def set_default_composer_params(self): if not self._composer.composer_requirements: models, _ = ModelTypesRepository().suitable_model(task_type=self.task.task_type) self._composer.composer_requirements = GPComposerRequirements(primary=models, secondary=models) if not self._composer.metrics: metric_function = MetricsRepository().metric_by_id(ClassificationMetricsEnum.ROCAUC_penalty) if self.task.task_type in (TaskTypesEnum.regression, TaskTypesEnum.ts_forecasting): metric_function = MetricsRepository().metric_by_id(RegressionMetricsEnum.RMSE) self._composer.metrics = metric_function
def get_metrics(self, target: Union[np.ndarray, pd.Series] = None, metric_names: Union[str, List[str]] = None) -> dict: """ Get quality metrics for the fitted graph :param target: the array with target values of test data :param metric_names: the names of required metrics :return: the values of quality metrics """ if metric_names is None: metric_names = self.metric_name if target is not None: if self.test_data is None: self.test_data = InputData( idx=range(len(self.prediction.predict)), features=None, target=target[:len(self.prediction.predict)], task=self.train_data.task, data_type=self.train_data.data_type) else: self.test_data.target = target[:len(self.prediction.predict)] real = self.test_data # TODO change to sklearn metrics if not isinstance(metric_names, List): metric_names = [metric_names] calculated_metrics = dict() for metric_name in metric_names: if composer_metrics_mapping[metric_name] is NotImplemented: self.log.warn(f'{metric_name} is not available as metric') else: prediction = self.prediction metric_cls = MetricsRepository().metric_class_by_id( composer_metrics_mapping[metric_name]) if metric_cls.output_mode == 'labels': prediction = self.prediction_labels if self.problem.task_type == TaskTypesEnum.ts_forecasting: real.target = real.target[~np.isnan(prediction.predict)] prediction.predict = prediction.predict[ ~np.isnan(prediction.predict)] metric_value = abs( metric_cls.metric(reference=real, predicted=prediction)) calculated_metrics[metric_name] = metric_value return calculated_metrics
def test_random_composer(data_fixture, request): random.seed(1) np.random.seed(1) data = request.getfixturevalue(data_fixture) dataset_to_compose = data dataset_to_validate = data available_model_types, _ = ModelTypesRepository().suitable_model( task_type=TaskTypesEnum.classification) metric_function = MetricsRepository().metric_by_id( ClassificationMetricsEnum.ROCAUC) random_composer = RandomSearchComposer(iter_num=1) req = ComposerRequirements(primary=available_model_types, secondary=available_model_types) chain_random_composed = random_composer.compose_chain( data=dataset_to_compose, initial_chain=None, composer_requirements=req, metrics=metric_function) chain_random_composed.fit_from_scratch(input_data=dataset_to_compose) predicted_random_composed = chain_random_composed.predict( dataset_to_validate) roc_on_valid_random_composed = roc_auc( y_true=dataset_to_validate.target, y_score=predicted_random_composed.predict) assert roc_on_valid_random_composed > 0.6
def test_composer_cv_correct(): """ Checks if the composer works correctly when using cross validation for time series """ folds = 2 _, forecast_len, validation_blocks, time_series = configure_experiment() primary_operations, secondary_operations = get_available_operations() # Composer parameters composer_requirements = GPComposerRequirements( primary=primary_operations, secondary=secondary_operations, max_arity=3, max_depth=3, pop_size=2, num_of_generations=2, crossover_prob=0.8, mutation_prob=0.8, timeout=datetime.timedelta(seconds=5), cv_folds=folds, validation_blocks=validation_blocks) init_pipeline = get_simple_ts_pipeline() metric_function = MetricsRepository().metric_by_id( RegressionMetricsEnum.RMSE) builder = GPComposerBuilder(task=time_series.task). \ with_requirements(composer_requirements). \ with_metrics(metric_function).with_initial_pipeline(init_pipeline) composer = builder.build() obtained_pipeline = composer.compose_pipeline(data=time_series, is_visualise=False) assert isinstance(obtained_pipeline, Pipeline)
def get_model(train_file_path: str, cur_lead_time: datetime.timedelta = timedelta(seconds=60)): task = Task(task_type=TaskTypesEnum.classification) dataset_to_compose = InputData.from_csv(train_file_path, task=task) # the search of the models provided by the framework # that can be used as nodes in a chain for the selected task models_repo = ModelTypesRepository() available_model_types, _ = models_repo.suitable_model( task_type=task.task_type, tags=['simple']) metric_function = MetricsRepository(). \ metric_by_id(ClassificationMetricsEnum.ROCAUC_penalty) composer_requirements = GPComposerRequirements( primary=available_model_types, secondary=available_model_types, max_lead_time=cur_lead_time) # Create the genetic programming-based composer, that allow to find # the optimal structure of the composite model builder = GPComposerBuilder(task).with_requirements( composer_requirements).with_metrics(metric_function) composer = builder.build() # run the search of best suitable model chain_evo_composed = composer.compose_chain(data=dataset_to_compose, is_visualise=False) chain_evo_composed.fit(input_data=dataset_to_compose) return chain_evo_composed
def test_parameter_free_composer_build_chain_correct(data_fixture, request): random.seed(1) np.random.seed(1) data = request.getfixturevalue(data_fixture) dataset_to_compose = data dataset_to_validate = data available_model_types, _ = ModelTypesRepository().suitable_model( task_type=TaskTypesEnum.classification) metric_function = MetricsRepository().metric_by_id( ClassificationMetricsEnum.ROCAUC) req = GPComposerRequirements(primary=available_model_types, secondary=available_model_types, max_arity=2, max_depth=2, pop_size=2, num_of_generations=1, crossover_prob=0.4, mutation_prob=0.5) opt_params = GPChainOptimiserParameters( genetic_scheme_type=GeneticSchemeTypesEnum.parameter_free) builder = GPComposerBuilder(task=Task( TaskTypesEnum.classification)).with_requirements(req).with_metrics( metric_function).with_optimiser_parameters(opt_params) gp_composer = builder.build() chain_gp_composed = gp_composer.compose_chain(data=dataset_to_compose) chain_gp_composed.fit_from_scratch(input_data=dataset_to_compose) predicted_gp_composed = chain_gp_composed.predict(dataset_to_validate) roc_on_valid_gp_composed = roc_auc(y_true=dataset_to_validate.target, y_score=predicted_gp_composed.predict) assert roc_on_valid_gp_composed > 0.6
def _check_so_improvements(self, offspring: List[Any]) -> Tuple[bool, bool]: suppl_metric = MetricsRepository().metric_by_id(ComplexityMetricsEnum.node_num) best_in_offspring = self.get_best_individual(offspring, equivalents_from_current_pop=False) fitness_improved = best_in_offspring.fitness < self.best_individual.fitness complexity_decreased = suppl_metric(best_in_offspring) < suppl_metric( self.best_individual) and best_in_offspring.fitness <= self.best_individual.fitness return fitness_improved, complexity_decreased
def test_structural_quality_correct(): chain = default_valid_chain() metric_function = MetricsRepository().metric_by_id( ComplexityMetricsEnum.structural) expected_metric_value = 13 actual_metric_value = metric_function(chain) assert actual_metric_value <= expected_metric_value
def test_regression_quality_metric(data_setup): train, _ = data_setup chain = default_valid_chain() chain.fit(input_data=train) metric_function = MetricsRepository().metric_by_id( RegressionMetricsEnum.RMSE) metric_value = metric_function(chain=chain, reference_data=train) metric_function_with_penalty = \ MetricsRepository().metric_by_id(RegressionMetricsEnum.RMSE_penalty) metric_value_with_penalty = \ metric_function_with_penalty(chain=chain, reference_data=train) assert metric_value > 0 assert metric_value_with_penalty > 0 assert metric_value < metric_value_with_penalty
def test_classification_quality_metric(data_setup): train, _ = data_setup chain = default_valid_chain() chain.fit(input_data=train) metric_function = MetricsRepository().metric_by_id( ClassificationMetricsEnum.ROCAUC) metric_value = metric_function(chain=chain, reference_data=train) metric_function_with_penalty = \ MetricsRepository().metric_by_id(ClassificationMetricsEnum.ROCAUC_penalty) metric_value_with_penalty = \ metric_function_with_penalty(chain=chain, reference_data=train) assert 0.5 < abs(metric_value) < 1.0 assert 0.5 < abs(metric_value_with_penalty) < 1.0 assert metric_value < metric_value_with_penalty
def test_gp_composer_builder(): task = Task(TaskTypesEnum.classification) available_model_types, _ = ModelTypesRepository().suitable_model( task_type=task.task_type) metric_function = MetricsRepository().metric_by_id( ClassificationMetricsEnum.ROCAUC) composer_requirements = GPComposerRequirements( primary=available_model_types, secondary=available_model_types, max_arity=3, max_depth=3, pop_size=5, num_of_generations=4, crossover_prob=0.8, mutation_prob=1, max_lead_time=datetime.timedelta(minutes=5)) scheme_type = GeneticSchemeTypesEnum.steady_state optimiser_parameters = GPChainOptimiserParameters( genetic_scheme_type=scheme_type) builder_with_custom_params = GPComposerBuilder( task=task).with_requirements(composer_requirements).with_metrics( metric_function).with_optimiser_parameters(optimiser_parameters) composer_with_custom_params = builder_with_custom_params.build() assert composer_with_custom_params.optimiser.parameters.genetic_scheme_type == scheme_type assert composer_with_custom_params.metrics == metric_function assert composer_with_custom_params.composer_requirements.pop_size == 5 assert composer_with_custom_params.composer_requirements.mutation_prob == 1 builder_with_default_params = GPComposerBuilder(task=task) composer_with_default_params = builder_with_default_params.build() default_metric = MetricsRepository().metric_by_id( ClassificationMetricsEnum.ROCAUC.ROCAUC_penalty) assert composer_with_default_params.optimiser.parameters.genetic_scheme_type == GeneticSchemeTypesEnum.generational assert composer_with_default_params.metrics == default_metric assert composer_with_default_params.composer_requirements.pop_size == 20 assert composer_with_default_params.composer_requirements.mutation_prob == 0.8
def test_regression_quality_metric(data_setup): train, _ = data_setup pipeline = default_valid_pipeline() pipeline.fit(input_data=train) for metric in RegressionMetricsEnum: metric_function = MetricsRepository().metric_by_id(metric) metric_value = metric_function(pipeline=pipeline, reference_data=train) assert metric_value > 0
def test_classification_quality_metric(data_setup): train, _ = data_setup chain = default_valid_chain() chain.fit(input_data=train) for metric in ClassificationMetricsEnum: metric_function = MetricsRepository().metric_by_id(metric) metric_value = metric_function(chain=chain, reference_data=train) assert 0 < abs(metric_value) < sys.maxsize
def run_class_scoring_case(sa_class: str, is_composed: bool = False, path_to_save=None): train_data, test_data = get_scoring_data() task = Task(TaskTypesEnum.classification) # the choice of the metric for the pipeline quality assessment during composition metric_function = MetricsRepository().metric_by_id(ClassificationMetricsEnum.ROCAUC_penalty) if is_composed: case = f'scoring_composed_{sa_class}' is_composed = True
def run_regr_case(sa_class: str, is_composed: bool = False, path_to_save=None): train_data, test_data = get_cholesterol_data() task = Task(TaskTypesEnum.regression) # the choice of the metric for the pipeline quality assessment during composition metric_function = MetricsRepository().metric_by_id(RegressionMetricsEnum.RMSE) if is_composed: case = f'cholesterol_composed_{sa_class}' is_composed = True
def test_regression_quality_metric(data_setup): train, _ = data_setup chain = default_valid_chain() chain.fit(input_data=train) for metric in RegressionMetricsEnum: metric_function = MetricsRepository().metric_by_id(metric) metric_value = metric_function(chain=chain, reference_data=train) assert metric_value > 0
def composer_metric(self, metrics, train_data: Union[InputData, MultiModalData], test_data: Union[InputData, MultiModalData], pipeline: Pipeline) -> Optional[Tuple[Any]]: try: validate(pipeline) pipeline.log = self.log if type(metrics) is not list: metrics = [metrics] if self.cache is not None: # TODO improve cache pipeline.fit_from_cache(self.cache) if not pipeline.is_fitted: self.log.debug( f'Pipeline {pipeline.root_node.descriptive_id} fit started' ) pipeline.fit(input_data=train_data, time_constraint=self.composer_requirements. max_pipeline_fit_time) try: self.cache.save_pipeline(pipeline) except Exception as ex: self.log.info(f'Cache can not be saved: {ex}. Continue.') evaluated_metrics = () for metric in metrics: if callable(metric): metric_func = metric else: metric_func = MetricsRepository().metric_by_id(metric) evaluated_metrics = evaluated_metrics + (metric_func( pipeline, reference_data=test_data), ) self.log.debug( f'Pipeline {pipeline.root_node.descriptive_id} with metrics: {list(evaluated_metrics)}' ) # enforce memory cleaning pipeline.unfit() gc.collect() except Exception as ex: self.log.info(f'Pipeline assessment warning: {ex}. Continue.') evaluated_metrics = None return evaluated_metrics
def run_regr_case(is_composed: bool = False, path_to_save=None): train_data, test_data = get_cholesterol_data() task = Task(TaskTypesEnum.regression) # the choice of the metric for the chain quality assessment during composition metric_function = MetricsRepository().metric_by_id( RegressionMetricsEnum.RMSE) if is_composed: case = 'cholesterol_composed' run_analysis_case(train_data, test_data, case, task, metric=metric_function, is_composed=True, result_path=path_to_save)
def run_class_kc2_case(is_composed: bool = False, path_to_save=None): train_data, test_data = get_kc2_data() task = Task(TaskTypesEnum.classification) # the choice of the metric for the chain quality assessment during composition metric_function = MetricsRepository().metric_by_id( ClassificationMetricsEnum.ROCAUC_penalty) if is_composed: case = 'kc2_composed' run_analysis_case(train_data, test_data, case, task, metric=metric_function, is_composed=True, result_path=path_to_save)
def run_credit_scoring_problem(train_file_path, test_file_path, max_lead_time: datetime.timedelta = datetime.timedelta(minutes=5), gp_optimiser_params: Optional[GPChainOptimiserParameters] = None, pop_size=None, generations=None): dataset_to_compose = InputData.from_csv(train_file_path) dataset_to_validate = InputData.from_csv(test_file_path) available_model_types, _ = ModelTypesRepository(). \ suitable_model(task_type=TaskTypesEnum.classification) # the choice of the metric for the chain quality assessment during composition metric_function = MetricsRepository().metric_by_id(ClassificationMetricsEnum.ROCAUC) if gp_optimiser_params: optimiser_parameters = gp_optimiser_params else: selection_types = [SelectionTypesEnum.tournament] crossover_types = [CrossoverTypesEnum.subtree] mutation_types = [MutationTypesEnum.simple, MutationTypesEnum.growth, MutationTypesEnum.reduce] regularization_type = RegularizationTypesEnum.decremental optimiser_parameters = GPChainOptimiserParameters(selection_types=selection_types, crossover_types=crossover_types, mutation_types=mutation_types, regularization_type=regularization_type) composer_requirements = GPComposerRequirements( primary=available_model_types, secondary=available_model_types, max_arity=4, max_depth=3, pop_size=pop_size, num_of_generations=generations, crossover_prob=0.8, mutation_prob=0.8, max_lead_time=max_lead_time) # Create GP-based composer composer = GPComposer() chain_evo_composed = composer.compose_chain(data=dataset_to_compose, initial_chain=None, composer_requirements=composer_requirements, metrics=metric_function, optimiser_parameters=optimiser_parameters, is_visualise=False) chain_evo_composed.fit(input_data=dataset_to_compose, verbose=True) roc_on_valid_evo_composed = calculate_validation_metric(chain_evo_composed, dataset_to_validate) print(f'Composed ROC AUC is {round(roc_on_valid_evo_composed, 3)}') return roc_on_valid_evo_composed, chain_evo_composed, composer
def test_composition_time(data_fixture, request): random.seed(1) np.random.seed(1) data = request.getfixturevalue(data_fixture) task = Task(TaskTypesEnum.classification) models_impl = ['mlp', 'knn'] metric_function = MetricsRepository().metric_by_id( ClassificationMetricsEnum.ROCAUC) req_terminated_evolution = GPComposerRequirements( primary=models_impl, secondary=models_impl, max_arity=2, max_depth=2, pop_size=2, num_of_generations=5, crossover_prob=0.9, mutation_prob=0.9, max_lead_time=datetime.timedelta(minutes=0.000001)) builder = GPComposerBuilder(task).with_requirements( req_terminated_evolution).with_metrics(metric_function) gp_composer_terminated_evolution = builder.build() _ = gp_composer_terminated_evolution.compose_chain(data=data) req_completed_evolution = GPComposerRequirements(primary=models_impl, secondary=models_impl, max_arity=2, max_depth=2, pop_size=2, num_of_generations=2, crossover_prob=0.4, mutation_prob=0.5) builder = GPComposerBuilder(task).with_requirements( req_completed_evolution).with_metrics(metric_function) gp_composer_completed_evolution = builder.build() _ = gp_composer_completed_evolution.compose_chain(data=data) assert len(gp_composer_terminated_evolution.history.chains) == len( gp_composer_completed_evolution.history.chains)
def _obtain_metric(task: Task, composer_metric: Union[str, Callable]): # the choice of the metric for the pipeline quality assessment during composition if composer_metric is None: composer_metric = MetricByTask(task.task_type).metric_cls.get_value if isinstance(composer_metric, str) or isinstance(composer_metric, Callable): composer_metric = [composer_metric] metric_function = [] for specific_metric in composer_metric: if isinstance(specific_metric, Callable): specific_metric_function = specific_metric else: metric_id = composer_metrics_mapping.get(specific_metric, None) if metric_id is None: raise ValueError(f'Incorrect metric {specific_metric}') specific_metric_function = MetricsRepository().metric_by_id(metric_id) metric_function.append(specific_metric_function) return metric_function
def metric_evaluation(pipeline, train_data: InputData, test_data: InputData, metrics: list, evaluated_metrics: list, vb_number: int = None): """ Pipeline training and metrics assessment :param pipeline: pipeline for validation :param train_data: InputData for train :param test_data: InputData for validation :param metrics: list with metrics for evaluation :param evaluated_metrics: list with metrics values :param vb_number: number of validation blocks """ pipeline.fit_from_scratch(train_data) for index, metric in enumerate(metrics): if callable(metric): metric_func = metric else: metric_func = MetricsRepository().metric_by_id(metric) metric_value = metric_func(pipeline, reference_data=test_data, validation_blocks=vb_number) evaluated_metrics[index].extend([metric_value]) return evaluated_metrics
def composer_metric(self, metrics, train_data: InputData, test_data: InputData, chain: Chain) -> Optional[Tuple[Any]]: try: validate(chain) chain.log = self.log if type(metrics) is not list: metrics = [metrics] if self.cache is not None: # TODO improve cache chain.fit_from_cache(self.cache) if not chain.is_fitted: self.log.debug( f'Chain {chain.root_node.descriptive_id} fit started') chain.fit(input_data=train_data, time_constraint=self.composer_requirements. max_chain_fit_time) self.cache.save_chain(chain) evaluated_metrics = () for metric in metrics: if callable(metric): metric_func = metric else: metric_func = MetricsRepository().metric_by_id(metric) evaluated_metrics = evaluated_metrics + (metric_func( chain, reference_data=test_data), ) self.log.debug( f'Chain {chain.root_node.descriptive_id} with metrics: {list(evaluated_metrics)}' ) except Exception as ex: self.log.info(f'Chain assessment warning: {ex}. Continue.') evaluated_metrics = None return evaluated_metrics
def test_fixed_structure_composer(data_fixture, request): random.seed(1) np.random.seed(1) data = request.getfixturevalue(data_fixture) dataset_to_compose = data dataset_to_validate = data available_model_types = ['logit', 'lda', 'knn'] metric_function = MetricsRepository().metric_by_id( ClassificationMetricsEnum.ROCAUC) req = GPComposerRequirements(primary=available_model_types, secondary=available_model_types, pop_size=2, num_of_generations=1, crossover_prob=0.4, mutation_prob=0.5, add_single_model_chains=False) reference_chain = get_class_chain() builder = FixedStructureComposerBuilder( task=Task(TaskTypesEnum.classification)).with_initial_chain( reference_chain).with_metrics(metric_function).with_requirements( req) composer = builder.build() chain_composed = composer.compose_chain(data=dataset_to_compose) chain_composed.fit_from_scratch(input_data=dataset_to_compose) predicted_random_composed = chain_composed.predict(dataset_to_validate) roc_on_valid_random_composed = roc_auc( y_true=dataset_to_validate.target, y_score=predicted_random_composed.predict) assert roc_on_valid_random_composed > 0.6 assert chain_composed.depth == reference_chain.depth assert chain_composed.length == reference_chain.length
def run_ts_forecasting_problem(forecast_length=50, with_visualisation=True) -> None: """ Function launch time series task with composing :param forecast_length: length of the forecast :param with_visualisation: is it needed to show the plots """ file_path = '../cases/data/metocean/metocean_data_test.csv' df = pd.read_csv(file_path) time_series = np.array(df['sea_height']) # Train/test split train_part = time_series[:-forecast_length] test_part = time_series[-forecast_length:] # Prepare data for train and test train_input, predict_input, task = prepare_train_test_input( train_part, forecast_length) # Get chain with pre-defined structure init_chain = get_source_chain() # Init check preds = fit_predict_for_chain(chain=init_chain, train_input=train_input, predict_input=predict_input) display_validation_metric(predicted=preds, real=test_part, actual_values=time_series, is_visualise=with_visualisation) # Get available_operations type primary_operations, secondary_operations = get_available_operations() # Composer parameters composer_requirements = GPComposerRequirements( primary=primary_operations, secondary=secondary_operations, max_arity=3, max_depth=8, pop_size=10, num_of_generations=15, crossover_prob=0.8, mutation_prob=0.8, max_lead_time=datetime.timedelta(minutes=10), allow_single_operations=False) mutation_types = [ MutationTypesEnum.parameter_change, MutationTypesEnum.simple, MutationTypesEnum.reduce ] optimiser_parameters = GPChainOptimiserParameters( mutation_types=mutation_types) metric_function = MetricsRepository().metric_by_id( RegressionMetricsEnum.MAE) builder = GPComposerBuilder(task=task). \ with_optimiser_parameters(optimiser_parameters).\ with_requirements(composer_requirements).\ with_metrics(metric_function).with_initial_chain(init_chain) composer = builder.build() obtained_chain = composer.compose_chain(data=train_input, is_visualise=False) ################################ # Obtained chain visualisation # ################################ if with_visualisation: visualiser = ChainVisualiser() visualiser.visualise(obtained_chain) preds = fit_predict_for_chain(chain=obtained_chain, train_input=train_input, predict_input=predict_input) display_validation_metric(predicted=preds, real=test_part, actual_values=time_series, is_visualise=with_visualisation) display_chain_info(obtained_chain)
def run_fedot(params: 'ExecutionParams'): train_file_path = params.train_file test_file_path = params.test_file case_label = params.case_label task_type = params.task if task_type == TaskTypesEnum.classification: metric = ClassificationMetricsEnum.ROCAUC elif task_type == TaskTypesEnum.regression: metric = RegressionMetricsEnum.RMSE else: raise NotImplementedError() metric_func = MetricsRepository().metric_by_id(metric) task = Task(task_type) dataset_to_compose = InputData.from_csv(train_file_path, task=task) dataset_to_validate = InputData.from_csv(test_file_path, task=task) models_hyperparameters = get_models_hyperparameters()['FEDOT'] cur_lead_time = models_hyperparameters['MAX_RUNTIME_MINS'] saved_model_name = f'fedot_{case_label}_{task_type.name}_{cur_lead_time}_{metric.name}' loaded_model = load_fedot_model(saved_model_name) if not loaded_model: generations = models_hyperparameters['GENERATIONS'] population_size = models_hyperparameters['POPULATION_SIZE'] # the search of the models provided by the framework that can be used as nodes in a chain' models_repo = ModelTypesRepository() available_model_types, _ = models_repo.suitable_model(task.task_type) heavy_models = ['svc', 'multinb', 'tfidf', 'qda'] available_model_types = [ model for model in available_model_types if model not in heavy_models ] # the choice and initialisation of the GP search composer_requirements = GPComposerRequirements( primary=available_model_types, secondary=available_model_types, max_arity=3, max_depth=2, pop_size=population_size, num_of_generations=generations, crossover_prob=0.8, mutation_prob=0.8, max_lead_time=datetime.timedelta(minutes=cur_lead_time), add_single_model_chains=True) # Create GP-based composer builder = GPComposerBuilder(task).with_requirements( composer_requirements).with_metrics(metric_func) gp_composer = builder.build() chain_gp_composed = gp_composer.compose_chain(data=dataset_to_compose) chain_gp_composed.fit_from_scratch(input_data=dataset_to_compose) save_fedot_model(chain_gp_composed, saved_model_name) else: chain_gp_composed = loaded_model evo_predicted = chain_gp_composed.predict(dataset_to_validate) evo_predicted_labels = chain_gp_composed.predict(dataset_to_validate, output_mode='labels') return dataset_to_validate.target, evo_predicted.predict, evo_predicted_labels.predict
def run_ts_forecasting_problem(forecast_length=50, with_visualisation=True, cv_folds=None) -> None: """ Function launch time series task with composing :param forecast_length: length of the forecast :param with_visualisation: is it needed to show the plots :param cv_folds: is it needed apply cross validation and what number of folds to use """ file_path = '../cases/data/metocean/metocean_data_test.csv' df = pd.read_csv(file_path) time_series = np.array(df['sea_height']) # Train/test split train_part = time_series[:-forecast_length] test_part = time_series[-forecast_length:] # Prepare data for train and test train_input, predict_input, task = prepare_train_test_input( train_part, forecast_length) # Get pipeline with pre-defined structure init_pipeline = get_source_pipeline() # Init check preds = fit_predict_for_pipeline(pipeline=init_pipeline, train_input=train_input, predict_input=predict_input) display_validation_metric(predicted=preds, real=test_part, actual_values=time_series, is_visualise=with_visualisation) # Get available_operations type primary_operations, secondary_operations = get_available_operations() # Composer parameters composer_requirements = GPComposerRequirements( primary=primary_operations, secondary=secondary_operations, max_arity=3, max_depth=8, pop_size=10, num_of_generations=10, crossover_prob=0.8, mutation_prob=0.8, timeout=datetime.timedelta(minutes=10), cv_folds=cv_folds, validation_blocks=3) mutation_types = [ parameter_change_mutation, MutationTypesEnum.simple, MutationTypesEnum.reduce ] optimiser_parameters = GPGraphOptimiserParameters( mutation_types=mutation_types) metric_function = MetricsRepository().metric_by_id( RegressionMetricsEnum.RMSE) builder = GPComposerBuilder(task=task). \ with_optimiser_parameters(optimiser_parameters).\ with_requirements(composer_requirements).\ with_metrics(metric_function).with_initial_pipeline(init_pipeline) composer = builder.build() obtained_pipeline = composer.compose_pipeline(data=train_input, is_visualise=False) ################################### # Obtained pipeline visualisation # ################################### if with_visualisation: obtained_pipeline.show() preds = fit_predict_for_pipeline(pipeline=obtained_pipeline, train_input=train_input, predict_input=predict_input) display_validation_metric(predicted=preds, real=test_part, actual_values=time_series, is_visualise=with_visualisation) obtained_pipeline.print_structure()
def run_credit_scoring_problem( train_file_path, test_file_path, max_lead_time: datetime.timedelta = datetime.timedelta(minutes=5), is_visualise=False, with_tuning=False): task = Task(TaskTypesEnum.classification) dataset_to_compose = InputData.from_csv(train_file_path, task=task) dataset_to_validate = InputData.from_csv(test_file_path, task=task) # the search of the models provided by the framework that can be used as nodes in a chain for the selected task available_model_types, _ = ModelTypesRepository().suitable_model( task_type=task.task_type) # the choice of the metric for the chain quality assessment during composition metric_function = MetricsRepository().metric_by_id( ClassificationMetricsEnum.ROCAUC_penalty) # the choice and initialisation of the GP search composer_requirements = GPComposerRequirements( primary=available_model_types, secondary=available_model_types, max_arity=3, max_depth=3, pop_size=20, num_of_generations=20, crossover_prob=0.8, mutation_prob=0.8, max_lead_time=max_lead_time) # GP optimiser parameters choice scheme_type = GeneticSchemeTypesEnum.steady_state optimiser_parameters = GPChainOptimiserParameters( genetic_scheme_type=scheme_type) # Create builder for composer and set composer params builder = GPComposerBuilder( task=task).with_requirements(composer_requirements).with_metrics( metric_function).with_optimiser_parameters(optimiser_parameters) # Create GP-based composer composer = builder.build() # the optimal chain generation by composition - the most time-consuming task chain_evo_composed = composer.compose_chain(data=dataset_to_compose, is_visualise=True) if with_tuning: chain_evo_composed.fine_tune_primary_nodes( input_data=dataset_to_compose, iterations=50, verbose=True) chain_evo_composed.fit(input_data=dataset_to_compose, verbose=True) if is_visualise: visualiser = ChainVisualiser() composer.log.info('History visualization started') visualiser.visualise_history(composer.history) composer.log.info('History visualization finished') composer.history.write_composer_history_to_csv() composer.log.info('Best chain visualization started') visualiser.visualise(chain_evo_composed) composer.log.info('Best chain visualization finished') # the quality assessment for the obtained composite models roc_on_valid_evo_composed = calculate_validation_metric( chain_evo_composed, dataset_to_validate) print(f'Composed ROC AUC is {round(roc_on_valid_evo_composed, 3)}') return roc_on_valid_evo_composed
def run_river_composer_experiment(file_path, init_pipeline, file_to_save, iterations=20, tuner=None): """ Function launch experiment for river level prediction. Composing and tuner processes are available for such experiment. :param file_path: path to the file with river level data :param init_pipeline: pipeline to start composing process :param file_to_save: path to the file and file name to save report :param iterations: amount of iterations to process :param tuner: if tuning after composing process is required or not. tuner - NodesTuner or PipelineTuner. """ # Read dataframe and prepare train and test data data = InputData.from_csv(file_path, target_columns='level_station_2', task=Task(TaskTypesEnum.regression), columns_to_drop=['date']) train_input, predict_input = train_test_data_setup(data) y_data_test = np.array(predict_input.target) available_secondary_operations = ['ridge', 'lasso', 'dtreg', 'xgbreg', 'adareg', 'knnreg', 'linear', 'svr', 'poly_features', 'scaling', 'ransac_lin_reg', 'rfe_lin_reg', 'pca', 'ransac_non_lin_reg', 'rfe_non_lin_reg', 'normalization'] available_primary_operations = ['one_hot_encoding'] # Report arrays obtained_pipelines = [] depths = [] maes = [] for i in range(0, iterations): print(f'Iteration {i}\n') composer_requirements = GPComposerRequirements( primary=available_primary_operations, secondary=available_secondary_operations, max_arity=3, max_depth=8, pop_size=10, num_of_generations=5, crossover_prob=0.8, mutation_prob=0.8, timeout=datetime.timedelta(minutes=5)) metric_function = MetricsRepository().metric_by_id( RegressionMetricsEnum.MAE) builder = GPComposerBuilder(task=data.task). \ with_requirements(composer_requirements). \ with_metrics(metric_function).with_initial_pipeline(init_pipeline) composer = builder.build() obtained_pipeline = composer.compose_pipeline(data=train_input, is_visualise=False) # Display info about obtained pipeline obtained_models, depth = get_pipeline_info(pipeline=obtained_pipeline) preds = fit_predict_for_pipeline(pipeline=obtained_pipeline, train_input=train_input, predict_input=predict_input) mse_value = mean_squared_error(y_data_test, preds, squared=False) mae_value = mean_absolute_error(y_data_test, preds) print(f'Obtained metrics for current iteration {i}:') print(f'RMSE - {mse_value:.2f}') print(f'MAE - {mae_value:.2f}\n') if tuner is not None: print(f'Start tuning process ...') pipeline_tuner = tuner(pipeline=obtained_pipeline, task=data.task, iterations=100) tuned_pipeline = pipeline_tuner.tune_pipeline(input_data=train_input, loss_function=mean_absolute_error) preds_tuned = fit_predict_for_pipeline(pipeline=tuned_pipeline, train_input=train_input, predict_input=predict_input) mse_value = mean_squared_error(y_data_test, preds_tuned, squared=False) mae_value = mean_absolute_error(y_data_test, preds_tuned) print(f'Obtained metrics for current iteration {i} after tuning:') print(f'RMSE - {mse_value:.2f}') print(f'MAE - {mae_value:.2f}\n') obtained_pipelines.append(obtained_models) maes.append(mae_value) depths.append(depth) report = pd.DataFrame({'Pipeline': obtained_pipelines, 'Depth': depths, 'MAE': maes}) report.to_csv(file_to_save, index=False)