def filter_operations_by_preset(task: Task, preset: str): """ Function filter operations by preset, remove "heavy" operations and save appropriate ones """ excluded_models_dict = {'light': ['mlp', 'svc', 'arima', 'exog_ts_data_source', 'text_clean'], 'light_tun': ['mlp', 'svc', 'arima', 'exog_ts_data_source', 'text_clean']} # Get data operations and models available_operations = get_operations_for_task(task, mode='all') available_data_operation = get_operations_for_task(task, mode='data_operation') # Exclude "heavy" operations if necessary if preset in excluded_models_dict.keys(): excluded_operations = excluded_models_dict[preset] available_operations = [_ for _ in available_operations if _ not in excluded_operations] # Save only "light" operations if preset in ['ultra_light', 'ultra_light_tun']: light_models = ['dt', 'dtreg', 'logit', 'linear', 'lasso', 'ridge', 'knn', 'ar'] included_operations = light_models + available_data_operation available_operations = [_ for _ in available_operations if _ in included_operations] if preset == 'gpu': # OperationTypesRepository.assign_repo('model', 'gpu_models_repository.json') repository = OperationTypesRepository().assign_repo('model', 'gpu_models_repository.json') available_operations = repository.suitable_operation(task_type=task.task_type) return available_operations
def filter_operations_by_preset(task, preset: str): """ Function filter operations by preset, remove "heavy" operations and save appropriate ones """ excluded_models_dict = { 'light': ['mlp', 'svc', 'arima', 'exog', 'text_clean'], 'light_tun': ['mlp', 'svc', 'arima', 'exog', 'text_clean'] } # Get data operations and models available_operations = get_operations_for_task(task, mode='all') available_data_operation = get_operations_for_task(task, mode='data_operations') # Exclude "heavy" operations if necessary if preset in excluded_models_dict.keys(): excluded_operations = excluded_models_dict[preset] available_operations = [ _ for _ in available_operations if _ not in excluded_operations ] # Save only "light" operations if preset in ['ultra_light', 'ultra_light_tun']: light_models = [ 'dt', 'dtreg', 'logit', 'linear', 'lasso', 'ridge', 'knn', 'ar' ] included_operations = light_models + available_data_operation available_operations = [ _ for _ in available_operations if _ in included_operations ] return available_operations
def get_composed_chain(dataset_to_compose, task, metric_function): # the search of the models provided by the framework that can be used as nodes in a chain for the selected task available_model_types = get_operations_for_task(task=task, mode='models') # the choice and initialisation of the GP search composer_requirements = GPComposerRequirements( primary=available_model_types, secondary=available_model_types, max_arity=3, max_depth=3, pop_size=20, num_of_generations=20, crossover_prob=0.8, mutation_prob=0.8, allow_single_operations=False) # GP optimiser parameters choice scheme_type = GeneticSchemeTypesEnum.steady_state optimiser_parameters = GPChainOptimiserParameters( genetic_scheme_type=scheme_type) # Create builder for composer and set composer params builder = GPComposerBuilder( task=task).with_requirements(composer_requirements).with_metrics( metric_function).with_optimiser_parameters(optimiser_parameters) # Create GP-based composer composer = builder.build() # the optimal chain generation by composition - the most time-consuming task chain_evo_composed = composer.compose_chain(data=dataset_to_compose, is_visualise=True) return chain_evo_composed
def test_presets_classification(): task = Task(TaskTypesEnum.classification) class_operations = get_operations_for_task(task, mode='all') operations_for_light_preset = filter_operations_by_preset(task, 'light') operations_for_ultra_light_preset = filter_operations_by_preset(task, 'ultra_light') assert len(operations_for_ultra_light_preset) < len(operations_for_light_preset) < len(class_operations) assert {'dt', 'logit', 'knn'} <= set(operations_for_ultra_light_preset)
def test_presets_regression(): task = Task(TaskTypesEnum.regression) regr_operations = get_operations_for_task(task, mode='all') operations_for_light_preset = filter_operations_by_preset(task, 'light') operations_for_ultra_light_preset = filter_operations_by_preset(task, 'ultra_light') assert len(operations_for_ultra_light_preset) < len(operations_for_light_preset) == len(regr_operations) assert {'dtreg', 'lasso', 'ridge', 'linear'} <= set(operations_for_ultra_light_preset)
def _divide_operations(available_operations, task): """ Function divide operations for primary and secondary """ if task.task_type == TaskTypesEnum.ts_forecasting: ts_data_operations = get_operations_for_task(task=task, mode='data_operation', tags=["ts_specific"]) # Remove exog data operation from the list ts_data_operations.remove('exog_ts_data_source') primary_operations = ts_data_operations secondary_operations = available_operations else: primary_operations = available_operations secondary_operations = available_operations return primary_operations, secondary_operations
def set_default_composer_params(self): """ Method set metrics and composer requirements """ if not self._composer.composer_requirements: # Get all available operations for task operations = get_operations_for_task(task=self.task, mode='all') # Set protected attributes to composer self._composer.composer_requirements = GPComposerRequirements( primary=operations, secondary=operations) if not self._composer.metrics: metric_function = ClassificationMetricsEnum.ROCAUC_penalty if self.task.task_type in (TaskTypesEnum.regression, TaskTypesEnum.ts_forecasting): metric_function = RegressionMetricsEnum.RMSE # Set metric self._composer.metrics = [metric_function]
def is_pipeline_contains_ts_operations(pipeline: 'Pipeline'): """ Function checks is the model contains operations for time series forecasting """ # Get time series specific operations with tag "ts_specific" ts_operations = get_operations_for_task(task=Task( TaskTypesEnum.ts_forecasting), tags=["ts_specific"], mode='all') # List with operations in considering pipeline operations_in_pipeline = [] for node in pipeline.nodes: operations_in_pipeline.append(node.operation.operation_type) if len(set(ts_operations) & set(operations_in_pipeline)) > 0: return True else: return False
def run_credit_scoring_problem( train_file_path, test_file_path, max_lead_time: datetime.timedelta = datetime.timedelta(minutes=5), is_visualise=False, with_tuning=False, cache_path=None): task = Task(TaskTypesEnum.classification) dataset_to_compose = InputData.from_csv(train_file_path, task=task) dataset_to_validate = InputData.from_csv(test_file_path, task=task) # the search of the models provided by the framework that can be used as nodes in a chain for the selected task available_model_types = get_operations_for_task(task=task, mode='models') # the choice of the metric for the chain quality assessment during composition metric_function = ClassificationMetricsEnum.ROCAUC_penalty # the choice and initialisation of the GP search composer_requirements = GPComposerRequirements( primary=available_model_types, secondary=available_model_types, max_arity=3, max_depth=3, pop_size=20, num_of_generations=20, crossover_prob=0.8, mutation_prob=0.8, max_lead_time=max_lead_time) # GP optimiser parameters choice scheme_type = GeneticSchemeTypesEnum.parameter_free optimiser_parameters = GPChainOptimiserParameters( genetic_scheme_type=scheme_type) # Create builder for composer and set composer params logger = default_log('FEDOT logger', verbose_level=4) builder = GPComposerBuilder(task=task).with_requirements(composer_requirements). \ with_metrics(metric_function).with_optimiser_parameters(optimiser_parameters).with_logger(logger=logger) if cache_path: builder = builder.with_cache(cache_path) # Create GP-based composer composer = builder.build() # the optimal chain generation by composition - the most time-consuming task chain_evo_composed = composer.compose_chain(data=dataset_to_compose, is_visualise=True) if with_tuning: # TODO Add tuning raise NotImplementedError(f'Tuning is not supported') chain_evo_composed.fit(input_data=dataset_to_compose) composer.history.write_composer_history_to_csv() if is_visualise: visualiser = ChainVisualiser() composer.log.debug('History visualization started') visualiser.visualise_history(composer.history) composer.log.debug('History visualization finished') composer.log.debug('Best chain visualization started') visualiser.visualise(chain_evo_composed) composer.log.debug('Best chain visualization finished') # the quality assessment for the obtained composite models roc_on_valid_evo_composed = calculate_validation_metric( chain_evo_composed, dataset_to_validate) print(f'Composed ROC AUC is {round(roc_on_valid_evo_composed, 3)}') return roc_on_valid_evo_composed
def compose_fedot_model(train_data: [InputData, MultiModalData], task: Task, logger: Log, max_depth: int, max_arity: int, pop_size: int, num_of_generations: int, available_operations: list = None, composer_metric=None, timeout: float = 5, with_tuning=False, tuner_metric=None, cv_folds: Optional[int] = None, validation_blocks: int = None, initial_pipeline=None ): """ Function for composing FEDOT pipeline """ metric_function = _obtain_metric(task, composer_metric) if available_operations is None: available_operations = get_operations_for_task(task, mode='model') logger.message(f'Composition started. Parameters tuning: {with_tuning}. ' f'Set of candidate models: {available_operations}. Composing time limit: {timeout} min') primary_operations, secondary_operations = _divide_operations(available_operations, task) timeout_for_composing = timeout / 2 if with_tuning else timeout # the choice and initialisation of the GP composer composer_requirements = \ GPComposerRequirements(primary=primary_operations, secondary=secondary_operations, max_arity=max_arity, max_depth=max_depth, pop_size=pop_size, num_of_generations=num_of_generations, cv_folds=cv_folds, validation_blocks=validation_blocks, timeout=datetime.timedelta(minutes=timeout_for_composing)) optimizer_parameters = GPGraphOptimiserParameters(genetic_scheme_type=GeneticSchemeTypesEnum.parameter_free) # Create GP-based composer builder = _get_gp_composer_builder(task=task, metric_function=metric_function, composer_requirements=composer_requirements, optimizer_parameters=optimizer_parameters, data=train_data, initial_pipeline=initial_pipeline, logger=logger) gp_composer = builder.build() logger.message('Pipeline composition started') pipeline_gp_composed = gp_composer.compose_pipeline(data=train_data) pipeline_for_return = pipeline_gp_composed if isinstance(pipeline_gp_composed, list): for pipeline in pipeline_gp_composed: pipeline.log = logger pipeline_for_return = pipeline_gp_composed[0] best_candidates = gp_composer.optimiser.archive else: best_candidates = [pipeline_gp_composed] pipeline_gp_composed.log = logger if with_tuning: logger.message('Hyperparameters tuning started') if tuner_metric is None: logger.message('Default loss function was set') # Default metric for tuner tune_metrics = TunerMetricByTask(task.task_type) tuner_loss, loss_params = tune_metrics.get_metric_and_params(train_data) else: # Get metric and parameters by name tuner_loss, loss_params = tuner_metric_by_name(metric_name=tuner_metric, train_data=train_data, task=task) iterations = 20 if timeout is None else 1000 timeout_for_tuning = timeout / 2 # Tune all nodes in the pipeline vb_number = composer_requirements.validation_blocks folds = composer_requirements.cv_folds if train_data.task.task_type != TaskTypesEnum.ts_forecasting: # TODO remove after implementation of CV for class/regr logger.warn('Cross-validation is not supported for tuning of ts-forecasting pipeline: ' 'hold-out validation used instead') folds = None pipeline_for_return = pipeline_for_return.fine_tune_all_nodes(loss_function=tuner_loss, loss_params=loss_params, input_data=train_data, iterations=iterations, timeout=timeout_for_tuning, cv_folds=folds, validation_blocks=vb_number) logger.message('Model composition finished') history = gp_composer.optimiser.history return pipeline_for_return, best_candidates, history
def compose_fedot_model(train_data: InputData, task: Task, logger: Log, max_depth: int, max_arity: int, pop_size: int, num_of_generations: int, available_operations: list = None, composer_metric=None, learning_time: float = 5, with_tuning=False, tuner_metric=None): """ Function for composing FEDOT chain model """ metric_function = _obtain_metric(task, composer_metric) if available_operations is None: available_operations = get_operations_for_task(task, mode='models') logger.message( f'Composition started. Parameters tuning: {with_tuning}. ' f'Set of candidate models: {available_operations}. Composing time limit: {learning_time} min' ) primary_operations, secondary_operations = _divide_operations( available_operations, task) learning_time_for_composing = learning_time / 2 if with_tuning else learning_time # the choice and initialisation of the GP composer composer_requirements = \ GPComposerRequirements(primary=primary_operations, secondary=secondary_operations, max_arity=max_arity, max_depth=max_depth, pop_size=pop_size, num_of_generations=num_of_generations, max_lead_time=datetime.timedelta(minutes=learning_time_for_composing), allow_single_operations=False) optimizer_parameters = GPChainOptimiserParameters( genetic_scheme_type=GeneticSchemeTypesEnum.parameter_free, mutation_types=[ MutationTypesEnum.parameter_change, MutationTypesEnum.simple, MutationTypesEnum.reduce, MutationTypesEnum.growth, MutationTypesEnum.local_growth ], crossover_types=[ CrossoverTypesEnum.one_point, CrossoverTypesEnum.subtree ]) # Create GP-based composer builder = _get_gp_composer_builder( task=task, metric_function=metric_function, composer_requirements=composer_requirements, optimizer_parameters=optimizer_parameters, logger=logger) gp_composer = builder.build() logger.message('Model composition started') chain_gp_composed = gp_composer.compose_chain(data=train_data) chain_for_return = chain_gp_composed if isinstance(chain_gp_composed, list): for chain in chain_gp_composed: chain.log = logger chain_for_return = chain_gp_composed[0] best_candidates = gp_composer.optimiser.archive else: best_candidates = [chain_gp_composed] chain_gp_composed.log = logger if with_tuning: logger.message('Hyperparameters tuning started') if tuner_metric is None: logger.message('Default loss function was set') # Default metric for tuner tune_metrics = TunerMetricByTask(task.task_type) tuner_loss, loss_params = tune_metrics.get_metric_and_params( train_data) else: # Get metric and parameters by name tuner_loss, loss_params = tuner_metric_by_name( metric_name=tuner_metric, train_data=train_data, task=task) iterations = 20 if learning_time is None else 1000 learning_time_for_tuning = learning_time / 2 # Tune all nodes in the chain chain_for_return.fine_tune_all_nodes( loss_function=tuner_loss, loss_params=loss_params, input_data=train_data, iterations=iterations, max_lead_time=learning_time_for_tuning) logger.message('Model composition finished') return chain_for_return, best_candidates
def run_credit_scoring_problem( train_file_path, test_file_path, max_lead_time: datetime.timedelta = datetime.timedelta(minutes=5), is_visualise=False): task = Task(TaskTypesEnum.classification) dataset_to_compose = InputData.from_csv(train_file_path, task=task) dataset_to_validate = InputData.from_csv(test_file_path, task=task) # the search of the models provided by the framework that can be used as nodes in a chain for the selected task available_model_types = get_operations_for_task(task=task, mode='models') # the choice of the metric for the chain quality assessment during composition quality_metric = ClassificationMetricsEnum.ROCAUC complexity_metric = ComplexityMetricsEnum.node_num metrics = [quality_metric, complexity_metric] # the choice and initialisation of the GP search composer_requirements = GPComposerRequirements( primary=available_model_types, secondary=available_model_types, max_arity=3, max_depth=3, pop_size=20, num_of_generations=20, crossover_prob=0.8, mutation_prob=0.8, max_lead_time=max_lead_time, start_depth=2, allow_single_operations=False) # GP optimiser parameters choice scheme_type = GeneticSchemeTypesEnum.parameter_free optimiser_parameters = GPChainOptimiserParameters( genetic_scheme_type=scheme_type, selection_types=[SelectionTypesEnum.spea2]) # Create builder for composer and set composer params builder = GPComposerBuilder( task=task).with_requirements(composer_requirements).with_metrics( metrics).with_optimiser_parameters(optimiser_parameters) # Create GP-based composer composer = builder.build() # the optimal chain generation by composition - the most time-consuming task chains_evo_composed = composer.compose_chain(data=dataset_to_compose, is_visualise=True) composer.history.write_composer_history_to_csv() if is_visualise: results_visualization(composed_chains=chains_evo_composed, history=composer.history) chains_roc_auc = [] for chain_num, chain_evo_composed in enumerate(chains_evo_composed): chain_evo_composed.fine_tune_primary_nodes( input_data=dataset_to_compose, iterations=50) chain_evo_composed.fit(input_data=dataset_to_compose) # the quality assessment for the obtained composite models roc_on_valid_evo_composed = calculate_validation_metric( chain_evo_composed, dataset_to_validate) chains_roc_auc.append(roc_on_valid_evo_composed) if len(chains_evo_composed) > 1: print( f'Composed ROC AUC of chain {chain_num + 1} is {round(roc_on_valid_evo_composed, 3)}' ) else: print(f'Composed ROC AUC is {round(roc_on_valid_evo_composed, 3)}') return max(chains_roc_auc)
def __init__(self, task=None): self.models = get_operations_for_task(task, mode='model') self.data_operations = get_operations_for_task(task, mode='data_operation') super().__init__(task)
def has_no_data_flow_conflicts_in_ts_pipeline(pipeline: 'Pipeline'): """ Function checks the correctness of connection between nodes """ task = Task(TaskTypesEnum.ts_forecasting) if not is_pipeline_contains_ts_operations(pipeline): return True models = get_operations_for_task(task=task, mode='model') # Preprocessing not only for time series non_ts_data_operations = get_operations_for_task( task=task, mode='data_operation', forbidden_tags=["ts_specific"]) ts_data_operations = get_operations_for_task(task=task, mode='data_operation', tags=["ts_specific"]) # Remove lagged transformation ts_data_operations.remove('lagged') ts_data_operations.remove('exog_ts_data_source') # Dictionary as {'current operation in the node': 'parent operations list'} # TODO refactor wrong_connections = { 'lagged': models + non_ts_data_operations + ['lagged'], 'ar': models + non_ts_data_operations + ['lagged'], 'arima': models + non_ts_data_operations + ['lagged'], 'ridge': ts_data_operations, 'linear': ts_data_operations, 'lasso': ts_data_operations, 'dtreg': ts_data_operations, 'knnreg': ts_data_operations, 'scaling': ts_data_operations, 'xgbreg': ts_data_operations, 'adareg': ts_data_operations, 'gbr': ts_data_operations, 'treg': ts_data_operations, 'rfr': ts_data_operations, 'svr': ts_data_operations, 'sgdr': ts_data_operations, 'normalization': ts_data_operations, 'simple_imputation': ts_data_operations, 'pca': ts_data_operations, 'kernel_pca': ts_data_operations, 'poly_features': ts_data_operations, 'ransac_lin_reg': ts_data_operations, 'ransac_non_lin_reg': ts_data_operations, 'rfe_lin_reg': ts_data_operations, 'rfe_non_lin_reg': ts_data_operations } for node in pipeline.nodes: # Operation name in the current node current_operation = node.operation.operation_type parent_nodes = node.nodes_from if parent_nodes is not None: # There are several parents for current node or at least 1 for parent in parent_nodes: parent_operation = parent.operation.operation_type forbidden_parents = wrong_connections.get(current_operation) if forbidden_parents is not None: __check_connection(parent_operation, forbidden_parents) return True
def run_multi_modal_case(files_path, is_visualise=False, timeout=datetime.timedelta(minutes=2)): task = Task(TaskTypesEnum.classification) images_size = (128, 128) train_num, test_num, train_img, test_img, train_text, test_text = prepare_multi_modal_data( files_path, task, images_size) pipeline, fit_data, predict_data = generate_initial_pipeline_and_data( images_size, train_num, test_num, train_img, test_img, train_text, test_text) # the search of the models provided by the framework that can be used as nodes in a pipeline for the selected task available_model_types = get_operations_for_task(task=task, mode='model') # the choice of the metric for the pipeline quality assessment during composition metric_function = ClassificationMetricsEnum.ROCAUC_penalty # the choice and initialisation of the GP search composer_requirements = GPComposerRequirements( primary=available_model_types, secondary=available_model_types, max_arity=3, max_depth=3, pop_size=5, num_of_generations=5, crossover_prob=0.8, mutation_prob=0.8, timeout=timeout) # GP optimiser parameters choice scheme_type = GeneticSchemeTypesEnum.parameter_free optimiser_parameters = GPGraphOptimiserParameters( genetic_scheme_type=scheme_type) # Create builder for composer and set composer params logger = default_log('FEDOT logger', verbose_level=4) # the multi modal template (with data sources) is passed as inital assumption for composer builder = GPComposerBuilder(task=task).with_requirements(composer_requirements). \ with_metrics(metric_function).with_optimiser_parameters(optimiser_parameters).with_logger(logger=logger). \ with_initial_pipeline(pipeline).with_cache('multi_modal_opt.cache') # Create GP-based composer composer = builder.build() # the optimal pipeline generation by composition - the most time-consuming task pipeline_evo_composed = composer.compose_pipeline(data=fit_data, is_visualise=True) pipeline_evo_composed.fit(input_data=fit_data) if is_visualise: pipeline_evo_composed.show() prediction = pipeline_evo_composed.predict(predict_data) err = calculate_validation_metric(prediction, test_num) print(f'ROC AUC for validation sample is {err}') return err