def test_cache_historical_state_using(data_setup): cache = OperationsCache() train, _ = data_setup pipeline = pipeline_first() # pipeline fitted, model goes to cache pipeline.fit(input_data=train) cache.save_pipeline(pipeline) new_node = SecondaryNode(operation_type='logit') old_node = pipeline.root_node.nodes_from[0] # change child node to new one pipeline.update_node(old_node=old_node, new_node=new_node) # cache is not actual assert not cache.get(pipeline.root_node) # fit modified pipeline pipeline.fit(input_data=train) cache.save_pipeline(pipeline) # cache is actual now assert cache.get(pipeline.root_node) # change node back pipeline.update_node(old_node=pipeline.root_node.nodes_from[0], new_node=old_node) # cache is actual without new fitting, # because the cached model was saved after first fit assert cache.get(pipeline.root_node)
def test_cache_actuality_after_subtree_change_to_identical(data_setup): """The non-affected nodes has actual cache after changing the subtree to other pre-fitted subtree""" cache = OperationsCache() train, _ = data_setup pipeline = pipeline_first() other_pipeline = pipeline_second() pipeline.fit(input_data=train) cache.save_pipeline(pipeline) other_pipeline.fit(input_data=train) cache.save_pipeline(Pipeline(other_pipeline.root_node.nodes_from[0])) pipeline.update_subtree(pipeline.root_node.nodes_from[0], other_pipeline.root_node.nodes_from[0]) nodes_with_actual_cache = [node for node in pipeline.nodes if node not in [pipeline.root_node]] # non-affected nodes of initial pipeline and fitted nodes of new subtree are actual assert all([cache.get(node) is not None for node in nodes_with_actual_cache]) # affected root node has no any actual cache assert cache.get(pipeline.root_node) is None
def test_cache_actuality_after_primary_node_changed_to_subtree(data_setup): """ The non-affected nodes has actual cache after changing the primary node to pre-fitted subtree""" cache = OperationsCache() train, _ = data_setup pipeline = pipeline_first() other_pipeline = pipeline_second() pipeline.fit(input_data=train) cache.save_pipeline(pipeline) other_pipeline.fit(input_data=train) pipeline.update_subtree(pipeline.root_node.nodes_from[0].nodes_from[0], other_pipeline.root_node.nodes_from[0]) cache.save_pipeline(Pipeline(other_pipeline.root_node.nodes_from[0])) root_parent_first = pipeline.root_node.nodes_from[0] nodes_with_non_actual_cache = [pipeline.root_node, root_parent_first] nodes_with_actual_cache = [node for node in pipeline.nodes if node not in nodes_with_non_actual_cache] # non-affected nodes of initial pipeline and fitted nodes of new subtree are actual assert all([cache.get(node) for node in nodes_with_actual_cache]) # affected root nodes and their childs has no any actual cache assert not any([cache.get(node) for node in nodes_with_non_actual_cache])
def test_cache_actuality_after_model_change(data_setup): """The non-affected nodes has actual cache after changing the model""" cache = OperationsCache() pipeline = pipeline_first() train, _ = data_setup pipeline.fit(input_data=train) cache.save_pipeline(pipeline) new_node = SecondaryNode(operation_type='logit') pipeline.update_node(old_node=pipeline.root_node.nodes_from[0], new_node=new_node) root_parent_first = pipeline.root_node.nodes_from[0] nodes_with_non_actual_cache = [pipeline.root_node, root_parent_first] nodes_with_actual_cache = [node for node in pipeline.nodes if node not in nodes_with_non_actual_cache] # non-affected nodes are actual assert all([cache.get(node) is not None for node in nodes_with_actual_cache]) # affected nodes and their childs has no any actual cache assert all([cache.get(node) is None for node in nodes_with_non_actual_cache])
def test_multi_pipeline_caching_with_cache(data_setup): train, _ = data_setup cache = OperationsCache() main_pipeline = pipeline_second() other_pipeline = pipeline_first() # fit other_pipeline that contains the parts identical to main_pipeline other_pipeline.fit(input_data=train) cache.save_pipeline(other_pipeline) nodes_with_non_actual_cache = [main_pipeline.root_node, main_pipeline.root_node.nodes_from[0]] + \ [_ for _ in main_pipeline.root_node.nodes_from[0].nodes_from] nodes_with_actual_cache = [node for node in main_pipeline.nodes if node not in nodes_with_non_actual_cache] # check that using of other_pipeline make identical of the main_pipeline fitted, # despite the main_pipeline.fit() was not called assert all([cache.get(node) for node in nodes_with_actual_cache]) # the non-identical parts are still not fitted assert not any([cache.get(node) for node in nodes_with_non_actual_cache]) # check the same case with another pipelines cache = OperationsCache() main_pipeline = pipeline_fourth() prev_pipeline_first = pipeline_third() prev_pipeline_second = pipeline_fifth() prev_pipeline_first.fit(input_data=train) cache.save_pipeline(prev_pipeline_first) prev_pipeline_second.fit(input_data=train) cache.save_pipeline(prev_pipeline_second) nodes_with_non_actual_cache = [main_pipeline.root_node, main_pipeline.root_node.nodes_from[1]] nodes_with_actual_cache = [child for child in main_pipeline.root_node.nodes_from[0].nodes_from] assert not any([cache.get(node) for node in nodes_with_non_actual_cache]) assert all([cache.get(node) for node in nodes_with_actual_cache])
class GPComposer(Composer): """ Genetic programming based composer :param optimiser: optimiser generated in GPComposerBuilder :param metrics: metrics used to define the quality of found solution. :param composer_requirements: requirements for composition process :param initial_pipeline: defines the initial state of the population. If None then initial population is random. """ def __init__( self, optimiser=None, composer_requirements: Optional[GPComposerRequirements] = None, metrics: Union[List[MetricsEnum], MetricsEnum] = None, initial_pipeline: Optional[Pipeline] = None, logger: Log = None): super().__init__(metrics=metrics, composer_requirements=composer_requirements, initial_pipeline=initial_pipeline) self.cache = OperationsCache() self.optimiser = optimiser self.cache_path = None self.use_existing_cache = False if not logger: self.log = default_log(__name__) else: self.log = logger def compose_pipeline( self, data: Union[InputData, MultiModalData], is_visualise: bool = False, is_tune: bool = False, on_next_iteration_callback: Optional[Callable] = None ) -> Union[Pipeline, List[Pipeline]]: """ Function for optimal pipeline structure searching :param data: InputData for pipeline composing :param is_visualise: is it needed to visualise :param is_tune: is it needed to tune pipeline after composing TODO integrate new tuner :param on_next_iteration_callback: TODO add description :return best_pipeline: obtained result after composing: one pipeline for single-objective optimization; For the multi-objective case, the list of the graph is returned. In the list, the pipelines are ordered by the descending of primary metric (the first is the best) """ self.optimiser.graph_generation_params.advisor.task = data.task if self.composer_requirements.max_pipeline_fit_time: set_multiprocess_start_method() if not self.optimiser: raise AttributeError( f'Optimiser for graph composition is not defined') if self.composer_requirements.cv_folds is not None: objective_function_for_pipeline = self._cv_validation_metric_build( data) else: self.log.info( "Hold out validation for graph composing was applied.") split_ratio = sample_split_ratio_for_tasks[data.task.task_type] train_data, test_data = train_test_data_setup(data, split_ratio) objective_function_for_pipeline = partial(self.composer_metric, self.metrics, train_data, test_data) if self.cache_path is None: self.cache.clear() else: self.cache.clear(tmp_only=True) self.cache = OperationsCache( self.cache_path, clear_exiting=not self.use_existing_cache) best_pipeline = self.optimiser.optimise( objective_function_for_pipeline, on_next_iteration_callback=on_next_iteration_callback) self.log.info('GP composition finished') self.cache.clear() if is_tune: self.tune_pipeline(best_pipeline, data, self.composer_requirements.timeout) return best_pipeline def _cv_validation_metric_build(self, data): """ Prepare function for metric evaluation based on task """ if isinstance(data, MultiModalData): raise NotImplementedError( 'Cross-validation is not supported for multi-modal data') task_type = data.task.task_type if task_type is TaskTypesEnum.ts_forecasting: # Perform time series cross validation self.log.info( "Time series cross validation for pipeline composing was applied." ) if self.composer_requirements.validation_blocks is None: self.log.info( 'For ts cross validation validation_blocks number was changed from None to 3 blocks' ) self.composer_requirements.validation_blocks = 3 metric_function_for_nodes = partial( ts_metric_calculation, data, self.composer_requirements.cv_folds, self.composer_requirements.validation_blocks, self.metrics, log=self.log) else: self.log.info( "KFolds cross validation for pipeline composing was applied.") metric_function_for_nodes = partial( table_metric_calculation, data, self.composer_requirements.cv_folds, self.metrics, log=self.log) return metric_function_for_nodes def composer_metric(self, metrics, train_data: Union[InputData, MultiModalData], test_data: Union[InputData, MultiModalData], pipeline: Pipeline) -> Optional[Tuple[Any]]: try: validate(pipeline) pipeline.log = self.log if type(metrics) is not list: metrics = [metrics] if self.cache is not None: # TODO improve cache pipeline.fit_from_cache(self.cache) if not pipeline.is_fitted: self.log.debug( f'Pipeline {pipeline.root_node.descriptive_id} fit started' ) pipeline.fit(input_data=train_data, time_constraint=self.composer_requirements. max_pipeline_fit_time) try: self.cache.save_pipeline(pipeline) except Exception as ex: self.log.info(f'Cache can not be saved: {ex}. Continue.') evaluated_metrics = () for metric in metrics: if callable(metric): metric_func = metric else: metric_func = MetricsRepository().metric_by_id(metric) evaluated_metrics = evaluated_metrics + (metric_func( pipeline, reference_data=test_data), ) self.log.debug( f'Pipeline {pipeline.root_node.descriptive_id} with metrics: {list(evaluated_metrics)}' ) # enforce memory cleaning pipeline.unfit() gc.collect() except Exception as ex: self.log.info(f'Pipeline assessment warning: {ex}. Continue.') evaluated_metrics = None return evaluated_metrics @staticmethod def tune_pipeline(pipeline: Pipeline, data: InputData, time_limit): raise NotImplementedError() @property def history(self): return self.optimiser.history