def compose_pipeline( self, data: Union[InputData, MultiModalData], is_visualise: bool = False, is_tune: bool = False, on_next_iteration_callback: Optional[Callable] = None ) -> Union[Pipeline, List[Pipeline]]: """ Function for optimal pipeline structure searching :param data: InputData for pipeline composing :param is_visualise: is it needed to visualise :param is_tune: is it needed to tune pipeline after composing TODO integrate new tuner :param on_next_iteration_callback: TODO add description :return best_pipeline: obtained result after composing: one pipeline for single-objective optimization; For the multi-objective case, the list of the graph is returned. In the list, the pipelines are ordered by the descending of primary metric (the first is the best) """ self.optimiser.graph_generation_params.advisor.task = data.task if self.composer_requirements.max_pipeline_fit_time: set_multiprocess_start_method() if not self.optimiser: raise AttributeError( f'Optimiser for graph composition is not defined') if self.composer_requirements.cv_folds is not None: objective_function_for_pipeline = self._cv_validation_metric_build( data) else: self.log.info( "Hold out validation for graph composing was applied.") split_ratio = sample_split_ratio_for_tasks[data.task.task_type] train_data, test_data = train_test_data_setup(data, split_ratio) objective_function_for_pipeline = partial(self.composer_metric, self.metrics, train_data, test_data) if self.cache_path is None: self.cache.clear() else: self.cache.clear(tmp_only=True) self.cache = OperationsCache( self.cache_path, clear_exiting=not self.use_existing_cache) best_pipeline = self.optimiser.optimise( objective_function_for_pipeline, on_next_iteration_callback=on_next_iteration_callback) self.log.info('GP composition finished') self.cache.clear() if is_tune: self.tune_pipeline(best_pipeline, data, self.composer_requirements.timeout) return best_pipeline
def test_cache_historical_state_using(data_setup): cache = OperationsCache() train, _ = data_setup chain = chain_first() # chain fitted, model goes to cache chain.fit(input_data=train) cache.save_chain(chain) new_node = SecondaryNode(operation_type='logit') old_node = chain.root_node.nodes_from[0] # change child node to new one chain.update_node(old_node=old_node, new_node=new_node) # cache is not actual assert not cache.get(chain.root_node) # fit modified chain chain.fit(input_data=train) cache.save_chain(chain) # cache is actual now assert cache.get(chain.root_node) # change node back chain.update_node(old_node=chain.root_node.nodes_from[0], new_node=old_node) # cache is actual without new fitting, # because the cached model was saved after first fit assert cache.get(chain.root_node)
def fit_from_cache(self, cache: OperationsCache): for node in self.nodes: cached_state = cache.get(node) if cached_state: node.fitted_operation = cached_state.operation else: node.fitted_operation = None
def compose_chain( self, data: InputData, is_visualise: bool = False, is_tune: bool = False, on_next_iteration_callback: Optional[Callable] = None ) -> Union[Chain, List[Chain]]: """ Function for optimal chain structure searching :param data: InputData for chain composing :param is_visualise: is it needed to visualise :param is_tune: is it needed to tune chain after composing TODO integrate new tuner :param on_next_iteration_callback: TODO add description :return best_chain: obtained result after composing: one chain for single-objective optimization; For the multi-objective case, the list of the chain is returned. In the list, the chains are ordered by the descending of primary metric (the first is the best) """ if self.composer_requirements.max_chain_fit_time: set_multiprocess_start_method() if not self.optimiser: raise AttributeError( f'Optimiser for chain composition is not defined') train_data, test_data = train_test_data_setup( data, sample_split_ration_for_tasks[data.task.task_type]) if self.cache_path is None: self.cache.clear() else: self.cache = OperationsCache( self.cache_path, clear_exiting=not self.use_existing_cache) metric_function_for_nodes = partial(self.composer_metric, self.metrics, train_data, test_data) best_chain = self.optimiser.optimise( metric_function_for_nodes, on_next_iteration_callback=on_next_iteration_callback) self.log.info('GP composition finished') if is_tune: self.tune_chain(best_chain, data, self.composer_requirements.max_lead_time) return best_chain
def test_cache_actuality_after_model_change(data_setup): """The non-affected nodes has actual cache after changing the model""" cache = OperationsCache() chain = chain_first() train, _ = data_setup chain.fit(input_data=train) cache.save_chain(chain) new_node = SecondaryNode(operation_type='logit') chain.update_node(old_node=chain.root_node.nodes_from[0], new_node=new_node) root_parent_first = chain.root_node.nodes_from[0] nodes_with_non_actual_cache = [chain.root_node, root_parent_first] nodes_with_actual_cache = [ node for node in chain.nodes if node not in nodes_with_non_actual_cache ] # non-affected nodes are actual assert all( [cache.get(node) is not None for node in nodes_with_actual_cache]) # affected nodes and their childs has no any actual cache assert all( [cache.get(node) is None for node in nodes_with_non_actual_cache])
def __init__( self, optimiser=None, composer_requirements: Optional[GPComposerRequirements] = None, metrics: Union[List[MetricsEnum], MetricsEnum] = None, initial_chain: Optional[Chain] = None, logger: Log = None): super().__init__(metrics=metrics, composer_requirements=composer_requirements, initial_chain=initial_chain) self.cache = OperationsCache() self.optimiser = optimiser self.cache_path = None self.use_existing_cache = False if not logger: self.log = default_log(__name__) else: self.log = logger
def test_cache_actuality_after_subtree_change_to_identical(data_setup): """The non-affected nodes has actual cache after changing the subtree to other pre-fitted subtree""" cache = OperationsCache() train, _ = data_setup pipeline = pipeline_first() other_pipeline = pipeline_second() pipeline.fit(input_data=train) cache.save_pipeline(pipeline) other_pipeline.fit(input_data=train) cache.save_pipeline(Pipeline(other_pipeline.root_node.nodes_from[0])) pipeline.update_subtree(pipeline.root_node.nodes_from[0], other_pipeline.root_node.nodes_from[0]) nodes_with_actual_cache = [node for node in pipeline.nodes if node not in [pipeline.root_node]] # non-affected nodes of initial pipeline and fitted nodes of new subtree are actual assert all([cache.get(node) is not None for node in nodes_with_actual_cache]) # affected root node has no any actual cache assert cache.get(pipeline.root_node) is None
def test_cache_actuality_after_primary_node_changed_to_subtree(data_setup): """ The non-affected nodes has actual cache after changing the primary node to pre-fitted subtree""" cache = OperationsCache() train, _ = data_setup pipeline = pipeline_first() other_pipeline = pipeline_second() pipeline.fit(input_data=train) cache.save_pipeline(pipeline) other_pipeline.fit(input_data=train) pipeline.update_subtree(pipeline.root_node.nodes_from[0].nodes_from[0], other_pipeline.root_node.nodes_from[0]) cache.save_pipeline(Pipeline(other_pipeline.root_node.nodes_from[0])) root_parent_first = pipeline.root_node.nodes_from[0] nodes_with_non_actual_cache = [pipeline.root_node, root_parent_first] nodes_with_actual_cache = [node for node in pipeline.nodes if node not in nodes_with_non_actual_cache] # non-affected nodes of initial pipeline and fitted nodes of new subtree are actual assert all([cache.get(node) for node in nodes_with_actual_cache]) # affected root nodes and their childs has no any actual cache assert not any([cache.get(node) for node in nodes_with_non_actual_cache])
class GPComposer(Composer): """ Genetic programming based composer :param optimiser: optimiser generated in GPComposerBuilder :param metrics: metrics used to define the quality of found solution. :param composer_requirements: requirements for composition process :param initial_chain: defines the initial state of the population. If None then initial population is random. """ def __init__( self, optimiser=None, composer_requirements: Optional[GPComposerRequirements] = None, metrics: Union[List[MetricsEnum], MetricsEnum] = None, initial_chain: Optional[Chain] = None, logger: Log = None): super().__init__(metrics=metrics, composer_requirements=composer_requirements, initial_chain=initial_chain) self.cache = OperationsCache() self.optimiser = optimiser self.cache_path = None self.use_existing_cache = False if not logger: self.log = default_log(__name__) else: self.log = logger def compose_chain( self, data: InputData, is_visualise: bool = False, is_tune: bool = False, on_next_iteration_callback: Optional[Callable] = None ) -> Union[Chain, List[Chain]]: """ Function for optimal chain structure searching :param data: InputData for chain composing :param is_visualise: is it needed to visualise :param is_tune: is it needed to tune chain after composing TODO integrate new tuner :param on_next_iteration_callback: TODO add description :return best_chain: obtained result after composing: one chain for single-objective optimization; For the multi-objective case, the list of the chain is returned. In the list, the chains are ordered by the descending of primary metric (the first is the best) """ if self.composer_requirements.max_chain_fit_time: set_multiprocess_start_method() if not self.optimiser: raise AttributeError( f'Optimiser for chain composition is not defined') train_data, test_data = train_test_data_setup( data, sample_split_ration_for_tasks[data.task.task_type]) if self.cache_path is None: self.cache.clear() else: self.cache = OperationsCache( self.cache_path, clear_exiting=not self.use_existing_cache) metric_function_for_nodes = partial(self.composer_metric, self.metrics, train_data, test_data) best_chain = self.optimiser.optimise( metric_function_for_nodes, on_next_iteration_callback=on_next_iteration_callback) self.log.info('GP composition finished') if is_tune: self.tune_chain(best_chain, data, self.composer_requirements.max_lead_time) return best_chain def composer_metric(self, metrics, train_data: InputData, test_data: InputData, chain: Chain) -> Optional[Tuple[Any]]: try: validate(chain) chain.log = self.log if type(metrics) is not list: metrics = [metrics] if self.cache is not None: # TODO improve cache chain.fit_from_cache(self.cache) if not chain.is_fitted: self.log.debug( f'Chain {chain.root_node.descriptive_id} fit started') chain.fit(input_data=train_data, time_constraint=self.composer_requirements. max_chain_fit_time) self.cache.save_chain(chain) evaluated_metrics = () for metric in metrics: if callable(metric): metric_func = metric else: metric_func = MetricsRepository().metric_by_id(metric) evaluated_metrics = evaluated_metrics + (metric_func( chain, reference_data=test_data), ) self.log.debug( f'Chain {chain.root_node.descriptive_id} with metrics: {list(evaluated_metrics)}' ) except Exception as ex: self.log.info(f'Chain assessment warning: {ex}. Continue.') evaluated_metrics = None return evaluated_metrics @staticmethod def tune_chain(chain: Chain, data: InputData, time_limit): raise NotImplementedError() @property def history(self): return self.optimiser.history
class GPComposer(Composer): """ Genetic programming based composer :param optimiser: optimiser generated in GPComposerBuilder :param metrics: metrics used to define the quality of found solution. :param composer_requirements: requirements for composition process :param initial_pipeline: defines the initial state of the population. If None then initial population is random. """ def __init__( self, optimiser=None, composer_requirements: Optional[GPComposerRequirements] = None, metrics: Union[List[MetricsEnum], MetricsEnum] = None, initial_pipeline: Optional[Pipeline] = None, logger: Log = None): super().__init__(metrics=metrics, composer_requirements=composer_requirements, initial_pipeline=initial_pipeline) self.cache = OperationsCache() self.optimiser = optimiser self.cache_path = None self.use_existing_cache = False if not logger: self.log = default_log(__name__) else: self.log = logger def compose_pipeline( self, data: Union[InputData, MultiModalData], is_visualise: bool = False, is_tune: bool = False, on_next_iteration_callback: Optional[Callable] = None ) -> Union[Pipeline, List[Pipeline]]: """ Function for optimal pipeline structure searching :param data: InputData for pipeline composing :param is_visualise: is it needed to visualise :param is_tune: is it needed to tune pipeline after composing TODO integrate new tuner :param on_next_iteration_callback: TODO add description :return best_pipeline: obtained result after composing: one pipeline for single-objective optimization; For the multi-objective case, the list of the graph is returned. In the list, the pipelines are ordered by the descending of primary metric (the first is the best) """ self.optimiser.graph_generation_params.advisor.task = data.task if self.composer_requirements.max_pipeline_fit_time: set_multiprocess_start_method() if not self.optimiser: raise AttributeError( f'Optimiser for graph composition is not defined') if self.composer_requirements.cv_folds is not None: objective_function_for_pipeline = self._cv_validation_metric_build( data) else: self.log.info( "Hold out validation for graph composing was applied.") split_ratio = sample_split_ratio_for_tasks[data.task.task_type] train_data, test_data = train_test_data_setup(data, split_ratio) objective_function_for_pipeline = partial(self.composer_metric, self.metrics, train_data, test_data) if self.cache_path is None: self.cache.clear() else: self.cache.clear(tmp_only=True) self.cache = OperationsCache( self.cache_path, clear_exiting=not self.use_existing_cache) best_pipeline = self.optimiser.optimise( objective_function_for_pipeline, on_next_iteration_callback=on_next_iteration_callback) self.log.info('GP composition finished') self.cache.clear() if is_tune: self.tune_pipeline(best_pipeline, data, self.composer_requirements.timeout) return best_pipeline def _cv_validation_metric_build(self, data): """ Prepare function for metric evaluation based on task """ if isinstance(data, MultiModalData): raise NotImplementedError( 'Cross-validation is not supported for multi-modal data') task_type = data.task.task_type if task_type is TaskTypesEnum.ts_forecasting: # Perform time series cross validation self.log.info( "Time series cross validation for pipeline composing was applied." ) if self.composer_requirements.validation_blocks is None: self.log.info( 'For ts cross validation validation_blocks number was changed from None to 3 blocks' ) self.composer_requirements.validation_blocks = 3 metric_function_for_nodes = partial( ts_metric_calculation, data, self.composer_requirements.cv_folds, self.composer_requirements.validation_blocks, self.metrics, log=self.log) else: self.log.info( "KFolds cross validation for pipeline composing was applied.") metric_function_for_nodes = partial( table_metric_calculation, data, self.composer_requirements.cv_folds, self.metrics, log=self.log) return metric_function_for_nodes def composer_metric(self, metrics, train_data: Union[InputData, MultiModalData], test_data: Union[InputData, MultiModalData], pipeline: Pipeline) -> Optional[Tuple[Any]]: try: validate(pipeline) pipeline.log = self.log if type(metrics) is not list: metrics = [metrics] if self.cache is not None: # TODO improve cache pipeline.fit_from_cache(self.cache) if not pipeline.is_fitted: self.log.debug( f'Pipeline {pipeline.root_node.descriptive_id} fit started' ) pipeline.fit(input_data=train_data, time_constraint=self.composer_requirements. max_pipeline_fit_time) try: self.cache.save_pipeline(pipeline) except Exception as ex: self.log.info(f'Cache can not be saved: {ex}. Continue.') evaluated_metrics = () for metric in metrics: if callable(metric): metric_func = metric else: metric_func = MetricsRepository().metric_by_id(metric) evaluated_metrics = evaluated_metrics + (metric_func( pipeline, reference_data=test_data), ) self.log.debug( f'Pipeline {pipeline.root_node.descriptive_id} with metrics: {list(evaluated_metrics)}' ) # enforce memory cleaning pipeline.unfit() gc.collect() except Exception as ex: self.log.info(f'Pipeline assessment warning: {ex}. Continue.') evaluated_metrics = None return evaluated_metrics @staticmethod def tune_pipeline(pipeline: Pipeline, data: InputData, time_limit): raise NotImplementedError() @property def history(self): return self.optimiser.history
def test_multi_chain_caching_with_cache(data_setup): train, _ = data_setup cache = OperationsCache() main_chain = chain_second() other_chain = chain_first() # fit other_chain that contains the parts identical to main_chain other_chain.fit(input_data=train) cache.save_chain(other_chain) nodes_with_non_actual_cache = [main_chain.root_node, main_chain.root_node.nodes_from[0]] + \ [_ for _ in main_chain.root_node.nodes_from[0].nodes_from] nodes_with_actual_cache = [ node for node in main_chain.nodes if node not in nodes_with_non_actual_cache ] # check that using of other_chain make identical of the main_chain fitted, # despite the main_chain.fit() was not called assert all([cache.get(node) for node in nodes_with_actual_cache]) # the non-identical parts are still not fitted assert not any([cache.get(node) for node in nodes_with_non_actual_cache]) # check the same case with another chains cache = OperationsCache() main_chain = chain_fourth() prev_chain_first = chain_third() prev_chain_second = chain_fifth() prev_chain_first.fit(input_data=train) cache.save_chain(prev_chain_first) prev_chain_second.fit(input_data=train) cache.save_chain(prev_chain_second) nodes_with_non_actual_cache = [ main_chain.root_node, main_chain.root_node.nodes_from[1] ] nodes_with_actual_cache = [ child for child in main_chain.root_node.nodes_from[0].nodes_from ] assert not any([cache.get(node) for node in nodes_with_non_actual_cache]) assert all([cache.get(node) for node in nodes_with_actual_cache])