Exemple #1
0
    def compose_pipeline(
        self,
        data: Union[InputData, MultiModalData],
        is_visualise: bool = False,
        is_tune: bool = False,
        on_next_iteration_callback: Optional[Callable] = None
    ) -> Union[Pipeline, List[Pipeline]]:
        """ Function for optimal pipeline structure searching
        :param data: InputData for pipeline composing
        :param is_visualise: is it needed to visualise
        :param is_tune: is it needed to tune pipeline after composing TODO integrate new tuner
        :param on_next_iteration_callback: TODO add description
        :return best_pipeline: obtained result after composing: one pipeline for single-objective optimization;
            For the multi-objective case, the list of the graph is returned.
            In the list, the pipelines are ordered by the descending of primary metric (the first is the best)
        """

        self.optimiser.graph_generation_params.advisor.task = data.task

        if self.composer_requirements.max_pipeline_fit_time:
            set_multiprocess_start_method()

        if not self.optimiser:
            raise AttributeError(
                f'Optimiser for graph composition is not defined')

        if self.composer_requirements.cv_folds is not None:
            objective_function_for_pipeline = self._cv_validation_metric_build(
                data)
        else:
            self.log.info(
                "Hold out validation for graph composing was applied.")
            split_ratio = sample_split_ratio_for_tasks[data.task.task_type]
            train_data, test_data = train_test_data_setup(data, split_ratio)
            objective_function_for_pipeline = partial(self.composer_metric,
                                                      self.metrics, train_data,
                                                      test_data)

        if self.cache_path is None:
            self.cache.clear()
        else:
            self.cache.clear(tmp_only=True)
            self.cache = OperationsCache(
                self.cache_path, clear_exiting=not self.use_existing_cache)

        best_pipeline = self.optimiser.optimise(
            objective_function_for_pipeline,
            on_next_iteration_callback=on_next_iteration_callback)

        self.log.info('GP composition finished')
        self.cache.clear()
        if is_tune:
            self.tune_pipeline(best_pipeline, data,
                               self.composer_requirements.timeout)
        return best_pipeline
Exemple #2
0
def test_cache_historical_state_using(data_setup):
    cache = OperationsCache()
    train, _ = data_setup
    chain = chain_first()

    # chain fitted, model goes to cache
    chain.fit(input_data=train)
    cache.save_chain(chain)
    new_node = SecondaryNode(operation_type='logit')
    old_node = chain.root_node.nodes_from[0]

    # change child node to new one
    chain.update_node(old_node=old_node, new_node=new_node)
    # cache is not actual
    assert not cache.get(chain.root_node)
    # fit modified chain
    chain.fit(input_data=train)
    cache.save_chain(chain)
    # cache is actual now
    assert cache.get(chain.root_node)

    # change node back
    chain.update_node(old_node=chain.root_node.nodes_from[0],
                      new_node=old_node)
    # cache is actual without new fitting,
    # because the cached model was saved after first fit
    assert cache.get(chain.root_node)
Exemple #3
0
 def fit_from_cache(self, cache: OperationsCache):
     for node in self.nodes:
         cached_state = cache.get(node)
         if cached_state:
             node.fitted_operation = cached_state.operation
         else:
             node.fitted_operation = None
Exemple #4
0
    def compose_chain(
        self,
        data: InputData,
        is_visualise: bool = False,
        is_tune: bool = False,
        on_next_iteration_callback: Optional[Callable] = None
    ) -> Union[Chain, List[Chain]]:
        """ Function for optimal chain structure searching

        :param data: InputData for chain composing
        :param is_visualise: is it needed to visualise
        :param is_tune: is it needed to tune chain after composing TODO integrate new tuner
        :param on_next_iteration_callback: TODO add description

        :return best_chain: obtained result after composing: one chain for single-objective optimization;
            For the multi-objective case, the list of the chain is returned.
            In the list, the chains are ordered by the descending of primary metric (the first is the best)
        """

        if self.composer_requirements.max_chain_fit_time:
            set_multiprocess_start_method()

        if not self.optimiser:
            raise AttributeError(
                f'Optimiser for chain composition is not defined')

        train_data, test_data = train_test_data_setup(
            data, sample_split_ration_for_tasks[data.task.task_type])
        if self.cache_path is None:
            self.cache.clear()
        else:
            self.cache = OperationsCache(
                self.cache_path, clear_exiting=not self.use_existing_cache)

        metric_function_for_nodes = partial(self.composer_metric, self.metrics,
                                            train_data, test_data)

        best_chain = self.optimiser.optimise(
            metric_function_for_nodes,
            on_next_iteration_callback=on_next_iteration_callback)

        self.log.info('GP composition finished')

        if is_tune:
            self.tune_chain(best_chain, data,
                            self.composer_requirements.max_lead_time)
        return best_chain
Exemple #5
0
def test_cache_actuality_after_model_change(data_setup):
    """The non-affected nodes has actual cache after changing the model"""

    cache = OperationsCache()

    chain = chain_first()
    train, _ = data_setup
    chain.fit(input_data=train)
    cache.save_chain(chain)
    new_node = SecondaryNode(operation_type='logit')
    chain.update_node(old_node=chain.root_node.nodes_from[0],
                      new_node=new_node)

    root_parent_first = chain.root_node.nodes_from[0]

    nodes_with_non_actual_cache = [chain.root_node, root_parent_first]
    nodes_with_actual_cache = [
        node for node in chain.nodes if node not in nodes_with_non_actual_cache
    ]

    # non-affected nodes are actual
    assert all(
        [cache.get(node) is not None for node in nodes_with_actual_cache])
    # affected nodes and their childs has no any actual cache
    assert all(
        [cache.get(node) is None for node in nodes_with_non_actual_cache])
Exemple #6
0
    def __init__(
            self,
            optimiser=None,
            composer_requirements: Optional[GPComposerRequirements] = None,
            metrics: Union[List[MetricsEnum], MetricsEnum] = None,
            initial_chain: Optional[Chain] = None,
            logger: Log = None):

        super().__init__(metrics=metrics,
                         composer_requirements=composer_requirements,
                         initial_chain=initial_chain)

        self.cache = OperationsCache()

        self.optimiser = optimiser
        self.cache_path = None
        self.use_existing_cache = False

        if not logger:
            self.log = default_log(__name__)
        else:
            self.log = logger
Exemple #7
0
def test_cache_actuality_after_subtree_change_to_identical(data_setup):
    """The non-affected nodes has actual cache after changing the subtree to other pre-fitted subtree"""
    cache = OperationsCache()
    train, _ = data_setup
    pipeline = pipeline_first()
    other_pipeline = pipeline_second()
    pipeline.fit(input_data=train)
    cache.save_pipeline(pipeline)
    other_pipeline.fit(input_data=train)
    cache.save_pipeline(Pipeline(other_pipeline.root_node.nodes_from[0]))

    pipeline.update_subtree(pipeline.root_node.nodes_from[0],
                            other_pipeline.root_node.nodes_from[0])

    nodes_with_actual_cache = [node for node in pipeline.nodes if node not in [pipeline.root_node]]

    # non-affected nodes of initial pipeline and fitted nodes of new subtree are actual
    assert all([cache.get(node) is not None for node in nodes_with_actual_cache])
    # affected root node has no any actual cache
    assert cache.get(pipeline.root_node) is None
Exemple #8
0
def test_cache_actuality_after_primary_node_changed_to_subtree(data_setup):
    """ The non-affected nodes has actual cache after changing the primary node to pre-fitted subtree"""
    cache = OperationsCache()
    train, _ = data_setup
    pipeline = pipeline_first()
    other_pipeline = pipeline_second()
    pipeline.fit(input_data=train)
    cache.save_pipeline(pipeline)
    other_pipeline.fit(input_data=train)
    pipeline.update_subtree(pipeline.root_node.nodes_from[0].nodes_from[0],
                            other_pipeline.root_node.nodes_from[0])
    cache.save_pipeline(Pipeline(other_pipeline.root_node.nodes_from[0]))
    root_parent_first = pipeline.root_node.nodes_from[0]

    nodes_with_non_actual_cache = [pipeline.root_node, root_parent_first]
    nodes_with_actual_cache = [node for node in pipeline.nodes if node not in nodes_with_non_actual_cache]

    # non-affected nodes of initial pipeline and fitted nodes of new subtree are actual
    assert all([cache.get(node) for node in nodes_with_actual_cache])
    # affected root nodes and their childs has no any actual cache
    assert not any([cache.get(node) for node in nodes_with_non_actual_cache])
Exemple #9
0
class GPComposer(Composer):
    """
    Genetic programming based composer
    :param optimiser: optimiser generated in GPComposerBuilder
    :param metrics: metrics used to define the quality of found solution.
    :param composer_requirements: requirements for composition process
    :param initial_chain: defines the initial state of the population. If None then initial population is random.
    """
    def __init__(
            self,
            optimiser=None,
            composer_requirements: Optional[GPComposerRequirements] = None,
            metrics: Union[List[MetricsEnum], MetricsEnum] = None,
            initial_chain: Optional[Chain] = None,
            logger: Log = None):

        super().__init__(metrics=metrics,
                         composer_requirements=composer_requirements,
                         initial_chain=initial_chain)

        self.cache = OperationsCache()

        self.optimiser = optimiser
        self.cache_path = None
        self.use_existing_cache = False

        if not logger:
            self.log = default_log(__name__)
        else:
            self.log = logger

    def compose_chain(
        self,
        data: InputData,
        is_visualise: bool = False,
        is_tune: bool = False,
        on_next_iteration_callback: Optional[Callable] = None
    ) -> Union[Chain, List[Chain]]:
        """ Function for optimal chain structure searching

        :param data: InputData for chain composing
        :param is_visualise: is it needed to visualise
        :param is_tune: is it needed to tune chain after composing TODO integrate new tuner
        :param on_next_iteration_callback: TODO add description

        :return best_chain: obtained result after composing: one chain for single-objective optimization;
            For the multi-objective case, the list of the chain is returned.
            In the list, the chains are ordered by the descending of primary metric (the first is the best)
        """

        if self.composer_requirements.max_chain_fit_time:
            set_multiprocess_start_method()

        if not self.optimiser:
            raise AttributeError(
                f'Optimiser for chain composition is not defined')

        train_data, test_data = train_test_data_setup(
            data, sample_split_ration_for_tasks[data.task.task_type])
        if self.cache_path is None:
            self.cache.clear()
        else:
            self.cache = OperationsCache(
                self.cache_path, clear_exiting=not self.use_existing_cache)

        metric_function_for_nodes = partial(self.composer_metric, self.metrics,
                                            train_data, test_data)

        best_chain = self.optimiser.optimise(
            metric_function_for_nodes,
            on_next_iteration_callback=on_next_iteration_callback)

        self.log.info('GP composition finished')

        if is_tune:
            self.tune_chain(best_chain, data,
                            self.composer_requirements.max_lead_time)
        return best_chain

    def composer_metric(self, metrics, train_data: InputData,
                        test_data: InputData,
                        chain: Chain) -> Optional[Tuple[Any]]:
        try:
            validate(chain)
            chain.log = self.log

            if type(metrics) is not list:
                metrics = [metrics]

            if self.cache is not None:
                # TODO improve cache
                chain.fit_from_cache(self.cache)

            if not chain.is_fitted:
                self.log.debug(
                    f'Chain {chain.root_node.descriptive_id} fit started')
                chain.fit(input_data=train_data,
                          time_constraint=self.composer_requirements.
                          max_chain_fit_time)
                self.cache.save_chain(chain)

            evaluated_metrics = ()
            for metric in metrics:
                if callable(metric):
                    metric_func = metric
                else:
                    metric_func = MetricsRepository().metric_by_id(metric)
                evaluated_metrics = evaluated_metrics + (metric_func(
                    chain, reference_data=test_data), )

            self.log.debug(
                f'Chain {chain.root_node.descriptive_id} with metrics: {list(evaluated_metrics)}'
            )

        except Exception as ex:
            self.log.info(f'Chain assessment warning: {ex}. Continue.')
            evaluated_metrics = None

        return evaluated_metrics

    @staticmethod
    def tune_chain(chain: Chain, data: InputData, time_limit):
        raise NotImplementedError()

    @property
    def history(self):
        return self.optimiser.history
Exemple #10
0
class GPComposer(Composer):
    """
    Genetic programming based composer
    :param optimiser: optimiser generated in GPComposerBuilder
    :param metrics: metrics used to define the quality of found solution.
    :param composer_requirements: requirements for composition process
    :param initial_pipeline: defines the initial state of the population. If None then initial population is random.
    """
    def __init__(
            self,
            optimiser=None,
            composer_requirements: Optional[GPComposerRequirements] = None,
            metrics: Union[List[MetricsEnum], MetricsEnum] = None,
            initial_pipeline: Optional[Pipeline] = None,
            logger: Log = None):

        super().__init__(metrics=metrics,
                         composer_requirements=composer_requirements,
                         initial_pipeline=initial_pipeline)

        self.cache = OperationsCache()

        self.optimiser = optimiser
        self.cache_path = None
        self.use_existing_cache = False

        if not logger:
            self.log = default_log(__name__)
        else:
            self.log = logger

    def compose_pipeline(
        self,
        data: Union[InputData, MultiModalData],
        is_visualise: bool = False,
        is_tune: bool = False,
        on_next_iteration_callback: Optional[Callable] = None
    ) -> Union[Pipeline, List[Pipeline]]:
        """ Function for optimal pipeline structure searching
        :param data: InputData for pipeline composing
        :param is_visualise: is it needed to visualise
        :param is_tune: is it needed to tune pipeline after composing TODO integrate new tuner
        :param on_next_iteration_callback: TODO add description
        :return best_pipeline: obtained result after composing: one pipeline for single-objective optimization;
            For the multi-objective case, the list of the graph is returned.
            In the list, the pipelines are ordered by the descending of primary metric (the first is the best)
        """

        self.optimiser.graph_generation_params.advisor.task = data.task

        if self.composer_requirements.max_pipeline_fit_time:
            set_multiprocess_start_method()

        if not self.optimiser:
            raise AttributeError(
                f'Optimiser for graph composition is not defined')

        if self.composer_requirements.cv_folds is not None:
            objective_function_for_pipeline = self._cv_validation_metric_build(
                data)
        else:
            self.log.info(
                "Hold out validation for graph composing was applied.")
            split_ratio = sample_split_ratio_for_tasks[data.task.task_type]
            train_data, test_data = train_test_data_setup(data, split_ratio)
            objective_function_for_pipeline = partial(self.composer_metric,
                                                      self.metrics, train_data,
                                                      test_data)

        if self.cache_path is None:
            self.cache.clear()
        else:
            self.cache.clear(tmp_only=True)
            self.cache = OperationsCache(
                self.cache_path, clear_exiting=not self.use_existing_cache)

        best_pipeline = self.optimiser.optimise(
            objective_function_for_pipeline,
            on_next_iteration_callback=on_next_iteration_callback)

        self.log.info('GP composition finished')
        self.cache.clear()
        if is_tune:
            self.tune_pipeline(best_pipeline, data,
                               self.composer_requirements.timeout)
        return best_pipeline

    def _cv_validation_metric_build(self, data):
        """ Prepare function for metric evaluation based on task """
        if isinstance(data, MultiModalData):
            raise NotImplementedError(
                'Cross-validation is not supported for multi-modal data')
        task_type = data.task.task_type
        if task_type is TaskTypesEnum.ts_forecasting:
            # Perform time series cross validation
            self.log.info(
                "Time series cross validation for pipeline composing was applied."
            )
            if self.composer_requirements.validation_blocks is None:
                self.log.info(
                    'For ts cross validation validation_blocks number was changed from None to 3 blocks'
                )
                self.composer_requirements.validation_blocks = 3
            metric_function_for_nodes = partial(
                ts_metric_calculation,
                data,
                self.composer_requirements.cv_folds,
                self.composer_requirements.validation_blocks,
                self.metrics,
                log=self.log)
        else:
            self.log.info(
                "KFolds cross validation for pipeline composing was applied.")
            metric_function_for_nodes = partial(
                table_metric_calculation,
                data,
                self.composer_requirements.cv_folds,
                self.metrics,
                log=self.log)

        return metric_function_for_nodes

    def composer_metric(self, metrics, train_data: Union[InputData,
                                                         MultiModalData],
                        test_data: Union[InputData, MultiModalData],
                        pipeline: Pipeline) -> Optional[Tuple[Any]]:
        try:
            validate(pipeline)
            pipeline.log = self.log

            if type(metrics) is not list:
                metrics = [metrics]

            if self.cache is not None:
                # TODO improve cache
                pipeline.fit_from_cache(self.cache)

            if not pipeline.is_fitted:
                self.log.debug(
                    f'Pipeline {pipeline.root_node.descriptive_id} fit started'
                )
                pipeline.fit(input_data=train_data,
                             time_constraint=self.composer_requirements.
                             max_pipeline_fit_time)
                try:
                    self.cache.save_pipeline(pipeline)
                except Exception as ex:
                    self.log.info(f'Cache can not be saved: {ex}. Continue.')

            evaluated_metrics = ()
            for metric in metrics:
                if callable(metric):
                    metric_func = metric
                else:
                    metric_func = MetricsRepository().metric_by_id(metric)
                evaluated_metrics = evaluated_metrics + (metric_func(
                    pipeline, reference_data=test_data), )

            self.log.debug(
                f'Pipeline {pipeline.root_node.descriptive_id} with metrics: {list(evaluated_metrics)}'
            )

            # enforce memory cleaning
            pipeline.unfit()
            gc.collect()
        except Exception as ex:
            self.log.info(f'Pipeline assessment warning: {ex}. Continue.')
            evaluated_metrics = None

        return evaluated_metrics

    @staticmethod
    def tune_pipeline(pipeline: Pipeline, data: InputData, time_limit):
        raise NotImplementedError()

    @property
    def history(self):
        return self.optimiser.history
Exemple #11
0
def test_multi_chain_caching_with_cache(data_setup):
    train, _ = data_setup
    cache = OperationsCache()

    main_chain = chain_second()
    other_chain = chain_first()

    # fit other_chain that contains the parts identical to main_chain
    other_chain.fit(input_data=train)
    cache.save_chain(other_chain)

    nodes_with_non_actual_cache = [main_chain.root_node, main_chain.root_node.nodes_from[0]] + \
                                  [_ for _ in main_chain.root_node.nodes_from[0].nodes_from]
    nodes_with_actual_cache = [
        node for node in main_chain.nodes
        if node not in nodes_with_non_actual_cache
    ]

    # check that using of other_chain make identical of the main_chain fitted,
    # despite the main_chain.fit() was not called
    assert all([cache.get(node) for node in nodes_with_actual_cache])
    # the non-identical parts are still not fitted
    assert not any([cache.get(node) for node in nodes_with_non_actual_cache])

    # check the same case with another chains
    cache = OperationsCache()

    main_chain = chain_fourth()

    prev_chain_first = chain_third()
    prev_chain_second = chain_fifth()

    prev_chain_first.fit(input_data=train)
    cache.save_chain(prev_chain_first)
    prev_chain_second.fit(input_data=train)
    cache.save_chain(prev_chain_second)

    nodes_with_non_actual_cache = [
        main_chain.root_node, main_chain.root_node.nodes_from[1]
    ]
    nodes_with_actual_cache = [
        child for child in main_chain.root_node.nodes_from[0].nodes_from
    ]

    assert not any([cache.get(node) for node in nodes_with_non_actual_cache])
    assert all([cache.get(node) for node in nodes_with_actual_cache])