コード例 #1
0
    def _operation_to_template(self, node: Node, operation_id: int, nodes_from: list):
        from fedot.core.pipelines.template import PipelineTemplate

        self.operation_id = operation_id
        self.operation_type = node.operation.operation_type
        self.nodes_from = nodes_from
        self.pipeline_template = PipelineTemplate(node.operation.pipeline)
        self.atomized_model_json_path = 'nested_' + str(self.operation_id)
コード例 #2
0
    def load(self, path: str):
        """
        Load the pipeline the json representation with pickled fitted operations.

        :param path to json file with operation
        """
        self.nodes = []
        self.template = PipelineTemplate(self, self.log)
        self.template.import_pipeline(path)
コード例 #3
0
    def save(self, path: str):
        """
        Save the pipeline to the json representation with pickled fitted operations.

        :param path to json file with operation
        :return: json containing a composite operation description
        """
        if not self.template:
            self.template = PipelineTemplate(self, self.log)
        json_object = self.template.export_pipeline(path)
        return json_object
コード例 #4
0
def test_empty_pipeline_to_json_correctly():
    json_path_load = create_correct_path('test_empty_pipeline_convert_to_json')

    pipeline = Pipeline()
    pipeline_template = PipelineTemplate(pipeline)
    json_actual = pipeline_template.convert_to_dict()

    with open(json_path_load, 'r') as json_file:
        json_expected = json.load(json_file)

    assert json.dumps(json_actual) == json.dumps(json_expected)
コード例 #5
0
def test_pipeline_template_as_nx_graph():
    pipeline = pipeline_first()
    pipeline_template = PipelineTemplate(pipeline)
    graph, node_labels = pipeline_template_as_nx_graph(pipeline=pipeline_template)

    assert len(graph.nodes) == len(pipeline.nodes)  # check node quantity
    assert node_labels[0] == str(pipeline.root_node)  # check root node
コード例 #6
0
    def __init__(self, node: Node = None, operation_id: int = None, nodes_from: list = None, path: str = None):
        # Need use the imports inside the class because of the problem of circular imports.
        from fedot.core.pipelines.pipeline import Pipeline
        from fedot.core.pipelines.template import PipelineTemplate
        from fedot.core.operations.atomized_model import AtomizedModel

        super().__init__()
        self.atomized_model_json_path = None
        self.next_pipeline_template = None
        self.pipeline_template = None

        if path:
            pipeline = Pipeline()
            pipeline.load(path)
            self.next_pipeline_template = AtomizedModel(pipeline)
            self.pipeline_template = PipelineTemplate(pipeline)

        if node:
            self._operation_to_template(node, operation_id, nodes_from)
コード例 #7
0
def test_data_model_types_forecasting_pipeline_fit():
    train_data, test_data = get_ts_data(forecast_length=10)

    pipeline = get_multiscale_pipeline()
    pipeline.fit(train_data)
    pipeline.save('data_model_forecasting')

    expected_len_nodes = len(pipeline.nodes)
    actual_len_nodes = len(PipelineTemplate(pipeline).operation_templates)

    assert actual_len_nodes == expected_len_nodes
コード例 #8
0
def test_import_json_template_to_pipeline_correctly():
    json_path_load = create_correct_path('test_pipeline_convert_to_json')

    pipeline = Pipeline()
    pipeline_template = PipelineTemplate(pipeline)
    pipeline_template.import_pipeline(json_path_load)
    json_actual = pipeline_template.convert_to_dict()

    pipeline_expected = create_pipeline()
    pipeline_expected_template = PipelineTemplate(pipeline_expected)
    json_expected = pipeline_expected_template.convert_to_dict()

    assert json.dumps(json_actual) == json.dumps(json_expected)
コード例 #9
0
def test_data_model_type_classification_pipeline_fit():
    train_file_path, test_file_path = get_scoring_case_data_paths()
    train_data = InputData.from_csv(train_file_path)

    pipeline = create_classification_pipeline_with_preprocessing()
    pipeline.fit(train_data)
    pipeline.save('data_model_classification')

    expected_len_nodes = len(pipeline.nodes)
    actual_len_nodes = len(PipelineTemplate(pipeline).operation_templates)

    assert actual_len_nodes == expected_len_nodes
コード例 #10
0
def test_hierarchy_pos():
    pipeline = pipeline_first()
    real_hierarchy_levels_y = {0: ['xgboost'], 1: ['xgboost', 'knn'],
                               2: ['logit', 'lda', 'logit', 'lda']}
    real_hierarchy_levels_x = {0: ['logit'], 1: ['xgboost'], 2: ['lda'],
                               3: ['xgboost'], 4: ['logit'], 5: ['knn'], 6: ['lda']}
    pipeline_template = PipelineTemplate(pipeline)
    graph, node_labels = pipeline_template_as_nx_graph(pipeline=pipeline_template)
    pos = hierarchy_pos(graph.to_undirected(), root=0)
    comparable_lists_y = make_comparable_lists(pos, real_hierarchy_levels_y,
                                               node_labels, 1, reverse=True)
    comparable_lists_x = make_comparable_lists(pos, real_hierarchy_levels_x,
                                               node_labels, 0, reverse=False)
    assert comparable_lists_y[0] == comparable_lists_y[1]  # check nodes hierarchy by y axis
    assert comparable_lists_x[0] == comparable_lists_x[1]  # check nodes hierarchy by x axis
コード例 #11
0
def test_extract_subtree_root():
    pipeline = create_four_depth_pipeline()
    pipeline_template = PipelineTemplate(pipeline)

    expected_types = ['knn', 'logit', 'knn', 'lda', 'xgboost']
    new_root_node_id = 4

    root_node = extract_subtree_root(root_operation_id=new_root_node_id,
                                     pipeline_template=pipeline_template)

    sub_pipeline = Pipeline(root_node)
    actual_types = [node.operation.operation_type for node in sub_pipeline.nodes]

    assertion_list = [True if expected_types[index] == actual_types[index] else False
                      for index in range(len(expected_types))]
    assert all(assertion_list)
コード例 #12
0
class Pipeline(Graph):
    """
    Base class used for composite model structure definition

    :param nodes: Node object(s)
    :param log: Log object to record messages
    :param tag: uniq part of the repository filename

    .. note::
        fitted_on_data stores the data which were used in last pipeline fitting (equals None if pipeline hasn't been
        fitted yet)
    """
    def __init__(self,
                 nodes: Optional[Union[Node, List[Node]]] = None,
                 log: Log = None):

        self.computation_time = None
        self.template = None
        self.fitted_on_data = {}

        self.log = log
        if not log:
            self.log = default_log(__name__)
        else:
            self.log = log
        super().__init__(nodes)

    def fit_from_scratch(self,
                         input_data: Union[InputData, MultiModalData] = None):
        """
        Method used for training the pipeline without using saved information

        :param input_data: data used for operation training
        """
        # Clean all saved states and fit all operations
        self.log.info('Fit pipeline from scratch')
        self.unfit()
        self.fit(input_data, use_fitted=False)

    def update_fitted_on_data(self, data: InputData):
        characteristics = input_data_characteristics(data=data, log=self.log)
        self.fitted_on_data['data_type'] = characteristics[0]
        self.fitted_on_data['features_hash'] = characteristics[1]
        self.fitted_on_data['target_hash'] = characteristics[2]

    def _fitted_status_if_new_data(self, new_input_data: InputData,
                                   fitted_status: bool):
        new_data_params = input_data_characteristics(new_input_data,
                                                     log=self.log)
        if fitted_status and self.fitted_on_data:
            params_names = ('data_type', 'features_hash', 'target_hash')
            are_data_params_different = any([
                new_data_param != self.fitted_on_data[param_name]
                for new_data_param, param_name in zip(new_data_params,
                                                      params_names)
            ])
            if are_data_params_different:
                info = 'Trained operation is not actual because you are using new dataset for training. ' \
                       'Parameter use_fitted value changed to False'
                self.log.info(info)
                fitted_status = False
        return fitted_status

    def _fit_with_time_limit(
        self,
        input_data: Optional[InputData] = None,
        use_fitted_operations=False,
        time: timedelta = timedelta(minutes=3)
    ) -> Manager:
        """
        Run training process with time limit. Create

        :param input_data: data used for operation training
        :param use_fitted_operations: flag defining whether use saved information about previous executions or not,
        default True
        :param time: time constraint for operation fitting process (seconds)
        """
        time = int(time.total_seconds())
        manager = Manager()
        process_state_dict = manager.dict()
        fitted_operations = manager.list()
        p = Process(target=self._fit,
                    args=(input_data, use_fitted_operations,
                          process_state_dict, fitted_operations),
                    kwargs={})
        p.start()
        p.join(time)
        if p.is_alive():
            p.terminate()
            raise TimeoutError(
                f'Pipeline fitness evaluation time limit is expired')

        self.fitted_on_data = process_state_dict['fitted_on_data']
        self.computation_time = process_state_dict['computation_time']
        for node_num, node in enumerate(self.nodes):
            self.nodes[node_num].fitted_operation = fitted_operations[node_num]
        return process_state_dict['train_predicted']

    def _fit(self,
             input_data: InputData,
             use_fitted_operations=False,
             process_state_dict: Manager = None,
             fitted_operations: Manager = None):
        """
        Run training process in all nodes in pipeline starting with root.

        :param input_data: data used for operation training
        :param use_fitted_operations: flag defining whether use saved information about previous executions or not,
        default True
        :param process_state_dict: this dictionary is used for saving required pipeline parameters (which were changed
        inside the process) in a case of operation fit time control (when process created)
        :param fitted_operations: this list is used for saving fitted operations of pipeline nodes
        """

        # InputData was set directly to the primary nodes
        if input_data is None:
            use_fitted_operations = False
        else:
            use_fitted_operations = self._fitted_status_if_new_data(
                new_input_data=input_data, fitted_status=use_fitted_operations)

            if not use_fitted_operations or not self.fitted_on_data:
                # Don't use previous information
                self.unfit()
                self.update_fitted_on_data(input_data)

        with Timer(log=self.log) as t:
            computation_time_update = not use_fitted_operations or not self.root_node.fitted_operation or \
                                      self.computation_time is None

            train_predicted = self.root_node.fit(input_data=input_data)
            if computation_time_update:
                self.computation_time = round(t.minutes_from_start, 3)

        if process_state_dict is None:
            return train_predicted
        else:
            process_state_dict['train_predicted'] = train_predicted
            process_state_dict['computation_time'] = self.computation_time
            process_state_dict['fitted_on_data'] = self.fitted_on_data
            for node in self.nodes:
                fitted_operations.append(node.fitted_operation)

    def fit(self,
            input_data: Union[InputData, MultiModalData],
            use_fitted=True,
            time_constraint: Optional[timedelta] = None):
        """
        Run training process in all nodes in pipeline starting with root.

        :param input_data: data used for operation training
        :param use_fitted: flag defining whether use saved information about previous executions or not,
            default True
        :param time_constraint: time constraint for operation fitting (seconds)
        """
        if not use_fitted:
            self.unfit()

        # Make copy of the input data to avoid performing inplace operations
        copied_input_data = copy(input_data)
        copied_input_data = self._assign_data_to_nodes(copied_input_data)

        if time_constraint is None:
            train_predicted = self._fit(input_data=copied_input_data,
                                        use_fitted_operations=use_fitted)
        else:
            train_predicted = self._fit_with_time_limit(
                input_data=copied_input_data,
                use_fitted_operations=use_fitted,
                time=time_constraint)
        return train_predicted

    @property
    def is_fitted(self):
        return all([(node.fitted_operation is not None)
                    for node in self.nodes])

    def unfit(self):
        """
        Remove fitted operations for all nodes.
        """
        for node in self.nodes:
            node.unfit()

    def fit_from_cache(self, cache: OperationsCache):
        for node in self.nodes:
            cached_state = cache.get(node)
            if cached_state:
                node.fitted_operation = cached_state.operation
            else:
                node.fitted_operation = None

    def predict(self,
                input_data: Union[InputData, MultiModalData],
                output_mode: str = 'default'):
        """
        Run the predict process in all nodes in pipeline starting with root.

        :param input_data: data for prediction
        :param output_mode: desired form of output for operations. Available options are:
                'default' (as is),
                'labels' (numbers of classes - for classification) ,
                'probs' (probabilities - for classification =='default'),
                'full_probs' (return all probabilities - for binary classification).
        :return: OutputData with prediction
        """

        if not self.is_fitted:
            ex = 'Pipeline is not fitted yet'
            self.log.error(ex)
            raise ValueError(ex)

        # Make copy of the input data to avoid performing inplace operations
        copied_input_data = copy(input_data)
        copied_input_data = self._assign_data_to_nodes(copied_input_data)

        result = self.root_node.predict(input_data=copied_input_data,
                                        output_mode=output_mode)
        return result

    def fine_tune_all_nodes(self,
                            loss_function: Callable,
                            loss_params: Callable = None,
                            input_data: Union[InputData,
                                              MultiModalData] = None,
                            iterations=50,
                            timeout: int = 5,
                            cv_folds: int = None,
                            validation_blocks: int = 3) -> 'Pipeline':
        """ Tune all hyperparameters of nodes simultaneously via black-box
            optimization using PipelineTuner. For details, see
        :meth:`~fedot.core.pipelines.tuning.unified.PipelineTuner.tune_pipeline`
        """
        # Make copy of the input data to avoid performing inplace operations
        copied_input_data = copy(input_data)

        timeout = timedelta(minutes=timeout)
        pipeline_tuner = PipelineTuner(pipeline=self,
                                       task=copied_input_data.task,
                                       iterations=iterations,
                                       timeout=timeout)
        self.log.info('Start tuning of primary nodes')

        tuned_pipeline = pipeline_tuner.tune_pipeline(
            input_data=copied_input_data,
            loss_function=loss_function,
            loss_params=loss_params,
            cv_folds=cv_folds,
            validation_blocks=validation_blocks)
        self.log.info('Tuning was finished')

        return tuned_pipeline

    def save(self, path: str):
        """
        Save the pipeline to the json representation with pickled fitted operations.

        :param path to json file with operation
        :return: json containing a composite operation description
        """
        if not self.template:
            self.template = PipelineTemplate(self, self.log)
        json_object = self.template.export_pipeline(path)
        return json_object

    def load(self, path: str):
        """
        Load the pipeline the json representation with pickled fitted operations.

        :param path to json file with operation
        """
        self.nodes = []
        self.template = PipelineTemplate(self, self.log)
        self.template.import_pipeline(path)

    def __eq__(self, other) -> bool:
        return self.root_node.descriptive_id == other.root_node.descriptive_id

    def __str__(self):
        description = {
            'depth': self.depth,
            'length': self.length,
            'nodes': self.nodes,
        }
        return f'{description}'

    @property
    def root_node(self) -> Optional[Node]:
        if len(self.nodes) == 0:
            return None
        root = [
            node for node in self.nodes
            if not any(self.operator.node_children(node))
        ]
        if len(root) > 1:
            raise ValueError(
                f'{ERROR_PREFIX} More than 1 root_nodes in pipeline')
        return root[0]

    def _assign_data_to_nodes(self, input_data) -> Optional[InputData]:
        if isinstance(input_data, MultiModalData):
            for node in [n for n in self.nodes if isinstance(n, PrimaryNode)]:
                if node.operation.operation_type in input_data.keys():
                    node.node_data = input_data[node.operation.operation_type]
                    node.direct_set = True
                else:
                    raise ValueError(f'No data for primary node {node}')
            return None
        return input_data

    def print_structure(self):
        """ Method print information about pipeline """
        print('Pipeline structure:')
        print(self.__str__())
        for node in self.nodes:
            print(f'{node.operation.operation_type} - {node.custom_params}')
コード例 #13
0
 def _convert_pipeline_to_template(self, pipeline):
     pipeline_template = PipelineTemplate(pipeline)
     return pipeline_template
コード例 #14
0
class AtomizedModelTemplate(OperationTemplateAbstract):
    def __init__(self, node: Node = None, operation_id: int = None, nodes_from: list = None, path: str = None):
        # Need use the imports inside the class because of the problem of circular imports.
        from fedot.core.pipelines.pipeline import Pipeline
        from fedot.core.pipelines.template import PipelineTemplate
        from fedot.core.operations.atomized_model import AtomizedModel

        super().__init__()
        self.atomized_model_json_path = None
        self.next_pipeline_template = None
        self.pipeline_template = None

        if path:
            pipeline = Pipeline()
            pipeline.load(path)
            self.next_pipeline_template = AtomizedModel(pipeline)
            self.pipeline_template = PipelineTemplate(pipeline)

        if node:
            self._operation_to_template(node, operation_id, nodes_from)

    def _operation_to_template(self, node: Node, operation_id: int, nodes_from: list):
        from fedot.core.pipelines.template import PipelineTemplate

        self.operation_id = operation_id
        self.operation_type = node.operation.operation_type
        self.nodes_from = nodes_from
        self.pipeline_template = PipelineTemplate(node.operation.pipeline)
        self.atomized_model_json_path = 'nested_' + str(self.operation_id)

    def convert_to_dict(self) -> dict:

        operation_object = {
            'operation_id': self.operation_id,
            'operation_type': self.operation_type,
            'nodes_from': self.nodes_from,
            'atomized_model_json_path': self.atomized_model_json_path
        }

        return operation_object

    def _create_nested_path(self, path: str) -> Tuple[str, str]:
        """
        Create folder for nested JSON operation and prepared path to save JSON's.
        :params path: path where to save parent JSON operation
        :return: absolute and relative paths to save nested JSON operation
        """

        relative_path = os.path.join('fitted_operations', 'nested_' + str(self.operation_id))
        absolute_path = os.path.join(path, relative_path)

        if not os.path.exists(absolute_path):
            os.makedirs(absolute_path)

        return absolute_path, relative_path

    def export_operation(self, path: str):
        absolute_path = os.path.join(path, self.atomized_model_json_path)
        _check_existing_path(absolute_path)
        self.pipeline_template.export_pipeline(absolute_path)

    def import_json(self, operation_object: dict):
        required_fields = ['operation_id', 'operation_type', 'nodes_from', 'atomized_model_json_path']
        self._validate_json_operation_template(operation_object, required_fields)

        self.operation_id = operation_object['operation_id']
        self.operation_type = operation_object['operation_type']
        self.nodes_from = operation_object['nodes_from']
        self.atomized_model_json_path = operation_object['atomized_model_json_path']
コード例 #15
0
 def restore_as_template(self, opt_graph: OptGraph):
     pipeline = self.restore(opt_graph)
     return PipelineTemplate(pipeline)