コード例 #1
0
def test_save_load_fitted_atomized_pipeline_correctly():
    pipeline = create_pipeline_with_several_nested_atomized_model()

    train_data, test_data = create_data_for_train()
    pipeline.fit(train_data)

    json_actual = pipeline.save(
        'test_save_load_fitted_atomized_pipeline_correctly')

    json_path_load = create_correct_path(
        'test_save_load_fitted_atomized_pipeline_correctly')

    pipeline_loaded = Pipeline()
    pipeline_loaded.load(json_path_load)
    json_expected = pipeline_loaded.save(
        'test_save_load_fitted_atomized_pipeline_correctly_loaded')

    assert pipeline.length == pipeline_loaded.length
    assert json_actual == json_expected

    before_save_predicted = pipeline.predict(test_data)

    pipeline_loaded.fit(train_data)
    after_save_predicted = pipeline_loaded.predict(test_data)

    bfr_tun_mse = mean_squared_error(y_true=test_data.target,
                                     y_pred=before_save_predicted.predict)
    aft_tun_mse = mean_squared_error(y_true=test_data.target,
                                     y_pred=after_save_predicted.predict)

    assert aft_tun_mse <= bfr_tun_mse
コード例 #2
0
def run_import_export_example(pipeline_path):
    # Prepare data to train the model
    train_data, test_data = get_scoring_data()

    # Get pipeline and fit it
    pipeline = get_three_depth_manual_class_pipeline()
    pipeline.fit_from_scratch(train_data)

    predicted_output = pipeline.predict(test_data)
    prediction_before_export = np.array(predicted_output.predict)
    print(f'Before export {prediction_before_export[:4]}')

    NodesAnalysis(
        pipeline,
        train_data,
        test_data,
        approaches=[NodeDeletionAnalyze,
                    NodeReplaceOperationAnalyze]).analyze()

    # Export it
    pipeline.save(path=pipeline_path)

    # Import pipeline
    json_path_load = create_correct_path(pipeline_path)
    new_pipeline = Pipeline()
    new_pipeline.load(json_path_load)

    predicted_output_after_export = new_pipeline.predict(test_data)
    prediction_after_export = np.array(predicted_output_after_export.predict)

    print(f'After import {prediction_after_export[:4]}')
コード例 #3
0
def test_import_json_to_fitted_pipeline_correctly():
    json_path_load = create_correct_path('test_fitted_pipeline_convert_to_json')

    pipeline = Pipeline()
    pipeline.load(json_path_load)
    json_actual = pipeline.save('test_import_json_to_fitted_pipeline_correctly')

    with open(json_path_load, 'r') as json_file:
        json_expected = json.load(json_file)

    assert json_actual == json.dumps(json_expected, indent=4)
コード例 #4
0
def test_import_json_to_pipeline_correctly():
    json_path_load = create_correct_path('test_pipeline_convert_to_json')

    pipeline = Pipeline()
    pipeline.load(json_path_load)
    json_actual = pipeline.save('test_import_json_to_pipeline_correctly_1')

    pipeline_expected = create_pipeline()
    json_expected = pipeline_expected.save('test_import_json_to_pipeline_correctly_2')

    assert json.dumps(json_actual) == json.dumps(json_expected)
コード例 #5
0
def test_import_custom_json_object_to_pipeline_and_fit_correctly_no_exception():
    test_file_path = str(os.path.dirname(__file__))
    file = '../../data/test_custom_json_template.json'
    json_path_load = os.path.join(test_file_path, file)

    train_file_path, test_file_path = get_scoring_case_data_paths()
    train_data = InputData.from_csv(train_file_path)

    pipeline = Pipeline()
    pipeline.load(json_path_load)
    pipeline.fit(train_data)

    pipeline.save('test_import_custom_json_object_to_pipeline_and_fit_correctly_no_exception')
コード例 #6
0
def test_save_load_atomized_pipeline_correctly():
    pipeline = create_pipeline_with_several_nested_atomized_model()

    json_actual = pipeline.save('test_save_load_atomized_pipeline_correctly')

    json_path_load = create_correct_path(
        'test_save_load_atomized_pipeline_correctly')

    with open(json_path_load, 'r') as json_file:
        json_expected = json.load(json_file)

    pipeline_loaded = Pipeline()
    pipeline_loaded.load(json_path_load)

    assert pipeline.length == pipeline_loaded.length
    assert json_actual == json.dumps(json_expected, indent=4)
コード例 #7
0
def test_fitted_pipeline_cache_correctness_after_export_and_import():
    train_file_path, test_file_path = get_scoring_case_data_paths()
    train_data = InputData.from_csv(train_file_path)
    test_data = InputData.from_csv(test_file_path)

    pipeline = create_classification_pipeline_with_preprocessing()
    pipeline.fit(train_data)
    pipeline.save('test_fitted_pipeline_cache_correctness_after_export_and_import')
    prediction = pipeline.predict(test_data)

    new_pipeline = Pipeline()
    new_pipeline.load(create_correct_path('test_fitted_pipeline_cache_correctness_after_export_and_import'))

    new_prediction = new_pipeline.predict(test_data)

    assert np.array_equal(prediction.predict, new_prediction.predict)
    assert new_pipeline.is_fitted
コード例 #8
0
def run_import_export_example(pipeline_path):
    features_options = {'informative': 1, 'bias': 0.0}
    samples_amount = 100
    features_amount = 2
    x_train, y_train, x_test, y_test = get_regression_dataset(
        features_options, samples_amount, features_amount)

    # Define regression task
    task = Task(TaskTypesEnum.regression)

    # Prepare data to train the model
    train_input = InputData(idx=np.arange(0, len(x_train)),
                            features=x_train,
                            target=y_train,
                            task=task,
                            data_type=DataTypesEnum.table)

    predict_input = InputData(idx=np.arange(0, len(x_test)),
                              features=x_test,
                              target=None,
                              task=task,
                              data_type=DataTypesEnum.table)

    # Get pipeline and fit it
    pipeline = get_pipeline()
    pipeline.fit_from_scratch(train_input)

    predicted_output = pipeline.predict(predict_input)
    prediction_before_export = np.array(predicted_output.predict)
    print(f'Before export {prediction_before_export[:4]}')

    # Export it
    pipeline.save(path=pipeline_path)

    # Import pipeline
    json_path_load = create_correct_path(pipeline_path)
    new_pipeline = Pipeline()
    new_pipeline.load(json_path_load)

    predicted_output_after_export = new_pipeline.predict(predict_input)
    prediction_after_export = np.array(predicted_output_after_export.predict)

    print(f'After import {prediction_after_export[:4]}')
コード例 #9
0
    def __init__(self, node: Node = None, operation_id: int = None, nodes_from: list = None, path: str = None):
        # Need use the imports inside the class because of the problem of circular imports.
        from fedot.core.pipelines.pipeline import Pipeline
        from fedot.core.pipelines.template import PipelineTemplate
        from fedot.core.operations.atomized_model import AtomizedModel

        super().__init__()
        self.atomized_model_json_path = None
        self.next_pipeline_template = None
        self.pipeline_template = None

        if path:
            pipeline = Pipeline()
            pipeline.load(path)
            self.next_pipeline_template = AtomizedModel(pipeline)
            self.pipeline_template = PipelineTemplate(pipeline)

        if node:
            self._operation_to_template(node, operation_id, nodes_from)
コード例 #10
0
def run_oil_forecasting(path_to_file, path_to_file_crm, len_forecast,
                        len_forecast_full, ax, well_id, timeout):
    if timeout is None:
        timeout = 1
    df = pd.read_csv(path_to_file, sep=' *, *')
    df_crm = pd.read_csv(path_to_file_crm, sep=' *, *')

    len_forecast_for_split = len_forecast_full
    dates, target_train, data_fit, data_predict, input_data_fit, input_data_predict, test_data, \
    train_data, time_series = prepare_dataset(df, len_forecast, len_forecast_for_split, well_id)

    dates, target_train_crm, data_fit_crm, data_predict_crm, input_data_fit_crm, input_data_predict_crm, test_data_crm, \
    train_data, time_series = prepare_dataset(df_crm, len_forecast, len_forecast_for_split, well_id)

    task_parameters = TsForecastingParams(forecast_length=len_forecast)

    if not os.path.exists(f'pipeline_{well_id}/pipeline_{well_id}.json'):
        model = Fedot(problem='ts_forecasting',
                      task_params=task_parameters,
                      composer_params={'timeout': timeout},
                      preset='ultra_light',
                      verbose_level=4)

        # run AutoML model design in the same way
        pipeline = model.fit(features=data_fit, target=target_train)
        pipeline.save(f'pipeline_{well_id}')  # , datetime_in_path=False)
    else:
        pipeline = Pipeline()
        pipeline.load(f'pipeline_{well_id}/pipeline_{well_id}.json')

    if not os.path.exists(
            f'pipeline_crm_{well_id}/pipeline_crm_{well_id}.json'):
        model = Fedot(problem='ts_forecasting',
                      task_params=task_parameters,
                      composer_params={'timeout': timeout},
                      preset='ultra_light',
                      verbose_level=4)

        # run AutoML model design in the same way
        pipeline_crm = model.fit(features=data_fit_crm,
                                 target=target_train_crm)
        pipeline_crm.save(
            f'pipeline_crm_{well_id}')  # , datetime_in_path=False)
    else:
        pipeline_crm = Pipeline()
        pipeline_crm.load(
            f'pipeline_crm_{well_id}/pipeline_crm_{well_id}.json')

    sources = dict(
        (f'data_source_ts/{data_part_key}', data_part)
        for (data_part_key, data_part) in input_data_predict.items())
    input_data_predict_mm = MultiModalData(sources)

    sources_crm = dict(
        (f'data_source_ts/{data_part_key}', data_part)
        for (data_part_key, data_part) in input_data_predict_crm.items())
    input_data_predict_mm_crm = MultiModalData(sources_crm)

    forecast = in_sample_ts_forecast(pipeline,
                                     input_data_predict_mm,
                                     horizon=len_forecast_full)
    forecast_crm = in_sample_ts_forecast(pipeline_crm,
                                         input_data_predict_mm_crm,
                                         horizon=len_forecast_full)

    predicted = np.ravel(np.array(forecast))
    predicted_crm = np.ravel(np.array(forecast_crm))
    predicted_only_crm = np.asarray(
        df_crm[f'crm_{well_id}'][-len_forecast_full:])

    test_data = np.ravel(test_data)

    print('CRM')
    predicted_only_crm[np.isnan(predicted_only_crm)] = 0
    mse_before = mean_squared_error(test_data,
                                    predicted_only_crm,
                                    squared=False)
    mae_before = mean_absolute_error(test_data, predicted_only_crm)
    print(f'RMSE - {mse_before:.4f}')
    print(f'MAE - {mae_before:.4f}\n')

    print('ML')
    mse_before = mean_squared_error(test_data, predicted, squared=False)
    mae_before = mean_absolute_error(test_data, predicted)
    print(f'RMSE - {mse_before:.4f}')
    print(f'MAE - {mae_before:.4f}\n')

    print('AutoML+CRM')
    mse_before = mean_squared_error(test_data, predicted_crm, squared=False)
    mae_before = mean_absolute_error(test_data, predicted_crm)
    print(f'RMSE - {mse_before:.4f}')
    print(f'MAE - {mae_before:.4f}\n')

    if ax:
        x_for = range(len(train_data), len(time_series))
        ax.plot(x_for,
                time_series[-len_forecast_full:],
                label='Actual time series',
                linewidth=0.5)
        ax.plot(x_for, predicted_crm, label='AutoML+CRM', linewidth=0.5)
        ax.plot(x_for, predicted_only_crm, label='CRM', linewidth=0.5)

        ci_crm = t_conf_interval(np.std(predicted_crm), 0.975,
                                 len(predicted_crm)) * 1.96
        ax.fill_between(x_for, (predicted_crm - ci_crm),
                        (predicted_crm + ci_crm),
                        color='orange',
                        alpha=.5)

        ci_crmonly = t_conf_interval(np.std(predicted_only_crm), 0.975,
                                     len(predicted_only_crm)) * 1.96
        ax.fill_between(x_for, (predicted_only_crm - ci_crmonly),
                        (predicted_only_crm + ci_crmonly),
                        color='green',
                        alpha=.5)

        ax.set(xlabel='Days from 2013.06.01', ylabel='Oil volume, m3')
        if well_id == '5351':
            ax.legend()
        ax.set_title(well_id)
        ax.plot()
コード例 #11
0
class Fedot:
    """
    Main class for FEDOT API

    :param problem: the name of modelling problem to solve:
        - classification
        - regression
        - ts_forecasting
        - clustering
    :param preset: name of preset for model building (e.g. 'light', 'ultra-light')
    :param timeout: time for model design (in minutes)
    :param composer_params: parameters of pipeline optimisation
        The possible parameters are:
            'max_depth' - max depth of the pipeline
            'max_arity' - max arity of the pipeline nodes
            'pop_size' - population size for composer
            'num_of_generations' - number of generations for composer
            'timeout':- composing time (minutes)
            'available_operations' - list of model names to use
            'with_tuning' - allow huperparameters tuning for the model
            'cv_folds' - number of folds for cross-validation
            'validation_blocks' - number of validation blocks for time series forecasting
    :param task_params:  additional parameters of the task
    :param seed: value for fixed random seed
    :param verbose_level: level of the output detailing
        (-1 - nothing, 0 - errors, 1 - messages,
        2 - warnings and info, 3-4 - basic and detailed debug)
    """
    def __init__(self,
                 problem: str,
                 preset: str = None,
                 timeout: Optional[float] = None,
                 composer_params: dict = None,
                 task_params: TaskParams = None,
                 seed=None,
                 verbose_level: int = 1):
        if seed is not None:
            np.random.seed(seed)
            random.seed(seed)

        # metainfo
        self.problem = problem
        self.composer_params = composer_params
        self.task_params = task_params

        # model to use
        self.current_pipeline = None

        # best models for multi-objective case
        self.best_models = None

        # composer history
        self.history = None

        # datasets
        self.train_data = None
        self.test_data = None
        self.prediction = None
        self.prediction_labels = None  # classification-only
        self.target_name = None

        self.log = default_log('FEDOT logger', verbose_level=verbose_level)

        if self.composer_params is None:
            self.composer_params = default_evo_params(self.problem)
        else:
            self.composer_params = {
                **default_evo_params(self.problem),
                **self.composer_params
            }

        self.metric_to_compose = None
        if 'metric' in self.composer_params:
            self.composer_params['composer_metric'] = self.composer_params[
                'metric']
            del self.composer_params['metric']
            self.metric_to_compose = self.composer_params['composer_metric']

        if timeout is not None:
            self.composer_params['timeout'] = timeout
            self.composer_params[
                'num_of_generations'] = 10000  # num of generation is limited by time now

        if self.problem == 'ts_forecasting' and task_params is None:
            self.task_params = TsForecastingParams(forecast_length=30)

        task_dict = {
            'regression':
            Task(TaskTypesEnum.regression, task_params=self.task_params),
            'classification':
            Task(TaskTypesEnum.classification, task_params=self.task_params),
            'clustering':
            Task(TaskTypesEnum.clustering, task_params=self.task_params),
            'ts_forecasting':
            Task(TaskTypesEnum.ts_forecasting, task_params=self.task_params)
        }

        if self.problem == 'clustering':
            raise ValueError(
                'This type of task is not not supported in API now')

        self.metric_name = default_test_metric_dict[self.problem]
        self.problem = task_dict[self.problem]

        if preset is None and 'preset' in self.composer_params:
            preset = self.composer_params['preset']

        if 'preset' in self.composer_params:
            del self.composer_params['preset']

        if preset is not None:
            available_operations = filter_operations_by_preset(
                self.problem, preset)
            self.composer_params['available_operations'] = available_operations
            self.composer_params[
                'with_tuning'] = '_tun' in preset or preset is None

    def _get_params(self):
        param_dict = {
            'train_data': self.train_data,
            'task': self.problem,
            'logger': self.log
        }
        return {**param_dict, **self.composer_params}

    def _obtain_model(self, is_composing_required: bool = True):
        execution_params = self._get_params()
        if is_composing_required:
            self.current_pipeline, self.best_models, self.history = compose_fedot_model(
                **execution_params)

        if isinstance(self.best_models, tools.ParetoFront):
            self.best_models.__class__ = ParetoFront
            self.best_models.objective_names = self.metric_to_compose

        self.current_pipeline.fit_from_scratch(self.train_data)

        return self.current_pipeline

    def clean(self):
        """
        Cleans fitted model and obtained predictions
        """
        self.prediction = None
        self.prediction_labels = None
        self.current_pipeline = None

    def fit(self,
            features: Union[str, np.ndarray, pd.DataFrame, InputData, dict],
            target: Union[str, np.ndarray, pd.Series] = 'target',
            predefined_model: Union[str, Pipeline] = None):
        """
        Fit the graph with a predefined structure or compose and fit the new graph

        :param features: the array with features of train data
        :param target: the array with target values of train data
        :param predefined_model: the name of the atomic model or Pipeline instance
        :return: Pipeline object
        """

        self.target_name = target
        self.train_data = _define_data(ml_task=self.problem,
                                       features=features,
                                       target=target,
                                       is_predict=False)

        is_composing_required = True
        if self.current_pipeline is not None:
            is_composing_required = False

        if predefined_model is not None:
            is_composing_required = False
            if isinstance(predefined_model, Pipeline):
                self.current_pipeline = predefined_model
            elif isinstance(predefined_model, str):
                self.current_pipeline = Pipeline(PrimaryNode(predefined_model))
            else:
                raise ValueError(
                    f'{type(predefined_model)} is not supported as Fedot model'
                )

        return self._obtain_model(is_composing_required)

    def predict(self,
                features: Union[str, np.ndarray, pd.DataFrame, InputData,
                                dict],
                save_predictions: bool = False):
        """
        Predict new target using already fitted model

        :param features: the array with features of test data
        :param save_predictions: if True-save predictions as csv-file in working directory.
        :return: the array with prediction values
        """
        if self.current_pipeline is None:
            raise ValueError(NOT_FITTED_ERR_MSG)

        self.test_data = _define_data(ml_task=self.problem,
                                      target=self.target_name,
                                      features=features,
                                      is_predict=True)

        if self.problem.task_type == TaskTypesEnum.classification:
            self.prediction_labels = self.current_pipeline.predict(
                self.test_data, output_mode='labels')
            self.prediction = self.current_pipeline.predict(
                self.test_data, output_mode='probs')
            output_prediction = self.prediction
        elif self.problem.task_type == TaskTypesEnum.ts_forecasting:
            # Convert forecast into one-dimensional array
            self.prediction = self.current_pipeline.predict(self.test_data)
            forecast = np.ravel(np.array(self.prediction.predict))
            self.prediction.predict = forecast
            output_prediction = self.prediction
        else:
            self.prediction = self.current_pipeline.predict(self.test_data)
            output_prediction = self.prediction

        if save_predictions:
            save_predict(self.prediction)
        return output_prediction.predict

    def predict_proba(self,
                      features: Union[str, np.ndarray, pd.DataFrame, InputData,
                                      dict],
                      save_predictions: bool = False,
                      probs_for_all_classes: bool = False):
        """
        Predict the probability of new target using already fitted classification model

        :param features: the array with features of test data
        :param save_predictions: if True-save predictions as csv-file in working directory.
        :param probs_for_all_classes: return probability for each class even for binary case
        :return: the array with prediction values
        """

        if self.current_pipeline is None:
            raise ValueError(NOT_FITTED_ERR_MSG)

        if self.problem.task_type == TaskTypesEnum.classification:
            self.test_data = _define_data(ml_task=self.problem,
                                          target=self.target_name,
                                          features=features,
                                          is_predict=True)

            mode = 'full_probs' if probs_for_all_classes else 'probs'

            self.prediction = self.current_pipeline.predict(self.test_data,
                                                            output_mode=mode)
            self.prediction_labels = self.current_pipeline.predict(
                self.test_data, output_mode='labels')

            if save_predictions:
                save_predict(self.prediction)
        else:
            raise ValueError(
                'Probabilities of predictions are available only for classification'
            )

        return self.prediction.predict

    def forecast(self,
                 pre_history: Union[str, Tuple[np.ndarray, np.ndarray],
                                    InputData, dict],
                 forecast_length: int = 1,
                 save_predictions: bool = False):
        """
        Forecast the new values of time series

        :param pre_history: the array with features for pre-history of the forecast
        :param forecast_length: num of steps to forecast
        :param save_predictions: if True-save predictions as csv-file in working directory.
        :return: the array with prediction values
        """

        # TODO use forecast length

        if self.current_pipeline is None:
            raise ValueError(NOT_FITTED_ERR_MSG)

        if self.problem.task_type != TaskTypesEnum.ts_forecasting:
            raise ValueError(
                'Forecasting can be used only for the time series')

        self.problem = self.train_data.task

        self.test_data = _define_data(ml_task=self.problem,
                                      target=self.target_name,
                                      features=pre_history,
                                      is_predict=True)

        self.current_pipeline = Pipeline(self.current_pipeline.root_node)
        # TODO add incremental forecast
        self.prediction = self.current_pipeline.predict(self.test_data)
        if len(self.prediction.predict.shape) > 1:
            self.prediction.predict = np.squeeze(self.prediction.predict)

        if save_predictions:
            save_predict(self.prediction)
        return self.prediction.predict

    def load(self, path):
        """
        Load saved graph from disk

        :param path to json file with model
        """
        self.current_pipeline.load(path)

    def plot_prediction(self):
        """
        Plot the prediction obtained from graph
        """
        if self.prediction is not None:
            if self.problem.task_type == TaskTypesEnum.ts_forecasting:
                plot_forecast(pre_history=self.train_data,
                              forecast=self.prediction)
            else:
                # TODO implement other visualizations
                self.log.error('Not supported yet')

        else:
            self.log.error('No prediction to visualize')

    def get_metrics(self,
                    target: Union[np.ndarray, pd.Series] = None,
                    metric_names: Union[str, List[str]] = None) -> dict:
        """
        Get quality metrics for the fitted graph

        :param target: the array with target values of test data
        :param metric_names: the names of required metrics
        :return: the values of quality metrics
        """
        if metric_names is None:
            metric_names = self.metric_name

        if target is not None:
            if self.test_data is None:
                self.test_data = InputData(
                    idx=range(len(self.prediction.predict)),
                    features=None,
                    target=target[:len(self.prediction.predict)],
                    task=self.train_data.task,
                    data_type=self.train_data.data_type)
            else:
                self.test_data.target = target[:len(self.prediction.predict)]

        real = self.test_data

        # TODO change to sklearn metrics
        if not isinstance(metric_names, List):
            metric_names = [metric_names]

        calculated_metrics = dict()
        for metric_name in metric_names:
            if composer_metrics_mapping[metric_name] is NotImplemented:
                self.log.warn(f'{metric_name} is not available as metric')
            else:
                prediction = self.prediction
                metric_cls = MetricsRepository().metric_class_by_id(
                    composer_metrics_mapping[metric_name])
                if metric_cls.output_mode == 'labels':
                    prediction = self.prediction_labels
                if self.problem.task_type == TaskTypesEnum.ts_forecasting:
                    real.target = real.target[~np.isnan(prediction.predict)]
                    prediction.predict = prediction.predict[
                        ~np.isnan(prediction.predict)]

                metric_value = abs(
                    metric_cls.metric(reference=real, predicted=prediction))
                calculated_metrics[metric_name] = metric_value

        return calculated_metrics