Esempio n. 1
0
    def _generate_splits(self, X, y, readings):
        if self._preprocessing:
            pipeline = MLPipeline(self.template)
            LOGGER.debug('Running %s preprocessing steps', self._preprocessing)
            context = pipeline.fit(X=X,
                                   y=y,
                                   readings=readings,
                                   output_=self._preprocessing - 1)
            del context['X']
            del context['y']
        else:
            context = {'readings': readings}

        splits = list()
        for fold, (train_index, test_index) in enumerate(self._cv.split(X, y)):
            LOGGER.debug('Running static steps for fold %s', fold)
            X_train, X_test = X.iloc[train_index], X.iloc[test_index]
            y_train, y_test = y.iloc[train_index], y.iloc[test_index]

            pipeline = MLPipeline(self.template)
            fit = pipeline.fit(X_train,
                               y_train,
                               output_=self._static - 1,
                               start_=self._preprocessing,
                               **context)
            predict = pipeline.predict(X_test,
                                       output_=self._static - 1,
                                       start_=self._preprocessing,
                                       **context)

            splits.append((fold, pipeline, fit, predict, y_test))

        return splits
Esempio n. 2
0
def test_jsons():
    """Validate MLBlocks primitive jsons"""

    primitives = (f for f in os.listdir(PRIMITIVES_PATH)
                  if f.endswith('.json'))
    for primitive_filename in primitives:
        try:
            primitive_path = os.path.join(PRIMITIVES_PATH, primitive_filename)
            with open(primitive_path, 'r') as f:
                primitive = json.load(f)

            primitive_name = primitive['name']
            fixed_hyperparameters = primitive.get('hyperparameters',
                                                  dict()).get('fixed', dict())

            init_hyperparameters = dict()
            for name, hyperparameter in fixed_hyperparameters.items():
                if 'default' not in hyperparameter:
                    type_ = hyperparameter.get('type')
                    init_hyperparameters[name] = HYPERPARAMETER_DEFAULTS.get(
                        type_)

            block_name = primitive_name + '#1'
            mlpipeline = MLPipeline([primitive_name],
                                    {block_name: init_hyperparameters})

            # Validate methods
            mlblock = mlpipeline.blocks[block_name]
            if mlblock._class:
                fit = primitive.get('fit')
                if fit:
                    assert hasattr(mlblock.instance, fit['method'])

                produce = primitive['produce']
                assert hasattr(mlblock.instance, produce['method'])

            # Run pipeline, when possible
            validation_dataset = primitive.get('validation_dataset')
            if validation_dataset:
                X, y = DATASETS[validation_dataset]
                mlpipeline.fit(X, y)
                mlpipeline.predict(X)

        except Exception:
            raise ValueError(
                "Invalid JSON primitive: {}".format(primitive_filename))
Esempio n. 3
0
    def _generate_splits(self, template_name, target_times, readings, turbines=None):
        template = self._template_dicts.get(template_name)
        pipeline = MLPipeline(template)
        preprocessing = self._preprocessing.get(template_name)
        static = self._count_static_steps(pipeline)
        X = target_times[['turbine_id', 'cutoff_time']]
        y = target_times['target']

        if preprocessing:
            if preprocessing > static:
                raise ValueError('Preprocessing cannot be bigger than static')

            LOGGER.debug('Running %s preprocessing steps', preprocessing)
            context = pipeline.fit(X=X, y=y, readings=readings,
                                   turbines=turbines, output_=preprocessing - 1)
            del context['X']
            del context['y']
            gc.collect()

        else:
            context = {
                'readings': readings,
                'turbines': turbines,
            }

        splits = list()
        for fold, (train_index, test_index) in enumerate(self._cv.split(X, y)):
            LOGGER.debug('Running static steps for fold %s', fold)
            gc.collect()
            X_train, X_test = X.iloc[train_index], X.iloc[test_index]
            y_train, y_test = y.iloc[train_index], y.iloc[test_index]

            pipeline = MLPipeline(template)
            fit = pipeline.fit(X_train, y_train, output_=static - 1,
                               start_=preprocessing, **context)
            predict = pipeline.predict(X_test, output_=static - 1,
                                       start_=preprocessing, **context)

            split = (fold, pipeline, fit, predict, y_test, static)

            if self._cache_path:
                split_name = '{}_{}.pkl'.format(template_name, fold)
                split_path = os.path.join(self._cache_path, split_name)
                os.makedirs(os.path.dirname(split_path), exist_ok=True)

                with open(split_path, 'wb') as split_file:
                    pickle.dump(split, split_file)

                split = split_path

            splits.append(split)

        gc.collect()
        return splits
Esempio n. 4
0
    def k_fold_validation(self, hyperparameters, X, y, scoring=None):
        """Score the pipeline through k-fold validation with the given scoring function.

        Args:
            hyperparameters (dict or None):
                A dictionary of hyper-parameters for each primitive in the target pipeline.
            X (pandas.DataFrame or ndarray):
                Inputs to the pipeline.
            y (pandas.Series or ndarray):
                Target values.
            scoring (str):
                The name of the scoring function.

        Returns:
            np.float64:
                The average score in the k-fold validation.
        """
        model_instance = MLPipeline(self._pipeline)
        X = pd.DataFrame(X)
        y = pd.Series(y)

        if hyperparameters:
            model_instance.set_hyperparameters(hyperparameters)

        if self._problem_type == 'regression':
            scorer = self.regression_metrics[scoring or 'R2 Score']
        else:
            scorer = self.classification_metrics[scoring or 'F1 Macro']

        scores = []
        kf = KFold(n_splits=10, random_state=None, shuffle=True)
        for train_index, test_index in kf.split(X):
            model_instance.fit(X.iloc[train_index], y.iloc[train_index])
            y_pred = model_instance.predict(X.iloc[test_index])
            scores.append(scorer(y.iloc[test_index], y_pred))

        return np.mean(scores)
Esempio n. 5
0
class GreenGuardPipeline(object):
    """Main Machine Learning component in the GreenGuard project.

    The ``GreenGuardPipeline`` represents the abstraction of a Machine
    Learning pipeline architecture specialized on the GreenGuard data
    format.

    In order to use it, an MLBlocks pipeline template needs to be given,
    alongside information about how to evaluate its performance using
    cross validation.

    Attributes:
        template (MLPipeline):
            MLPipeline instance used as the template for tuning.
        template_name:
            Name of the template being used.
        fitted (bool):
            Whether this GreenGuardPipeline has already been fitted or not.
        steps (list):
            List of primitives that compose this template.
        preprocessing (list):
            List of preprocessing steps. These steps have no learning stage
            and are executed only once on the complete training dataset, before
            partitioning it for cross validation.
        static (list):
            List of static steps. These are all the steps in the pipeline that
            come after the preprocessing ones but have no hyperparameters.
            These are executed on each cross validation split only once, when
            the data is partitioned, and their output is cached to be reused
            later on at every tuning iteration.
        tunable (list):
            List of steps that have hyperparameters and will be tuned during
            the tuning loop.

    Args:
        templates (str, MLPipeline or list):
            Template to use. If a ``str`` is given, load the corresponding
            ``MLPipeline``. Also can be a list combining both.
        metric (str or function):
            Metric to use. If an ``str`` is give it must be one of the metrics
            defined in the ``greenguard.metrics.METRICS`` dictionary.
        cost (bool):
            Whether the metric is a cost function (the lower the better) or not.
            Defaults to ``False``.
        init_params (dict or list):
            There are three possible values for init_params:

                * Init params ``dict``: It will be used for all templates.
                * ``dict`` with the name of the template as a key and dictionary with its
                  init params.
                * ``list``: each value will be assigned to the corresponding position of
                  self.templates.

            Defaults to ``None``.
        stratify (bool):
            Whether to stratify the data when partitioning for cross validation.
            Defaults to ``True``.
        cv_splits (int):
            Number of cross validation folds to use. Defaults to ``5``.
        shuffle (bool):
            Whether to shuffle the data when partitioning for cross validation.
            Defaults to ``True``.
        random_state (int or RandomState):
            random state to use for the cross validation partitioning.
            Defaults to ``0``.
        preprocessing (int, dict or list):
            There are three possible values for preprocessing:

                * ``int``: the value will be used for all templates.
                * ``dict`` with the template name as a key and a number as a value, will
                  be used for that template.
                * ``list``: each value will be assigned to the corresponding position of
                  self.templates.

            Defaults to ``0``.
        cache_path (str):
            If given, cache the generated cross validation splits in this folder.
            Defatuls to ``None``.
    """

    template = None
    template_name = None
    fitted = False
    cv_score = None

    _cv_class = None
    _metric = None
    _cost = False
    _tuner = None
    _pipeline = None
    _static = None
    _init_params = None
    _preprocessing = None

    def _get_cv(self, stratify, cv_splits, shuffle, random_state):
        if stratify:
            cv_class = StratifiedKFold
        else:
            cv_class = KFold

        return cv_class(n_splits=cv_splits, shuffle=shuffle, random_state=random_state)

    def _set_hyperparameters(self, new_hyperparameters):
        self._hyperparameters = deepcopy(new_hyperparameters)

    def _set_template(self, template_name):
        self.template_name = template_name
        self.template = self._template_dicts[self.template_name]

    @staticmethod
    def _update_params(old, new):
        for name, params in new.items():
            if '#' not in name:
                name = name + '#1'

            block_params = old.setdefault(name, dict())
            for param, value in params.items():
                block_params[param] = value

    def _count_static_steps(self, pipeline):
        tunable_hyperparams = pipeline.get_tunable_hyperparameters()
        for index, block_name in enumerate(pipeline.blocks.keys()):
            if tunable_hyperparams[block_name]:
                return index

        return 0

    def _get_templates(self, templates):
        template_dicts = dict()
        template_names = list()
        for template in templates:
            if isinstance(template, str):
                template_name = template
                template = load_pipeline(template_name)
            else:
                template_name = md5(json.dumps(template)).digest()
            template_dicts[template_name] = template
            template_names.append(template_name)

        return template_names, template_dicts

    def _generate_init_params(self, init_params):
        if not init_params:
            self._init_params = {}
        elif isinstance(init_params, list):
            self._init_params = dict(zip(self._template_names, init_params))
        elif any(name in init_params for name in self._template_names):
            self._init_params = init_params

    def _generate_preprocessing(self, preprocessing):
        if isinstance(preprocessing, int):
            self._preprocessing = {name: preprocessing for name in self._template_names}
        else:
            if isinstance(preprocessing, list):
                preprocessing = dict(zip(self._template_names, preprocessing))

            self._preprocessing = {
                name: preprocessing.get(name, 0)
                for name in self._template_names
            }

    def _build_pipeline(self):
        self._pipeline = MLPipeline(self.template)

        if self._hyperparameters:
            self._pipeline.set_hyperparameters(self._hyperparameters)

        self.fitted = False

    def __init__(self, templates, metric='accuracy', cost=False, init_params=None, stratify=True,
                 cv_splits=5, shuffle=True, random_state=0, preprocessing=0, cache_path=None):

        if isinstance(metric, str):
            metric, cost = METRICS[metric]

        self._metric = metric
        self._cost = cost
        self._cv = self._get_cv(stratify, cv_splits, shuffle, random_state)
        self.cv_score = np.inf if cost else -np.inf

        if not isinstance(templates, list):
            templates = [templates]

        self.templates = templates
        self._template_names, self._template_dicts = self._get_templates(templates)
        self._default_init_params = {}
        self._generate_init_params(init_params)

        for name, template in self._template_dicts.items():
            init_params = self._init_params.get(name, self._default_init_params)
            template_params = template.setdefault('init_params', {})
            self._update_params(template_params, init_params)

        self._generate_preprocessing(preprocessing)
        self._set_template(self._template_names[0])
        self._hyperparameters = dict()
        self._build_pipeline()
        self._cache_path = cache_path
        if cache_path:
            os.makedirs(cache_path, exist_ok=True)

    def get_hyperparameters(self):
        """Get the current hyperparameters.

        Returns:
            dict:
                Current hyperparameters.
        """
        return deepcopy(self._hyperparameters)

    def _is_better(self, score):
        if self._cost:
            return score < self.cv_score

        return score > self.cv_score

    def _generate_splits(self, template_name, target_times, readings, turbines=None):
        template = self._template_dicts.get(template_name)
        pipeline = MLPipeline(template)
        preprocessing = self._preprocessing.get(template_name)
        static = self._count_static_steps(pipeline)
        X = target_times[['turbine_id', 'cutoff_time']]
        y = target_times['target']

        if preprocessing:
            if preprocessing > static:
                raise ValueError('Preprocessing cannot be bigger than static')

            LOGGER.debug('Running %s preprocessing steps', preprocessing)
            context = pipeline.fit(X=X, y=y, readings=readings,
                                   turbines=turbines, output_=preprocessing - 1)
            del context['X']
            del context['y']
            gc.collect()

        else:
            context = {
                'readings': readings,
                'turbines': turbines,
            }

        splits = list()
        for fold, (train_index, test_index) in enumerate(self._cv.split(X, y)):
            LOGGER.debug('Running static steps for fold %s', fold)
            gc.collect()
            X_train, X_test = X.iloc[train_index], X.iloc[test_index]
            y_train, y_test = y.iloc[train_index], y.iloc[test_index]

            pipeline = MLPipeline(template)
            fit = pipeline.fit(X_train, y_train, output_=static - 1,
                               start_=preprocessing, **context)
            predict = pipeline.predict(X_test, output_=static - 1,
                                       start_=preprocessing, **context)

            split = (fold, pipeline, fit, predict, y_test, static)

            if self._cache_path:
                split_name = '{}_{}.pkl'.format(template_name, fold)
                split_path = os.path.join(self._cache_path, split_name)

                with open(split_path, 'wb') as split_file:
                    pickle.dump(split, split_file)

                split = split_path

            splits.append(split)

        gc.collect()
        return splits

    def _cross_validate(self, template_splits, hyperparams):
        scores = []
        for split in template_splits:
            gc.collect()
            if self._cache_path:
                with open(split, 'rb') as split_file:
                    split = pickle.load(split_file)

            fold, pipeline, fit, predict, y_test, static = split

            LOGGER.debug('Scoring fold %s', fold)
            pipeline.set_hyperparameters(hyperparams)
            pipeline.fit(start_=static, **fit)
            predictions = pipeline.predict(start_=static, **predict)

            score = self._metric(y_test, predictions)
            LOGGER.debug('Fold fold %s score: %s', fold, score)
            scores.append(score)

        return np.mean(scores)

    def _make_btb_scorer(self, target_times, readings, turbines):
        splits = {}

        def scorer(template_name, config):
            template_splits = splits.get(template_name)
            if template_splits is None:
                template_splits = self._generate_splits(
                    template_name, target_times, readings, turbines)

                splits[template_name] = template_splits

            cv_score = self._cross_validate(template_splits, config)
            if self._is_better(cv_score):
                _config = '\n'.join('      {}: {}'.format(n, v) for n, v in config.items())
                LOGGER.info(('New configuration found:\n'
                             '  Template: %s \n'
                             '    Hyperparameters: \n'
                             '%s'), template_name, _config)

                self.cv_score = cv_score
                self._set_template(template_name)
                self._set_hyperparameters(config)
                self._build_pipeline()

            return cv_score

        return scorer

    def cross_validate(self, target_times, readings, turbines,
                       template_name=None, hyperparams=None):
        """Compute cross validation score using the given data.

        If the splits have not been previously computed, compute them now.
        During this computation, the data is partitioned using the indicated
        cross validation parameters and later on processed using the
        pipeline static steps.

        The results of the fit and produce executions are cached and reused
        in subsequent calls to this method.

        Args:
            X (pandas.DataFrame):
                ``target_times`` data, without the ``target`` column.
                Only needed if the splits have not been previously computed.
            y (pandas.Series or numpy.ndarray):
                ``target`` vector corresponding to the passed ``target_times``.
                Only needed if the splits have not been previously computed.
            readings (pandas.DataFrame):
                ``readings`` table. Only needed if the splits have not been
                previously computed.
            turbines (pandas.DataFrame):
                ``turbines`` table. Only needed if the splits have not been
                previously computed.
            params (dict):
                hyperparameter values to use.

        Returns:
            float:
                Computed cross validation score. This score is the average
                of the scores obtained accross all the cross validation folds.
        """
        if not template_name:
            template_name = self.template_name
            if hyperparams is None:
                hyperparams = self.get_hyperparameters()

        elif hyperparams is None:
            hyperparams = {}

        template_splits = self._generate_splits(template_name, target_times, readings, turbines)
        return self._cross_validate(template_splits, hyperparams)

    @classmethod
    def _get_tunables(cls, template_dicts):
        tunables = {}
        for name, template in template_dicts.items():
            pipeline = MLPipeline(template)
            pipeline_tunables = pipeline.get_tunable_hyperparameters(flat=True)
            tunables[name] = Tunable.from_dict(pipeline_tunables)

        return tunables

    def tune(self, target_times, readings, turbines=None):
        """Create a tuning session object that tunes and selects the templates.

        Args:
            target_times (pandas.DataFrame):
                ``target_times`` table, containing the ``turbine_id``, ``cutoff_time``
                and ``target`` columns.
                Only needed if the splits have not been previously computed.
            readings (pandas.DataFrame):
                ``readings`` table. Only needed if the splits have not been
                previously computed.
            turbines (pandas.DataFrame):
                ``turbines`` table. Only needed if the splits have not been
                previously computed.
        """
        scoring_function = self._make_btb_scorer(target_times, readings, turbines)
        tunables = self._get_tunables(self._template_dicts)
        return BTBSession(tunables, scoring_function, maximize=not self._cost)

    def fit(self, target_times, readings, turbines=None):
        """Fit this pipeline to the given data.

        Args:
            target_times (pandas.DataFrame):
                ``target_times`` table, containing the ``turbine_id``, ``cutoff_time``
                and ``target`` columns.
            readings (pandas.DataFrame):
                ``readings`` table.
            turbines (pandas.DataFrame):
                ``turbines`` table.
        """
        X = target_times[['turbine_id', 'cutoff_time']]
        y = target_times['target']
        self._pipeline.fit(X, y, readings=readings, turbines=turbines)
        self.fitted = True

    def predict(self, target_times, readings, turbines=None):
        """Make predictions using this pipeline.

        Args:
            target_times (pandas.DataFrame):
                ``target_times`` table, containing the ``turbine_id``, ``cutoff_time``
                and ``target`` columns.
            readings (pandas.DataFrame):
                ``readings`` table.
            turbines (pandas.DataFrame):
                ``turbines`` table.

        Returns:
            numpy.ndarray:
                Vector of predictions.
        """
        if not self.fitted:
            raise NotFittedError()

        X = target_times[['turbine_id', 'cutoff_time']]
        return self._pipeline.predict(X, readings=readings, turbines=turbines)

    def save(self, path):
        """Serialize and save this pipeline using cloudpickle.

        Args:
            path (str):
                Path to the file where the pipeline will be saved.
        """
        with open(path, 'wb') as pickle_file:
            cloudpickle.dump(self, pickle_file)

    @classmethod
    def load(cls, path):
        """Load a previously saved pipeline from a file.

        Args:
            path (str):
                Path to the file where the pipeline is saved.

        Returns:
            GreenGuardPipeline:
                Loaded GreenGuardPipeline instance.
        """
        with open(path, 'rb') as pickle_file:
            return cloudpickle.load(pickle_file)
Esempio n. 6
0
def test_SigPro_nested_pipeline():
    """Test nested sigpro primitive."""
    # setup
    aggregations = [{
        'primitive': 'sigpro.SigPro',
        'init_params': {
            'keep_columns':
            True,
            'input_is_dataframe':
            False,
            'values_column_name':
            'amplitude_values',
            'transformations': [{
                'primitive':
                'sigpro.transformations.frequency.band.frequency_band',
                'init_params': {
                    'low': 100,
                    'high': 200
                }
            }],
            'aggregations': [{
                'primitive':
                'sigpro.aggregations.amplitude.statistical.mean'
            }]
        }
    }, {
        'primitive': 'sigpro.SigPro',
        'init_params': {
            'input_is_dataframe':
            False,
            'values_column_name':
            'amplitude_values',
            'transformations': [{
                'primitive':
                'sigpro.transformations.frequency.band.frequency_band',
                'init_params': {
                    'low': 3000,
                    'high': 4000
                }
            }],
            'aggregations': [{
                'name':
                'band_3k_4k_mean',
                'primitive':
                'sigpro.aggregations.amplitude.statistical.mean'
            }],
        }
    }, {
        'primitive':
        'sigpro.aggregations.amplitude.statistical.mean'
    }]

    pipeline = MLPipeline({
        'primitives': ['sigpro.SigPro'],
        'init_params': {
            'sigpro.SigPro#1': {
                'transformations': [{
                    'primitive':
                    'sigpro.transformations.frequency.fft.fft_real'
                }],
                'aggregations':
                aggregations
            }
        }
    })

    data = pd.DataFrame({
        'timestamp': pd.to_datetime(['2020-01-01 00:00:00']),
        'values': [[1, 2, 3, 4, 5, 6]],
        'sampling_frequency': [10000],
        'dummy': [1],
    })

    # run
    output = pipeline.predict(readings=data)
    outputs = dict(zip(pipeline.get_output_names(), output))

    # assert
    expected_features = [
        'fft_real.SigPro.frequency_band.mean.mean_value',
        'fft_real.SigPro.frequency_band.band_3k_4k_mean.mean_value',
        'fft_real.mean.mean_value'
    ]

    assert outputs['feature_columns'] == expected_features
    expected_readings = pd.DataFrame({
        'fft_real.SigPro.frequency_band.mean.mean_value': [float('nan')],
        'fft_real.SigPro.frequency_band.band_3k_4k_mean.mean_value': [-3.0],
        'fft_real.mean.mean_value': [1.0],
    })

    pd.testing.assert_frame_equal(expected_readings, outputs['readings'])
Esempio n. 7
0
def test_SigPro():
    # setup
    pipeline = MLPipeline({
        'primitives': [
            'sigpro.SigPro',
            'sigpro.SigPro',
        ],
        'init_params': {
            'sigpro.SigPro#1': {
                'values_column_name':
                'signal_values',
                'keep_columns':
                True,
                'transformations': [
                    {
                        'name':
                        'identity',
                        'primitive':
                        'sigpro.transformations.amplitude.identity.identity',
                    },
                    {
                        'name':
                        'fft',
                        'primitive':
                        'sigpro.transformations.frequency.fft.fft_real',
                    },
                ],
                'aggregations': [
                    {
                        'name':
                        'mean',
                        'primitive':
                        'sigpro.aggregations.amplitude.statistical.mean',
                    },
                    {
                        'name':
                        'rms',
                        'primitive':
                        'sigpro.aggregations.amplitude.statistical.rms',
                    },
                ]
            },
            'sigpro.SigPro#2': {
                'values_column_name':
                'signal_values',
                'keep_columns': ['dummy'],
                'transformations': [
                    {
                        'name':
                        'identity',
                        'primitive':
                        'sigpro.transformations.amplitude.identity.identity',
                    },
                ],
                'aggregations': [
                    {
                        'name':
                        'std',
                        'primitive':
                        'sigpro.aggregations.amplitude.statistical.std',
                    },
                ]
            }
        }
    })

    data = pd.DataFrame({
        'timestamp': pd.to_datetime(['2020-01-01 00:00:00']),
        'signal_values': [[1, 2, 3, 4]],
        'sampling_frequency': [1000],
        'dummy': [1],
    })

    # run
    output = pipeline.predict(readings=data)
    outputs = dict(zip(pipeline.get_output_names(), output))

    # assert
    expected_features = [
        'identity.fft.mean.mean_value', 'identity.fft.rms.rms_value',
        'identity.std.std_value'
    ]
    assert outputs['feature_columns'] == expected_features

    expected_readings = pd.DataFrame({
        'dummy': [1],
        'identity.fft.mean.mean_value': [1.0],
        'identity.fft.rms.rms_value': [5.291503],
        'identity.std.std_value': [1.118034],
    })
    pd.testing.assert_frame_equal(expected_readings, outputs['readings'])
Esempio n. 8
0
class Modeler:
    """A class responsible for executing various Machine Learning Pipelines using MLBlocks."""

    _regression_metrics = {
        'Explained Variance Score': sklearn.metrics.explained_variance_score,
        'Mean Absolute Error': sklearn.metrics.mean_absolute_error,
        'Mean Squared Error': sklearn.metrics.mean_squared_error,
        'Mean Squared Log Error': sklearn.metrics.mean_squared_log_error,
        'Median Absolute Error': sklearn.metrics.median_absolute_error,
        'R2 Score': sklearn.metrics.r2_score
    }

    _classification_metrics = {
        'Accuracy':
        sklearn.metrics.accuracy_score,
        'F1 Macro':
        lambda y_true, y_pred: sklearn.metrics.f1_score(
            y_true, y_pred, average="macro"),
        'Precision':
        lambda y_true, y_pred: sklearn.metrics.precision_score(
            y_true, y_pred, average="macro"),
        'Recall':
        lambda y_true, y_pred: sklearn.metrics.recall_score(
            y_true, y_pred, average="macro"),
        'Confusion Matrix':
        sklearn.metrics.confusion_matrix
    }

    def __init__(self, pipeline, problem_type):
        self._pipeline = MLPipeline(pipeline)
        self._problem_type = problem_type

    @staticmethod
    def train_test_split(X, y, test_size=0.2, shuffle=True):
        """Split the training dataset and the testing dataset.

        Args:
            X (pandas.DataFrame or ndarray):
                Inputs to the pipeline.
            y (pandas.Series or ndarray):
                Target values.
            test_size (float):
                The proportion of the dataset to include in the test dataset.
            shuffle (bool):
                Whether or not to shuffle the data before splitting.

        Returns:
            list:
                List containing the train-test split of the inputs and targets.
        """
        return train_test_split(X, y, test_size=test_size, shuffle=shuffle)

    @property
    def regression_metrics(self):
        """Supported regression metrics functions.

        Returns:
            dict:
                A dictionary for regression metric functions.
        """
        return self._regression_metrics

    @property
    def classification_metrics(self):
        """Supported classification metrics functions.

        Returns:
            dict:
                A dictionary for classification metric functions.
        """
        return self._classification_metrics

    @property
    def target_metrics(self):
        """Supported metrics functions for the given problem type.

        Returns:
            dict:
                A dictionary for metric functions.
        """
        if self._problem_type == 'classification':
            return self._classification_metrics
        else:
            return self._regression_metrics

    @property
    def pipeline(self):
        """Pipeline.

        Returns:
            MLPipeline:
                The pipeline in the modeler.
        """
        return MLPipeline(self._pipeline)

    def k_fold_validation(self, hyperparameters, X, y, scoring=None):
        """Score the pipeline through k-fold validation with the given scoring function.

        Args:
            hyperparameters (dict or None):
                A dictionary of hyper-parameters for each primitive in the target pipeline.
            X (pandas.DataFrame or ndarray):
                Inputs to the pipeline.
            y (pandas.Series or ndarray):
                Target values.
            scoring (str):
                The name of the scoring function.

        Returns:
            np.float64:
                The average score in the k-fold validation.
        """
        model_instance = MLPipeline(self._pipeline)
        X = pd.DataFrame(X)
        y = pd.Series(y)

        if hyperparameters:
            model_instance.set_hyperparameters(hyperparameters)

        if self._problem_type == 'regression':
            scorer = self.regression_metrics[scoring or 'R2 Score']
        else:
            scorer = self.classification_metrics[scoring or 'F1 Macro']

        scores = []
        kf = KFold(n_splits=10, random_state=None, shuffle=True)
        for train_index, test_index in kf.split(X):
            model_instance.fit(X.iloc[train_index], y.iloc[train_index])
            y_pred = model_instance.predict(X.iloc[test_index])
            scores.append(scorer(y.iloc[test_index], y_pred))

        return np.mean(scores)

    def tune(self, X, y, max_evals=10, scoring=None, verbose=False):
        """ Tune the pipeline hyper-parameters and select the optimized model.

        Args:
            X (pandas.DataFrame or ndarray):
                Inputs to the pipeline.
            y (pandas.Series or ndarray):
                Target values.
            max_evals (int):
                Maximum number of hyper-parameter optimization iterations.
            scoring (str):
                The name of the scoring function.
            verbose (bool):
                Whether to log information during processing.
        """
        tunables = {'0': self._pipeline.get_tunable_hyperparameters(flat=True)}

        session = BTBSession(tunables,
                             lambda _, hyparam: self.k_fold_validation(
                                 hyparam, X=X, y=y, scoring=scoring),
                             max_errors=max_evals,
                             verbose=verbose)

        best_proposal = session.run(max_evals)
        self._pipeline.set_hyperparameters(best_proposal['config'])

    def fit(self, X, y, tune=False, max_evals=10, scoring=None, verbose=False):
        """Fit and select the pipelines.

        Args:
            X (pandas.DataFrame or ndarray):
                Inputs to the pipeline.
            y (pandas.Series or ndarray):
                Target values.
            tune (bool):
                Whether to optimize hyper-parameters of the pipelines.
            max_evals (int):
                Maximum number of hyper-parameter optimization iterations.
            scoring (str):
                The name of the scoring function used in the hyper-parameter optimization.
            verbose (bool):
                Whether to log information during processing.
        """
        if tune:
            # tune and select pipeline
            self.tune(X,
                      y,
                      max_evals=max_evals,
                      scoring=scoring,
                      verbose=verbose)

        # fit pipeline
        self._pipeline.fit(X, y)

    def predict(self, X):
        """Predict the input data

        Args:
            X (pandas.DataFrame or ndarray):
                Testing data, inputs to the pipeline.

        Returns:
            pandas.Series or ndarray:
                Predictions to the input data.
        """
        return self._pipeline.predict(X)

    def test(self, X, y, scoring=None):
        """Test the trained pipeline.

        Args:
            X (pandas.DataFrame or ndarray):
                Inputs to the pipeline.
            y (pandas.Series or ndarray):
                Target values.
            scoring (str):
                The name of the scoring function.

        Returns:
            float:
                The score of the trained pipeline on the inputs.
        """
        if self._problem_type == 'regression':
            scorer = self.regression_metrics[scoring or 'R2 Score']
        else:
            scorer = self.classification_metrics[scoring or 'F1 Macro']
        return scorer(y, self.predict(X))

    def fit_predict(self,
                    X,
                    y,
                    tune=False,
                    max_evals=10,
                    scoring=None,
                    verbose=False):
        """Fit the pipeline and make predictions

        Args:
            X (pandas.DataFrame or ndarray):
                Inputs to the pipeline.
            y (pandas.Series or ndarray):
                Target values.
            tune (bool):
                Whether to optimize hyper-parameters of the pipelines.
            max_evals (int):
                Maximum number of hyper-parameter optimization iterations.
            scoring (str):
                The name of the scoring function used in the hyper-parameter optimization.
            verbose(bool):
                Whether to log information during processing.

        Returns:
            pandas.Series or ndarray:
                Predictions to the input data.
        """
        self.fit(X,
                 y,
                 tune=tune,
                 max_evals=max_evals,
                 scoring=scoring,
                 verbose=verbose)
        return self.predict(X)

    def evaluate(self,
                 X,
                 y,
                 test_size=0.2,
                 shuffle=True,
                 tune=False,
                 max_evals=10,
                 scoring=None,
                 metrics=None,
                 verbose=False):
        """Evaluate the pipelines.

        Args:
            X (pandas.DataFrame or ndarray):
                Inputs to the pipeline.
            y (pandas.Series or ndarray):
                Target values.
            test_size (float):
                The proportion of the dataset to include in the test dataset.
            shuffle (bool):
                Whether or not to shuffle the data before splitting.
            tune (bool):
                Whether to optimize hyper-parameters of the pipelines.
            max_evals (int):
                Maximum number of hyper-parameter optimization iterations.
            scoring (str):
                The name of the scoring function used in the hyper-parameter optimization.
            metrics (list):
                A list of scoring function names. The scoring functions should be consistent
                with the problem type.
            verbose (bool):
                Whether to log information during processing.
        """
        X_train, X_test, y_train, y_test = self.train_test_split(
            X, y, test_size=test_size, shuffle=shuffle)
        metrics = metrics or self.target_metrics.keys()

        scores = {}
        self.fit(X_train,
                 y_train,
                 tune=tune,
                 max_evals=max_evals,
                 scoring=scoring,
                 verbose=verbose)
        for metric in metrics:
            scores[metric] = self.test(X_test, y_test, scoring=metric)
        return scores

    def save(self, path):
        """Save the object in a pickle file.

        Args:
            path (str): The path to store the modeler.
        """
        os.makedirs(os.path.dirname(path), exist_ok=True)
        with open(path, 'wb') as pickle_file:
            pickle.dump(self, pickle_file)

    @staticmethod
    def load(path):
        """Load a Modeler object from a pickle file

        Args:
            path (str): The path to load the modeler.

        Returns:
            Modeler:
                A Modeler instance.
        """
        with open(path, 'rb') as pickle_file:
            obj = pickle.load(pickle_file)
        if not isinstance(obj, Modeler):
            raise ValueError('Serialized object is not a Modeler instance')
        return obj
Esempio n. 9
0
class GreenGuardPipeline(object):
    """Main Machine Learning component in the GreenGuard project.

    The ``GreenGuardPipeline`` represents the abstraction of a Machine
    Learning pipeline architecture specialized on the GreenGuard data
    format.

    In order to use it, an MLBlocks pipeline template needs to be given,
    alongside information about how to evaluate its performance using
    cross validation.

    Attributes:
        template (MLPipeline):
            MLPipeline instance used as the template for tuning.
        template_name:
            Name of the template being used.
        fitted (bool):
            Whether this GreenGuardPipeline has already been fitted or not.
        steps (list):
            List of primitives that compose this template.
        preprocessing (list):
            List of preprocessing steps. These steps have no learning stage
            and are executed only once on the complete training dataset, before
            partitioning it for cross validation.
        static (list):
            List of static steps. These are all the steps in the pipeline that
            come after the preprocessing ones but have no hyperparameters.
            These are executed on each cross validation split only once, when
            the data is partitioned, and their output is cached to be reused
            later on at every tuning iteration.
        tunable (list):
            List of steps that have hyperparameters and will be tuned during
            the tuning loop.

    Args:
        template (str or MLPipeline):
            Template to use. If a ``str`` is given, load the corresponding
            ``MLPipeline``.
        metric (str or function):
            Metric to use. If an ``str`` is give it must be one of the metrics
            defined in the ``greenguard.metrics.METRICS`` dictionary.
        cost (bool):
            Whether the metric is a cost function (the lower the better) or not.
            Defaults to ``False``.
        init_params (dict):
            Initial parameters to pass to the underlying MLPipeline if something
            other than the defaults need to be used.
            Defaults to ``None``.
        stratify (bool):
            Whether to stratify the data when partitioning for cross validation.
            Defaults to ``True``.
        cv_splits (int):
            Number of cross validation folds to use. Defaults to ``5``.
        shuffle (bool):
            Whether to shuffle the data when partitioning for cross validation.
            Defaults to ``True``.
        random_state (int or RandomState):
            random state to use for the cross validation partitioning.
            Defaults to ``0``.
        preprocessing (int):
            Number of steps to execute during the preprocessing stage.
            The number of preprocessing steps cannot be higher than the
            number of static steps in the given template.
            Defaults to ``0``.
    """

    template = None
    template_name = None
    fitted = False
    cv_score = None

    _cv_class = None
    _metric = None
    _cost = False
    _tuner = None
    _pipeline = None
    _splits = None
    _static = None

    def _get_cv(self, stratify, cv_splits, shuffle, random_state):
        if stratify:
            cv_class = StratifiedKFold
        else:
            cv_class = KFold

        return cv_class(n_splits=cv_splits,
                        shuffle=shuffle,
                        random_state=random_state)

    def _count_static_steps(self):
        tunable_hyperparams = self._pipeline.get_tunable_hyperparameters()
        for index, block_name in enumerate(self._pipeline.blocks.keys()):
            if tunable_hyperparams[block_name]:
                return index

        return 0

    def _build_pipeline(self):
        self._pipeline = MLPipeline(self.template)
        if self._hyperparameters:
            self._pipeline.set_hyperparameters(self._hyperparameters)

        self.fitted = False

    @staticmethod
    def _update_params(old, new):
        for name, params in new.items():
            if '#' not in name:
                name = name + '#1'

            block_params = old.setdefault(name, dict())
            for param, value in params.items():
                block_params[param] = value

    def set_init_params(self, init_params):
        """Set new init params for the template and pipeline.

        Args:
            init_params (dict):
                New init_params to use.
        """
        template_params = self.template['init_params']
        self._update_params(template_params, init_params)
        self._build_pipeline()

    def __init__(self,
                 template,
                 metric='accuracy',
                 cost=False,
                 init_params=None,
                 stratify=True,
                 cv_splits=5,
                 shuffle=True,
                 random_state=0,
                 preprocessing=0):

        self._cv = self._get_cv(stratify, cv_splits, shuffle, random_state)

        if isinstance(metric, str):
            metric, cost = METRICS[metric]

        self._metric = metric
        self._cost = cost

        if isinstance(template, str):
            self.template_name = template
            self.template = load_pipeline(template)
        else:
            self.template = template

        # Make sure to have block number in all init_params names
        template_params = self.template.setdefault('init_params', dict())
        for name, params in list(template_params.items()):
            if '#' not in name:
                template_params[name + '#1'] = template_params.pop(name)

        self._hyperparameters = dict()
        if init_params:
            self.set_init_params(init_params)
        else:
            self._build_pipeline()

        self._static = self._count_static_steps()
        self._preprocessing = preprocessing

        self.steps = self._pipeline.primitives.copy()
        self.preprocessing = self.steps[:self._preprocessing]
        self.static = self.steps[self._preprocessing:self._static]
        self.tunable = self.steps[self._static:]

        if self._preprocessing and (self._preprocessing > self._static):
            raise ValueError('Preprocessing cannot be bigger than static')

    def __repr__(self):
        return ("GreenGuardPipeline({})\n"
                "  preprocessing:\n{}\n"
                "  static:\n{}\n"
                "  tunable:\n{}\n").format(
                    self.template_name,
                    '\n'.join('    {}'.format(step)
                              for step in self.preprocessing),
                    '\n'.join('    {}'.format(step) for step in self.static),
                    '\n'.join('    {}'.format(step) for step in self.tunable),
                )

    def get_hyperparameters(self):
        """Get the current hyperparameters.

        Returns:
            dict:
                Current hyperparameters.
        """
        return deepcopy(self._hyperparameters)

    def set_hyperparameters(self, hyperparameters):
        """Set new hyperparameters for this pipeline instance.

        The template ``init_params`` remain unmodified.

        Args:
            hyperparameters (dict):
                New hyperparameters to use.
        """
        self._update_params(self._hyperparameters, hyperparameters)
        self._build_pipeline()

    @staticmethod
    def _clone_pipeline(pipeline):
        return MLPipeline.from_dict(pipeline.to_dict())

    def _is_better(self, score):
        if self._cost:
            return score < self.cv_score

        return score > self.cv_score

    def _generate_splits(self, X, y, readings, turbines=None):
        if self._preprocessing:
            pipeline = MLPipeline(self.template)
            LOGGER.debug('Running %s preprocessing steps', self._preprocessing)
            context = pipeline.fit(X=X,
                                   y=y,
                                   readings=readings,
                                   turbines=turbines,
                                   output_=self._preprocessing - 1)
            del context['X']
            del context['y']
        else:
            context = {
                'readings': readings,
                'turbines': turbines,
            }

        splits = list()
        for fold, (train_index, test_index) in enumerate(self._cv.split(X, y)):
            LOGGER.debug('Running static steps for fold %s', fold)
            X_train, X_test = X.iloc[train_index], X.iloc[test_index]
            y_train, y_test = y.iloc[train_index], y.iloc[test_index]

            pipeline = MLPipeline(self.template)
            fit = pipeline.fit(X_train,
                               y_train,
                               output_=self._static - 1,
                               start_=self._preprocessing,
                               **context)
            predict = pipeline.predict(X_test,
                                       output_=self._static - 1,
                                       start_=self._preprocessing,
                                       **context)

            splits.append((fold, pipeline, fit, predict, y_test))

        return splits

    def cross_validate(self,
                       X=None,
                       y=None,
                       readings=None,
                       turbines=None,
                       params=None):
        """Compute cross validation score using the given data.

        If the splits have not been previously computed, compute them now.
        During this computation, the data is partitioned using the indicated
        cross validation parameters and later on processed using the
        pipeline static steps.

        The results of the fit and produce executions are cached and reused
        in subsequent calls to this method.

        Args:
            X (pandas.DataFrame):
                ``target_times`` data, without the ``target`` column.
                Only needed if the splits have not been previously computed.
            y (pandas.Series or numpy.ndarray):
                ``target`` vector corresponding to the passed ``target_times``.
                Only needed if the splits have not been previously computed.
            readings (pandas.DataFrame):
                ``readings`` table. Only needed if the splits have not been
                previously computed.
            turbines (pandas.DataFrame):
                ``turbines`` table. Only needed if the splits have not been
                previously computed.
            params (dict):
                hyperparameter values to use.

        Returns:
            float:
                Computed cross validation score. This score is the average
                of the scores obtained accross all the cross validation folds.
        """

        if self._splits is None:
            LOGGER.info('Running static steps before cross validation')
            self._splits = self._generate_splits(X, y, readings, turbines)

        scores = []
        for fold, pipeline, fit, predict, y_test in self._splits:
            LOGGER.debug('Scoring fold %s', fold)

            if params:
                pipeline.set_hyperparameters(params)
            else:
                pipeline.set_hyperparameters(
                    self._pipeline.get_hyperparameters())

            pipeline.fit(start_=self._static, **fit)
            predictions = pipeline.predict(start_=self._static, **predict)

            score = self._metric(y_test, predictions)

            LOGGER.debug('Fold fold %s score: %s', fold, score)
            scores.append(score)

        cv_score = np.mean(scores)
        if self.cv_score is None:
            self.cv_score = cv_score

        return cv_score

    def _to_dicts(self, hyperparameters):
        params_tree = defaultdict(dict)
        for (block, hyperparameter), value in hyperparameters.items():
            if isinstance(value, np.integer):
                value = int(value)

            elif isinstance(value, np.floating):
                value = float(value)

            elif isinstance(value, np.ndarray):
                value = value.tolist()

            elif value == 'None':
                value = None

            params_tree[block][hyperparameter] = value

        return params_tree

    def _to_tuples(self, params_tree, tunable_keys):
        param_tuples = defaultdict(dict)
        for block_name, params in params_tree.items():
            for param, value in params.items():
                key = (block_name, param)
                if key in tunable_keys:
                    param_tuples[key] = 'None' if value is None else value

        return param_tuples

    def _get_tunables(self):
        tunables = []
        tunable_keys = []
        for block_name, params in self._pipeline.get_tunable_hyperparameters(
        ).items():
            for param_name, param_details in params.items():
                key = (block_name, param_name)
                param_type = param_details['type']
                param_type = 'string' if param_type == 'str' else param_type

                if param_type == 'bool':
                    param_range = [True, False]
                else:
                    param_range = param_details.get(
                        'range') or param_details.get('values')

                value = HyperParameter(param_type, param_range)
                tunables.append((key, value))
                tunable_keys.append(key)

        return tunables, tunable_keys

    def _get_tuner(self):
        tunables, tunable_keys = self._get_tunables()
        tuner = GP(tunables)

        # Inform the tuner about the score that the default hyperparmeters obtained
        param_tuples = self._to_tuples(self._pipeline.get_hyperparameters(),
                                       tunable_keys)
        tuner.add(param_tuples, self.cv_score)

        return tuner

    def tune(self,
             target_times=None,
             readings=None,
             turbines=None,
             iterations=10):
        """Tune this pipeline for the indicated number of iterations.

        Args:
            target_times (pandas.DataFrame):
                ``target_times`` table, containing the ``turbine_id``, ``cutoff_time``
                and ``target`` columns.
                Only needed if the splits have not been previously computed.
            readings (pandas.DataFrame):
                ``readings`` table. Only needed if the splits have not been
                previously computed.
            turbines (pandas.DataFrame):
                ``turbines`` table. Only needed if the splits have not been
                previously computed.
            iterations (int):
                Number of iterations to perform.
        """
        if not self._tuner:
            LOGGER.info('Scoring the default pipeline')
            X = target_times[['turbine_id', 'cutoff_time']]
            y = target_times['target']
            self.cv_score = self.cross_validate(X, y, readings, turbines)

            LOGGER.info('Default Pipeline score: %s', self.cv_score)

            self._tuner = self._get_tuner()

        for i in range(iterations):
            LOGGER.info('Scoring pipeline %s', i + 1)

            params = self._tuner.propose(1)
            param_dicts = self._to_dicts(params)

            try:
                score = self.cross_validate(params=param_dicts)

                LOGGER.info('Pipeline %s score: %s', i + 1, score)

                if self._is_better(score):
                    self.cv_score = score
                    self.set_hyperparameters(param_dicts)

                self._tuner.add(params, score)

            except Exception:
                failed = '\n'.join('{}: {}'.format(k, v)
                                   for k, v in params.items())
                LOGGER.exception(
                    "Caught an exception scoring pipeline %s with params:\n%s",
                    i + 1, failed)

    def fit(self, target_times, readings, turbines=None):
        """Fit this pipeline to the given data.

        Args:
            target_times (pandas.DataFrame):
                ``target_times`` table, containing the ``turbine_id``, ``cutoff_time``
                and ``target`` columns.
            readings (pandas.DataFrame):
                ``readings`` table.
            turbines (pandas.DataFrame):
                ``turbines`` table.
        """
        X = target_times[['turbine_id', 'cutoff_time']]
        y = target_times['target']
        self._pipeline.fit(X, y, readings=readings, turbines=turbines)
        self.fitted = True

    def predict(self, target_times, readings, turbines=None):
        """Make predictions using this pipeline.

        Args:
            target_times (pandas.DataFrame):
                ``target_times`` table, containing the ``turbine_id``, ``cutoff_time``
                and ``target`` columns.
            readings (pandas.DataFrame):
                ``readings`` table.
            turbines (pandas.DataFrame):
                ``turbines`` table.

        Returns:
            numpy.ndarray:
                Vector of predictions.
        """
        if not self.fitted:
            raise NotFittedError()

        X = target_times[['turbine_id', 'cutoff_time']]
        return self._pipeline.predict(X, readings=readings, turbines=turbines)

    def save(self, path):
        """Serialize and save this pipeline using cloudpickle.

        Args:
            path (str):
                Path to the file where the pipeline will be saved.
        """
        with open(path, 'wb') as pickle_file:
            cloudpickle.dump(self, pickle_file)

    @classmethod
    def load(cls, path):
        """Load a previously saved pipeline from a file.

        Args:
            path (str):
                Path to the file where the pipeline is saved.

        Returns:
            GreenGuardPipeline:
                Loaded GreenGuardPipeline instance.
        """
        with open(path, 'rb') as pickle_file:
            return cloudpickle.load(pickle_file)
Esempio n. 10
0
class GreenGuardPipeline(object):

    template = None
    template_name = None
    fitted = False
    cv_score = None

    _cv_class = None
    _metric = None
    _cost = False
    _tuner = None
    _pipeline = None
    _splits = None
    _static = None

    def _get_cv(self, stratify, cv_splits, shuffle, random_state):
        if stratify:
            cv_class = StratifiedKFold
        else:
            cv_class = KFold

        return cv_class(n_splits=cv_splits,
                        shuffle=shuffle,
                        random_state=random_state)

    def _count_static_steps(self):
        tunable_hyperparams = self._pipeline.get_tunable_hyperparameters()
        for index, block_name in enumerate(self._pipeline.blocks.keys()):
            if tunable_hyperparams[block_name]:
                return index

        return 0

    def _build_pipeline(self):
        self._pipeline = MLPipeline(self.template)
        if self._hyperparameters:
            self._pipeline.set_hyperparameters(self._hyperparameters)

        self.fitted = False

    @staticmethod
    def _update_params(old, new):
        for name, params in new.items():
            if '#' not in name:
                name = name + '#1'

            block_params = old.setdefault(name, dict())
            for param, value in params.items():
                block_params[param] = value

    def set_init_params(self, init_params):
        template_params = self.template['init_params']
        self._update_params(template_params, init_params)
        self._build_pipeline()

    def __init__(self,
                 template,
                 metric,
                 cost=False,
                 init_params=None,
                 stratify=True,
                 cv_splits=5,
                 shuffle=True,
                 random_state=0,
                 preprocessing=0):

        self._cv = self._get_cv(stratify, cv_splits, shuffle, random_state)

        if isinstance(metric, str):
            metric, cost = METRICS[metric]

        self._metric = metric
        self._cost = cost

        if isinstance(template, str):
            self.template_name = template
            self.template = load_pipeline(template)
        else:
            self.template = template

        # Make sure to have block number in all init_params names
        template_params = self.template.setdefault('init_params', dict())
        for name, params in list(template_params.items()):
            if '#' not in name:
                template_params[name + '#1'] = template_params.pop(name)

        self._hyperparameters = dict()
        if init_params:
            self.set_init_params(init_params)
        else:
            self._build_pipeline()

        self._static = self._count_static_steps()
        self._preprocessing = preprocessing

        self.steps = self._pipeline.primitives.copy()
        self.preprocessing = self.steps[:self._preprocessing]
        self.static = self.steps[self._preprocessing:self._static]
        self.tunable = self.steps[self._static:]

        if self._preprocessing and (self._preprocessing > self._static):
            raise ValueError('Preprocessing cannot be bigger than static')

    def __repr__(self):
        return ("GreenGuardPipeline({})\n"
                "  preprocessing:\n{}\n"
                "  static:\n{}\n"
                "  tunable:\n{}\n").format(
                    self.template_name,
                    '\n'.join('    {}'.format(step)
                              for step in self.preprocessing),
                    '\n'.join('    {}'.format(step) for step in self.static),
                    '\n'.join('    {}'.format(step) for step in self.tunable),
                )

    def get_hyperparameters(self):
        return deepcopy(self._hyperparameters)

    def set_hyperparameters(self, hyperparameters):
        self._update_params(self._hyperparameters, hyperparameters)
        self._build_pipeline()

    @staticmethod
    def _clone_pipeline(pipeline):
        return MLPipeline.from_dict(pipeline.to_dict())

    def _is_better(self, score):
        if self._cost:
            return score < self.cv_score

        return score > self.cv_score

    def _generate_splits(self, X, y, readings):
        if self._preprocessing:
            pipeline = MLPipeline(self.template)
            LOGGER.debug('Running %s preprocessing steps', self._preprocessing)
            context = pipeline.fit(X=X,
                                   y=y,
                                   readings=readings,
                                   output_=self._preprocessing - 1)
            del context['X']
            del context['y']
        else:
            context = {'readings': readings}

        splits = list()
        for fold, (train_index, test_index) in enumerate(self._cv.split(X, y)):
            LOGGER.debug('Running static steps for fold %s', fold)
            X_train, X_test = X.iloc[train_index], X.iloc[test_index]
            y_train, y_test = y.iloc[train_index], y.iloc[test_index]

            pipeline = MLPipeline(self.template)
            fit = pipeline.fit(X_train,
                               y_train,
                               output_=self._static - 1,
                               start_=self._preprocessing,
                               **context)
            predict = pipeline.predict(X_test,
                                       output_=self._static - 1,
                                       start_=self._preprocessing,
                                       **context)

            splits.append((fold, pipeline, fit, predict, y_test))

        return splits

    def cross_validate(self, X=None, y=None, readings=None, params=None):
        if self._splits is None:
            LOGGER.info('Running static steps before cross validation')
            self._splits = self._generate_splits(X, y, readings)

        scores = []
        for fold, pipeline, fit, predict, y_test in self._splits:
            LOGGER.debug('Scoring fold %s', fold)

            if params:
                pipeline.set_hyperparameters(params)
            else:
                pipeline.set_hyperparameters(
                    self._pipeline.get_hyperparameters())

            pipeline.fit(start_=self._static, **fit)
            predictions = pipeline.predict(start_=self._static, **predict)

            score = self._metric(y_test, predictions)

            LOGGER.debug('Fold fold %s score: %s', fold, score)
            scores.append(score)

        cv_score = np.mean(scores)
        if self.cv_score is None:
            self.cv_score = cv_score

        return cv_score

    def _to_dicts(self, hyperparameters):
        params_tree = defaultdict(dict)
        for (block, hyperparameter), value in hyperparameters.items():
            if isinstance(value, np.integer):
                value = int(value)

            elif isinstance(value, np.floating):
                value = float(value)

            elif isinstance(value, np.ndarray):
                value = value.tolist()

            elif value == 'None':
                value = None

            params_tree[block][hyperparameter] = value

        return params_tree

    def _to_tuples(self, params_tree, tunable_keys):
        param_tuples = defaultdict(dict)
        for block_name, params in params_tree.items():
            for param, value in params.items():
                key = (block_name, param)
                if key in tunable_keys:
                    param_tuples[key] = 'None' if value is None else value

        return param_tuples

    def _get_tunables(self):
        tunables = []
        tunable_keys = []
        for block_name, params in self._pipeline.get_tunable_hyperparameters(
        ).items():
            for param_name, param_details in params.items():
                key = (block_name, param_name)
                param_type = param_details['type']
                param_type = 'string' if param_type == 'str' else param_type

                if param_type == 'bool':
                    param_range = [True, False]
                else:
                    param_range = param_details.get(
                        'range') or param_details.get('values')

                value = HyperParameter(param_type, param_range)
                tunables.append((key, value))
                tunable_keys.append(key)

        return tunables, tunable_keys

    def _get_tuner(self):
        tunables, tunable_keys = self._get_tunables()
        tuner = GP(tunables)

        # Inform the tuner about the score that the default hyperparmeters obtained
        param_tuples = self._to_tuples(self._pipeline.get_hyperparameters(),
                                       tunable_keys)
        tuner.add(param_tuples, self.cv_score)

        return tuner

    def tune(self, X=None, y=None, readings=None, iterations=10):
        if not self._tuner:
            LOGGER.info('Scoring the default pipeline')
            self.cv_score = self.cross_validate(X, y, readings)

            LOGGER.info('Default Pipeline score: %s', self.cv_score)

            self._tuner = self._get_tuner()

        for i in range(iterations):
            LOGGER.info('Scoring pipeline %s', i + 1)

            params = self._tuner.propose(1)
            param_dicts = self._to_dicts(params)

            try:
                score = self.cross_validate(params=param_dicts)

                LOGGER.info('Pipeline %s score: %s', i + 1, score)

                if self._is_better(score):
                    self.cv_score = score
                    self.set_hyperparameters(param_dicts)

                self._tuner.add(params, score)

            except Exception:
                failed = '\n'.join('{}: {}'.format(k, v)
                                   for k, v in params.items())
                LOGGER.exception(
                    "Caught an exception scoring pipeline %s with params:\n%s",
                    i + 1, failed)

    def fit(self, X, y, readings):
        self._pipeline.fit(X, y, readings=readings)
        self.fitted = True

    def predict(self, X, readings):
        if not self.fitted:
            raise NotFittedError()

        return self._pipeline.predict(X, readings=readings)

    def save(self, path):
        with open(path, 'wb') as pickle_file:
            cloudpickle.dump(self, pickle_file)

    @classmethod
    def load(cls, path):
        with open(path, 'rb') as pickle_file:
            return cloudpickle.load(pickle_file)