def _resample(self,
                  data_x: pd.DataFrame,
                  data_y: np.ndarray,
                  hyper_params: HyperParamsBase = None) -> ResamplerResults:

        result_evaluators = list()
        training_indexes, test_indexes = self._stratified_splitter.split_monte_carlo(target_values=data_y,
                                                                                     samples=self._repeats,
                                                                                     seed=42)
        for train_ind, test_ind in zip(training_indexes, test_indexes):
                train_x_not_transformed, holdout_x_not_transformed = data_x[train_ind], data_x[test_ind]
                train_y, test_y = data_y[train_ind], data_y[test_ind]

                pipeline = TransformerPipeline(transformations=self._transformations)
                train_x_transformed = pipeline.fit_transform(data_x=train_x_not_transformed)
                holdout_x_transformed = pipeline.transform(data_x=holdout_x_not_transformed)

                if self._train_callback is not None:
                    self._train_callback(train_x_transformed, data_y, hyper_params)

                model_copy = self._model.clone()  # need to reuse this object type for each fold/repeat
                model_copy.train(data_x=train_x_not_transformed, data_y=train_y, hyper_params=hyper_params)

                # for each evaluator, add the metric name/value to a dict to add to the ResamplerResults
                fold_evaluators = list()
                for evaluator in self._scores:
                    evaluator_copy = evaluator.clone()  # need to reuse this object type for each fold/repeat
                    evaluator_copy.calculate(actual_values=test_y,
                                             predicted_values=model_copy.predict(data_x=holdout_x_transformed))  # noqa
                    fold_evaluators.append(evaluator_copy)
                result_evaluators.append(fold_evaluators)

        return ResamplerResults(scores=result_evaluators, decorators=None)
    def hierarchical_dendogram_plot(
            data: pd.DataFrame,
            transformations: List[TransformerBase] = None,
            linkage:
        ClusteringHierarchicalLinkage = ClusteringHierarchicalLinkage.WARD,
            color_threshold=None,
            figure_size: tuple = (22, 18),
    ):
        """

        :param data: dataset to cluster on
        :param transformations: transformations to apply before clustering
        :param linkage: the type of clustering to apply
        :param color_threshold: the value of the y-axis to apply the 'horizontal cutoff' for clustering.
            You'll likely need to first use `None`, then visualize a more appropriate value based off of the
            dendogram.

            For a more precise description, see
            https://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.hierarchy.dendrogram.html
        :param figure_size: size of the figure
        """
        transformed_data = TransformerPipeline(
            transformations=transformations).fit_transform(data)
        # Specify the linkage type. Scipy accepts 'ward', 'complete', 'average', as well as other values
        linkage_matrix = hierarchy.linkage(transformed_data.values,
                                           linkage.value)
        plt.figure(figsize=figure_size)
        hierarchy.dendrogram(linkage_matrix, color_threshold=color_threshold)
Beispiel #3
0
def train_aggregator(args):
    model_info = args[0]
    data_x_local = args[1]
    data_y_local = args[2]

    if model_info.transformations:
        # ensure none of the Transformers have been used.
        assert all([x.state is None for x in model_info.transformations])

    # List of Pipelines to cache for `predict()`
    pipeline = TransformerPipeline(model_info.transformations)
    # fit/transform with current pipeline
    transformed_data_x_local = pipeline.fit_transform(data_x=data_x_local)
    model_info.model.train(data_x=transformed_data_x_local,
                           data_y=data_y_local,
                           hyper_params=model_info.hyper_params)
    return model_info, pipeline
    def search(self, data: pd.DataFrame):
        data = data.copy()
        if self._global_transformations is not None:
            # noinspection PyTypeChecker
            data = TransformerPipeline(transformations=self._global_transformations).\
                fit_transform(data_x=data)

        searcher_results = []
        for model_index in range(len(self._models)):
            local_model = self._models[model_index]
            local_model_description = self._model_descriptions[model_index]
            local_transformations = self._model_transformations[model_index]
            local_hyper_params_object = self._model_hyper_params_objects[model_index]
            local_hyper_params_grid = self._model_hyper_params_grids[model_index]

            # could pass transformations to fitter, but since we are iterating, let's just transform once
            if local_transformations is not None:
                data = TransformerPipeline(transformations=[x.clone() for x in local_transformations]).\
                    fit_transform(data_x=data)

            score_results = []
            resulting_num_clusters = []
            params_grid = local_hyper_params_grid.params_grid
            for params_index in range(len(params_grid)):
                trainer = ModelTrainer(model=local_model.clone(),
                                       scores=[x.clone() for x in self._scores])
                # get current hyper parameter combination
                hyper_params = local_hyper_params_object.clone()
                hyper_params.update_dict(params_grid.iloc[params_index].to_dict())
                clusters = trainer.train_predict_eval(data=data, hyper_params=hyper_params)
                score_results.append(trainer.training_scores)  # appending list to list
                resulting_num_clusters.append(len(np.unique(clusters)))  # get number of unique clusters

            # noinspection PyTypeChecker
            searcher_results.append(ClusteringSearcherResult(model_description=local_model_description,
                                                             hyper_params_grid=params_grid,
                                                             scores=score_results,
                                                             resulting_num_clusters=resulting_num_clusters))

        return ClusteringSearcherResults(searcher_results=searcher_results)
    def get_final_datasets(data, target_variable, splitter, transformations):

        # if we have a splitter, split into training and holdout, else just do transformations on all data
        if splitter:
            training_indexes, holdout_indexes = splitter.split(
                target_values=data[target_variable])
        else:
            training_indexes, holdout_indexes = range(len(data)), []

        training_y = data.iloc[training_indexes][target_variable]
        training_x = data.iloc[training_indexes].drop(columns=target_variable)

        holdout_y = data.iloc[holdout_indexes][target_variable]
        holdout_x = data.iloc[holdout_indexes].drop(columns=target_variable)

        # transform on training data
        if transformations is not None:
            # before we train the data, we actually want to 'snoop' at what the expected columns will be with
            # ALL the data. The reason is that if we so some sort of dummy encoding, but not all the
            # categories are included in the training set (i.e. maybe only a small number of observations have
            # the categoric value), then we can still ensure that we will be giving the same expected columns/
            # encodings to the predict method with the holdout set.
            # noinspection PyTypeChecker
            expected_columns = TransformerPipeline.get_expected_columns(
                data=data.drop(columns=target_variable),  # noqa
                transformations=transformations)
            transformer = StatelessTransformer(
                custom_function=lambda x_df: x_df.reindex(
                    columns=expected_columns,  # noqa
                    fill_value=0))
            transformations = transformations + [transformer]

        pipeline = TransformerPipeline(transformations=transformations)
        # before we fit the data, we actually want to 'peak' at what the expected columns will be with
        # ALL the data. The reason is that if we so some sort of encoding (dummy/one-hot), but not all
        # of the categories are included in the training set (i.e. maybe only a small number of
        # observations have the categoric value), then we can still ensure that we will be giving the
        # same expected columns/encodings to the `predict` method with the holdout set.

        # peak at all the data (except for the target variable of course)
        # noinspection PyTypeChecker
        pipeline.peak(data_x=data.drop(columns=target_variable))

        # fit on only the train data-set (and also transform)
        transformed_training_x = pipeline.fit_transform(training_x)

        if holdout_indexes:
            transformed_holdout_x = pipeline.transform(holdout_x)
        else:
            transformed_holdout_x = holdout_x

        return transformed_training_x, training_y, transformed_holdout_x, holdout_y, pipeline
Beispiel #6
0
    def __init__(self,
                 base_models: List[ModelInfo],
                 scores: List[ScoreActualPredictedBase],
                 stacking_model: ModelWrapperBase,
                 stacking_transformations: List[TransformerBase] = None,
                 include_original_dataset: bool = False,
                 converter: ContinuousToClassConverterBase = None,
                 train_callback: Callable[
                     [pd.DataFrame, np.ndarray, Union[HyperParamsBase,
                                                      None]], None] = None,
                 predict_callback: Callable[[pd.DataFrame], None] = None):
        """
        :param base_models:
        :param stacking_transformations: The transformations that are specific to the 'stacker'.
            (The transformations that are specific to the base models will be included in the corresponding
            ModelInfo objects. Transformations applied to ALL models (base and stacker) could be passed in
            via (for example) a ModelTrainer.)
        :param include_original_dataset: When training the 'stacker' (using the predictions of the base models
            as features), this parameter indicates whether or not the original dataset should be also included
            as features, along with the base-model predictions.
        :param scores: since we are cross-validating, we can get a score from each base-model
        :param converter: A Converter object specifying how the predictions (e.g. DataFrame of probabilities
            for a classification problem) should be converted to classes. If the base_models `predict()`
            returns a DataFrame, then a `converter` will need to be supplied to describe how to extract
            the predictions for the positive class,
                e.g. `converter=ExtractPredictionsColumnConverter(column=positive_class)`
        """
        super().__init__()
        # ensure unique model descriptions
        assert len(set([x.description
                        for x in base_models])) == len(base_models)

        self._base_models = base_models
        self._scores = scores
        self._stacking_model = stacking_model
        self._stacking_model_pipeline = None if stacking_transformations is None \
            else TransformerPipeline(transformations=stacking_transformations)
        self._include_original_dataset = include_original_dataset
        self._converter = converter
        self._resampler_results = list()
        self._base_model_pipelines = list()
        self._train_callback = train_callback
        self._predict_callback = predict_callback
        self._train_meta_correlations = None
        self._stackerobject_persistence_manager = None
class ModelTrainer:
    """
    ModelTrainer encapsulates the (mundane and repetitive) logic of the general process of training a model,
        including:

        - splitting the data into training and holdout sets
        - data transformations & pre-processing
        - training a model
        - predicting on a holdout data-set, or on future data (applying the same transformations)
        - evaluate the performance of the model on a holdout set
    """
    def __init__(self,
                 model: ModelWrapperBase,
                 model_transformations: Union[List[TransformerBase],
                                              None] = None,
                 splitter: DataSplitterBase = None,
                 evaluator: EvaluatorBase = None,
                 scores: List[ScoreBase] = None,
                 persistence_manager: PersistenceManagerBase = None,
                 train_callback: Callable[
                     [pd.DataFrame, np.ndarray, Union[HyperParamsBase,
                                                      None]], None] = None):
        """

        :param model: a class representing the model to train_predict_eval
        :param model_transformations: a list of transformations to apply before training (and predicting)
        :param splitter: a class encapsulating the logic of splitting the data into training and holdout sets;
            if None, then no split occurs, and the model is trained on all the data (and so no holdout
            evaluator or scores are available).
        :param evaluator: a class encapsulating the logic of evaluating a holdout set
        :param scores: a list of Score objects
        :param persistence_manager: a PersistenceManager defining how the underlying models should be cached,
            optional.
        :param train_callback: a callback that is called before the model is trained, which returns the
           data_x, data_y, and hyper_params that are passed into `ModelWrapper.train_predict_eval()`.
           The primary intent is for unit tests to have the ability to ensure that the data (data_x) is
           being transformed as expected, but it is imaginable to think that users will also benefit
           from this capability to also peak at the data that is being trained.
        """
        assert isinstance(model, ModelWrapperBase)
        self._model = model
        self._splitter = splitter
        self._training_evaluator = evaluator
        # copy so that we can use 'same' evaluator type in the holdout evaluator
        self._holdout_evaluator = copy.deepcopy(evaluator)
        self._training_scores = scores
        self._holdout_scores = None if scores is None else [
            x.clone() for x in scores
        ]
        self._has_fitted = False
        self._persistence_manager = persistence_manager
        self._train_callback = train_callback

        if model_transformations is not None:
            assert isinstance(model_transformations, list)
            assert all([
                isinstance(x, TransformerBase) for x in model_transformations
            ])

        self._model_transformations = model_transformations
        self._pipeline = None

    def __str__(self):
        val = str(self.model)

        # either show evaluator info or scores
        if self.training_evaluator is not None:
            val += "\n\nTraining Evaluator\n==================\n"
            val += "\n" + str(self.training_evaluator)

            if self.holdout_evaluator is not None:
                val += "\n\nHoldout Evaluator\n=================\n"
                val += "\n" + str(self.holdout_evaluator)

        else:
            if self.training_scores is not None:
                val += "\n\nTraining Scores\n===============\n"
                for score in self.training_scores:
                    val += "\n" + str(score)

                if self.holdout_scores is not None:
                    val += "\n\nHoldout Scores\n==============="
                    for score in self.holdout_scores:
                        val += "\n" + str(score)

        return val

    @property
    def model(self) -> ModelWrapperBase:
        """
        :return: underlying model object
        """
        if self._has_fitted is False:
            raise ModelNotFittedError()

        return self._model

    def set_persistence_manager(self,
                                persistence_manager: PersistenceManagerBase):
        """
        Sets the persistence manager, defining how the underlying model should be cached
        :param persistence_manager:
        :return:
        """
        self._persistence_manager = persistence_manager

    @staticmethod
    def _build_cache_key(model: ModelWrapperBase,
                         hyper_params: HyperParamsBase) -> str:
        """
        helper function to build the cache key (e.g. file name)
        """
        model_name = model.name
        if hyper_params is None:
            key = model_name
        else:
            # if hyper-params, flatten out list of param names and values and concatenate/join them together
            hyper_params_long = '_'.join(
                list(
                    sum([(str(x), str(y))
                         for x, y in hyper_params.params_dict.items()],
                        ())))  # noqa
            return model_name + '_' + hyper_params_long

        return key

    def train_predict_eval(self,
                           data: pd.DataFrame,
                           target_variable: Union[str, None] = None,
                           hyper_params: HyperParamsBase = None) -> np.ndarray:
        """
        The data is split into a training/holdout set if a Splitter is provided. If not provided, no split
            occurs and the model is trained on all the `data`). Before training, the data is transformed
            by the specified Transformation objects. If a Splitter is provided, the transformations are
            'fit/transformed' on the training and only transformed on the holdout.

        Trains the data on the model, predicts, and evaluates the predictions if an Evaluator or Scores are
            passed in.
            If a Splitter is provide, the predictions that are returned are of the holdout set. Otherwise,
            the predictions form the training set are returned.

        :param data: data to split (if Splitter is provided) and train_predict_eval the model on
        :param target_variable: the name of the target variable/column
        :param hyper_params: a corresponding HyperParams object
        """
        if self._has_fitted:
            raise ModelAlreadyFittedError()

        if self._splitter:
            assert target_variable is not None
            training_indexes, holdout_indexes = self._splitter.split(
                target_values=data[target_variable])
        else:  # we are fitting the entire data-set, no such thing as a holdout data-set/evaluator/scores
            training_indexes, holdout_indexes = range(len(data)), []
            self._holdout_evaluator = None
            self._holdout_scores = None

        # for unsupervised problems, there might not be a target variable;
        # in that case, there will also not be a training_y/holding_y
        training_y = data.iloc[training_indexes][
            target_variable] if target_variable is not None else None
        training_x = data.iloc[training_indexes]

        holdout_y = data.iloc[holdout_indexes][
            target_variable] if target_variable is not None else None
        holdout_x = data.iloc[holdout_indexes]

        if target_variable is not None:
            training_x = training_x.drop(columns=target_variable)
            holdout_x = holdout_x.drop(columns=target_variable)

        # transform/train_predict_eval on training data
        if self._model_transformations is not None:
            # before we train_predict_eval the data, we actually want to 'snoop' at what the expected columns
            # will be with ALL the data. The reason is that if we so some sort of dummy encoding, but not all
            # the categories are included in the training set (i.e. maybe only a small number of observations
            # have the categoric value), then we can still ensure that we will be giving the same expected
            # columns/encodings to the predict method with the holdout set.
            expected_columns = TransformerPipeline.\
                get_expected_columns(data=data if target_variable is None else data.drop(columns=target_variable),  # noqa
                                     transformations=self._model_transformations)
            transformer = StatelessTransformer(
                custom_function=lambda x_df: x_df.reindex(
                    columns=expected_columns,  # noqa
                    fill_value=0))
            self._model_transformations = self._model_transformations + [
                transformer
            ]

        self._pipeline = TransformerPipeline(
            transformations=self._model_transformations)
        # before we fit the data, we actually want to 'peak' at what the expected columns will be with
        # ALL the data. The reason is that if we so some sort of encoding (dummy/one-hot), but not all
        # of the categories are included in the training set (i.e. maybe only a small number of
        # observations have the categoric value), then we can still ensure that we will be giving the
        # same expected columns/encodings to the `predict` method with the holdout set.

        # peak at all the data (except for the target variable of course)
        # noinspection PyTypeChecker
        self._pipeline.peak(
            data_x=data if target_variable is None else data.drop(
                columns=target_variable))
        # fit on only the train_predict_eval data-set (and also transform)
        transformed_training_data = self._pipeline.fit_transform(training_x)

        # set up persistence if applicable
        if self._persistence_manager is not None:  # then build the key
            cache_key = ModelTrainer._build_cache_key(
                model=self._model, hyper_params=hyper_params)
            self._persistence_manager.set_key(key=cache_key)
            self._model.set_persistence_manager(
                persistence_manager=self._persistence_manager)

        if self._train_callback is not None:
            self._train_callback(transformed_training_data, training_y,
                                 hyper_params)

        # train_predict_eval the model with the transformed training data
        self._model.train(data_x=transformed_training_data,
                          data_y=training_y,
                          hyper_params=hyper_params)

        self._has_fitted = True

        training_predictions = self.predict(data_x=training_x)
        holdout_predictions = None
        if self._splitter is not None:
            holdout_predictions = self.predict(data_x=holdout_x)

        # if evaluators, evaluate on both the training and holdout set
        if self._training_evaluator is not None:
            # predict will apply the transformations (which are fitted on the training data)
            self._training_evaluator.evaluate(
                actual_values=training_y,
                predicted_values=training_predictions)
            if self._holdout_evaluator:
                self._holdout_evaluator.evaluate(
                    actual_values=holdout_y,
                    predicted_values=holdout_predictions)

        # if scores, score on both the training and holdout set
        if self._training_scores is not None:
            # predict will apply the transformations (which are fitted on the training data)
            for score in self._training_scores:
                ScoreMediator.calculate(score=score,
                                        data_x=transformed_training_data,
                                        actual_target_variables=training_y,
                                        predicted_values=training_predictions)

            if self._holdout_scores:
                for score in self._holdout_scores:
                    ScoreMediator.calculate(
                        score=score,
                        data_x=
                        holdout_x,  # TODO may have to manually do transformations
                        actual_target_variables=holdout_y,
                        predicted_values=holdout_predictions)

        return training_predictions if self._splitter is None else holdout_predictions

    def predict(self, data_x: pd.DataFrame) -> np.ndarray:
        """
        `predict` handles the logic of applying the transformations (same transformations that were applied to
            the training data, as well as predicted data
        :param data_x: unprocessed DataFrame (unprocessed in terms of the model specific transformation
            pipeline, i.e. exactly the same transformations should be applied to this data as was used on the
            training data
        :return: predicted values
        """
        if self._has_fitted is False:
            raise ModelNotFittedError()

        prepared_prediction_set = self._pipeline.transform(data_x)

        predictions = self._model.predict(data_x=prepared_prediction_set)
        if isinstance(predictions, pd.DataFrame):
            # noinspection PyTypeChecker
            assert all(predictions.index.values == data_x.index.values)

        return predictions

    @property
    def training_evaluator(self) -> Union[EvaluatorBase, None]:
        """
        :return: if an Evaluator was provided via class constructor, returns the object evaluated on the
            training data
        """
        if self._has_fitted is False:
            raise ModelNotFittedError()

        return self._training_evaluator

    @property
    def holdout_evaluator(self) -> Union[EvaluatorBase, None]:
        """
        :return: if an Evaluator *and* a Splitter (thus creating a holdout set before training) were provided
            via class constructor, returns the object evaluated on the holdout data
        """
        if self._has_fitted is False:
            raise ModelNotFittedError()

        return self._holdout_evaluator

    @property
    def training_scores(self) -> Union[List[ScoreBase], None]:
        """
        :return: if a list of Scores was provided via class constructor, returns the list of Scores calculated
            on the training data.
        """
        if self._has_fitted is False:
            raise ModelNotFittedError()

        return self._training_scores

    @property
    def holdout_scores(self) -> Union[List[ScoreBase], None]:
        """
        :return: if list of Scores *and* a Splitter (thus creating a holdout set before training) were
            provided via class constructor, returns the list of Scores evaluated on the holdout data
        """
        if self._has_fitted is False:
            raise ModelNotFittedError()

        return self._holdout_scores
    def train_predict_eval(self,
                           data: pd.DataFrame,
                           target_variable: Union[str, None] = None,
                           hyper_params: HyperParamsBase = None) -> np.ndarray:
        """
        The data is split into a training/holdout set if a Splitter is provided. If not provided, no split
            occurs and the model is trained on all the `data`). Before training, the data is transformed
            by the specified Transformation objects. If a Splitter is provided, the transformations are
            'fit/transformed' on the training and only transformed on the holdout.

        Trains the data on the model, predicts, and evaluates the predictions if an Evaluator or Scores are
            passed in.
            If a Splitter is provide, the predictions that are returned are of the holdout set. Otherwise,
            the predictions form the training set are returned.

        :param data: data to split (if Splitter is provided) and train_predict_eval the model on
        :param target_variable: the name of the target variable/column
        :param hyper_params: a corresponding HyperParams object
        """
        if self._has_fitted:
            raise ModelAlreadyFittedError()

        if self._splitter:
            assert target_variable is not None
            training_indexes, holdout_indexes = self._splitter.split(
                target_values=data[target_variable])
        else:  # we are fitting the entire data-set, no such thing as a holdout data-set/evaluator/scores
            training_indexes, holdout_indexes = range(len(data)), []
            self._holdout_evaluator = None
            self._holdout_scores = None

        # for unsupervised problems, there might not be a target variable;
        # in that case, there will also not be a training_y/holding_y
        training_y = data.iloc[training_indexes][
            target_variable] if target_variable is not None else None
        training_x = data.iloc[training_indexes]

        holdout_y = data.iloc[holdout_indexes][
            target_variable] if target_variable is not None else None
        holdout_x = data.iloc[holdout_indexes]

        if target_variable is not None:
            training_x = training_x.drop(columns=target_variable)
            holdout_x = holdout_x.drop(columns=target_variable)

        # transform/train_predict_eval on training data
        if self._model_transformations is not None:
            # before we train_predict_eval the data, we actually want to 'snoop' at what the expected columns
            # will be with ALL the data. The reason is that if we so some sort of dummy encoding, but not all
            # the categories are included in the training set (i.e. maybe only a small number of observations
            # have the categoric value), then we can still ensure that we will be giving the same expected
            # columns/encodings to the predict method with the holdout set.
            expected_columns = TransformerPipeline.\
                get_expected_columns(data=data if target_variable is None else data.drop(columns=target_variable),  # noqa
                                     transformations=self._model_transformations)
            transformer = StatelessTransformer(
                custom_function=lambda x_df: x_df.reindex(
                    columns=expected_columns,  # noqa
                    fill_value=0))
            self._model_transformations = self._model_transformations + [
                transformer
            ]

        self._pipeline = TransformerPipeline(
            transformations=self._model_transformations)
        # before we fit the data, we actually want to 'peak' at what the expected columns will be with
        # ALL the data. The reason is that if we so some sort of encoding (dummy/one-hot), but not all
        # of the categories are included in the training set (i.e. maybe only a small number of
        # observations have the categoric value), then we can still ensure that we will be giving the
        # same expected columns/encodings to the `predict` method with the holdout set.

        # peak at all the data (except for the target variable of course)
        # noinspection PyTypeChecker
        self._pipeline.peak(
            data_x=data if target_variable is None else data.drop(
                columns=target_variable))
        # fit on only the train_predict_eval data-set (and also transform)
        transformed_training_data = self._pipeline.fit_transform(training_x)

        # set up persistence if applicable
        if self._persistence_manager is not None:  # then build the key
            cache_key = ModelTrainer._build_cache_key(
                model=self._model, hyper_params=hyper_params)
            self._persistence_manager.set_key(key=cache_key)
            self._model.set_persistence_manager(
                persistence_manager=self._persistence_manager)

        if self._train_callback is not None:
            self._train_callback(transformed_training_data, training_y,
                                 hyper_params)

        # train_predict_eval the model with the transformed training data
        self._model.train(data_x=transformed_training_data,
                          data_y=training_y,
                          hyper_params=hyper_params)

        self._has_fitted = True

        training_predictions = self.predict(data_x=training_x)
        holdout_predictions = None
        if self._splitter is not None:
            holdout_predictions = self.predict(data_x=holdout_x)

        # if evaluators, evaluate on both the training and holdout set
        if self._training_evaluator is not None:
            # predict will apply the transformations (which are fitted on the training data)
            self._training_evaluator.evaluate(
                actual_values=training_y,
                predicted_values=training_predictions)
            if self._holdout_evaluator:
                self._holdout_evaluator.evaluate(
                    actual_values=holdout_y,
                    predicted_values=holdout_predictions)

        # if scores, score on both the training and holdout set
        if self._training_scores is not None:
            # predict will apply the transformations (which are fitted on the training data)
            for score in self._training_scores:
                ScoreMediator.calculate(score=score,
                                        data_x=transformed_training_data,
                                        actual_target_variables=training_y,
                                        predicted_values=training_predictions)

            if self._holdout_scores:
                for score in self._holdout_scores:
                    ScoreMediator.calculate(
                        score=score,
                        data_x=
                        holdout_x,  # TODO may have to manually do transformations
                        actual_target_variables=holdout_y,
                        predicted_values=holdout_predictions)

        return training_predictions if self._splitter is None else holdout_predictions
Beispiel #9
0
    def _train(self,
               data_x: pd.DataFrame,
               data_y: np.ndarray,
               hyper_params: HyperParamsBase = None) -> object:

        # cache the `train_meta` dataset we've built up
        if self._stackerobject_persistence_manager is not None:
            cache = self._stackerobject_persistence_manager.clone()
            # utilizes the cache's _key_prefix if set (e.g. the index of repeat/fold for a resampler).
            # In the case of a Resampler, for example, this ensures a different `train_meta` is cached for
            # each fold
            cache.set_key('train_meta')
            # NOTE: building the train meta dataset is exactly the same regardless of the type of model
            # stacker or the model stacker's hyper-parameters. As such, the file name of the cache will be the
            # same, which will cause problems for multi-threading, because the same file will be read/written
            # simultaneously by multiple threads. Therefore, we will lock this action.
            self.StackerMetaLock().acquire()
            train_meta, self._resampler_results = cache.\
                get_object(fetch_function=lambda: self.build_train_meta(data_x,
                                                                        data_y,
                                                                        self._base_models,
                                                                        self._scores,
                                                                        self._converter))
            self.StackerMetaLock().release()
        else:
            train_meta, self._resampler_results = self.build_train_meta(
                data_x, data_y, self._base_models, self._scores,
                self._converter)

        if self._include_original_dataset:
            # need to make sure that the columns aren't overlapping
            assert set(train_meta.columns.values).isdisjoint(
                set(data_x.columns.values))
            train_meta = train_meta.join(data_x)

        assert all(train_meta.index.values == data_x.index.values)

        # need to fit each base model on ALL of the training data, because when we predict, we first have to
        # build the corresponding train_meta features that we will feed into the trained stacker.
        for model_info in self._base_models:
            transformed_data_x = data_x
            if model_info.transformations:
                # ensure none of the Transformers have been used.
                assert all(
                    [x.state is None for x in model_info.transformations])
                # We will fit_transform the training data then in `predict()`,
                # we will transform future data using the same transformations per model
                pipeline = TransformerPipeline(
                    transformations=model_info.transformations)
                # fit on only the train data-set (and also transform)
                transformed_data_x = pipeline.fit_transform(
                    data_x=transformed_data_x)
                # we will reuse the pipelines (which were fitted on the training data) when we predict
                self._base_model_pipelines.append(pipeline)
            else:
                self._base_model_pipelines.append(None)

            if self._stackerobject_persistence_manager is not None:
                cache = self._stackerobject_persistence_manager.clone()
                hyper_params_string = '_'.join([
                    '{}_{}'.format(key, value) for key, value in
                    model_info.hyper_params.params_dict.items()
                ])  # noqa
                # utilizes the same _key_prefix (e.g. the index of repeat/fold for a resampler). In the case
                # of a Resampler, for example, this ensures a different `train_meta` is cached for each fold
                cache.set_key(key='{}_{}_{}'.format(
                    'base', model_info.description, hyper_params_string))
                model_info.model.set_persistence_manager(cache)

            # NOTE: training the base stackers is exactly the same regardless of the type of model
            # stacker or the model stacker's hyper-parameters. As such, the file name of the cache will be the
            # same, which will cause problems for multi-threading, because the same file will be read/written
            # simultaneously by multiple threads
            # Therefore, we will lock this action. This will have the affect of generating performance of
            # training the base models similar to when it is single-threaded, although once created,
            # it should get the benefits of multi-threading. A better version of this would have a separate
            # lock for each base model, so we could still train them simultaneously.
            self.StackerBaseLock().acquire()
            model_info.model.train(data_x=transformed_data_x,
                                   data_y=data_y,
                                   hyper_params=model_info.hyper_params)
            self.StackerBaseLock().release()

        # get the correlations before any transformations on `train_meta` for the stacking model.
        self._train_meta_correlations = train_meta.corr()

        # do stacker-specific transformations
        transformed_train_meta = train_meta.drop(columns='actual_y')
        if self._stacking_model_pipeline is not None:
            transformed_train_meta = self._stacking_model_pipeline.fit_transform(
                data_x=transformed_train_meta)  # noqa

        if self._train_callback:
            self._train_callback(transformed_train_meta, data_y, hyper_params)

        self._stacking_model.set_persistence_manager(
            self._stackerobject_persistence_manager)
        self._stacking_model.train(data_x=transformed_train_meta,
                                   data_y=data_y,
                                   hyper_params=hyper_params)

        return ModelStackerTrainingObject(
            base_models=self._base_models,
            base_model_pipelines=self._base_model_pipelines,
            stacking_model=self._stacking_model,
            stacking_model_pipeline=self._stacking_model_pipeline)
def resample_repeat(args):
    """
    NOTE: parallelization is per "repeat", not per "fold". This is because decorators can be used (and
        retained/cached) across folds, which would break if we split up and parallelized the logic
    """
    folds = args['folds']
    repeat_index = args['repeat_index']
    data_x = args['data_x']
    data_y = args['data_y']
    transformer_factory = args['transformer_factory']
    train_callback = args['train_callback']
    hyper_params = args['hyper_params']
    model_factory = args['model_factory']
    persistence_manager = args['persistence_manager']
    score_factory = args['score_factory']
    decorators = args['decorators']

    # consistent folds per repeat index, but different folds for different repeats
    np.random.seed(repeat_index)
    # generate random fold #s that correspond to each index of the data
    random_folds = np.random.randint(low=0, high=folds, size=len(data_y))

    result_scores = list()  # list of all the `evaluated` holdout scores

    for fold_index in range(folds):
        holdout_indexes = random_folds == fold_index  # indexes that match the fold belong to holdout
        training_indexes = ~holdout_indexes  # all other indexes belong to the training set

        # odd naming serves as distinction between when we use transformed/non-transformed data
        train_x_not_transformed, holdout_x_not_transformed = data_x[training_indexes], \
                                                             data_x[holdout_indexes]
        train_y, holdout_y = data_y[training_indexes], data_y[holdout_indexes]

        # NOTE: we are fitting the transformations on the k-1 folds (i.e. local training data)
        # for each k times we train/predict data. This is so we don't have any contamination/
        # leakage into the local holdout/fold we are predicting on (just like we wouldn't fit
        # the transformations on the entire dataset; we fit/transform on the training and then
        # simply transform on the holdout
        pipeline = TransformerPipeline(
            transformations=transformer_factory.get())
        # before we fit the data, we actually want to 'peak' at what the expected columns will be with
        # ALL the data. The reason is that if we so some sort of encoding (dummy/one-hot), but not all
        # of the categories are included in the training set (i.e. maybe only a small number of
        # observations have the categoric value), then we can still ensure that we will be giving the
        # same expected columns/encodings to the `predict` method with the holdout set.
        # peak at all the data
        pipeline.peak(data_x=data_x)
        # fit on only the train dataset (and also transform)
        train_x_transformed = pipeline.fit_transform(
            data_x=train_x_not_transformed)
        # transform (but don't fit) on holdout
        holdout_x_transformed = pipeline.transform(
            data_x=holdout_x_not_transformed)

        # the callback allows callers to see/verify the data that is being trained, at each fold
        if train_callback is not None:
            train_callback(train_x_transformed, data_y, hyper_params)

        model = model_factory.get_model(
        )  # need to reuse this object type for each fold/repeat

        # set up persistence if applicable
        if persistence_manager is not None:  # then build the key
            # first set the key_prefix; separating the repeat/fold information from the rest of the key
            # let's models (e.g. ModelStacker) utilize the key_prefix, while modifying the key
            persistence_manager.set_key_prefix(
                prefix='repeat{}_fold{}_'.format(str(repeat_index),
                                                 str(fold_index)))
            cache_key = model_build_cache_key(model=model,
                                              hyper_params=hyper_params)
            persistence_manager.set_key(key=cache_key)
            model.set_persistence_manager(
                persistence_manager=persistence_manager)

        model.train(data_x=train_x_transformed,
                    data_y=train_y,
                    hyper_params=hyper_params)
        predicted_values = model.predict(data_x=holdout_x_transformed)

        fold_scores = list()
        for score in score_factory.get(
        ):  # cycle through scores and store results of each fold
            score.calculate(actual_values=holdout_y,
                            predicted_values=predicted_values)
            fold_scores.append(score)
        result_scores.append(fold_scores)

        # executed any functionality that is dynamically attached via decorators
        if decorators:
            for decorator in decorators:
                decorator.decorate(
                    repeat_index=repeat_index,
                    fold_index=fold_index,
                    scores=score_factory.get(),
                    holdout_actual_values=holdout_y,
                    holdout_predicted_values=predicted_values,
                    holdout_indexes=holdout_x_transformed.index.values,
                    model=model,
                    transformer_pipeline=pipeline)
    return result_scores, decorators
    def _resample(self,
                  data_x: pd.DataFrame,
                  data_y: np.ndarray,
                  hyper_params: HyperParamsBase = None) -> ResamplerResults:

        # transform/fit on training data
        if self._transformer_factory.has_transformations():
            # before we fit the data, we actually want to 'snoop' at what the expected columns will be with
            # ALL the data. The reason is that if we so some sort of encoding (dummy/one-hot), but not all of
            # the categories are included in the training set (i.e. maybe only a small number of observations
            # have the categoric value), then we can still ensure that we will be giving the same expected
            # columns/encodings to the `predict` method of the holdout set.
            expected_columns = TransformerPipeline.get_expected_columns(
                data=data_x,
                transformations=self._transformer_factory.get())  # noqa
            # create a transformer that ensures the expected columns exist (e.g. dummy columns), and add it
            # as the last transformation
            temp = StatelessParallelizationHelper(
                expected_columns=expected_columns)
            transformer = StatelessTransformer(custom_function=temp.helper)
            self._transformer_factory.append_transformations([transformer])

        # map_function rather than a for loop so we can switch between parallelization and non-parallelization
        resample_args = [
            dict(
                folds=self._folds,
                repeat_index=x,
                data_x=data_x,
                data_y=data_y,
                transformer_factory=self._transformer_factory,
                train_callback=self._train_callback,
                hyper_params=hyper_params,
                model_factory=self._model_factory,
                persistence_manager=self._model_persistence_manager,
                # need to reuse this object type for each fold/repeat
                score_factory=self._score_factory,
                decorators=self._decorators) for x in range(self._repeats)
        ]

        # if self._parallelization_cores == 0 or self._parallelization_cores == 1:
        #     results = list(map(resample_repeat, resample_args))
        # else:
        #     cores = cpu_count() if self._parallelization_cores == -1 else self._parallelization_cores
        #     # with ThreadPool(cores) as pool:
        #     # https://codewithoutrules.com/2018/09/04/python-multiprocessing/
        #     with get_context("spawn").Pool(cores) as pool:
        #         results = list(pool.map(resample_repeat, resample_args))
        results = list(map(resample_repeat, resample_args))

        result_scores = [x[0] for x in results]
        # flatten out so there are folds*repeats number of list items
        flattened_scores = [
            result_scores[x][y] for x in range(self._repeats)
            for y in range(self._folds)
        ]

        # if we have decorators and are doing parallelization, then we will have a decorator per repeat
        # and we have to flatten it out the list, otherwise, there will only be one set (list) of decorators
        # and it will already be set to self._decorators
        if self._decorators is not None and self._parallelization_cores != 0:
            decorators = [x[1] for x in results]
            flattened_decorators = [
                decorators[x][y] if decorators[x] else None
                for x in range(self._repeats)
                for y in range(len(self._decorators))
            ]
            self._decorators = flattened_decorators

        # result_scores is a list of list of holdout scores.
        # Each outer list represents a resampling result
        # and each element of the inner list represents a specific score.
        return ResamplerResults(scores=flattened_scores,
                                decorators=self._decorators,
                                hyper_params=hyper_params)