Exemple #1
0
    def __init__(self,
                 models_path=None,
                 time_left_for_this_task=3600,
                 n_ensemble=10,
                 n_best_model=5,
                 include_estimators=None,
                 exclude_estimators=None,
                 include_preprocessors=None,
                 exclude_preprocessors=None,
                 keep_models=True,
                 model_dir=None,
                 precision=32,
                 delete_models=True):
        """
        this is to init automl class, whole thing should be ininstanted
        in this class, like what algorithms to use, how many models to
        be selected, etc.
        :param backend: backend object used to save and load models
        :param time_left_for_this_task: how long for this models to be trained.
        :param n_ensemble: how many models to be selected to be ensemble
        :param n_best_model: how many models to be keeped during training.
        :param include_estimators: what algorithms to be included
        :param exclude_estimators: what algorithms to be excluded
        :param include_preprocessors: what preprocessing step to be included
        :param exclude_preprocessors: what preprocessing step to be excluded
        :param keep_models: whether or not to keep trained models
        :param model_dir: keep model folder, if None use backend to create one folder
        :param precision: precision of data, to save memory
        """
        super(AutoML, self).__init__()
        # Change backend object creation as this class is entry class also Backend class support singleton, so output_folder will work.
        models_path = OUTPUT_FOLDER if models_path is None else models_path
        # Add with `models_path` parameters, as the only output for the framework is only models files, so to store models into
        # folder that we would like will be nice!
        self.models_path = models_path

        self.backend = Backend(output_folder=models_path)
        self.time_left_for_this_taks = 3600 if time_left_for_this_task is None else time_left_for_this_task
        self.n_ensemble = n_ensemble
        self.n_best_model = n_best_model
        self.include_estimators = include_estimators
        self.exclude_estimators = exclude_estimators
        self.include_preprocessors = include_preprocessors
        self.exclude_preprocessors = exclude_preprocessors
        self.keep_models = keep_models
        self.model_dir = model_dir
        self.precision = precision

        self.estimator = None

        # as we will use `ensemble` to combine models so the last will just be one model
        self.best_model = None

        # Add with what type of the problem
        self.type_of_problem = None

        self.delete_models = delete_models
Exemple #2
0
        # Then should combined the prediction and original data
        x_new = np.concatenate([x, pred_array], axis=1)

        return x_new


if __name__ == '__main__':
    from sklearn.datasets import load_iris

    from auto_ml.test.get_test_data import get_training_data
    from auto_ml.utils.backend_obj import Backend

    x, y = load_iris(return_X_y=True)
    x, y = get_training_data()

    backend = Backend(
        output_folder=r"C:\Users\guangqiiang.lu\Downloads\test_automl")

    print("Get backend output folder:", backend.output_folder)
    model_ensemble = ModelEnsemble(
        backend=backend,
        ensemble_alg='stacking',
        voting_logic='soft',
    )

    model_ensemble.fit(x, y)
    # print([x[1].__class__ for x in model_ensemble.model_list])
    # print(model_ensemble.model_list)
    # for models in model_ensemble.model_list:
    #     model = models[1]
    #     print(model)
    #     print(getattr(model, "_estimator_type", None))
Exemple #3
0
 def __init__(self):
     self.backend = Backend()
     self.model = None
Exemple #4
0
# -*- coding:utf-8 -*-
"""
This is used to make test data for whole project

@author: Guangqiang.lu
"""
import os
import pandas as pd
from auto_ml.utils.paths import get_root_path
from auto_ml.utils.backend_obj import Backend

backend = Backend()
root_path = get_root_path()
cur_path = os.path.join(root_path, 'test')


def get_training_data(return_df=False, file_name='train.csv'):
    df = pd.read_csv(os.path.join(cur_path, file_name))

    if return_df:
        return df

    x = df.drop(['Survived'], axis=1).values
    y = df['Survived'].values
    x = x.copy(order='C')

    return x, y


def save_processing_data(data, dataset_name='process_tmp'):
    backend.save_dataset(data, dataset_name, model_file_path=False)
        # self.dataset_name = hash_dataset_name(x) if dataset_name is None else dataset_name
        # we should ensure data could be trained like training data should be 2D
        x, y = check_data_and_label(x, y)

        return x, y


if __name__ == '__main__':
    from sklearn.datasets import load_iris

    x, y = load_iris(return_X_y=True)

    from auto_ml.utils.backend_obj import Backend

    models_path = r"C:\Users\guangqiiang.lu\Downloads\test_automl"
    backend = Backend(output_folder=models_path)
    print("Backend output folder ", backend.output_folder)

    classifier_pipeline = ClassificationPipeline(backend=backend)
    # print(classifier_pipeline)

    # grid_models = classifier_pipeline.build_training_pipeline()
    # print(grid_models.list_estimators())
    # grid_models.fit(x, y)
    # print(grid_models.load_best_model_list())

    # process_pipeline = classifier_pipeline.build_preprocessing_pipeline()
    # print(process_pipeline)
    # classifier_pipeline._fit_processing_pipeline(x, y)

    from auto_ml.test.get_test_data import get_training_data
Exemple #6
0
class AutoML(BaseEstimator):
    def __init__(self,
                 models_path=None,
                 time_left_for_this_task=3600,
                 n_ensemble=10,
                 n_best_model=5,
                 include_estimators=None,
                 exclude_estimators=None,
                 include_preprocessors=None,
                 exclude_preprocessors=None,
                 keep_models=True,
                 model_dir=None,
                 precision=32,
                 delete_models=True):
        """
        this is to init automl class, whole thing should be ininstanted
        in this class, like what algorithms to use, how many models to
        be selected, etc.
        :param backend: backend object used to save and load models
        :param time_left_for_this_task: how long for this models to be trained.
        :param n_ensemble: how many models to be selected to be ensemble
        :param n_best_model: how many models to be keeped during training.
        :param include_estimators: what algorithms to be included
        :param exclude_estimators: what algorithms to be excluded
        :param include_preprocessors: what preprocessing step to be included
        :param exclude_preprocessors: what preprocessing step to be excluded
        :param keep_models: whether or not to keep trained models
        :param model_dir: keep model folder, if None use backend to create one folder
        :param precision: precision of data, to save memory
        """
        super(AutoML, self).__init__()
        # Change backend object creation as this class is entry class also Backend class support singleton, so output_folder will work.
        models_path = OUTPUT_FOLDER if models_path is None else models_path
        # Add with `models_path` parameters, as the only output for the framework is only models files, so to store models into
        # folder that we would like will be nice!
        self.models_path = models_path

        self.backend = Backend(output_folder=models_path)
        self.time_left_for_this_taks = 3600 if time_left_for_this_task is None else time_left_for_this_task
        self.n_ensemble = n_ensemble
        self.n_best_model = n_best_model
        self.include_estimators = include_estimators
        self.exclude_estimators = exclude_estimators
        self.include_preprocessors = include_preprocessors
        self.exclude_preprocessors = exclude_preprocessors
        self.keep_models = keep_models
        self.model_dir = model_dir
        self.precision = precision

        self.estimator = None

        # as we will use `ensemble` to combine models so the last will just be one model
        self.best_model = None

        # Add with what type of the problem
        self.type_of_problem = None

        self.delete_models = delete_models

    def fit(self, x=None, y=None, file_load=None, \
             xval=None, yval=None, val_split=None, n_jobs=None, use_neural_network=True, *args, **kwargs):
        """
        Type of the problem attribute should be added, so that for `score`,
        we could get metrics based on different problem.
        :param xtrain:
        :param ytrain:
        :param n_jobs:
        :return:
        """
        start_time = time.time()

        # Add logic here is that we should clean models' folder, so that every time we could get a clean folder for next time running!
        if self.backend and self.delete_models:
            self.backend.clean_folder()

        x, y = self._get_data_and_label(file_load, x, y)

        if val_split is not None:
            # if do need to do validation based on current train data, then just split current data into validation as well
            x, xval, y, yval = train_test_split(x, y, test_size=val_split)
        else:
            # Here I think if and only if the data length is over a threashold, then to do validation,even user haven't provided val data.
            if len(x) > VALIDATION_THRESHOLD:
                val_split = .2
                # if do need to do validation based on current train data, then just split current data into validation as well
                x, xval, y, yval = train_test_split(x, y, test_size=val_split)

        # after the checking process, then we need to create the Pipeline for whole process.
        # Here should use a Pipeline object to do real training, also with `ensemble`
        self.estimator.fit(x,
                           y,
                           n_jobs=n_jobs,
                           use_neural_network=use_neural_network)

        # load trained models for prediction and scoring for testing data.
        # after we have fitted the trained models, then next step is to load whole of them from disk
        # and sort them based on score, and get scores for `test data` and `test label`
        # WE could get them from parent function, so that we could also use this for `regression`

        # TODO: This should be changed for regression! As regression may have negative values, so here should change
        # on different type.
        # After change regression metrics to r2, this is workable
        self.models_list = self._load_trained_models_ordered_by_score(
            higher_best=True)

        self._validation_models(xval, yval)

        logger.info("Whole training pipeline takes: {} seconds!".format(
            round(time.time() - start_time, 2)))

        return self

    def predict(self, x=None, file_load=None, **kwargs):
        """
        Most of the prediction logic should happen here, as for the score should based on
        prediction result.

        :param x:
        :param kwargs:
        :return:
        """
        x = self._get_training_data(x, file_load)

        logger.info("Start to get prediction based on best trained model.")
        prediction = self.estimator.predict(x)
        logger.info("Prediction step finishes.")

        return prediction

    def predict_proba(self, x=None, file_load=None, **kwargs):
        """
        Should support with probability if supported.
        :param x:
        :param kwargs:
        :return:
        """
        # should check the estimator should have function: `predict_proba`!
        if not hasattr(self.estimator, 'predict_proba'):
            raise NotImplementedError(
                "Best fitted model:{} doesn't support `predict_proba`".format(
                    self.best_model))

        x = self._get_training_data(x, file_load)

        logger.info("Start to get probability based on best trained model.")
        prob = self.estimator.predict_proba(x)
        logger.info("Prediction step finishes.")

        return prob

    def score(self, x=None, y=None, file_load=None, **kwargs):
        self._check_fitted()

        logger.info("Start to get prediction based on best trained model!")

        self._check_param(file_load, x, y)

        if file_load is not None:
            x, y = self.__get_file_load_data_label(file_load,
                                                   use_for_pred=False)

        # ensure we have array type
        if isinstance(x, pd.DataFrame) or isinstance(x, pd.Series):
            x = x.values
            if len(x.shape) == 1:
                x = x.reshape(-1, 1)
        if isinstance(y, pd.DataFrame) or isinstance(y, pd.Series):
            y = y.values

            if len(y.shape) == 1:
                y = y.reshape(-1, 1)

        # Use child func, child should implement with score based on different type of problem.
        score = self.estimator.score(x, y)

        logger.info("Get score: {} based on best model!".format(score))

        return score

    def _load_trained_models_ordered_by_score(self, higher_best=True):
        """
        To load whole trained model from disk and sorted them based on `higher_best`.
        :param higher_best:
        :return:
            models_list: a list of trained models sorted with score. [("lr-0.9.pkl", lr-0.9.pkl), ...]
        """
        models_list = self.backend.load_models_combined_with_model_name()

        # let's sort these models based on score, this is a tuple:(model_name, model_instance)
        if higher_best:
            reverse = True
        else:
            reverse = False

        # sort models by `training` score with diff type of problem
        models_list = sorted(models_list,
                             key=lambda model: float(model[0].split("_")[
                                 1].replace(".pkl", '').replace(".h5", '')),
                             reverse=reverse)

        return models_list

    def get_sorted_models_scores(self, x, y, **kwargs):
        """
        Add this func to get whole score based for each trained models, so that
        we could get the result that we have taken that times and for each models,
        how about the testing result.

        But this should be implemented by pipeline!
        :param x:
        :param y:
        :param kwargs:
        :return:
            a list of tuple: [(model_name, model_score), ...]
        """
        raise NotImplementedError

    def _check_fitted(self):
        if not self.models_list:
            logger.error(
                "When try to get prediction with `automl`, Please fit the model first"
            )
            raise NotFittedError(
                "When try to get prediction with `automl`, Please fit the model first"
            )

        return True

    @staticmethod
    def _check_param(*args):
        all_None = all([arg is None for arg in args])

        if all_None:
            raise ValueError("Please provide at least one parameter!")

    @staticmethod
    def __get_file_load_data_label(file_load, use_for_pred=True):
        """
        Get data and label from original obj.
        """
        data, label = file_load.data, file_load.label

        return data, label

    @classmethod
    def reconstruct(cls, models_path=None, *args, **kwargs):
        """
        Used for Restful API to create
        """
        return cls(models_path, *args, **kwargs)

    @staticmethod
    def _get_data_and_label(file_load, x, y):
        """
        Ensure could get data and label.
        """
        if file_load is None and x is None and y is None:
            raise ValueError(
                "When do real training, please provide at least a " +
                "`file_load` or train data with `xtrain, ytrain`!")

        if file_load is not None:
            # with container, then just query the attribute then we could keep other as same.
            x, y = file_load.data, file_load.label
        else:
            if x is None or y is None:
                raise ValueError(
                    "When to do training, please provide both `x` and `y`!")

        # Let's try to make DF into array, so later will be easier!
        if isinstance(x, pd.DataFrame):
            x = x.values
        if isinstance(y, pd.DataFrame):
            y = y.values

        return x, y

    def _validation_models(self, xval, yval):
        if xval is not None and yval is not None:
            score_dict = self.estimator.get_sorted_models_scores(xval, yval)
            print(score_dict)

            score_log_str = self.__format_trained_model_scores(score_dict)
            print(score_log_str)
        else:
            logger.warning("No need to validation!")
            pass

    @staticmethod
    def __format_trained_model_scores(score_dict, n_space=35):
        out_str_format = '{{0:{0}}}|{{1:{0}}}|{{2:{0}}}'.format(n_space)

        score_log_str = out_str_format.format("Model name", "Train score",
                                              "Validation score")
        logger.info(score_log_str)

        for model_name, test_score in score_dict.items():
            try:
                model_name_split = model_name.split('_')
                # model_name = model_name_split[0]
                train_score = model_name_split[1]
                train_score = train_score[:train_score.rindex(".")]
                # must convert to str, otherwise with not wanted result.
                test_score = str(test_score)

                log_str = out_str_format.format(model_name, train_score,
                                                test_score)
                score_log_str += '\n' + log_str
            except Exception as e:
                logger.warning(
                    "Get invalidate model name: {}".format(model_name))
                continue

            logger.info(log_str)

        return score_log_str

    def get_sorted_models_scores(self,
                                 xtest=None,
                                 ytest=None,
                                 file_load=None,
                                 reverse=True,
                                 **kwargs):
        """
        To get some best trained model's score for `test` data with ordered.

        So that we could get the list of the best scores for later front end show case.
        :param x:
        :param y:
        :param kwargs:
        :return:
        """
        self._check_param(file_load, xtest, ytest)

        if file_load is not None:
            xtest, ytest = file_load.data, file_load.label

        score_dict = self.estimator.get_sorted_models_scores(xtest,
                                                             ytest,
                                                             reverse=reverse)

        return score_dict

    def _get_training_data(self, x, file_load):
        """Get training data based on `file_load` or `x`,
        if x is provided, then just return x, otherwise should get data 
        from file_load.

        Either of thems should be provided.

        Args:
            x ([type]): [description]
            file_load ([type]): [description]

        Returns:
            [type]: [description]
        """
        self._check_param(file_load, x)

        if x is not None:
            if isinstance(x, pd.DataFrame):
                x = x.values
            return x
        elif file_load is not None:
            x, _ = self.__get_file_load_data_label(file_load)
            return x