def __init__(self, models_path=None, time_left_for_this_task=3600, n_ensemble=10, n_best_model=5, include_estimators=None, exclude_estimators=None, include_preprocessors=None, exclude_preprocessors=None, keep_models=True, model_dir=None, precision=32, delete_models=True): """ this is to init automl class, whole thing should be ininstanted in this class, like what algorithms to use, how many models to be selected, etc. :param backend: backend object used to save and load models :param time_left_for_this_task: how long for this models to be trained. :param n_ensemble: how many models to be selected to be ensemble :param n_best_model: how many models to be keeped during training. :param include_estimators: what algorithms to be included :param exclude_estimators: what algorithms to be excluded :param include_preprocessors: what preprocessing step to be included :param exclude_preprocessors: what preprocessing step to be excluded :param keep_models: whether or not to keep trained models :param model_dir: keep model folder, if None use backend to create one folder :param precision: precision of data, to save memory """ super(AutoML, self).__init__() # Change backend object creation as this class is entry class also Backend class support singleton, so output_folder will work. models_path = OUTPUT_FOLDER if models_path is None else models_path # Add with `models_path` parameters, as the only output for the framework is only models files, so to store models into # folder that we would like will be nice! self.models_path = models_path self.backend = Backend(output_folder=models_path) self.time_left_for_this_taks = 3600 if time_left_for_this_task is None else time_left_for_this_task self.n_ensemble = n_ensemble self.n_best_model = n_best_model self.include_estimators = include_estimators self.exclude_estimators = exclude_estimators self.include_preprocessors = include_preprocessors self.exclude_preprocessors = exclude_preprocessors self.keep_models = keep_models self.model_dir = model_dir self.precision = precision self.estimator = None # as we will use `ensemble` to combine models so the last will just be one model self.best_model = None # Add with what type of the problem self.type_of_problem = None self.delete_models = delete_models
# Then should combined the prediction and original data x_new = np.concatenate([x, pred_array], axis=1) return x_new if __name__ == '__main__': from sklearn.datasets import load_iris from auto_ml.test.get_test_data import get_training_data from auto_ml.utils.backend_obj import Backend x, y = load_iris(return_X_y=True) x, y = get_training_data() backend = Backend( output_folder=r"C:\Users\guangqiiang.lu\Downloads\test_automl") print("Get backend output folder:", backend.output_folder) model_ensemble = ModelEnsemble( backend=backend, ensemble_alg='stacking', voting_logic='soft', ) model_ensemble.fit(x, y) # print([x[1].__class__ for x in model_ensemble.model_list]) # print(model_ensemble.model_list) # for models in model_ensemble.model_list: # model = models[1] # print(model) # print(getattr(model, "_estimator_type", None))
def __init__(self): self.backend = Backend() self.model = None
# -*- coding:utf-8 -*- """ This is used to make test data for whole project @author: Guangqiang.lu """ import os import pandas as pd from auto_ml.utils.paths import get_root_path from auto_ml.utils.backend_obj import Backend backend = Backend() root_path = get_root_path() cur_path = os.path.join(root_path, 'test') def get_training_data(return_df=False, file_name='train.csv'): df = pd.read_csv(os.path.join(cur_path, file_name)) if return_df: return df x = df.drop(['Survived'], axis=1).values y = df['Survived'].values x = x.copy(order='C') return x, y def save_processing_data(data, dataset_name='process_tmp'): backend.save_dataset(data, dataset_name, model_file_path=False)
# self.dataset_name = hash_dataset_name(x) if dataset_name is None else dataset_name # we should ensure data could be trained like training data should be 2D x, y = check_data_and_label(x, y) return x, y if __name__ == '__main__': from sklearn.datasets import load_iris x, y = load_iris(return_X_y=True) from auto_ml.utils.backend_obj import Backend models_path = r"C:\Users\guangqiiang.lu\Downloads\test_automl" backend = Backend(output_folder=models_path) print("Backend output folder ", backend.output_folder) classifier_pipeline = ClassificationPipeline(backend=backend) # print(classifier_pipeline) # grid_models = classifier_pipeline.build_training_pipeline() # print(grid_models.list_estimators()) # grid_models.fit(x, y) # print(grid_models.load_best_model_list()) # process_pipeline = classifier_pipeline.build_preprocessing_pipeline() # print(process_pipeline) # classifier_pipeline._fit_processing_pipeline(x, y) from auto_ml.test.get_test_data import get_training_data
class AutoML(BaseEstimator): def __init__(self, models_path=None, time_left_for_this_task=3600, n_ensemble=10, n_best_model=5, include_estimators=None, exclude_estimators=None, include_preprocessors=None, exclude_preprocessors=None, keep_models=True, model_dir=None, precision=32, delete_models=True): """ this is to init automl class, whole thing should be ininstanted in this class, like what algorithms to use, how many models to be selected, etc. :param backend: backend object used to save and load models :param time_left_for_this_task: how long for this models to be trained. :param n_ensemble: how many models to be selected to be ensemble :param n_best_model: how many models to be keeped during training. :param include_estimators: what algorithms to be included :param exclude_estimators: what algorithms to be excluded :param include_preprocessors: what preprocessing step to be included :param exclude_preprocessors: what preprocessing step to be excluded :param keep_models: whether or not to keep trained models :param model_dir: keep model folder, if None use backend to create one folder :param precision: precision of data, to save memory """ super(AutoML, self).__init__() # Change backend object creation as this class is entry class also Backend class support singleton, so output_folder will work. models_path = OUTPUT_FOLDER if models_path is None else models_path # Add with `models_path` parameters, as the only output for the framework is only models files, so to store models into # folder that we would like will be nice! self.models_path = models_path self.backend = Backend(output_folder=models_path) self.time_left_for_this_taks = 3600 if time_left_for_this_task is None else time_left_for_this_task self.n_ensemble = n_ensemble self.n_best_model = n_best_model self.include_estimators = include_estimators self.exclude_estimators = exclude_estimators self.include_preprocessors = include_preprocessors self.exclude_preprocessors = exclude_preprocessors self.keep_models = keep_models self.model_dir = model_dir self.precision = precision self.estimator = None # as we will use `ensemble` to combine models so the last will just be one model self.best_model = None # Add with what type of the problem self.type_of_problem = None self.delete_models = delete_models def fit(self, x=None, y=None, file_load=None, \ xval=None, yval=None, val_split=None, n_jobs=None, use_neural_network=True, *args, **kwargs): """ Type of the problem attribute should be added, so that for `score`, we could get metrics based on different problem. :param xtrain: :param ytrain: :param n_jobs: :return: """ start_time = time.time() # Add logic here is that we should clean models' folder, so that every time we could get a clean folder for next time running! if self.backend and self.delete_models: self.backend.clean_folder() x, y = self._get_data_and_label(file_load, x, y) if val_split is not None: # if do need to do validation based on current train data, then just split current data into validation as well x, xval, y, yval = train_test_split(x, y, test_size=val_split) else: # Here I think if and only if the data length is over a threashold, then to do validation,even user haven't provided val data. if len(x) > VALIDATION_THRESHOLD: val_split = .2 # if do need to do validation based on current train data, then just split current data into validation as well x, xval, y, yval = train_test_split(x, y, test_size=val_split) # after the checking process, then we need to create the Pipeline for whole process. # Here should use a Pipeline object to do real training, also with `ensemble` self.estimator.fit(x, y, n_jobs=n_jobs, use_neural_network=use_neural_network) # load trained models for prediction and scoring for testing data. # after we have fitted the trained models, then next step is to load whole of them from disk # and sort them based on score, and get scores for `test data` and `test label` # WE could get them from parent function, so that we could also use this for `regression` # TODO: This should be changed for regression! As regression may have negative values, so here should change # on different type. # After change regression metrics to r2, this is workable self.models_list = self._load_trained_models_ordered_by_score( higher_best=True) self._validation_models(xval, yval) logger.info("Whole training pipeline takes: {} seconds!".format( round(time.time() - start_time, 2))) return self def predict(self, x=None, file_load=None, **kwargs): """ Most of the prediction logic should happen here, as for the score should based on prediction result. :param x: :param kwargs: :return: """ x = self._get_training_data(x, file_load) logger.info("Start to get prediction based on best trained model.") prediction = self.estimator.predict(x) logger.info("Prediction step finishes.") return prediction def predict_proba(self, x=None, file_load=None, **kwargs): """ Should support with probability if supported. :param x: :param kwargs: :return: """ # should check the estimator should have function: `predict_proba`! if not hasattr(self.estimator, 'predict_proba'): raise NotImplementedError( "Best fitted model:{} doesn't support `predict_proba`".format( self.best_model)) x = self._get_training_data(x, file_load) logger.info("Start to get probability based on best trained model.") prob = self.estimator.predict_proba(x) logger.info("Prediction step finishes.") return prob def score(self, x=None, y=None, file_load=None, **kwargs): self._check_fitted() logger.info("Start to get prediction based on best trained model!") self._check_param(file_load, x, y) if file_load is not None: x, y = self.__get_file_load_data_label(file_load, use_for_pred=False) # ensure we have array type if isinstance(x, pd.DataFrame) or isinstance(x, pd.Series): x = x.values if len(x.shape) == 1: x = x.reshape(-1, 1) if isinstance(y, pd.DataFrame) or isinstance(y, pd.Series): y = y.values if len(y.shape) == 1: y = y.reshape(-1, 1) # Use child func, child should implement with score based on different type of problem. score = self.estimator.score(x, y) logger.info("Get score: {} based on best model!".format(score)) return score def _load_trained_models_ordered_by_score(self, higher_best=True): """ To load whole trained model from disk and sorted them based on `higher_best`. :param higher_best: :return: models_list: a list of trained models sorted with score. [("lr-0.9.pkl", lr-0.9.pkl), ...] """ models_list = self.backend.load_models_combined_with_model_name() # let's sort these models based on score, this is a tuple:(model_name, model_instance) if higher_best: reverse = True else: reverse = False # sort models by `training` score with diff type of problem models_list = sorted(models_list, key=lambda model: float(model[0].split("_")[ 1].replace(".pkl", '').replace(".h5", '')), reverse=reverse) return models_list def get_sorted_models_scores(self, x, y, **kwargs): """ Add this func to get whole score based for each trained models, so that we could get the result that we have taken that times and for each models, how about the testing result. But this should be implemented by pipeline! :param x: :param y: :param kwargs: :return: a list of tuple: [(model_name, model_score), ...] """ raise NotImplementedError def _check_fitted(self): if not self.models_list: logger.error( "When try to get prediction with `automl`, Please fit the model first" ) raise NotFittedError( "When try to get prediction with `automl`, Please fit the model first" ) return True @staticmethod def _check_param(*args): all_None = all([arg is None for arg in args]) if all_None: raise ValueError("Please provide at least one parameter!") @staticmethod def __get_file_load_data_label(file_load, use_for_pred=True): """ Get data and label from original obj. """ data, label = file_load.data, file_load.label return data, label @classmethod def reconstruct(cls, models_path=None, *args, **kwargs): """ Used for Restful API to create """ return cls(models_path, *args, **kwargs) @staticmethod def _get_data_and_label(file_load, x, y): """ Ensure could get data and label. """ if file_load is None and x is None and y is None: raise ValueError( "When do real training, please provide at least a " + "`file_load` or train data with `xtrain, ytrain`!") if file_load is not None: # with container, then just query the attribute then we could keep other as same. x, y = file_load.data, file_load.label else: if x is None or y is None: raise ValueError( "When to do training, please provide both `x` and `y`!") # Let's try to make DF into array, so later will be easier! if isinstance(x, pd.DataFrame): x = x.values if isinstance(y, pd.DataFrame): y = y.values return x, y def _validation_models(self, xval, yval): if xval is not None and yval is not None: score_dict = self.estimator.get_sorted_models_scores(xval, yval) print(score_dict) score_log_str = self.__format_trained_model_scores(score_dict) print(score_log_str) else: logger.warning("No need to validation!") pass @staticmethod def __format_trained_model_scores(score_dict, n_space=35): out_str_format = '{{0:{0}}}|{{1:{0}}}|{{2:{0}}}'.format(n_space) score_log_str = out_str_format.format("Model name", "Train score", "Validation score") logger.info(score_log_str) for model_name, test_score in score_dict.items(): try: model_name_split = model_name.split('_') # model_name = model_name_split[0] train_score = model_name_split[1] train_score = train_score[:train_score.rindex(".")] # must convert to str, otherwise with not wanted result. test_score = str(test_score) log_str = out_str_format.format(model_name, train_score, test_score) score_log_str += '\n' + log_str except Exception as e: logger.warning( "Get invalidate model name: {}".format(model_name)) continue logger.info(log_str) return score_log_str def get_sorted_models_scores(self, xtest=None, ytest=None, file_load=None, reverse=True, **kwargs): """ To get some best trained model's score for `test` data with ordered. So that we could get the list of the best scores for later front end show case. :param x: :param y: :param kwargs: :return: """ self._check_param(file_load, xtest, ytest) if file_load is not None: xtest, ytest = file_load.data, file_load.label score_dict = self.estimator.get_sorted_models_scores(xtest, ytest, reverse=reverse) return score_dict def _get_training_data(self, x, file_load): """Get training data based on `file_load` or `x`, if x is provided, then just return x, otherwise should get data from file_load. Either of thems should be provided. Args: x ([type]): [description] file_load ([type]): [description] Returns: [type]: [description] """ self._check_param(file_load, x) if x is not None: if isinstance(x, pd.DataFrame): x = x.values return x elif file_load is not None: x, _ = self.__get_file_load_data_label(file_load) return x