def load(results_path, model_subpath, lazy_load=True): model_path = os.path.join(results_path, model_subpath) logger.info(f"Loading model framework from {model_path}") json_desc = json.load(open(os.path.join(model_path, "framework.json"))) mf = ModelFramework(json_desc["params"]) mf.uid = json_desc.get("uid", mf.uid) mf._name = json_desc.get("name", mf._name) mf._threshold = json_desc.get("threshold") mf.train_time = json_desc.get("train_time", mf.train_time) mf.final_loss = json_desc.get("final_loss", mf.final_loss) mf.metric_name = json_desc.get("metric_name", mf.metric_name) mf._is_stacked = json_desc.get("is_stacked", mf._is_stacked) predictions_fname = json_desc.get("predictions_fname") if predictions_fname is not None: mf._oof_predictions_fname = os.path.join(results_path, predictions_fname) mf.learners = [] for learner_desc, learner_subpath in zip(json_desc.get("learners"), json_desc.get("saved")): learner_path = os.path.join(results_path, learner_subpath) l = AlgorithmFactory.load(learner_desc, learner_path, lazy_load) mf.learners += [l] mf.preprocessings = [] for p in json_desc.get("preprocessing"): ps = Preprocessing() ps.from_json(p, results_path) mf.preprocessings += [ps] return mf
def test_run_on_y_only_validation(self): d = {"y": ["a", "b", "a", "b"]} df = pd.DataFrame(data=d) y_train = df.loc[:, "y"] d_test = {"y": [np.nan, "a", np.nan, "b"]} df_test = pd.DataFrame(data=d_test) y_test = df_test.loc[:, "y"] preprocessing_params = { "target_preprocessing": [ PreprocessingMissingValues.FILL_NA_MEDIAN, PreprocessingCategorical.CONVERT_INTEGER, ] } ps = Preprocessing(preprocessing_params) _, y_train = ps.fit_and_transform(None, y_train) _, y_test = ps.transform(None, y_test) self.assertEqual(4, y_train.shape[0]) self.assertEqual(2, y_test.shape[0]) self.assertEqual(0, y_train[0]) self.assertEqual(1, y_train[1]) self.assertEqual(0, y_test[0]) self.assertEqual(1, y_test[1])
def test_empty_column(self): # training data d = { "col1": [np.nan, np.nan, np.nan, np.nan], "col2": [5, 6, 7, 0], "col3": [1, 1, 1, 3], "col4": [2, 2, 4, 3], "y": [0, 1, 0, 1], } df = pd.DataFrame(data=d) X_train = df.loc[:, ["col1", "col2", "col3", "col4"]] y_train = df.loc[:, "y"] preprocessing_params = {"columns_preprocessing": {"col1": ["remove_column"]}} ps = Preprocessing(preprocessing_params) X_train1, _ = ps.fit_and_transform(X_train, y_train) self.assertTrue("col1" not in X_train1.columns) self.assertEqual(3, len(X_train1.columns)) X_train2, _ = ps.transform(X_train, y_train) self.assertTrue("col1" not in X_train2.columns) self.assertEqual(3, len(X_train2.columns)) for col in ["col2", "col3", "col4"]: self.assertTrue(col in X_train2.columns) params_json = ps.to_json() ps2 = Preprocessing() ps2.from_json(params_json) X_train3, _ = ps2.transform(X_train, y_train) self.assertTrue("col1" not in X_train3.columns) self.assertEqual(3, len(X_train3.columns)) for col in ["col2", "col3", "col4"]: self.assertTrue(col in X_train3.columns)
def test_run_fill_median_convert_integer(self): # training data d = { "col1": [1, 1, np.nan, 3], "col2": ["a", "a", np.nan, "a"], "col3": [1, 1, 1, 3], "col4": ["a", "a", "b", "c"], "y": [0, 1, 0, 1], } df = pd.DataFrame(data=d) X_train = df.loc[:, ["col1", "col2", "col3", "col4"]] y_train = df.loc[:, "y"] preprocessing_params = { "columns_preprocessing": { "col1": [ PreprocessingMissingValues.FILL_NA_MEDIAN, PreprocessingCategorical.CONVERT_INTEGER, ], "col2": [ PreprocessingMissingValues.FILL_NA_MEDIAN, PreprocessingCategorical.CONVERT_INTEGER, ], "col3": [ PreprocessingMissingValues.FILL_NA_MEDIAN, PreprocessingCategorical.CONVERT_INTEGER, ], "col4": [ PreprocessingMissingValues.FILL_NA_MEDIAN, PreprocessingCategorical.CONVERT_INTEGER, ], } } ps = Preprocessing(preprocessing_params) X_train, y_train = ps.fit_and_transform(X_train, y_train) for col in ["col1", "col2", "col3", "col4"]: self.assertTrue(col in X_train.columns) self.assertEqual(X_train["col1"][2], 1) self.assertEqual(X_train["col2"][2], 0) self.assertEqual(X_train["col4"][0], 0) self.assertEqual(X_train["col4"][1], 0) self.assertEqual(X_train["col4"][2], 1) self.assertEqual(X_train["col4"][3], 2) params_json = ps.to_json() self.assertTrue("missing_values" in params_json) self.assertTrue("categorical" in params_json) self.assertTrue("categorical_y" not in params_json) self.assertTrue("fill_params" in params_json["missing_values"][0]) self.assertEqual( "na_fill_median", params_json["missing_values"][0]["fill_method"] ) self.assertTrue("convert_params" in params_json["categorical"][0]) self.assertEqual( "categorical_to_int", params_json["categorical"][0]["convert_method"] )
def test_run_fill_median_convert_integer_validation_dataset(self): # training data d = { "col1": [1, 1, np.nan, 3], "col2": ["a", "a", np.nan, "a"], "col3": [1, 1, 1, 3], "col4": ["a", "a", "b", "c"], "y": [0, 1, 1, 1], } df = pd.DataFrame(data=d) X_train = df.loc[:, ["col1", "col2", "col3", "col4"]] y_train = df.loc[:, "y"] d_test = { "col1": [1, 1, np.nan, 3], "col2": ["a", "a", np.nan, "a"], "col3": [1, 1, 1, 3], "col4": ["a", "a", "b", "c"], "y": [np.nan, 1, np.nan, 1], } df_test = pd.DataFrame(data=d_test) X_test = df_test.loc[:, ["col1", "col2", "col3", "col4"]] y_test = df_test.loc[:, "y"] preprocessing_params = { "columns_preprocessing": { "col1": [ PreprocessingMissingValues.FILL_NA_MEDIAN, PreprocessingCategorical.CONVERT_INTEGER, ], "col2": [ PreprocessingMissingValues.FILL_NA_MEDIAN, PreprocessingCategorical.CONVERT_INTEGER, ], "col3": [ PreprocessingMissingValues.FILL_NA_MEDIAN, PreprocessingCategorical.CONVERT_INTEGER, ], "col4": [ PreprocessingMissingValues.FILL_NA_MEDIAN, PreprocessingCategorical.CONVERT_INTEGER, ], } } ps = Preprocessing(preprocessing_params) X_train, y_train = ps.fit_and_transform(X_train, y_train) X_test, y_test = ps.transform(X_test, y_test) for col in ["col1", "col2", "col3", "col4"]: self.assertTrue(col in X_train.columns) self.assertTrue(col in X_test.columns) self.assertEqual(4, X_train.shape[0]) self.assertEqual(4, y_train.shape[0]) self.assertEqual(2, X_test.shape[0]) self.assertEqual(2, y_test.shape[0])
def test_constructor_preprocessing_step(self): preprocessing_params = {} ps = Preprocessing(preprocessing_params) self.assertTrue(len(ps._missing_values) == 0) self.assertTrue(len(ps._categorical) == 0) self.assertTrue(ps._categorical_y is None)
def test_to_and_from_json_run_fill_median_convert_integer(self): # training data d = { "col1": [1, 1, np.nan, 3], "col2": ["a", "a", np.nan, "a"], "col3": [1, 1, 1, 3], "col4": ["a", "a", "b", "c"], "y": [0, 1, 0, 1], } df = pd.DataFrame(data=d) X_train = df.loc[:, ["col1", "col2", "col3", "col4"]] y_train = df.loc[:, "y"] preprocessing_params = { "columns_preprocessing": { "col1": [PreprocessingMissingValues.FILL_NA_MEDIAN], "col2": [ PreprocessingMissingValues.FILL_NA_MEDIAN, PreprocessingCategorical.CONVERT_INTEGER, ], "col4": [ PreprocessingMissingValues.FILL_NA_MEDIAN, PreprocessingCategorical.CONVERT_INTEGER, ], }, "target_preprocessing": [], } ps = Preprocessing(preprocessing_params) _, _ = ps.fit_and_transform(X_train, y_train) ps2 = Preprocessing() ps2.from_json(ps.to_json()) del ps d_test = { "col1": [1, 1, np.nan, 3], "col2": ["a", "a", np.nan, "a"], "col3": [1, 1, 1, 3], "col4": ["a", "a", "b", "c"], "y": [np.nan, np.nan, 1, 1], } df_test = pd.DataFrame(data=d_test) X_test = df_test.loc[:, ["col1", "col2", "col3", "col4"]] y_test = df_test.loc[:, "y"] X_test, y_test = ps2.transform(X_test, y_test) self.assertEqual(2, y_test.shape[0]) self.assertEqual(2, np.sum(y_test)) self.assertEqual(1, X_test["col1"].iloc[0]) self.assertEqual(0, X_test["col2"].iloc[0])
def test_run_exclude_missing_targets(self): # training data d = { "col1": [1, 1, 1, 3], "col2": [5, 6, 7, 0], "col3": [1, 1, 1, 3], "col4": [2, 2, 4, 3], "y": [0, np.nan, 0, 1], } df = pd.DataFrame(data=d) X_train = df.loc[:, ["col1", "col2", "col3", "col4"]] y_train = df.loc[:, "y"] ps = Preprocessing() X_train, y_train = ps.fit_and_transform(X_train, y_train) self.assertEqual(3, X_train.shape[0]) self.assertEqual(3, y_train.shape[0])
def test_exclude_missing_targets_all_good(self): # training data d = { "col1": [1, 1, 1, 3], "col2": [5, 6, 7, 0], "col3": [1, 1, 1, 3], "col4": [2, 2, 4, 3], "y": [0, 1, 0, 1], } df = pd.DataFrame(data=d) X_train = df.loc[:, ["col1", "col2", "col3", "col4"]] y_train = df.loc[:, "y"] ps = Preprocessing() X_train, y_train = ps._exclude_missing_targets(X_train, y_train) self.assertEqual(4, X_train.shape[0]) self.assertEqual(4, y_train.shape[0])
def test_run_all_good(self): # training data d = { "col1": [1, 1, 1, 3], "col2": [5, 6, 7, 0], "col3": [1, 1, 1, 3], "col4": [2, 2, 4, 3], "y": [0, 1, 0, 1], } df = pd.DataFrame(data=d) X_train = df.loc[:, ["col1", "col2", "col3", "col4"]] y_train = df.loc[:, "y"] preprocessing_params = { "columns_preprocessing": { "col1": [ PreprocessingMissingValues.FILL_NA_MEDIAN, PreprocessingCategorical.CONVERT_INTEGER, ], "col2": [ PreprocessingMissingValues.FILL_NA_MEDIAN, PreprocessingCategorical.CONVERT_INTEGER, ], "col3": [ PreprocessingMissingValues.FILL_NA_MEDIAN, PreprocessingCategorical.CONVERT_INTEGER, ], "col4": [ PreprocessingMissingValues.FILL_NA_MEDIAN, PreprocessingCategorical.CONVERT_INTEGER, ], } } ps = Preprocessing(preprocessing_params) X_train, y_train = ps.fit_and_transform(X_train, y_train) for col in ["col1", "col2", "col3", "col4"]: self.assertTrue(col in X_train.columns) params_json = ps.to_json() self.assertEqual(len(params_json), 1) # should store params only self.assertTrue("params" in params_json)
def load(model_path): logger.info(f"Loading model framework from {model_path}") json_desc = json.load(open(os.path.join(model_path, "framework.json"))) mf = ModelFramework(json_desc["params"]) mf.uid = json_desc.get("uid", mf.uid) mf._name = json_desc.get("name", mf._name) mf._threshold = json_desc.get("threshold") mf.learners = [] for learner_desc, learner_path in zip(json_desc.get("learners"), json_desc.get("saved")): l = AlgorithmFactory.load(learner_desc, learner_path) mf.learners += [l] mf.preprocessings = [] for p in json_desc.get("preprocessing"): ps = Preprocessing() ps.from_json(p) mf.preprocessings += [ps] return mf
def train(self, results_path, model_subpath): logger.debug( f"ModelFramework.train {self.learner_params.get('model_type')}") start_time = time.time() np.random.seed(self.learner_params["seed"]) optuna_tuner = None if self._optuna_time_budget is not None and OptunaTuner.is_optimizable( self.learner_params.get("model_type", "")): optuna_tuner = OptunaTuner( results_path, ml_task=self._ml_task, eval_metric=self.get_metric(), time_budget=self._optuna_time_budget, init_params=self._optuna_init_params, verbose=self._optuna_verbose, n_jobs=self.learner_params.get("n_jobs", -1), random_state=self._automl_random_state, ) self.validation = ValidationStep(self.validation_params) repeats = self.validation.get_repeats() for repeat in range(repeats): for k_fold in range(self.validation.get_n_splits()): train_data, validation_data = self.validation.get_split( k_fold, repeat) logger.debug( "Data split, train X:{} y:{}, validation X:{}, y:{}". format( train_data["X"].shape, train_data["y"].shape, validation_data["X"].shape, validation_data["y"].shape, )) if "sample_weight" in train_data: logger.debug( "Sample weight available during the training.") # the proprocessing is done at every validation step self.preprocessings += [ Preprocessing(self.preprocessing_params, self.get_name(), k_fold, repeat) ] X_train, y_train, sample_weight = self.preprocessings[ -1].fit_and_transform(train_data["X"], train_data["y"], train_data.get("sample_weight")) ( X_validation, y_validation, sample_weight_validation, ) = self.preprocessings[-1].transform( validation_data["X"], validation_data["y"], validation_data.get("sample_weight"), ) if optuna_tuner is not None: optuna_start_time = time.time() self.learner_params = optuna_tuner.optimize( self.learner_params.get("model_type", ""), self.params.get("data_type", ""), X_train, y_train, sample_weight, X_validation, y_validation, sample_weight_validation, self.learner_params, ) # exclude optuna optimize time from model training start_time += time.time() - optuna_start_time self.learner_params["explain_level"] = self._explain_level self.learners += [ AlgorithmFactory.get_algorithm( copy.deepcopy(self.learner_params)) ] learner = self.learners[-1] learner.set_learner_name(k_fold, repeat, repeats) self.callbacks.add_and_set_learner(learner) self.callbacks.on_learner_train_start() log_to_file = os.path.join(results_path, model_subpath, f"{learner.name}_training.log") for i in range(learner.max_iters): self.callbacks.on_iteration_start() learner.fit( X_train, y_train, sample_weight, X_validation, y_validation, sample_weight_validation, log_to_file, self._max_time_for_learner, ) if self.params.get("injected_sample_weight", False): # print("Dont use sample weight in model evaluation") sample_weight = None sample_weight_validation = None self.callbacks.on_iteration_end( {"iter_cnt": i}, self.predictions( learner, self.preprocessings[-1], X_train, y_train, sample_weight, X_validation, y_validation, sample_weight_validation, ), ) if learner.stop_training: break learner.update({"step": i}) # end of learner iters loop self.callbacks.on_learner_train_end() model_path = os.path.join(results_path, model_subpath) learner.interpret( X_train, y_train, X_validation, y_validation, model_file_path=model_path, learner_name=learner.name, class_names=self.preprocessings[-1].get_target_class_names( ), metric_name=self.get_metric_name(), ml_task=self._ml_task, explain_level=self._explain_level, ) # save learner and free the memory p = os.path.join(model_path, learner.get_fname()) learner.save(p) del learner.model learner.model = None # end of learner training # end of validation loop self.callbacks.on_framework_train_end() # self.get_additional_metrics() self._additional_metrics = self.get_additional_metrics() self.train_time = time.time() - start_time logger.debug("ModelFramework end of training")
def train(self): # , data): logger.debug( f"ModelFramework.train {self.learner_params.get('model_type')}") start_time = time.time() np.random.seed(self.learner_params["seed"]) self.validation = ValidationStep(self.validation_params) for k_fold in range(self.validation.get_n_splits()): train_data, validation_data = self.validation.get_split(k_fold) logger.debug( "Data split, train X:{} y:{}, validation X:{}, y:{}".format( train_data["X"].shape, train_data["y"].shape, validation_data["X"].shape, validation_data["y"].shape, )) # the proprocessing is done at every validation step self.preprocessings += [Preprocessing(self.preprocessing_params)] X_train, y_train = self.preprocessings[-1].fit_and_transform( train_data["X"], train_data["y"]) X_validation, y_validation = self.preprocessings[-1].transform( validation_data["X"], validation_data["y"]) self.learners += [ AlgorithmFactory.get_algorithm(self.learner_params) ] learner = self.learners[-1] self.callbacks.add_and_set_learner(learner) self.callbacks.on_learner_train_start() for i in range(learner.max_iters): self.callbacks.on_iteration_start() learner.fit(X_train, y_train) self.callbacks.on_iteration_end( {"iter_cnt": i}, self.predictions( learner, self.preprocessings[-1], X_train, y_train, X_validation, y_validation, ), ) if learner.stop_training: break learner.update({"step": i}) # end of learner iters loop self.callbacks.on_learner_train_end() # end of validation loop self.callbacks.on_framework_train_end() self.train_time = time.time() - start_time self.get_additional_metrics() logger.debug("ModelFramework end of training")
def train(self, model_path): logger.debug( f"ModelFramework.train {self.learner_params.get('model_type')}") start_time = time.time() np.random.seed(self.learner_params["seed"]) self.validation = ValidationStep(self.validation_params) for k_fold in range(self.validation.get_n_splits()): train_data, validation_data = self.validation.get_split(k_fold) logger.debug( "Data split, train X:{} y:{}, validation X:{}, y:{}".format( train_data["X"].shape, train_data["y"].shape, validation_data["X"].shape, validation_data["y"].shape, )) # the proprocessing is done at every validation step self.preprocessings += [Preprocessing(self.preprocessing_params)] X_train, y_train = self.preprocessings[-1].fit_and_transform( train_data["X"], train_data["y"]) X_validation, y_validation = self.preprocessings[-1].transform( validation_data["X"], validation_data["y"]) self.learner_params["explain_level"] = self._explain_level self.learners += [ AlgorithmFactory.get_algorithm( copy.deepcopy(self.learner_params)) ] learner = self.learners[-1] self.callbacks.add_and_set_learner(learner) self.callbacks.on_learner_train_start() log_to_file = os.path.join(model_path, f"learner_{k_fold+1}_training.log") for i in range(learner.max_iters): self.callbacks.on_iteration_start() learner.fit(X_train, y_train, X_validation, y_validation, log_to_file) self.callbacks.on_iteration_end( {"iter_cnt": i}, self.predictions( learner, self.preprocessings[-1], X_train, y_train, X_validation, y_validation, ), ) if learner.stop_training: break learner.update({"step": i}) # end of learner iters loop self.callbacks.on_learner_train_end() learner.interpret( X_train, y_train, X_validation, y_validation, model_file_path=model_path, learner_name=f"learner_{k_fold+1}", class_names=self.preprocessings[-1].get_target_class_names(), metric_name=self.get_metric_name(), ml_task=self._ml_task, explain_level=self._explain_level, ) # save learner and free the memory p = os.path.join(model_path, f"learner_{k_fold+1}.{learner.file_extension()}") learner.save(p) del learner.model learner.model = None # end of learner training # end of validation loop self.callbacks.on_framework_train_end() self.get_additional_metrics() self.train_time = time.time() - start_time logger.debug("ModelFramework end of training")