def test_save_and_load(self): self.assertTrue( "Private" in list(self.data["train"]["X"]["workclass"])) early_stop = EarlyStopping({"metric": {"name": "logloss"}}) metric_logger = MetricLogger({"metric_names": ["logloss", "auc"]}) il = IterativeLearner(self.train_params, callbacks=[early_stop, metric_logger]) il.train(self.data) y_predicted = il.predict(self.data["train"]["X"]) metric = Metric({"name": "logloss"}) loss_1 = metric(self.data["train"]["y"], y_predicted) json_desc = il.to_json() il2 = IterativeLearner(self.train_params, callbacks=[]) self.assertTrue(il.uid != il2.uid) il2.from_json(json_desc) self.assertTrue(il.uid == il2.uid) y_predicted_2 = il2.predict(self.data["train"]["X"]) loss_2 = metric(self.data["train"]["y"], y_predicted_2) assert_almost_equal(loss_1, loss_2) uids = [i.uid for i in il.learners] uids2 = [i.uid for i in il2.learners] for u in uids: self.assertTrue(u in uids2)
def test_fit_and_predict_kfold(self): self.assertTrue( "Private" in list(self.data["train"]["X"]["workclass"])) early_stop = EarlyStopping({"metric": {"name": "logloss"}}) metric_logger = MetricLogger({"metric_names": ["logloss", "auc"]}) params = copy.deepcopy(self.train_params) params["validation"] = { "validation_type": "kfold", "k_folds": 5, "shuffle": True, } il = IterativeLearner(params, callbacks=[early_stop, metric_logger]) il.train(self.data) oof = il.get_out_of_folds() self.assertEqual(len(np.unique(oof.index)), oof.shape[0]) self.assertTrue( np.array_equal(oof.index, self.data["train"]["X"].index)) self.assertTrue(oof.shape[0], self.data["train"]["X"].shape[0]) self.assertTrue( "Private" in list(self.data["train"]["X"]["workclass"])) y_predicted = il.predict(self.data["train"]["X"]) self.assertTrue( "Private" in list(self.data["train"]["X"]["workclass"])) metric = Metric({"name": "logloss"}) loss = metric(self.data["train"]["y"], y_predicted) self.assertTrue(loss < 0.6)
def train_model(self, params): model_path = os.path.join(self._results_path, params["name"]) early_stop = EarlyStopping( {"metric": {"name": self._optimize_metric}, "log_to_dir": model_path} ) time_constraint = TimeConstraint({"train_seconds_time_limit": self._time_limit}) mf = ModelFramework(params, callbacks=[early_stop, time_constraint]) if self._enough_time_to_train(mf.get_type()): logger.info( f"Train model #{len(self._models)+1} / Model name: {params['name']}" ) try: os.mkdir(model_path) except Exception as e: raise AutoMLException(f"Cannot create directory {model_path}") mf.train() # {"train": {"X": X, "y": y}}) mf.save(model_path) self._model_paths += [model_path] self.keep_model(mf) else: logger.info( f"Cannot check more models of {mf.get_type()} because of time constraint" )
def run(self): # update status mlmodel = MLModel.objects.get(pk=self.job_params.get("db_id")) mlmodel.status = "started" mlmodel.save() mlexperiment = MLExperiment.objects.get( pk=mlmodel.parent_experiment_id) print("mlexperiment", mlexperiment.id) print(mlexperiment.parent_columns_usage) # prepare data columns_usage = mlexperiment.parent_columns_usage.columns_usage print("cols", columns_usage) training_dataframe = mlexperiment.parent_training_dataframe print("training", training_dataframe.absolute_path) metric_params = mlexperiment.params.get("metric") validation_params = mlexperiment.params.get("validation") preprocessing_params = mlexperiment.params.get("preprocessing") df_train = DataServe.get(training_dataframe.absolute_path) training_data = { "train": { "X": df_train[columns_usage.get("input")], "y": df_train[columns_usage.get("target")], } } # prepare model hyper parameters learner_params = { "learner_type": mlmodel.model_type, "max_iters": 3, "max_depth": 1, } for k, v in mlmodel.params.items(): learner_params[k] = v train_params = { "preprocessing": preprocessing_params, "validation": validation_params, "learner": learner_params, } print(train_params) # prepare needed callbacks early_stop = EarlyStopping({"metric": {"name": "logloss"}}) metric_logger = MetricLogger({"metric_names": ["logloss", "auc"]}) # run the training il = IterativeLearner(train_params, callbacks=[early_stop, metric_logger]) il.train(training_data) # save the model save_details = il.save() logger.info(save_details) # store model details in platform database mlmodel.status = "done" mlmodel.save_details = save_details mlmodel.all_params = ( train_params) # all parameters will be needed for models loading mlmodel.save()
def train_model(self, params): # do we have enough time to train? # if not, skip if not self._time_ctrl.enough_time( params["learner"]["model_type"], self._fit_level ): logger.info(f"Cannot train {params['name']} because of the time constraint") return False # let's create directory to log all training artifacts model_path = os.path.join(self._results_path, params["name"]) self.create_dir(model_path) # prepare callbacks early_stop = EarlyStopping( {"metric": {"name": self._optimize_metric}, "log_to_dir": model_path} ) learner_time_constraint = LearnerTimeConstraint( { "learner_time_limit": self._time_ctrl.learner_time_limit( params["learner"]["model_type"], self._fit_level, self._validation.get("k_folds", 1.0), ), "min_steps": params["additional"].get("min_steps"), } ) total_time_constraint = TotalTimeConstraint( { "total_time_limit": self._total_time_limit if self._model_time_limit is None else None, "total_time_start": self._start_time, } ) # create model framework mf = ModelFramework( params, callbacks=[early_stop, learner_time_constraint, total_time_constraint], ) # start training logger.info( f"Train model #{len(self._models)+1} / Model name: {params['name']}" ) mf.train(model_path) # save the model mf.save(model_path) # and keep info about the model self.keep_model(mf, model_path) return True
def train_model(self, params): model_path = os.path.join(self._results_path, params["name"]) early_stop = EarlyStopping({ "metric": { "name": self._optimize_metric }, "log_to_dir": model_path }) learner_time_constraint = LearnerTimeConstraint({ "learner_time_limit": self._get_learner_time_limit( params["learner"]["model_type"]), # self._time_limit, "min_steps": params["additional"].get("min_steps"), }) total_time_constraint = TotalTimeConstraint({ "total_time_limit": self._total_time_limit if self._model_time_limit is None else None, "total_time_start": self._start_time, }) mf = ModelFramework( params, callbacks=[ early_stop, learner_time_constraint, total_time_constraint ], ) if self._enough_time_to_train(mf.get_type()): # self.verbose_print(params["name"] + " training start ...") logger.info( f"Train model #{len(self._models)+1} / Model name: {params['name']}" ) try: os.mkdir(model_path) except Exception as e: raise AutoMLException(f"Cannot create directory {model_path}") mf.train(model_path) mf.save(model_path) self._model_paths += [model_path] self.keep_model(mf) # save the best one in the case the training will be interrupted self.select_and_save_best() else: logger.info( f"Cannot train {mf.get_type()} because of time constraint")
def train_model(self, params, X, y): early_stop = EarlyStopping({"metric": {"name": "logloss"}}) time_constraint = TimeConstraint({"train_seconds_time_limit": self._time_limit}) il = IterativeLearner(params, callbacks=[early_stop, time_constraint]) il_key = il.get_params_key() if il_key in self._models_params_keys: return None self._models_params_keys += [il_key] if self.should_train_next(il.get_name()): il.train({"train": {"X": X, "y": y}}) return il return None
def test_fit_and_predict(self): early_stop = EarlyStopping({"metric": {"name": "logloss"}}) metric_logger = MetricLogger({"metric_names": ["logloss", "auc"]}) il = IterativeLearner(self.train_params, callbacks=[early_stop, metric_logger]) il.train(self.data) y_predicted = il.predict(self.X) metric = Metric({"name": "logloss"}) loss = metric(self.y, y_predicted) self.assertTrue(loss < 0.4)
def test_fit_and_predict_split(self): self.assertTrue("Private" in list(self.data["train"]["X"]["workclass"])) early_stop = EarlyStopping({"metric": {"name": "logloss"}}) metric_logger = MetricLogger({"metric_names": ["logloss", "auc"]}) il = IterativeLearner(self.train_params, callbacks=[early_stop, metric_logger]) il.train(self.data) self.assertTrue("Private" in list(self.data["train"]["X"]["workclass"])) y_predicted = il.predict(self.data["train"]["X"]) self.assertTrue("Private" in list(self.data["train"]["X"]["workclass"])) metric = Metric({"name": "logloss"}) loss = metric(self.data["train"]["y"], y_predicted) self.assertTrue(loss < 0.6)
def train_model(self, params, X, y): metric_logger = MetricLogger({"metric_names": ["logloss", "auc"]}) early_stop = EarlyStopping({"metric": {"name": self._optimize_metric}}) time_constraint = TimeConstraint({"train_seconds_time_limit": self._time_limit}) il = IterativeLearner( params, callbacks=[early_stop, time_constraint, metric_logger] ) il_key = il.get_params_key() if il_key in self._models_params_keys: self._progress_bar.update(1) return None self._models_params_keys += [il_key] if self.should_train_next(il.get_name()): il.train({"train": {"X": X, "y": y}}) self._progress_bar.update(1) return il self._progress_bar.update(1) return None