def __init__(self, optimize_metric="logloss"): self.library_version = "0.1" self.uid = str(uuid.uuid4()) self.model_file = self.uid + ".ensemble.model" self.model_file_path = os.path.join(storage_path, self.model_file) self.metric = Metric({"name": optimize_metric}) self.best_loss = self.metric.get_maximum( ) # the best loss obtained by ensemble self.models = None self.selected_models = [] self.train_time = None self.total_best_sum = None # total sum of predictions, the oof of ensemble self.target = None
def test_copy(self): # train model #1 metric = Metric({"name": "logloss"}) cat = CatBoostLearner(self.params) cat.fit(self.X, self.y) y_predicted = cat.predict(self.X) loss = metric(self.y, y_predicted) # create model #2 cat2 = CatBoostLearner(self.params) # model #2 is initialized in constructor self.assertTrue(cat2.model is not None) # do a copy and use it for predictions cat2 = cat.copy() self.assertEqual(type(cat), type(cat2)) y_predicted = cat2.predict(self.X) loss2 = metric(self.y, y_predicted) self.assertEqual(loss, loss2) # fit model #1, there should be improvement in loss cat.fit(self.X, self.y) y_predicted = cat.predict(self.X) loss3 = metric(self.y, y_predicted) self.assertTrue(loss3 < loss) # the loss of model #2 should not change y_predicted = cat2.predict(self.X) loss4 = metric(self.y, y_predicted) assert_almost_equal(loss2, loss4)
def test_copy(self): metric = Metric({"name": "logloss"}) params = {"objective": "binary:logistic", "eval_metric": "logloss"} xgb = XgbLearner(params) xgb.fit(self.X, self.y) y_predicted = xgb.predict(self.X) loss = metric(self.y, y_predicted) xgb2 = XgbLearner(params) self.assertTrue( xgb2.model is None) # model is set to None, while initialized xgb2 = xgb.copy() self.assertEqual(type(xgb), type(xgb2)) y_predicted = xgb2.predict(self.X) loss2 = metric(self.y, y_predicted) self.assertEqual(loss, loss2) xgb.fit(self.X, self.y) y_predicted = xgb.predict(self.X) loss3 = metric(self.y, y_predicted) self.assertTrue(loss3 < loss) y_predicted = xgb2.predict(self.X) loss4 = metric(self.y, y_predicted) assert_almost_equal(loss2, loss4)
def __init__(self, params): super(MetricLogger, self).__init__(params) self.name = params.get("name", "metric_logger") self.loss_values = {} self.metrics = [] for metric_name in params.get("metric_names"): self.metrics += [Metric({"name": metric_name})]
def test_fit_and_predict(self): metric = Metric({"name": "logloss"}) automl = AutoML( total_time_limit=5, algorithms=["Xgboost"], start_random_models=5, hill_climbing_steps=0, seed=13, ) automl.fit(self.X, self.y) y_predicted = automl.predict(self.X)["p_1"] self.assertTrue(y_predicted is not None) loss = metric(self.y, y_predicted) self.assertTrue(loss < 0.7) params = automl.to_json() automl2 = AutoML() automl2.from_json(params) y_predicted2 = automl2.predict(self.X)["p_1"] self.assertTrue(y_predicted2 is not None) loss2 = metric(self.y, y_predicted2) self.assertTrue(loss2 < 0.7) assert_almost_equal(automl._threshold, automl2._threshold)
def test_copy(self): # train model #1 metric = Metric({"name": "logloss"}) nn = NeuralNetworkLearner(self.params) nn.fit(self.X, self.y) y_predicted = nn.predict(self.X) loss = metric(self.y, y_predicted) # create model #2 nn2 = NeuralNetworkLearner(self.params) # model #2 is not initialized in constructor self.assertTrue(nn2.model is None) # do a copy and use it for predictions nn2 = nn.copy() self.assertEqual(type(nn), type(nn2)) y_predicted = nn2.predict(self.X) loss2 = metric(self.y, y_predicted) self.assertEqual(loss, loss2) # fit model #1, there should be improvement in loss nn.fit(self.X, self.y) y_predicted = nn.predict(self.X) loss3 = metric(self.y, y_predicted) self.assertTrue(loss3 < loss) # the loss of model #2 should not change y_predicted = nn2.predict(self.X) loss4 = metric(self.y, y_predicted) assert_almost_equal(loss2, loss4)
def test_save_and_load(self): self.assertTrue( "Private" in list(self.data["train"]["X"]["workclass"])) early_stop = EarlyStopping({"metric": {"name": "logloss"}}) metric_logger = MetricLogger({"metric_names": ["logloss", "auc"]}) il = IterativeLearner(self.train_params, callbacks=[early_stop, metric_logger]) il.train(self.data) y_predicted = il.predict(self.data["train"]["X"]) metric = Metric({"name": "logloss"}) loss_1 = metric(self.data["train"]["y"], y_predicted) json_desc = il.to_json() il2 = IterativeLearner(self.train_params, callbacks=[]) self.assertTrue(il.uid != il2.uid) il2.from_json(json_desc) self.assertTrue(il.uid == il2.uid) y_predicted_2 = il2.predict(self.data["train"]["X"]) loss_2 = metric(self.data["train"]["y"], y_predicted_2) assert_almost_equal(loss_1, loss_2) uids = [i.uid for i in il.learners] uids2 = [i.uid for i in il2.learners] for u in uids: self.assertTrue(u in uids2)
def test_copy(self): # train model #1 metric = Metric({"name": "logloss"}) lgb = LightgbmLearner(self.params) lgb.fit(self.X, self.y) y_predicted = lgb.predict(self.X) loss = metric(self.y, y_predicted) # create model #2 lgb2 = LightgbmLearner(self.params) # model #2 is set to None, while initialized self.assertTrue(lgb2.model is None) # do a copy and use it for predictions lgb2 = lgb.copy() self.assertEqual(type(lgb), type(lgb2)) y_predicted = lgb2.predict(self.X) loss2 = metric(self.y, y_predicted) self.assertEqual(loss, loss2) # fit model #1, there should be improvement in loss lgb.fit(self.X, self.y) y_predicted = lgb.predict(self.X) loss3 = metric(self.y, y_predicted) self.assertTrue(loss3 < loss) # the loss of model #2 should not change y_predicted = lgb2.predict(self.X) loss4 = metric(self.y, y_predicted) assert_almost_equal(loss2, loss4)
def test_fit_and_predict_kfold(self): self.assertTrue( "Private" in list(self.data["train"]["X"]["workclass"])) early_stop = EarlyStopping({"metric": {"name": "logloss"}}) metric_logger = MetricLogger({"metric_names": ["logloss", "auc"]}) params = copy.deepcopy(self.train_params) params["validation"] = { "validation_type": "kfold", "k_folds": 5, "shuffle": True, } il = IterativeLearner(params, callbacks=[early_stop, metric_logger]) il.train(self.data) oof = il.get_out_of_folds() self.assertEqual(len(np.unique(oof.index)), oof.shape[0]) self.assertTrue( np.array_equal(oof.index, self.data["train"]["X"].index)) self.assertTrue(oof.shape[0], self.data["train"]["X"].shape[0]) self.assertTrue( "Private" in list(self.data["train"]["X"]["workclass"])) y_predicted = il.predict(self.data["train"]["X"]) self.assertTrue( "Private" in list(self.data["train"]["X"]["workclass"])) metric = Metric({"name": "logloss"}) loss = metric(self.data["train"]["y"], y_predicted) self.assertTrue(loss < 0.6)
def test_fit_and_predict(self): il = IterativeLearner(self.train_params, callbacks=[]) il.train(self.data) y_predicted = il.predict(self.X) metric = Metric({"name": "logloss"}) loss = metric(self.y, y_predicted) self.assertTrue(loss < 0.4)
def test_fit_predict(self): metric = Metric({"name": "logloss"}) params = {"trees_in_step": 50} rf = RandomForestLearner(params) rf.fit(self.X, self.y) y_predicted = rf.predict(self.X) self.assertTrue(metric(self.y, y_predicted) < 0.6)
def __init__(self, params): super(EarlyStopping, self).__init__(params) self.name = params.get("name", "early_stopping") self.metric = Metric(params.get("metric")) self.max_no_improvement_cnt = params.get("max_no_improvement_cnt", 5) self.keep_best_model = params.get("keep_best_model", True) self.best_loss = {} self.loss_values = {} self.best_models = {} self.best_y_predicted = {} self.best_y_oof = ( None) # predictions computed on out of folds or on validation set self.final_loss = ( None ) # final score computed on combined predictions from all learners # path to best model local copy, only used if cannot deep copy self.best_model_paths = {}
def __init__(self): self.library_version = "0.1" self.uid = str(uuid.uuid4()) self.model_file = self.uid + ".ensemble.model" self.model_file_path = "/tmp/" + self.model_file # right now only logloss can be optimized by ensemble self.metric = Metric({"name": "logloss"}) self.best_loss = 10e12 # the best loss obtained by ensemble self.models = None self.selected_models = [] self.train_time = None
def test_fit_predict(self): metric = Metric({"name": "logloss"}) nn = NeuralNetworkLearner(self.params) loss_prev = None for i in range(5): nn.fit(self.X, self.y) y_predicted = nn.predict(self.X) loss = metric(self.y, y_predicted) if loss_prev is not None: self.assertTrue(loss + 0.000001 < loss_prev) loss_prev = loss
def test_fit_predict(self): metric = Metric({"name": "logloss"}) cat = CatBoostLearner(self.params) loss_prev = None for i in range(5): cat.fit(self.X, self.y) y_predicted = cat.predict(self.X) loss = metric(self.y, y_predicted) if loss_prev is not None: self.assertTrue(loss + 0.001 < loss_prev) loss_prev = loss
def test_fit_and_predict(self): early_stop = EarlyStopping({"metric": {"name": "logloss"}}) metric_logger = MetricLogger({"metric_names": ["logloss", "auc"]}) il = IterativeLearner(self.train_params, callbacks=[early_stop, metric_logger]) il.train(self.data) y_predicted = il.predict(self.X) metric = Metric({"name": "logloss"}) loss = metric(self.y, y_predicted) self.assertTrue(loss < 0.4)
def test_reproduce_fit(self): metric = Metric({"name": "logloss"}) params = {"trees_in_step": 1, "seed": 1} prev_loss = None for i in range(3): model = RandomForestLearner(params) model.fit(self.X, self.y) y_predicted = model.predict(self.X) loss = metric(self.y, y_predicted) if prev_loss is not None: assert_almost_equal(prev_loss, loss) prev_loss = loss
def test_fit_predict(self): metric = Metric({"name": "logloss"}) params = {"trees_in_step": 1} rf = RandomForestLearner(params) loss_prev = None for i in range(2): rf.fit(self.X, self.y) y_predicted = rf.predict(self.X) loss = metric(self.y, y_predicted) if loss_prev is not None: self.assertTrue(loss + 0.00001 < loss_prev) loss_prev = loss
def __init__(self): self.library_version = "0.1" self.uid = str(uuid.uuid4()) self.model_file = self.uid + ".ensemble.model" self.model_file_path = os.path.join(storage_path, self.model_file) # right now only logloss can be optimized by ensemble self.metric = Metric({"name": "logloss"}) self.best_loss = 10e12 # the best loss obtained by ensemble self.models = None self.selected_models = [] self.train_time = None self.total_best_sum = None # total sum of predictions, the oof of ensemble self.target = None
def test_fit_predict(self): metric = Metric({"name": "logloss"}) params = {"objective": "binary:logistic", "eval_metric": "logloss"} xgb = XgbLearner(params) loss_prev = None for i in range(5): xgb.fit(self.X, self.y) y_predicted = xgb.predict(self.X) loss = metric(self.y, y_predicted) if loss_prev is not None: self.assertTrue(loss + 0.001 < loss_prev) loss_prev = loss
def test_fit_predict(self): metric = Metric({"name": "logloss"}) lgb = LightgbmLearner(self.params) loss_prev = None for i in range(5): lgb.fit(self.X, self.y) y_predicted = lgb.predict(self.X) loss = metric(self.y, y_predicted) if loss_prev is not None: self.assertTrue(loss + 0.001 < loss_prev) loss_prev = loss
def test_fit_and_predict_split(self): self.assertTrue("Private" in list(self.data["train"]["X"]["workclass"])) early_stop = EarlyStopping({"metric": {"name": "logloss"}}) metric_logger = MetricLogger({"metric_names": ["logloss", "auc"]}) il = IterativeLearner(self.train_params, callbacks=[early_stop, metric_logger]) il.train(self.data) self.assertTrue("Private" in list(self.data["train"]["X"]["workclass"])) y_predicted = il.predict(self.data["train"]["X"]) self.assertTrue("Private" in list(self.data["train"]["X"]["workclass"])) metric = Metric({"name": "logloss"}) loss = metric(self.data["train"]["y"], y_predicted) self.assertTrue(loss < 0.6)
def test_reproduce_fit(self): metric = Metric({"name": "logloss"}) params = { "objective": "binary:logistic", "eval_metric": "logloss", "seed": 1 } prev_loss = None for i in range(3): xgb = XgbLearner(params) xgb.fit(self.X, self.y) y_predicted = xgb.predict(self.X) loss = metric(self.y, y_predicted) if prev_loss is not None: assert_almost_equal(prev_loss, loss) prev_loss = loss
def test_save_and_load(self): metric = Metric({"name": "logloss"}) rf = RandomForestLearner({}) rf.fit(self.X, self.y) y_predicted = rf.predict(self.X) loss = metric(self.y, y_predicted) json_desc = rf.save() rf2 = RandomForestLearner({}) self.assertTrue(rf.uid != rf2.uid) rf2.load(json_desc) self.assertTrue(rf.uid == rf2.uid) y_predicted = rf2.predict(self.X) loss2 = metric(self.y, y_predicted) assert_almost_equal(loss, loss2)
def test_save_and_load(self): metric = Metric({"name": "logloss"}) lgb = LightgbmLearner(self.params) lgb.fit(self.X, self.y) y_predicted = lgb.predict(self.X) loss = metric(self.y, y_predicted) json_desc = lgb.save() lgb2 = LightgbmLearner({}) self.assertTrue(lgb.uid != lgb2.uid) self.assertTrue(lgb2.model is None) lgb2.load(json_desc) self.assertTrue(lgb.uid == lgb2.uid) y_predicted = lgb2.predict(self.X) loss2 = metric(self.y, y_predicted) assert_almost_equal(loss, loss2)
def test_save_and_load(self): metric = Metric({"name": "logloss"}) cat = CatBoostLearner(self.params) cat.fit(self.X, self.y) y_predicted = cat.predict(self.X) loss = metric(self.y, y_predicted) json_desc = cat.save() cat2 = CatBoostLearner({}) self.assertTrue(cat.uid != cat2.uid) self.assertTrue(cat2.model is not None) cat2.load(json_desc) self.assertTrue(cat.uid == cat2.uid) y_predicted = cat2.predict(self.X) loss2 = metric(self.y, y_predicted) assert_almost_equal(loss, loss2)
def test_save_and_load(self): metric = Metric({"name": "logloss"}) nn = NeuralNetworkLearner(self.params) nn.fit(self.X, self.y) y_predicted = nn.predict(self.X) loss = metric(self.y, y_predicted) json_desc = nn.save() nn2 = NeuralNetworkLearner({}) self.assertTrue(nn.uid != nn2.uid) self.assertTrue(nn2.model is None) nn2.load(json_desc) self.assertTrue(nn.uid == nn2.uid) y_predicted = nn2.predict(self.X) loss2 = metric(self.y, y_predicted) assert_almost_equal(loss, loss2)
def test_save_and_load(self): metric = Metric({"name": "logloss"}) params = {"objective": "binary:logistic", "eval_metric": "logloss"} xgb = XgbLearner(params) xgb.fit(self.X, self.y) y_predicted = xgb.predict(self.X) loss = metric(self.y, y_predicted) json_desc = xgb.save() xgb2 = XgbLearner(params) self.assertTrue(xgb.uid != xgb2.uid) self.assertTrue(xgb2.model is None) xgb2.load(json_desc) self.assertTrue(xgb.uid == xgb2.uid) y_predicted = xgb2.predict(self.X) loss2 = metric(self.y, y_predicted) assert_almost_equal(loss, loss2)
def test_reproduce_fit(self): metric = Metric({"name": "logloss"}) losses = [] for i in range(2): automl = AutoML( total_time_limit= 10000, # the time limit should be big enough too not interrupt the training algorithms=["Xgboost"], start_random_models=2, hill_climbing_steps=1, train_ensemble=True, verbose=True, seed=12, ) automl.fit(self.X, self.y) y_predicted = automl.predict(self.X)["p_1"] loss = metric(self.y, y_predicted) losses += [loss] assert_almost_equal(losses[0], losses[1], decimal=4)
def test_save_and_load(self): il = IterativeLearner(self.train_params, callbacks=[]) il.train(self.data) metric = Metric({"name": "logloss"}) loss = metric(self.y, il.predict(self.X)) json_desc = il.to_json() il2 = IterativeLearner(json_desc.get("params"), callbacks=[]) self.assertTrue(il.uid != il2.uid) il2.from_json(json_desc) self.assertTrue(il.uid == il2.uid) loss2 = metric(self.y, il2.predict(self.X)) assert_almost_equal(loss, loss2) uids = [i.uid for i in il.learners] uids2 = [i.uid for i in il2.learners] for u in uids: self.assertTrue(u in uids2)