def test_save_and_load(self): self.assertTrue( "Private" in list(self.data["train"]["X"]["workclass"])) early_stop = EarlyStopping({"metric": {"name": "logloss"}}) metric_logger = MetricLogger({"metric_names": ["logloss", "auc"]}) il = IterativeLearner(self.train_params, callbacks=[early_stop, metric_logger]) il.train(self.data) y_predicted = il.predict(self.data["train"]["X"]) metric = Metric({"name": "logloss"}) loss_1 = metric(self.data["train"]["y"], y_predicted) json_desc = il.to_json() il2 = IterativeLearner(self.train_params, callbacks=[]) self.assertTrue(il.uid != il2.uid) il2.from_json(json_desc) self.assertTrue(il.uid == il2.uid) y_predicted_2 = il2.predict(self.data["train"]["X"]) loss_2 = metric(self.data["train"]["y"], y_predicted_2) assert_almost_equal(loss_1, loss_2) uids = [i.uid for i in il.learners] uids2 = [i.uid for i in il2.learners] for u in uids: self.assertTrue(u in uids2)
def test_fit_and_predict_kfold(self): self.assertTrue( "Private" in list(self.data["train"]["X"]["workclass"])) early_stop = EarlyStopping({"metric": {"name": "logloss"}}) metric_logger = MetricLogger({"metric_names": ["logloss", "auc"]}) params = copy.deepcopy(self.train_params) params["validation"] = { "validation_type": "kfold", "k_folds": 5, "shuffle": True, } il = IterativeLearner(params, callbacks=[early_stop, metric_logger]) il.train(self.data) oof = il.get_out_of_folds() self.assertEqual(len(np.unique(oof.index)), oof.shape[0]) self.assertTrue( np.array_equal(oof.index, self.data["train"]["X"].index)) self.assertTrue(oof.shape[0], self.data["train"]["X"].shape[0]) self.assertTrue( "Private" in list(self.data["train"]["X"]["workclass"])) y_predicted = il.predict(self.data["train"]["X"]) self.assertTrue( "Private" in list(self.data["train"]["X"]["workclass"])) metric = Metric({"name": "logloss"}) loss = metric(self.data["train"]["y"], y_predicted) self.assertTrue(loss < 0.6)
def test_fit_and_predict(self): il = IterativeLearner(self.train_params, callbacks=[]) il.train(self.data) y_predicted = il.predict(self.X) metric = Metric({"name": "logloss"}) loss = metric(self.y, y_predicted) self.assertTrue(loss < 0.4)
def test_save_and_load(self): il = IterativeLearner(self.train_params, callbacks=[]) il.train(self.data) metric = Metric({"name": "logloss"}) loss = metric(self.y, il.predict(self.X)) json_desc = il.to_json() il2 = IterativeLearner(json_desc.get("params"), callbacks=[]) self.assertTrue(il.uid != il2.uid) il2.from_json(json_desc) self.assertTrue(il.uid == il2.uid) loss2 = metric(self.y, il2.predict(self.X)) assert_almost_equal(loss, loss2) uids = [i.uid for i in il.learners] uids2 = [i.uid for i in il2.learners] for u in uids: self.assertTrue(u in uids2)
def test_fit_and_predict(self): early_stop = EarlyStopping({"metric": {"name": "logloss"}}) metric_logger = MetricLogger({"metric_names": ["logloss", "auc"]}) il = IterativeLearner(self.train_params, callbacks=[early_stop, metric_logger]) il.train(self.data) y_predicted = il.predict(self.X) metric = Metric({"name": "logloss"}) loss = metric(self.y, y_predicted) self.assertTrue(loss < 0.4)
def test_fit_and_predict_split(self): self.assertTrue("Private" in list(self.data["train"]["X"]["workclass"])) early_stop = EarlyStopping({"metric": {"name": "logloss"}}) metric_logger = MetricLogger({"metric_names": ["logloss", "auc"]}) il = IterativeLearner(self.train_params, callbacks=[early_stop, metric_logger]) il.train(self.data) self.assertTrue("Private" in list(self.data["train"]["X"]["workclass"])) y_predicted = il.predict(self.data["train"]["X"]) self.assertTrue("Private" in list(self.data["train"]["X"]["workclass"])) metric = Metric({"name": "logloss"}) loss = metric(self.data["train"]["y"], y_predicted) self.assertTrue(loss < 0.6)
def run(self): # read data logger.info("ComputeBatchPrediction::run") batch = MLBatchPrediction.objects.get(pk=self.job_params.get("db_id")) logger.info("batch", batch) # {'parent_mlmodel': 9, 'parent_dataframe': 1, 'db_id': 1, 'created_by_id': 1, 'parent_organization_id': 1, 'parent_project_id': 1} mlmodel = MLModel.objects.get(pk=self.job_params.get("parent_mlmodel")) logger.info(mlmodel.save_details) logger.info(mlmodel.all_params) il = IterativeLearner(mlmodel.all_params) il.load(mlmodel.save_details) logger.info(batch.parent_dataframe.absolute_path) input_df = DataServe.get(batch.parent_dataframe.absolute_path) predictions = il.predict(input_df) logger.info(predictions) filename = "predictions-{0}.csv".format(str(uuid.uuid4())[:8]) organization_slug = batch.parent_organization.slug project_id = batch.parent_project.id relative_dir = "org_{0}_proj_{1}".format(organization_slug, project_id) relative_path = os.path.join(relative_dir, filename) result_absolute_path = Storage().get_path(relative_dir, filename) logger.info(result_absolute_path) df = pd.DataFrame({"prediction": predictions}) df.to_csv(result_absolute_path, index=False) # create mljar data frame result_df = DataFrame( source_id=self.job_params.get("parent_dataframe"), # fix this absolute_path=result_absolute_path, file_size=1, # TODO fix the file size columns_details="", # we can describe any data frame (always :-)) preview_absolute_path="", created_by_id=self.job_params["created_by_id"], parent_organization_id=self.job_params["parent_organization_id"], parent_project_id=self.job_params["parent_project_id"], ) result_df.save() batch.result_dataframe = result_df batch.status = "done" batch.save()