def load(model_path): logger.info(f"Loading model framework from {model_path}") json_desc = json.load(open(os.path.join(model_path, "framework.json"))) mf = ModelFramework(json_desc["params"]) mf.uid = json_desc.get("uid", mf.uid) mf._name = json_desc.get("name", mf._name) mf._threshold = json_desc.get("threshold") mf.train_time = json_desc.get("train_time", mf.train_time) mf.final_loss = json_desc.get("final_loss", mf.final_loss) mf.metric_name = json_desc.get("metric_name", mf.metric_name) mf._is_stacked = json_desc.get("is_stacked", mf._is_stacked) predictions_fname = json_desc.get("predictions_fname") if predictions_fname is not None: mf.oof_predictions = pd.read_csv(predictions_fname) mf.learners = [] for learner_desc, learner_path in zip(json_desc.get("learners"), json_desc.get("saved")): l = AlgorithmFactory.load(learner_desc, learner_path) mf.learners += [l] mf.preprocessings = [] for p in json_desc.get("preprocessing"): ps = Preprocessing() ps.from_json(p) mf.preprocessings += [ps] return mf
def test_empty_column(self): # training data d = { "col1": [np.nan, np.nan, np.nan, np.nan], "col2": [5, 6, 7, 0], "col3": [1, 1, 1, 3], "col4": [2, 2, 4, 3], "y": [0, 1, 0, 1], } df = pd.DataFrame(data=d) X_train = df.loc[:, ["col1", "col2", "col3", "col4"]] y_train = df.loc[:, "y"] preprocessing_params = {"columns_preprocessing": {"col1": ["remove_column"]}} ps = Preprocessing(preprocessing_params) X_train1, _ = ps.fit_and_transform(X_train, y_train) self.assertTrue("col1" not in X_train1.columns) self.assertEqual(3, len(X_train1.columns)) X_train2, _ = ps.transform(X_train, y_train) self.assertTrue("col1" not in X_train2.columns) self.assertEqual(3, len(X_train2.columns)) for col in ["col2", "col3", "col4"]: self.assertTrue(col in X_train2.columns) params_json = ps.to_json() ps2 = Preprocessing() ps2.from_json(params_json) X_train3, _ = ps2.transform(X_train, y_train) self.assertTrue("col1" not in X_train3.columns) self.assertEqual(3, len(X_train3.columns)) for col in ["col2", "col3", "col4"]: self.assertTrue(col in X_train3.columns)
def test_to_and_from_json_run_fill_median_convert_integer(self): # training data d = { "col1": [1, 1, np.nan, 3], "col2": ["a", "a", np.nan, "a"], "col3": [1, 1, 1, 3], "col4": ["a", "a", "b", "c"], "y": [0, 1, 0, 1], } df = pd.DataFrame(data=d) X_train = df.loc[:, ["col1", "col2", "col3", "col4"]] y_train = df.loc[:, "y"] preprocessing_params = { "columns_preprocessing": { "col1": [PreprocessingMissingValues.FILL_NA_MEDIAN], "col2": [ PreprocessingMissingValues.FILL_NA_MEDIAN, PreprocessingCategorical.CONVERT_INTEGER, ], "col4": [ PreprocessingMissingValues.FILL_NA_MEDIAN, PreprocessingCategorical.CONVERT_INTEGER, ], }, "target_preprocessing": [], } ps = Preprocessing(preprocessing_params) _, _ = ps.fit_and_transform(X_train, y_train) ps2 = Preprocessing() ps2.from_json(ps.to_json()) del ps d_test = { "col1": [1, 1, np.nan, 3], "col2": ["a", "a", np.nan, "a"], "col3": [1, 1, 1, 3], "col4": ["a", "a", "b", "c"], "y": [np.nan, np.nan, 1, 1], } df_test = pd.DataFrame(data=d_test) X_test = df_test.loc[:, ["col1", "col2", "col3", "col4"]] y_test = df_test.loc[:, "y"] X_test, y_test = ps2.transform(X_test, y_test) self.assertEqual(2, y_test.shape[0]) self.assertEqual(2, np.sum(y_test)) self.assertEqual(1, X_test["col1"].iloc[0]) self.assertEqual(0, X_test["col2"].iloc[0])
def load(model_path): logger.info(f"Loading model framework from {model_path}") json_desc = json.load(open(os.path.join(model_path, "framework.json"))) mf = ModelFramework(json_desc["params"]) mf.uid = json_desc.get("uid", mf.uid) mf._name = json_desc.get("name", mf._name) mf._threshold = json_desc.get("threshold") mf.learners = [] for learner_desc, learner_path in zip(json_desc.get("learners"), json_desc.get("saved")): l = AlgorithmFactory.load(learner_desc, learner_path) mf.learners += [l] mf.preprocessings = [] for p in json_desc.get("preprocessing"): ps = Preprocessing() ps.from_json(p) mf.preprocessings += [ps] return mf