def test_run_on_y_only_validation(self):
        d = {"y": ["a", "b", "a", "b"]}
        df = pd.DataFrame(data=d)
        y_train = df.loc[:, "y"]

        d_test = {"y": [np.nan, "a", np.nan, "b"]}
        df_test = pd.DataFrame(data=d_test)
        y_test = df_test.loc[:, "y"]

        preprocessing_params = {
            "target_preprocessing": [
                PreprocessingMissingValues.FILL_NA_MEDIAN,
                PreprocessingCategorical.CONVERT_INTEGER,
            ]
        }

        ps = PreprocessingStep(preprocessing_params)

        train_data, validation_data = ps.run(train_data={"y": y_train},
                                             validation_data={"y": y_test})
        y_train = train_data.get("y")
        y_test = validation_data.get("y")

        self.assertEqual(4, y_train.shape[0])
        self.assertEqual(2, y_test.shape[0])
        self.assertEqual(0, y_train[0])
        self.assertEqual(1, y_train[1])
        self.assertEqual(0, y_test[0])
        self.assertEqual(1, y_test[1])
    def test_run_fill_median_convert_integer(self):
        # training data
        d = {
            "col1": [1, 1, np.nan, 3],
            "col2": ["a", "a", np.nan, "a"],
            "col3": [1, 1, 1, 3],
            "col4": ["a", "a", "b", "c"],
            "y": [0, 1, 0, 1],
        }
        df = pd.DataFrame(data=d)
        X_train = df.loc[:, ["col1", "col2", "col3", "col4"]]
        y_train = df.loc[:, "y"]

        preprocessing_params = {
            "columns_preprocessing": {
                "col1": [
                    PreprocessingMissingValues.FILL_NA_MEDIAN,
                    PreprocessingCategorical.CONVERT_INTEGER,
                ],
                "col2": [
                    PreprocessingMissingValues.FILL_NA_MEDIAN,
                    PreprocessingCategorical.CONVERT_INTEGER,
                ],
                "col3": [
                    PreprocessingMissingValues.FILL_NA_MEDIAN,
                    PreprocessingCategorical.CONVERT_INTEGER,
                ],
                "col4": [
                    PreprocessingMissingValues.FILL_NA_MEDIAN,
                    PreprocessingCategorical.CONVERT_INTEGER,
                ],
            }
        }

        ps = PreprocessingStep(preprocessing_params)

        train_data, _ = ps.run(train_data={"X": X_train, "y": y_train})
        X_train, y_train = train_data.get("X"), train_data.get("y")

        for col in ["col1", "col2", "col3", "col4"]:
            self.assertTrue(col in X_train.columns)
        self.assertEqual(X_train["col1"][2], 1)
        self.assertEqual(X_train["col2"][2], 0)
        self.assertEqual(X_train["col4"][0], 0)
        self.assertEqual(X_train["col4"][1], 0)
        self.assertEqual(X_train["col4"][2], 1)
        self.assertEqual(X_train["col4"][3], 2)

        params_json = ps.to_json()

        self.assertTrue("missing_values" in params_json)
        self.assertTrue("categorical" in params_json)
        self.assertTrue("categorical_y" not in params_json)

        self.assertTrue("fill_params" in params_json["missing_values"][0])
        self.assertEqual("na_fill_median",
                         params_json["missing_values"][0]["fill_method"])
        self.assertTrue("convert_params" in params_json["categorical"][0])
        self.assertEqual("categorical_to_int",
                         params_json["categorical"][0]["convert_method"])
    def test_to_and_from_json_run_fill_median_convert_integer(self):
        # training data
        d = {
            "col1": [1, 1, np.nan, 3],
            "col2": ["a", "a", np.nan, "a"],
            "col3": [1, 1, 1, 3],
            "col4": ["a", "a", "b", "c"],
            "y": [0, 1, 0, 1],
        }
        df = pd.DataFrame(data=d)
        X_train = df.loc[:, ["col1", "col2", "col3", "col4"]]
        y_train = df.loc[:, "y"]

        preprocessing_params = {
            "columns_preprocessing": {
                "col1": [PreprocessingMissingValues.FILL_NA_MEDIAN],
                "col2": [
                    PreprocessingMissingValues.FILL_NA_MEDIAN,
                    PreprocessingCategorical.CONVERT_INTEGER,
                ],
                "col4": [
                    PreprocessingMissingValues.FILL_NA_MEDIAN,
                    PreprocessingCategorical.CONVERT_INTEGER,
                ],
            },
            "target_preprocessing": [],
        }

        ps = PreprocessingStep(preprocessing_params)
        train_data, _ = ps.run(train_data={"X": X_train, "y": y_train})

        ps2 = PreprocessingStep()
        ps2.from_json(ps.to_json())
        del ps

        d_test = {
            "col1": [1, 1, np.nan, 3],
            "col2": ["a", "a", np.nan, "a"],
            "col3": [1, 1, 1, 3],
            "col4": ["a", "a", "b", "c"],
            "y": [np.nan, np.nan, 1, 1],
        }
        df_test = pd.DataFrame(data=d_test)
        X_test = df_test.loc[:, ["col1", "col2", "col3", "col4"]]
        y_test = df_test.loc[:, "y"]

        validation_data = ps2.transform(validation_data={
            "X": X_test,
            "y": y_test
        })
        X_test, y_test = validation_data.get("X"), validation_data.get("y")

        self.assertEqual(2, y_test.shape[0])
        self.assertEqual(2, np.sum(y_test))
        self.assertEqual(1, X_test["col1"][0])
        self.assertEqual(0, X_test["col2"][0])
    def test_constructor_preprocessing_step(self):
        preprocessing_params = {}
        ps = PreprocessingStep(preprocessing_params)

        self.assertTrue(len(ps._missing_values) == 0)
        self.assertTrue(len(ps._categorical) == 0)
        self.assertTrue(ps._categorical_y is None)
    def test_empty_column(self):
        # training data
        d = {
            "col1": [np.nan, np.nan, np.nan, np.nan],
            "col2": [5, 6, 7, 0],
            "col3": [1, 1, 1, 3],
            "col4": [2, 2, 4, 3],
            "y": [0, 1, 0, 1],
        }
        df = pd.DataFrame(data=d)
        X_train = df.loc[:, ["col1", "col2", "col3", "col4"]]
        y_train = df.loc[:, "y"]

        preprocessing_params = {
            "columns_preprocessing": {
                "col1": ["remove_column"]
            }
        }

        ps = PreprocessingStep(preprocessing_params)
        train_data, _ = ps.run(train_data={"X": X_train, "y": y_train})
        X_train1 = train_data.get("X")
        self.assertTrue("col1" not in X_train1.columns)
        self.assertEqual(3, len(X_train1.columns))
        train_data2 = ps.transform(validation_data={
            "X": X_train,
            "y": y_train
        })
        X_train2 = train_data2.get("X")
        self.assertTrue("col1" not in X_train2.columns)
        self.assertEqual(3, len(X_train2.columns))
        for col in ["col2", "col3", "col4"]:
            self.assertTrue(col in X_train2.columns)

        params_json = ps.to_json()
        ps2 = PreprocessingStep()
        ps2.from_json(params_json)

        train_data3 = ps2.transform(validation_data={
            "X": X_train,
            "y": y_train
        })
        X_train3 = train_data3.get("X")
        self.assertTrue("col1" not in X_train3.columns)
        self.assertEqual(3, len(X_train3.columns))
        for col in ["col2", "col3", "col4"]:
            self.assertTrue(col in X_train3.columns)
    def test_run_exclude_missing_targets(self):
        # training data
        d = {
            "col1": [1, 1, 1, 3],
            "col2": [5, 6, 7, 0],
            "col3": [1, 1, 1, 3],
            "col4": [2, 2, 4, 3],
            "y": [0, np.nan, 0, 1],
        }
        df = pd.DataFrame(data=d)
        X_train = df.loc[:, ["col1", "col2", "col3", "col4"]]
        y_train = df.loc[:, "y"]

        ps = PreprocessingStep()
        train_data, _ = ps.run(train_data={"X": X_train, "y": y_train})
        X_train, y_train = train_data.get("X"), train_data.get("y")
        self.assertEqual(3, X_train.shape[0])
        self.assertEqual(3, y_train.shape[0])
    def test_exclude_missing_targets(self):
        # training data
        d = {
            "col1": [1, 1, 1, 3],
            "col2": [5, 6, 7, 0],
            "col3": [1, 1, 1, 3],
            "col4": [2, 2, 4, 3],
            "y": [0, np.nan, 0, 1],
        }
        df = pd.DataFrame(data=d)
        X_train = df.loc[:, ["col1", "col2", "col3", "col4"]]
        y_train = df.loc[:, "y"]

        ps = PreprocessingStep()
        X_train, y_train = ps._exclude_missing_targets(X_train, y_train)

        self.assertEqual(3, X_train.shape[0])
        self.assertEqual(3, y_train.shape[0])
    def from_json(self, json_desc):
        self.uid = json_desc.get("uid", self.uid)
        self.framework_file = json_desc.get("framework_file", self.framework_file)
        self.framework_file_path = json_desc.get(
            "framework_file_path", self.framework_file_path
        )

        with zipfile.ZipFile(json_desc.get("framework_file_path"), "r") as zip_ref:
            zip_ref.extractall(storage_path)
        self.learners = []
        for learner_desc in json_desc.get("learners"):
            self.learners += [LearnerFactory.load(learner_desc)]
        preprocessing = json_desc.get("preprocessing", [])

        for p in preprocessing:
            preproc = PreprocessingStep()
            preproc.from_json(p)
            self.preprocessings += [preproc]
    def test_run_all_good(self):
        # training data
        d = {
            "col1": [1, 1, 1, 3],
            "col2": [5, 6, 7, 0],
            "col3": [1, 1, 1, 3],
            "col4": [2, 2, 4, 3],
            "y": [0, 1, 0, 1],
        }
        df = pd.DataFrame(data=d)
        X_train = df.loc[:, ["col1", "col2", "col3", "col4"]]
        y_train = df.loc[:, "y"]

        preprocessing_params = {
            "columns_preprocessing": {
                "col1": [
                    PreprocessingMissingValues.FILL_NA_MEDIAN,
                    PreprocessingCategorical.CONVERT_INTEGER,
                ],
                "col2": [
                    PreprocessingMissingValues.FILL_NA_MEDIAN,
                    PreprocessingCategorical.CONVERT_INTEGER,
                ],
                "col3": [
                    PreprocessingMissingValues.FILL_NA_MEDIAN,
                    PreprocessingCategorical.CONVERT_INTEGER,
                ],
                "col4": [
                    PreprocessingMissingValues.FILL_NA_MEDIAN,
                    PreprocessingCategorical.CONVERT_INTEGER,
                ],
            }
        }

        ps = PreprocessingStep(preprocessing_params)

        train_data, _ = ps.run(train_data={"X": X_train, "y": y_train})
        X_train, y_train = train_data.get("X"), train_data.get("y")

        for col in ["col1", "col2", "col3", "col4"]:
            self.assertTrue(col in X_train.columns)

        params_json = ps.to_json()
        self.assertFalse(params_json)  # should be empty
    def train(self, data):
        start_time = time.time()
        log.debug("IterativeLearner.train")
        np.random.seed(self.learner_params["seed"])
        data = PreprocessingExcludeMissingValues.remove_rows_without_target(data)
        self.validation = ValidationStep(self.validation_params, data)

        for train_data, validation_data in self.validation.split():
            # the proprocessing is done at every validation step
            self.preprocessings += [PreprocessingStep(self.preprocessing_params)]
            train_data, _ = self.preprocessings[-1].run(
                train_data
            )
            validation_data = self.preprocessings[-1].transform(
                validation_data
            )


            self.learners += [LearnerFactory.get_learner(self.learner_params)]
            learner = self.learners[-1]

            self.callbacks.add_and_set_learner(learner)
            self.callbacks.on_learner_train_start()

            for i in range(learner.max_iters):
                self.callbacks.on_iteration_start()
                learner.fit(train_data.get("X"), train_data.get("y"))
                # do a target postprocessing here
                self.callbacks.on_iteration_end(
                    {"iter_cnt": i},
                    self.predictions(learner, train_data, validation_data),
                )
                if learner.stop_training:
                    break
                learner.update({"step": i})
            # end of learner iters loop
            self.callbacks.on_learner_train_end()
        # end of validation loop
        self.callbacks.on_framework_train_end()
        self.train_time = time.time() - start_time
    def test_run_fill_median_convert_integer_validation_dataset(self):
        # training data
        d = {
            "col1": [1, 1, np.nan, 3],
            "col2": ["a", "a", np.nan, "a"],
            "col3": [1, 1, 1, 3],
            "col4": ["a", "a", "b", "c"],
            "y": [0, 1, 1, 1],
        }
        df = pd.DataFrame(data=d)
        X_train = df.loc[:, ["col1", "col2", "col3", "col4"]]
        y_train = df.loc[:, "y"]

        d_test = {
            "col1": [1, 1, np.nan, 3],
            "col2": ["a", "a", np.nan, "a"],
            "col3": [1, 1, 1, 3],
            "col4": ["a", "a", "b", "c"],
            "y": [np.nan, 1, np.nan, 1],
        }
        df_test = pd.DataFrame(data=d_test)
        X_test = df_test.loc[:, ["col1", "col2", "col3", "col4"]]
        y_test = df_test.loc[:, "y"]

        preprocessing_params = {
            "columns_preprocessing": {
                "col1": [
                    PreprocessingMissingValues.FILL_NA_MEDIAN,
                    PreprocessingCategorical.CONVERT_INTEGER,
                ],
                "col2": [
                    PreprocessingMissingValues.FILL_NA_MEDIAN,
                    PreprocessingCategorical.CONVERT_INTEGER,
                ],
                "col3": [
                    PreprocessingMissingValues.FILL_NA_MEDIAN,
                    PreprocessingCategorical.CONVERT_INTEGER,
                ],
                "col4": [
                    PreprocessingMissingValues.FILL_NA_MEDIAN,
                    PreprocessingCategorical.CONVERT_INTEGER,
                ],
            }
        }

        ps = PreprocessingStep(preprocessing_params)

        train_data, validation_data = ps.run(
            train_data={
                "X": X_train,
                "y": y_train
            },
            validation_data={
                "X": X_test,
                "y": y_test
            },
        )
        X_train, y_train = train_data.get("X"), train_data.get("y")
        X_test, y_test = validation_data.get("X"), validation_data.get("y")

        for col in ["col1", "col2", "col3", "col4"]:
            self.assertTrue(col in X_train.columns)
            self.assertTrue(col in X_test.columns)

        self.assertEqual(4, X_train.shape[0])
        self.assertEqual(4, y_train.shape[0])
        self.assertEqual(2, X_test.shape[0])
        self.assertEqual(2, y_test.shape[0])