Esempio n. 1
0
    def test_split(self):
        kfold_obj = KFold()
        kfold_obj.n_splits = 10
        kfold_obj.random_seed = 32

        # print(self.table, self.table.count())
        data_generator = kfold_obj.split(self.table)
        expect_test_data_num = self.data_num / 10
        expect_train_data_num = self.data_num - expect_test_data_num

        key_list = []
        for train_data, test_data in data_generator:
            train_num = train_data.count()
            test_num = test_data.count()
            # print("train_num: {}, test_num: {}".format(train_num, test_num))
            self.assertTrue(0.9 * expect_train_data_num < train_num < 1.1 *
                            expect_train_data_num)
            self.assertTrue(0.9 * expect_test_data_num < test_num < 1.1 *
                            expect_test_data_num)
            first_key = train_data.first()[0]
            key_list.append(first_key)

        # Test random seed work
        kfold_obj2 = KFold()
        kfold_obj2.n_splits = 10
        kfold_obj2.random_seed = 32

        data_generator = kfold_obj.split(self.table)
        n = 0
        for train_data, test_data in data_generator:
            second_key = train_data.first()[0]
            first_key = key_list[n]
            self.assertTrue(first_key == second_key)
            n += 1
Esempio n. 2
0
    def homo_cross_validation(self, data_instance):
        n_splits = self.workflow_param.n_splits
        k_fold_obj = KFold(n_splits=n_splits)
        kfold_data_generator = k_fold_obj.split(data_instance)
        cv_result = []
        flowid = 0
        LOGGER.info("Doing H**o cross validation")
        for train_data, test_data in kfold_data_generator:
            LOGGER.info("This is the {}th fold".format(flowid))

            LOGGER.info("Start sample before train")
            sample_flowid = "sample_" + str(flowid)
            train_data = self.sample(train_data, sample_flowid)
            LOGGER.info("End sample before_train")

            self.model.set_flowid(flowid)
            self.model.fit(train_data)
            # self.save_model()
            predict_result = self.model.predict(
                test_data, self.workflow_param.predict_param)
            flowid += 1
            eval_result = self.evaluate(predict_result)
            cv_result.append(eval_result)
            self._initialize_model(self.config_path)

        return cv_result
Esempio n. 3
0
    def hetero_cross_validation(self, data_instance):
        n_splits = self.workflow_param.n_splits

        if self.role == consts.GUEST:
            LOGGER.info("In hetero cross_validation Guest")
            k_fold_obj = KFold(n_splits=n_splits)
            kfold_data_generator = k_fold_obj.split(data_instance)
            flowid = 0
            cv_results = []
            for train_data, test_data in kfold_data_generator:
                LOGGER.info("flowid:{}".format(flowid))
                self._synchronous_data(train_data, flowid, consts.TRAIN_DATA)
                LOGGER.info("synchronous train data")
                self._synchronous_data(test_data, flowid, consts.TEST_DATA)
                LOGGER.info("synchronous test data")

                self.model.set_flowid(flowid)
                self.model.fit(train_data)
                pred_res = self.model.predict(
                    test_data, self.workflow_param.predict_param)
                evaluation_results = self.evaluate(pred_res)
                cv_results.append(evaluation_results)
                flowid += 1
                LOGGER.info("cv" + str(flowid) + " evaluation:" +
                            str(evaluation_results))
                self._initialize_model(self.config_path)

            LOGGER.info("total cv evaluation:{}".format(cv_results))
            return cv_results

        elif self.role == consts.HOST:
            LOGGER.info("In hetero cross_validation Host")
            for flowid in range(n_splits):
                LOGGER.info("flowid:{}".format(flowid))
                train_data = self._synchronous_data(data_instance, flowid,
                                                    consts.TRAIN_DATA)
                LOGGER.info("synchronous train data")
                test_data = self._synchronous_data(data_instance, flowid,
                                                   consts.TEST_DATA)
                LOGGER.info("synchronous test data")

                self.model.set_flowid(flowid)
                self.model.fit(train_data)
                self.model.predict(test_data)
                flowid += 1
                self._initialize_model(self.config_path)

        elif self.role == consts.ARBITER:
            LOGGER.info("In hetero cross_validation Arbiter")
            for flowid in range(n_splits):
                LOGGER.info("flowid:{}".format(flowid))
                self.model.set_flowid(flowid)
                self.model.fit()
                flowid += 1
                self._initialize_model(self.config_path)
Esempio n. 4
0
    def test_split(self):
        n_splits = 10
        kfold_obj = KFold(n_splits)

        print(self.table, self.table.count())
        data_generator = kfold_obj.split(self.table)
        expect_test_data_num = self.data_num / 10
        expect_train_data_num = self.data_num - expect_test_data_num
        print("expect_train_data_num: {}, expect_test_data_num: {}".format(
            expect_train_data_num, expect_test_data_num))
        for train_data, test_data in data_generator:
            train_num = train_data.count()
            test_num = test_data.count()
            print("train_num: {}, test_num: {}".format(train_num, test_num))
            self.assertTrue(0.9 * expect_train_data_num < train_num < 1.1 *
                            expect_train_data_num)
            self.assertTrue(0.9 * expect_test_data_num < test_num < 1.1 *
                            expect_test_data_num)
Esempio n. 5
0
    def hetero_cross_validation(self, data_instance):
        LOGGER.debug("Enter train function")
        LOGGER.debug("Start intersection before train")
        intersect_flowid = "cross_validation_0"
        data_instance = self.intersect(data_instance, intersect_flowid)
        LOGGER.debug("End intersection before train")

        n_splits = self.workflow_param.n_splits
        if self.role == consts.GUEST:
            LOGGER.info("In hetero cross_validation Guest")
            k_fold_obj = KFold(n_splits=n_splits)
            kfold_data_generator = k_fold_obj.split(data_instance)
            flowid = 0
            cv_results = []
            for train_data, test_data in kfold_data_generator:
                self._init_pipeline()
                LOGGER.info("flowid:{}".format(flowid))
                self._synchronous_data(train_data, flowid, consts.TRAIN_DATA)
                LOGGER.info("synchronous train data")
                self._synchronous_data(test_data, flowid, consts.TEST_DATA)
                LOGGER.info("synchronous test data")

                LOGGER.info("Start sample before train")
                sample_flowid = "sample_" + str(flowid)
                train_data = self.sample(train_data, sample_flowid)
                LOGGER.info("End sample before_train")

                feature_selection_flowid = "feature_selection_fit_" + str(
                    flowid)
                train_data = self.feature_selection_fit(
                    train_data, feature_selection_flowid)
                LOGGER.info("End feature selection fit_transform")

                train_data, cols_scale_value = self.scale(train_data)

                self.model.set_flowid(flowid)
                self.model.fit(train_data)

                feature_selection_flowid = "feature_selection_transform_" + str(
                    flowid)
                test_data = self.feature_selection_transform(
                    test_data, feature_selection_flowid)
                LOGGER.info("End feature selection transform")

                test_data, cols_scale_value = self.scale(
                    test_data, cols_scale_value)
                pred_res = self.model.predict(
                    test_data, self.workflow_param.predict_param)
                evaluation_results = self.evaluate(pred_res)
                cv_results.append(evaluation_results)
                flowid += 1
                LOGGER.info("cv" + str(flowid) + " evaluation:" +
                            str(evaluation_results))
                self._initialize_model(self.config_path)

            LOGGER.info("total cv evaluation:{}".format(cv_results))
            return cv_results

        elif self.role == consts.HOST:
            LOGGER.info("In hetero cross_validation Host")
            for flowid in range(n_splits):
                self._init_pipeline()
                LOGGER.info("flowid:{}".format(flowid))
                train_data = self._synchronous_data(data_instance, flowid,
                                                    consts.TRAIN_DATA)
                LOGGER.info("synchronous train data")
                test_data = self._synchronous_data(data_instance, flowid,
                                                   consts.TEST_DATA)
                LOGGER.info("synchronous test data")

                LOGGER.info("Start sample before train")
                sample_flowid = "sample_" + str(flowid)
                train_data = self.sample(train_data, sample_flowid)
                LOGGER.info("End sample before_train")

                feature_selection_flowid = "feature_selection_fit_" + str(
                    flowid)
                train_data = self.feature_selection_fit(
                    train_data, feature_selection_flowid)
                LOGGER.info("End feature selection fit_transform")

                self.model.set_flowid(flowid)
                self.model.fit(train_data)

                feature_selection_flowid = "feature_selection_transform_" + str(
                    flowid)
                test_data = self.feature_selection_transform(
                    test_data, feature_selection_flowid)
                LOGGER.info("End feature selection transform")

                self.model.predict(test_data)
                flowid += 1
                self._initialize_model(self.config_path)

        elif self.role == consts.ARBITER:
            LOGGER.info("In hetero cross_validation Arbiter")
            for flowid in range(n_splits):
                LOGGER.info("flowid:{}".format(flowid))
                self.model.set_flowid(flowid)
                self.model.fit()
                flowid += 1
                self._initialize_model(self.config_path)