def test_split(self): kfold_obj = KFold() kfold_obj.n_splits = 10 kfold_obj.random_seed = 32 # print(self.table, self.table.count()) data_generator = kfold_obj.split(self.table) expect_test_data_num = self.data_num / 10 expect_train_data_num = self.data_num - expect_test_data_num key_list = [] for train_data, test_data in data_generator: train_num = train_data.count() test_num = test_data.count() # print("train_num: {}, test_num: {}".format(train_num, test_num)) self.assertTrue(0.9 * expect_train_data_num < train_num < 1.1 * expect_train_data_num) self.assertTrue(0.9 * expect_test_data_num < test_num < 1.1 * expect_test_data_num) first_key = train_data.first()[0] key_list.append(first_key) # Test random seed work kfold_obj2 = KFold() kfold_obj2.n_splits = 10 kfold_obj2.random_seed = 32 data_generator = kfold_obj.split(self.table) n = 0 for train_data, test_data in data_generator: second_key = train_data.first()[0] first_key = key_list[n] self.assertTrue(first_key == second_key) n += 1
def homo_cross_validation(self, data_instance): n_splits = self.workflow_param.n_splits k_fold_obj = KFold(n_splits=n_splits) kfold_data_generator = k_fold_obj.split(data_instance) cv_result = [] flowid = 0 LOGGER.info("Doing H**o cross validation") for train_data, test_data in kfold_data_generator: LOGGER.info("This is the {}th fold".format(flowid)) LOGGER.info("Start sample before train") sample_flowid = "sample_" + str(flowid) train_data = self.sample(train_data, sample_flowid) LOGGER.info("End sample before_train") self.model.set_flowid(flowid) self.model.fit(train_data) # self.save_model() predict_result = self.model.predict( test_data, self.workflow_param.predict_param) flowid += 1 eval_result = self.evaluate(predict_result) cv_result.append(eval_result) self._initialize_model(self.config_path) return cv_result
def hetero_cross_validation(self, data_instance): n_splits = self.workflow_param.n_splits if self.role == consts.GUEST: LOGGER.info("In hetero cross_validation Guest") k_fold_obj = KFold(n_splits=n_splits) kfold_data_generator = k_fold_obj.split(data_instance) flowid = 0 cv_results = [] for train_data, test_data in kfold_data_generator: LOGGER.info("flowid:{}".format(flowid)) self._synchronous_data(train_data, flowid, consts.TRAIN_DATA) LOGGER.info("synchronous train data") self._synchronous_data(test_data, flowid, consts.TEST_DATA) LOGGER.info("synchronous test data") self.model.set_flowid(flowid) self.model.fit(train_data) pred_res = self.model.predict( test_data, self.workflow_param.predict_param) evaluation_results = self.evaluate(pred_res) cv_results.append(evaluation_results) flowid += 1 LOGGER.info("cv" + str(flowid) + " evaluation:" + str(evaluation_results)) self._initialize_model(self.config_path) LOGGER.info("total cv evaluation:{}".format(cv_results)) return cv_results elif self.role == consts.HOST: LOGGER.info("In hetero cross_validation Host") for flowid in range(n_splits): LOGGER.info("flowid:{}".format(flowid)) train_data = self._synchronous_data(data_instance, flowid, consts.TRAIN_DATA) LOGGER.info("synchronous train data") test_data = self._synchronous_data(data_instance, flowid, consts.TEST_DATA) LOGGER.info("synchronous test data") self.model.set_flowid(flowid) self.model.fit(train_data) self.model.predict(test_data) flowid += 1 self._initialize_model(self.config_path) elif self.role == consts.ARBITER: LOGGER.info("In hetero cross_validation Arbiter") for flowid in range(n_splits): LOGGER.info("flowid:{}".format(flowid)) self.model.set_flowid(flowid) self.model.fit() flowid += 1 self._initialize_model(self.config_path)
def test_split(self): n_splits = 10 kfold_obj = KFold(n_splits) print(self.table, self.table.count()) data_generator = kfold_obj.split(self.table) expect_test_data_num = self.data_num / 10 expect_train_data_num = self.data_num - expect_test_data_num print("expect_train_data_num: {}, expect_test_data_num: {}".format( expect_train_data_num, expect_test_data_num)) for train_data, test_data in data_generator: train_num = train_data.count() test_num = test_data.count() print("train_num: {}, test_num: {}".format(train_num, test_num)) self.assertTrue(0.9 * expect_train_data_num < train_num < 1.1 * expect_train_data_num) self.assertTrue(0.9 * expect_test_data_num < test_num < 1.1 * expect_test_data_num)
def hetero_cross_validation(self, data_instance): LOGGER.debug("Enter train function") LOGGER.debug("Start intersection before train") intersect_flowid = "cross_validation_0" data_instance = self.intersect(data_instance, intersect_flowid) LOGGER.debug("End intersection before train") n_splits = self.workflow_param.n_splits if self.role == consts.GUEST: LOGGER.info("In hetero cross_validation Guest") k_fold_obj = KFold(n_splits=n_splits) kfold_data_generator = k_fold_obj.split(data_instance) flowid = 0 cv_results = [] for train_data, test_data in kfold_data_generator: self._init_pipeline() LOGGER.info("flowid:{}".format(flowid)) self._synchronous_data(train_data, flowid, consts.TRAIN_DATA) LOGGER.info("synchronous train data") self._synchronous_data(test_data, flowid, consts.TEST_DATA) LOGGER.info("synchronous test data") LOGGER.info("Start sample before train") sample_flowid = "sample_" + str(flowid) train_data = self.sample(train_data, sample_flowid) LOGGER.info("End sample before_train") feature_selection_flowid = "feature_selection_fit_" + str( flowid) train_data = self.feature_selection_fit( train_data, feature_selection_flowid) LOGGER.info("End feature selection fit_transform") train_data, cols_scale_value = self.scale(train_data) self.model.set_flowid(flowid) self.model.fit(train_data) feature_selection_flowid = "feature_selection_transform_" + str( flowid) test_data = self.feature_selection_transform( test_data, feature_selection_flowid) LOGGER.info("End feature selection transform") test_data, cols_scale_value = self.scale( test_data, cols_scale_value) pred_res = self.model.predict( test_data, self.workflow_param.predict_param) evaluation_results = self.evaluate(pred_res) cv_results.append(evaluation_results) flowid += 1 LOGGER.info("cv" + str(flowid) + " evaluation:" + str(evaluation_results)) self._initialize_model(self.config_path) LOGGER.info("total cv evaluation:{}".format(cv_results)) return cv_results elif self.role == consts.HOST: LOGGER.info("In hetero cross_validation Host") for flowid in range(n_splits): self._init_pipeline() LOGGER.info("flowid:{}".format(flowid)) train_data = self._synchronous_data(data_instance, flowid, consts.TRAIN_DATA) LOGGER.info("synchronous train data") test_data = self._synchronous_data(data_instance, flowid, consts.TEST_DATA) LOGGER.info("synchronous test data") LOGGER.info("Start sample before train") sample_flowid = "sample_" + str(flowid) train_data = self.sample(train_data, sample_flowid) LOGGER.info("End sample before_train") feature_selection_flowid = "feature_selection_fit_" + str( flowid) train_data = self.feature_selection_fit( train_data, feature_selection_flowid) LOGGER.info("End feature selection fit_transform") self.model.set_flowid(flowid) self.model.fit(train_data) feature_selection_flowid = "feature_selection_transform_" + str( flowid) test_data = self.feature_selection_transform( test_data, feature_selection_flowid) LOGGER.info("End feature selection transform") self.model.predict(test_data) flowid += 1 self._initialize_model(self.config_path) elif self.role == consts.ARBITER: LOGGER.info("In hetero cross_validation Arbiter") for flowid in range(n_splits): LOGGER.info("flowid:{}".format(flowid)) self.model.set_flowid(flowid) self.model.fit() flowid += 1 self._initialize_model(self.config_path)