def __synchronize_encryption(self, mode='train'): """ Communicate with hosts. Specify whether use encryption or not and transfer the public keys. """ # 2. Send pubkey to those use-encryption guest & hosts encrypter = PaillierEncrypt() encrypter.generate_key(self.key_length) pub_key = encrypter.get_public_key() # LOGGER.debug("Start to remote pub_key: {}, transfer_id: {}".format(pub_key, pubkey_id)) self.transfer_variable.paillier_pubkey.remote(obj=pub_key, role=consts.GUEST, idx=0, suffix=(mode, )) LOGGER.info("send pubkey to guest") pri_key = encrypter.get_privacy_key() self.transfer_variable.paillier_prikey.remote(obj=pri_key, role=consts.GUEST, idx=0, suffix=(mode, )) # LOGGER.debug("Start to remote pri_key: {}, transfer_id: {}".format(pri_key, prikey_id)) LOGGER.info("send prikey to guest") self.transfer_variable.paillier_pubkey.remote(obj=pub_key, role=consts.HOST, idx=-1, suffix=(mode, )) LOGGER.info("send pubkey to host") self.transfer_variable.paillier_prikey.remote(obj=pri_key, role=consts.HOST, idx=-1, suffix=(mode, )) LOGGER.info("send prikey to host")
def test_tensor_op(self): arr1 = np.ones((10, 1, 3)) arr1[0] = np.array([[2, 3, 4]]) arr2 = np.ones((10, 3, 3)) arr3 = np.ones([1, 1, 3]) arr4 = np.ones([50, 1]) arr5 = np.ones([32]) pt = PaillierTensor(arr1) pt2 = PaillierTensor(arr2) pt3 = PaillierTensor(arr3) pt4 = PaillierTensor(arr4) pt5 = PaillierTensor(arr5) encrypter = PaillierEncrypt() encrypter.generate_key(EncryptParam().key_length) encrypted_calculator = EncryptModeCalculator( encrypter, EncryptedModeCalculatorParam().mode, EncryptedModeCalculatorParam().re_encrypted_rate) rs1 = pt * arr2 rs2 = pt * pt2 rs3 = pt.matmul_3d(pt2) enpt = pt2.encrypt(encrypted_calculator) enrs = enpt.matmul_3d(arr1, multiply='right') rng_generator = random_number_generator.RandomNumberGenerator() enpt2 = pt4.encrypt(encrypted_calculator) random_num = rng_generator.generate_random_number(enpt2.shape)
class TestHeteroLogisticGradient(unittest.TestCase): def setUp(self): self.paillier_encrypt = PaillierEncrypt() self.paillier_encrypt.generate_key() # self.hetero_lr_gradient = HeteroLogisticGradient(self.paillier_encrypt) self.hetero_lr_gradient = hetero_lr_gradient_and_loss.Guest() size = 10 self.wx = session.parallelize( [self.paillier_encrypt.encrypt(i) for i in range(size)]) self.en_sum_wx_square = session.parallelize( [self.paillier_encrypt.encrypt(np.square(i)) for i in range(size)]) self.w = [i for i in range(size)] self.data_inst = session.parallelize([ Instance(features=[1 for _ in range(size)], label=pow(-1, i % 2)) for i in range(size) ], partition=1) # test fore_gradient self.fore_gradient_local = [ -0.5, 0.75, 0, 1.25, 0.5, 1.75, 1, 2.25, 1.5, 2.75 ] # test gradient self.gradient = [ 1.125, 1.125, 1.125, 1.125, 1.125, 1.125, 1.125, 1.125, 1.125, 1.125 ] self.gradient_fit_intercept = [ 1.125, 1.125, 1.125, 1.125, 1.125, 1.125, 1.125, 1.125, 1.125, 1.125, 1.125 ] self.loss = 4.505647
def test_cipher_add_sub_mul(self): encrypter = PaillierEncrypt() encrypter.generate_key(1024) en_1, en_2, en_3, en_4 = encrypter.encrypt(1), encrypter.encrypt( 2), encrypter.encrypt(3), encrypter.encrypt(4) en_5, en_6, en_7, en_8 = encrypter.encrypt(5), encrypter.encrypt( 6), encrypter.encrypt(7), encrypter.encrypt(8) a = PackingCipherTensor([en_1, en_2, en_3, en_4]) b = PackingCipherTensor([en_5, en_6, en_7, en_8]) c = PackingCipherTensor(encrypter.encrypt(1)) d = PackingCipherTensor([encrypter.encrypt(5)]) rs_1 = a + b rs_2 = b - a rs_3 = c + d rs_4 = 123 * c rs_5 = d * 456 rs_6 = a * 114 print(encrypter.recursive_decrypt(rs_1.ciphers)) print(encrypter.recursive_decrypt(rs_2.ciphers)) print(encrypter.recursive_decrypt(rs_3.ciphers)) print(encrypter.decrypt(rs_4.ciphers)) print(encrypter.decrypt(rs_5.ciphers)) print(encrypter.recursive_decrypt(rs_6.ciphers)) print('cipher test done') print('*' * 30)
def test_data_type(self, mode="strict", re_encrypted_rate=0.2): from federatedml.secureprotol import PaillierEncrypt from federatedml.secureprotol.encrypt_mode import EncryptModeCalculator encrypter = PaillierEncrypt() encrypter.generate_key(1024) encrypted_calculator = EncryptModeCalculator(encrypter, mode, re_encrypted_rate) data_list = dict( encrypted_calculator.encrypt(self.data_list).collect()) data_tuple = dict( encrypted_calculator.encrypt(self.data_tuple).collect()) data_numpy = dict( encrypted_calculator.encrypt(self.data_numpy).collect()) for key, value in data_list.items(): self.assertTrue(isinstance(value, list)) self.assertTrue(len(value) == len(self.list_data[key])) for key, value in data_tuple.items(): self.assertTrue(isinstance(value, tuple)) self.assertTrue(len(value) == len(self.tuple_data[key])) for key, value in data_numpy.items(): self.assertTrue(type(value).__name__ == "ndarray") self.assertTrue(value.shape[0] == self.numpy_data[key].shape[0])
def generate_encrypter(self, param): LOGGER.info("generate encrypter") if param.encrypt_param.method.lower() == consts.PAILLIER.lower(): encrypter = PaillierEncrypt() encrypter.generate_key(param.encrypt_param.key_length) else: raise NotImplementedError("encrypt method not supported yet!!!") return encrypter
def EINI_guest_predict(data_inst, trees: List[HeteroDecisionTreeGuest], learning_rate, init_score, booster_dim, encrypt_key_length, transfer_var: HeteroSecureBoostTransferVariable, sitename=None, party_list=None, predict_cache=None, pred_leaf=False): if sitename is None: raise ValueError( 'input sitename is None, not able to run EINI predict algorithm') if pred_leaf: raise ValueError( 'EINI predict mode does not support leaf idx prediction') # EINI algorithms id_pos_map_list = get_leaf_idx_map(trees) map_func = functools.partial(generate_leaf_candidates_guest, sitename=sitename, trees=trees, node_pos_map_list=id_pos_map_list, init_score=init_score, learning_rate=learning_rate, booster_dim=booster_dim) position_vec = data_inst.mapValues(map_func) # encryption encrypter = PaillierEncrypt() encrypter.generate_key(encrypt_key_length) encrypter_vec_table = position_vec.mapValues(encrypter.recursive_encrypt) # federation part # send to first host party transfer_var.guest_predict_data.remote(encrypter_vec_table, idx=0, suffix='position_vec', role=consts.HOST) # get from last host party result_table = transfer_var.host_predict_data.get(idx=len(party_list) - 1, suffix='merge_result', role=consts.HOST) # decode result result = result_table.mapValues(encrypter.recursive_decrypt) # reformat result = result.mapValues(lambda x: np.array(x)) if predict_cache: result = result.join(predict_cache, lambda v1, v2: v1 + v2) return result
class TestHeteroLogisticGradient(unittest.TestCase): def setUp(self): self.paillier_encrypt = PaillierEncrypt() self.paillier_encrypt.generate_key() # self.hetero_lr_gradient = HeteroLogisticGradient(self.paillier_encrypt) self.hetero_lr_gradient = hetero_lr_gradient_and_loss.Guest() size = 10 self.en_wx = session.parallelize([self.paillier_encrypt.encrypt(i) for i in range(size)], partition=48, include_key=False) # self.en_wx = session.parallelize([self.paillier_encrypt.encrypt(i) for i in range(size)]) self.en_sum_wx_square = session.parallelize([self.paillier_encrypt.encrypt(np.square(i)) for i in range(size)], partition=48, include_key=False) self.wx = np.array([i for i in range(size)]) self.w = self.wx / np.array([1 for _ in range(size)]) self.data_inst = session.parallelize( [Instance(features=np.array([1 for _ in range(size)]), label=pow(-1, i % 2)) for i in range(size)], partition=48, include_key=False) # test fore_gradient self.fore_gradient_local = [-0.5, 0.75, 0, 1.25, 0.5, 1.75, 1, 2.25, 1.5, 2.75] # test gradient self.gradient = [1.125, 1.125, 1.125, 1.125, 1.125, 1.125, 1.125, 1.125, 1.125, 1.125] self.gradient_fit_intercept = [1.125, 1.125, 1.125, 1.125, 1.125, 1.125, 1.125, 1.125, 1.125, 1.125, 1.125] self.loss = 4.505647 def test_compute_partition_gradient(self): fore_gradient = self.en_wx.join(self.data_inst, lambda wx, d: 0.25 * wx - 0.5 * d.label) sparse_data = self._make_sparse_data() gradient_computer = hetero_linear_model_gradient.HeteroGradientBase() for fit_intercept in [True, False]: dense_result = gradient_computer.compute_gradient(self.data_inst, fore_gradient, fit_intercept) dense_result = [self.paillier_encrypt.decrypt(iterator) for iterator in dense_result] if fit_intercept: self.assertListEqual(dense_result, self.gradient_fit_intercept) else: self.assertListEqual(dense_result, self.gradient) sparse_result = gradient_computer.compute_gradient(sparse_data, fore_gradient, fit_intercept) sparse_result = [self.paillier_encrypt.decrypt(iterator) for iterator in sparse_result] self.assertListEqual(dense_result, sparse_result) def _make_sparse_data(self): def trans_sparse(instance): dense_features = instance.features indices = [i for i in range(len(dense_features))] sparse_features = SparseVector(indices=indices, data=dense_features, shape=len(dense_features)) return Instance(inst_id=None, features=sparse_features, label=instance.label) return self.data_inst.mapValues(trans_sparse)
class TestHomoLRGradient(unittest.TestCase): def setUp(self): self.paillier_encrypt = PaillierEncrypt() self.paillier_encrypt.generate_key() self.gradient_operator = LogisticGradient() self.taylor_operator = TaylorLogisticGradient() self.X = np.array([[1, 2, 3, 4, 5], [3, 2, 4, 5, 1], [ 2, 2, 3, 1, 1, ]]) / 10 self.X1 = np.c_[self.X, np.ones(3)] self.Y = np.array([[1], [1], [-1]]) self.values = [] for idx, x in enumerate(self.X): inst = Instance(inst_id=idx, features=x, label=self.Y[idx]) self.values.append((idx, inst)) self.values1 = [] for idx, x in enumerate(self.X1): inst = Instance(inst_id=idx, features=x, label=self.Y[idx]) self.values1.append((idx, inst)) self.coef = np.array([2, 2.3, 3, 4, 2.1]) / 10 self.coef1 = np.append(self.coef, [1]) def test_gradient_length(self): fit_intercept = False grad, loss = self.gradient_operator.compute(self.values, self.coef, 0, fit_intercept) self.assertEqual(grad.shape[0], self.X.shape[1]) taylor_grad, loss = self.taylor_operator.compute( self.values, self.coef, 0, fit_intercept) self.assertEqual(taylor_grad.shape[0], self.X.shape[1]) self.assertTrue(np.sum(grad - taylor_grad) < 0.0001) fit_intercept = True grad, loss = self.gradient_operator.compute(self.values, self.coef, 0, fit_intercept) self.assertEqual(grad.shape[0], self.X.shape[1] + 1) taylor_grad, loss = self.taylor_operator.compute( self.values, self.coef, 0, fit_intercept) self.assertEqual(taylor_grad.shape[0], self.X.shape[1] + 1) self.assertTrue(np.sum(grad - taylor_grad) < 0.0001)
def test_diff_mode(self, round=10, mode="strict", re_encrypted_rate=0.2): from federatedml.secureprotol.encrypt_mode import EncryptModeCalculator from federatedml.secureprotol import PaillierEncrypt encrypter = PaillierEncrypt() encrypter.generate_key(1024) encrypted_calculator = EncryptModeCalculator(encrypter, mode, re_encrypted_rate) for i in range(round): data_i = self.data_numpy.mapValues(lambda v: v + i) data_i = encrypted_calculator.encrypt(data_i) decrypt_data_i = dict(data_i.mapValues(lambda arr: np.array([encrypter.decrypt(val) for val in arr])).collect()) for j in range(30): self.assertTrue(np.fabs(self.numpy_data[j] - decrypt_data_i[j] + i).all() < 1e-5)
def keygen(self, key_length, suffix=tuple()) -> dict: use_cipher = self._use_encrypt.get_parties( parties=self._client_parties, suffix=suffix) ciphers = dict() for party, use_encryption in zip(self._client_parties, use_cipher): if not use_encryption: ciphers[party] = None else: cipher = PaillierEncrypt() cipher.generate_key(key_length) pub_key = cipher.get_public_key() self._pailler_pubkey.remote_parties(obj=pub_key, parties=[party], suffix=suffix) ciphers[party] = cipher return ciphers
def test_encrypt_and_decrypt(self): from federatedml.secureprotol import PaillierEncrypt from federatedml.secureprotol.encrypt_mode import EncryptModeCalculator encrypter = PaillierEncrypt() encrypter.generate_key(1024) encrypted_calculator = EncryptModeCalculator(encrypter, "fast") encrypter_tensor = self.paillier_tensor1.encrypt(encrypted_calculator) decrypted_tensor = encrypter_tensor.decrypt(encrypter) self.assertTrue(isinstance(encrypter_tensor, PaillierTensor)) self.assertTrue(isinstance(decrypted_tensor, PaillierTensor)) arr = decrypted_tensor.numpy() self.assertTrue(abs(arr.sum() - 10000) < consts.FLOAT_ZERO)
class TestHeteroLogisticGradient(unittest.TestCase): def setUp(self): self.paillier_encrypt = PaillierEncrypt() self.paillier_encrypt.generate_key() self.hetero_lr_gradient = HeteroLogisticGradient(self.paillier_encrypt) size = 10 self.wx = eggroll.parallelize([self.paillier_encrypt.encrypt(i) for i in range(size)]) self.en_sum_wx_square = eggroll.parallelize([self.paillier_encrypt.encrypt(np.square(i)) for i in range(size)]) self.w = [i for i in range(size)] self.data_inst = eggroll.parallelize( [Instance(features=[1 for _ in range(size)], label=pow(-1, i % 2)) for i in range(size)], partition=1) # test fore_gradient self.fore_gradient_local = [-0.5, 0.75, 0, 1.25, 0.5, 1.75, 1, 2.25, 1.5, 2.75] # test gradient self.gradient = [1.125, 1.125, 1.125, 1.125, 1.125, 1.125, 1.125, 1.125, 1.125, 1.125] self.gradient_fit_intercept = [1.125, 1.125, 1.125, 1.125, 1.125, 1.125, 1.125, 1.125, 1.125, 1.125, 1.125] self.loss = 4.505647 def test_compute_fore_gradient(self): fore_gradient = self.hetero_lr_gradient.compute_fore_gradient(self.data_inst, self.wx) fore_gradient_local = [self.paillier_encrypt.decrypt(iterator[1]) for iterator in fore_gradient.collect()] self.assertListEqual(fore_gradient_local, self.fore_gradient_local) def test_compute_gradient(self): fore_gradient = self.hetero_lr_gradient.compute_fore_gradient(self.data_inst, self.wx) gradient = self.hetero_lr_gradient.compute_gradient(self.data_inst, fore_gradient, fit_intercept=False) de_gradient = [self.paillier_encrypt.decrypt(iterator) for iterator in gradient] self.assertListEqual(de_gradient, self.gradient) gradient = self.hetero_lr_gradient.compute_gradient(self.data_inst, fore_gradient, fit_intercept=True) de_gradient = [self.paillier_encrypt.decrypt(iterator) for iterator in gradient] self.assertListEqual(de_gradient, self.gradient_fit_intercept) def test_compute_gradient_and_loss(self): fore_gradient = self.hetero_lr_gradient.compute_fore_gradient(self.data_inst, self.wx) gradient, loss = self.hetero_lr_gradient.compute_gradient_and_loss(self.data_inst, fore_gradient, self.wx, self.en_sum_wx_square, False) de_gradient = [self.paillier_encrypt.decrypt(i) for i in gradient] self.assertListEqual(de_gradient, self.gradient) diff_loss = np.abs(self.loss - self.paillier_encrypt.decrypt(loss)) self.assertLess(diff_loss, 1e-5)
class HeteroSecureBoostingTreeGuest(BoostingTree): def __init__(self, secureboost_tree_param): super(HeteroSecureBoostingTreeGuest, self).__init__(secureboost_tree_param) self.convegence = None self.y = None self.F = None self.data_bin = None self.loss = None self.classes_dict = {} self.classes_ = [] self.num_classes = 0 self.classify_target = "binary" self.feature_num = None self.encrypter = None self.grad_and_hess = None self.flowid = 0 self.tree_dim = 1 self.trees_ = [] self.history_loss = [] self.bin_split_points = None self.bin_sparse_points = None self.transfer_inst = HeteroSecureBoostingTreeTransferVariable() def set_loss(self, loss_type): LOGGER.info("set loss, loss type is {}".format(loss_type)) if self.task_type == "classification": if loss_type == "cross_entropy": if self.num_classes == 2: self.loss = SigmoidBinaryCrossEntropyLoss() else: self.loss = SoftmaxCrossEntropyLoss() else: raise NotImplementedError("Loss type %s not supported yet" % (self.loss_type)) else: raise NotImplementedError("Loss type %s not supported yet" % (self.loss_type)) def convert_feature_to_bin(self, data_instance): LOGGER.info("convert feature to bins") self.data_bin, self.bin_split_points, self.bin_sparse_points = \ Quantile.convert_feature_to_bin( data_instance, self.quantile_method, self.bin_num, self.bin_gap, self.bin_sample_num) def set_y(self): LOGGER.info("set label from data and check label") self.y = self.data_bin.mapValues(lambda instance: instance.label) self.check_label() def set_flowid(self, flowid=0): LOGGER.info("set flowid, flowid is {}".format(flowid)) self.flowid = flowid def generate_flowid(self, round_num, tree_num): LOGGER.info("generate flowid") return ".".join(map(str, [self.flowid, round_num, tree_num])) def check_label(self): LOGGER.info("check label") if self.task_type == "classification": self.num_classes, self.classes_ = ClassifyLabelChecker.validate_y( self.y) if self.num_classes > 2: self.classify_target = "multinomial" self.tree_dim = self.num_classes range_from_zero = True for _class in self.classes_: try: if _class >= 0 and _class < range_from_zero and isinstance( _class, int): continue else: range_from_zero = False break except: range_from_zero = False self.classes_ = sorted(self.classes_) if not range_from_zero: class_mapping = dict( zip(self.classes_, range(self.num_classes))) self.y = self.y.mapValues(lambda _class: class_mapping[_class]) else: RegressionLabelChecker.validate_y(self.y) self.set_loss(self.loss_type) def generate_encrypter(self): LOGGER.info("generate encrypter") if self.encrypt_param.method == "paillier": self.encrypter = PaillierEncrypt() self.encrypter.generate_key(self.encrypt_param.key_length) else: raise NotImplementedError("encrypt method not supported yes!!!") @staticmethod def accumulate_f(f_val, new_f_val, lr=0.1, idx=0): f_val[idx] += lr * new_f_val return f_val def update_f_value(self, new_f=None, tidx=-1): LOGGER.info("update tree f value, tree idx is {}".format(tidx)) if self.F is None: LOGGER.info("tree_dim is %d" % (self.tree_dim)) tree_dim = self.tree_dim self.F = self.y.mapValues(lambda v: np.zeros(tree_dim)) else: accumuldate_f = functools.partial(self.accumulate_f, lr=self.learning_rate, idx=tidx) self.F = self.F.join(new_f, accumuldate_f) def compute_grad_and_hess(self): LOGGER.info("compute grad and hess") loss_method = self.loss self.grad_and_hess = self.y.join(self.F, lambda y, f_val: \ (loss_method.compute_grad(y, loss_method.predict(f_val)), \ loss_method.compute_hess(y, loss_method.predict(f_val)))) def compute_loss(self): LOGGER.info("compute loss") loss_method = self.loss y_predict = self.F.mapValues(lambda val: loss_method.predict(val)) loss = loss_method.compute_loss(self.y, y_predict) return loss def get_grad_and_hess(self, tree_idx): LOGGER.info("get grad and hess of tree {}".format(tree_idx)) grad_and_hess_subtree = self.grad_and_hess.mapValues( lambda grad_and_hess: (grad_and_hess[0][tree_idx], grad_and_hess[1][tree_idx])) return grad_and_hess_subtree def check_convergence(self, loss): LOGGER.info("check convergence") if self.convegence is None: self.convegence = DiffConverge() return self.convegence.is_converge(loss) def sample_valid_features(self): LOGGER.info("sample valid features") if self.feature_num is None: self.feature_num = self.bin_split_points.shape[0] choose_feature = random.choice(range(0, self.feature_num), \ max(1, int(self.subsample_feature_rate * self.feature_num)), replace=False) valid_features = [False for i in range(self.feature_num)] for fid in choose_feature: valid_features[fid] = True return valid_features def sync_tree_dim(self): LOGGER.info("sync tree dim to host") federation.remote(obj=self.tree_dim, name=self.transfer_inst.tree_dim.name, tag=self.transfer_inst.generate_transferid( self.transfer_inst.tree_dim), role=consts.HOST, idx=0) def sync_stop_flag(self, stop_flag, num_round): LOGGER.info( "sync stop flag to host, boosting round is {}".format(num_round)) federation.remote(obj=stop_flag, name=self.transfer_inst.stop_flag.name, tag=self.transfer_inst.generate_transferid( self.transfer_inst.stop_flag, num_round), role=consts.HOST, idx=0) def fit(self, data_inst): LOGGER.info("begin to train secureboosting guest model") self.convert_feature_to_bin(data_inst) self.set_y() self.update_f_value() self.generate_encrypter() self.sync_tree_dim() for i in range(self.num_trees): n_tree = [] self.compute_grad_and_hess() for tidx in range(self.tree_dim): tree_inst = HeteroDecisionTreeGuest(self.tree_param) tree_inst.set_inputinfo(self.data_bin, self.get_grad_and_hess(tidx), self.bin_split_points, self.bin_sparse_points) valid_features = self.sample_valid_features() tree_inst.set_valid_features(valid_features) tree_inst.set_encrypter(self.encrypter) tree_inst.set_flowid(self.generate_flowid(i, tidx)) tree_inst.fit() n_tree.append(tree_inst.get_tree_model()) self.update_f_value(tree_inst.predict_weights, tidx) self.trees_.append(n_tree) loss = self.compute_loss() self.history_loss.append(loss) LOGGER.info("round {} loss is {}".format(i, loss)) if self.n_iter_no_change is True: if self.check_convergence(loss): self.sync_stop_flag(True, i) break else: self.sync_stop_flag(False, i) LOGGER.info("end to train secureboosting guest model") def predict_f_value(self, data_inst): LOGGER.info("predict tree f value") tree_dim = self.tree_dim self.F = data_inst.mapValues(lambda v: np.zeros(tree_dim)) for i in range(len(self.trees_)): n_tree = self.trees_[i] for tidx in range(len(n_tree)): tree_inst = HeteroDecisionTreeGuest(self.tree_param) tree_inst.set_tree_model(n_tree[tidx]) tree_inst.set_flowid(self.generate_flowid(i, tidx)) predict_data = tree_inst.predict(data_inst) self.update_f_value(predict_data, tidx) def predict(self, data_inst, predict_param): LOGGER.info("start predict") self.predict_f_value(data_inst) loss_method = self.loss predicts = self.F.mapValues(lambda f: loss_method.predict(f)) if self.task_type == "classification": classes_ = self.classes_ if self.num_classes == 2: predict_label = predicts.mapValues(lambda pred: classes_[ 1] if pred > predict_param.threshold else classes_[0]) else: predict_label = predicts.mapValues( lambda preds: classes_[np.argmax(preds)]) if predict_param.with_proba: predict_result = data_inst.join( predicts, lambda inst, predict_prob: (inst.label, predict_prob)) else: predict_result = data_inst.mapValues(lambda inst: inst.label) predict_result = predict_result.join( predict_label, lambda label_prob, predict_label: (label_prob[0], label_prob[1], predict_label)) else: raise NotImplementedError("task type %s not supported yet" % (self.task_type)) LOGGER.info("end predict") return predict_result def save_model(self, model_table, model_namespace): LOGGER.info("save model") modelmeta = BoostingTreeModelMeta() modelmeta.trees_ = self.trees_ modelmeta.loss_type = self.loss_type modelmeta.tree_dim = self.tree_dim modelmeta.task_type = self.task_type modelmeta.num_classes = self.num_classes modelmeta.classes_ = self.classes_ modelmeta.loss = self.history_loss model = eggroll.parallelize([modelmeta], include_key=False) model.save_as(model_table, model_namespace) def load_model(self, model_table, model_namespace): LOGGER.info("load model") modelmeta = list( eggroll.table(model_table, model_namespace).collect())[0][1] self.task_type = modelmeta.task_type self.loss_type = modelmeta.loss_type self.tree_dim = modelmeta.tree_dim self.num_classes = modelmeta.num_classes self.trees_ = modelmeta.trees_ self.classes_ = modelmeta.classes_ self.history_loss = modelmeta.loss self.set_loss(self.loss_type) def evaluate(self, labels, pred_prob, pred_labels, evaluate_param): LOGGER.info("evaluate data") predict_res = None if evaluate_param.classi_type == consts.BINARY: predict_res = pred_prob elif evaluate_param.classi_type == consts.MULTY: predict_res = pred_labels else: LOGGER.warning( "unknown classification type, return None as evaluation results" ) eva = Evaluation(evaluate_param.classi_type) return eva.report(labels, predict_res, evaluate_param.metrics, evaluate_param.thresholds, evaluate_param.pos_label)
class HeteroSecureBoostingTreeGuest(BoostingTree): def __init__(self): super(HeteroSecureBoostingTreeGuest, self).__init__() self.convegence = None self.y = None self.F = None self.data_bin = None self.loss = None self.init_score = None self.classes_dict = {} self.classes_ = [] self.num_classes = 0 self.classify_target = "binary" self.feature_num = None self.encrypter = None self.grad_and_hess = None # self.flowid = 0 self.tree_dim = 1 self.tree_meta = None self.trees_ = [] self.history_loss = [] self.bin_split_points = None self.bin_sparse_points = None self.encrypted_mode_calculator = None self.runtime_idx = 0 self.feature_importances_ = {} self.role = consts.GUEST self.transfer_inst = HeteroSecureBoostingTreeTransferVariable() def set_loss(self, objective_param): loss_type = objective_param.objective params = objective_param.params LOGGER.info("set objective, objective is {}".format(loss_type)) if self.task_type == consts.CLASSIFICATION: if loss_type == "cross_entropy": if self.num_classes == 2: self.loss = SigmoidBinaryCrossEntropyLoss() else: self.loss = SoftmaxCrossEntropyLoss() else: raise NotImplementedError("objective %s not supported yet" % (loss_type)) elif self.task_type == consts.REGRESSION: if loss_type == "lse": self.loss = LeastSquaredErrorLoss() elif loss_type == "lae": self.loss = LeastAbsoluteErrorLoss() elif loss_type == "huber": self.loss = HuberLoss(params[0]) elif loss_type == "fair": self.loss = FairLoss(params[0]) elif loss_type == "tweedie": self.loss = TweedieLoss(params[0]) elif loss_type == "log_cosh": self.loss = LogCoshLoss() else: raise NotImplementedError("objective %s not supported yet" % (loss_type)) else: raise NotImplementedError("objective %s not supported yet" % (loss_type)) def convert_feature_to_bin(self, data_instance): LOGGER.info("convert feature to bins") param_obj = FeatureBinningParam(bin_num=self.bin_num) binning_obj = QuantileBinning(param_obj) binning_obj.fit_split_points(data_instance) self.data_bin, self.bin_split_points, self.bin_sparse_points = binning_obj.convert_feature_to_bin( data_instance) LOGGER.info("convert feature to bins over") def set_y(self): LOGGER.info("set label from data and check label") self.y = self.data_bin.mapValues(lambda instance: instance.label) self.check_label() def set_runtime_idx(self, runtime_idx): self.runtime_idx = runtime_idx def generate_flowid(self, round_num, tree_num): LOGGER.info("generate flowid, flowid {}".format(self.flowid)) return ".".join(map(str, [self.flowid, round_num, tree_num])) def check_label(self): LOGGER.info("check label") if self.task_type == consts.CLASSIFICATION: self.num_classes, self.classes_ = ClassifyLabelChecker.validate_y( self.y) if self.num_classes > 2: self.classify_target = "multinomial" self.tree_dim = self.num_classes range_from_zero = True for _class in self.classes_: try: if _class >= 0 and _class < range_from_zero and isinstance( _class, int): continue else: range_from_zero = False break except: range_from_zero = False self.classes_ = sorted(self.classes_) if not range_from_zero: class_mapping = dict( zip(self.classes_, range(self.num_classes))) self.y = self.y.mapValues(lambda _class: class_mapping[_class]) else: RegressionLabelChecker.validate_y(self.y) self.set_loss(self.objective_param) def generate_encrypter(self): LOGGER.info("generate encrypter") if self.encrypt_param.method == consts.PAILLIER: self.encrypter = PaillierEncrypt() self.encrypter.generate_key(self.encrypt_param.key_length) else: raise NotImplementedError("encrypt method not supported yes!!!") self.encrypted_calculator = EncryptModeCalculator( self.encrypter, self.calculated_mode, self.re_encrypted_rate) @staticmethod def accumulate_f(f_val, new_f_val, lr=0.1, idx=0): f_val[idx] += lr * new_f_val return f_val def update_feature_importance(self, tree_feature_importance): for fid in tree_feature_importance: if fid not in self.feature_importances_: self.feature_importances_[fid] = 0 self.feature_importances_[fid] += tree_feature_importance[fid] def update_f_value(self, new_f=None, tidx=-1): LOGGER.info("update tree f value, tree idx is {}".format(tidx)) if self.F is None: if self.tree_dim > 1: self.F, self.init_score = self.loss.initialize( self.y, self.tree_dim) else: self.F, self.init_score = self.loss.initialize(self.y) else: accumuldate_f = functools.partial(self.accumulate_f, lr=self.learning_rate, idx=tidx) self.F = self.F.join(new_f, accumuldate_f) def compute_grad_and_hess(self): LOGGER.info("compute grad and hess") loss_method = self.loss if self.task_type == consts.CLASSIFICATION: self.grad_and_hess = self.y.join(self.F, lambda y, f_val: \ (loss_method.compute_grad(y, loss_method.predict(f_val)), \ loss_method.compute_hess(y, loss_method.predict(f_val)))) else: self.grad_and_hess = self.y.join( self.F, lambda y, f_val: (loss_method.compute_grad(y, f_val), loss_method.compute_hess(y, f_val))) def compute_loss(self): LOGGER.info("compute loss") if self.task_type == consts.CLASSIFICATION: loss_method = self.loss y_predict = self.F.mapValues(lambda val: loss_method.predict(val)) loss = loss_method.compute_loss(self.y, y_predict) elif self.task_type == consts.REGRESSION: if self.objective_param.objective in [ "lse", "lae", "logcosh", "tweedie", "log_cosh", "huber" ]: loss_method = self.loss loss = loss_method.compute_loss(self.y, self.F) else: loss_method = self.loss y_predict = self.F.mapValues( lambda val: loss_method.predict(val)) loss = loss_method.compute_loss(self.y, y_predict) return float(loss) def get_grad_and_hess(self, tree_idx): LOGGER.info("get grad and hess of tree {}".format(tree_idx)) grad_and_hess_subtree = self.grad_and_hess.mapValues( lambda grad_and_hess: (grad_and_hess[0][tree_idx], grad_and_hess[1][tree_idx])) return grad_and_hess_subtree def check_convergence(self, loss): LOGGER.info("check convergence") if self.convegence is None: self.convegence = DiffConverge(eps=self.tol) return self.convegence.is_converge(loss) def sample_valid_features(self): LOGGER.info("sample valid features") if self.feature_num is None: self.feature_num = self.bin_split_points.shape[0] choose_feature = random.choice(range(0, self.feature_num), \ max(1, int(self.subsample_feature_rate * self.feature_num)), replace=False) valid_features = [False for i in range(self.feature_num)] for fid in choose_feature: valid_features[fid] = True return valid_features def sync_tree_dim(self): LOGGER.info("sync tree dim to host") federation.remote(obj=self.tree_dim, name=self.transfer_inst.tree_dim.name, tag=self.transfer_inst.generate_transferid( self.transfer_inst.tree_dim), role=consts.HOST, idx=-1) def sync_stop_flag(self, stop_flag, num_round): LOGGER.info( "sync stop flag to host, boosting round is {}".format(num_round)) federation.remote(obj=stop_flag, name=self.transfer_inst.stop_flag.name, tag=self.transfer_inst.generate_transferid( self.transfer_inst.stop_flag, num_round), role=consts.HOST, idx=-1) def fit(self, data_inst): LOGGER.info("begin to train secureboosting guest model") self.gen_feature_fid_mapping(data_inst.schema) data_inst = self.data_alignment(data_inst) self.convert_feature_to_bin(data_inst) self.set_y() self.update_f_value() self.generate_encrypter() self.sync_tree_dim() self.callback_meta( "loss", "train", MetricMeta(name="train", metric_type="LOSS", extra_metas={"unit_name": "iters"})) for i in range(self.num_trees): self.compute_grad_and_hess() for tidx in range(self.tree_dim): tree_inst = HeteroDecisionTreeGuest(self.tree_param) tree_inst.set_inputinfo(self.data_bin, self.get_grad_and_hess(tidx), self.bin_split_points, self.bin_sparse_points) valid_features = self.sample_valid_features() tree_inst.set_valid_features(valid_features) tree_inst.set_encrypter(self.encrypter) tree_inst.set_encrypted_mode_calculator( self.encrypted_calculator) tree_inst.set_flowid(self.generate_flowid(i, tidx)) tree_inst.fit() tree_meta, tree_param = tree_inst.get_model() self.trees_.append(tree_param) if self.tree_meta is None: self.tree_meta = tree_meta self.update_f_value(new_f=tree_inst.predict_weights, tidx=tidx) self.update_feature_importance( tree_inst.get_feature_importance()) loss = self.compute_loss() self.history_loss.append(loss) LOGGER.info("round {} loss is {}".format(i, loss)) self.callback_metric("loss", "train", [Metric(i, loss)]) if self.n_iter_no_change is True: if self.check_convergence(loss): self.sync_stop_flag(True, i) break else: self.sync_stop_flag(False, i) LOGGER.debug("history loss is {}".format(min(self.history_loss))) self.callback_meta( "loss", "train", MetricMeta(name="train", metric_type="LOSS", extra_metas={"Best": min(self.history_loss)})) LOGGER.info("end to train secureboosting guest model") def predict_f_value(self, data_inst): LOGGER.info("predict tree f value, there are {} trees".format( len(self.trees_))) tree_dim = self.tree_dim init_score = self.init_score self.F = data_inst.mapValues(lambda v: init_score) rounds = len(self.trees_) // self.tree_dim for i in range(rounds): for tidx in range(self.tree_dim): tree_inst = HeteroDecisionTreeGuest(self.tree_param) tree_inst.load_model(self.tree_meta, self.trees_[i * self.tree_dim + tidx]) tree_inst.set_flowid(self.generate_flowid(i, tidx)) predict_data = tree_inst.predict(data_inst) self.update_f_value(new_f=predict_data, tidx=tidx) def predict(self, data_inst): LOGGER.info("start predict") data_inst = self.data_alignment(data_inst) self.predict_f_value(data_inst) if self.task_type == consts.CLASSIFICATION: loss_method = self.loss if self.num_classes == 2: predicts = self.F.mapValues( lambda f: float(loss_method.predict(f))) else: predicts = self.F.mapValues( lambda f: loss_method.predict(f).tolist()) elif self.task_type == consts.REGRESSION: if self.objective_param.objective in [ "lse", "lae", "huber", "log_cosh", "fair", "tweedie" ]: predicts = self.F else: raise NotImplementedError( "objective {} not supprted yet".format( self.objective_param.objective)) if self.task_type == consts.CLASSIFICATION: classes_ = self.classes_ if self.num_classes == 2: threshold = self.predict_param.threshold predict_result = data_inst.join( predicts, lambda inst, pred: [ inst.label, classes_[1] if pred > threshold else classes_[0], pred, { "0": 1 - pred, "1": pred } ]) else: predict_result = data_inst.join( predicts, lambda inst, preds: [ inst.label, classes_[np.argmax(preds)], np.max(preds), dict(zip(map(str, classes_), preds)) ]) elif self.task_type == consts.REGRESSION: predict_result = data_inst.join( predicts, lambda inst, pred: [inst.label, float(pred), float(pred), { "label": float(pred) }]) else: raise NotImplementedError("task type {} not supported yet".format( self.task_type)) LOGGER.info("end predict") return predict_result def get_feature_importance(self): return self.feature_importances_ def get_model_meta(self): model_meta = BoostingTreeModelMeta() model_meta.tree_meta.CopyFrom(self.tree_meta) model_meta.learning_rate = self.learning_rate model_meta.num_trees = self.num_trees model_meta.quantile_meta.CopyFrom(QuantileMeta(bin_num=self.bin_num)) model_meta.objective_meta.CopyFrom( ObjectiveMeta(objective=self.objective_param.objective, param=self.objective_param.params)) model_meta.task_type = self.task_type model_meta.tree_dim = self.tree_dim model_meta.n_iter_no_change = self.n_iter_no_change model_meta.tol = self.tol model_meta.num_classes = self.num_classes model_meta.classes_.extend(map(str, self.classes_)) model_meta.need_run = self.need_run meta_name = "HeteroSecureBoostingTreeGuestMeta" return meta_name, model_meta def set_model_meta(self, model_meta): self.tree_meta = model_meta.tree_meta self.learning_rate = model_meta.learning_rate self.num_trees = model_meta.num_trees self.bin_num = model_meta.quantile_meta.bin_num self.objective_param.objective = model_meta.objective_meta.objective self.objective_param.params = list(model_meta.objective_meta.param) self.task_type = model_meta.task_type self.tree_dim = model_meta.tree_dim self.num_classes = model_meta.num_classes self.n_iter_no_change = model_meta.n_iter_no_change self.tol = model_meta.tol self.classes_ = list(model_meta.classes_) self.set_loss(self.objective_param) def get_model_param(self): model_param = BoostingTreeModelParam() model_param.tree_num = len(list(self.trees_)) model_param.trees_.extend(self.trees_) model_param.init_score.extend(self.init_score) model_param.losses.extend(self.history_loss) feature_importances = list(self.get_feature_importance().items()) feature_importances = sorted(feature_importances, key=itemgetter(1), reverse=True) feature_importance_param = [] for (sitename, fid), _importance in feature_importances: feature_importance_param.append( FeatureImportanceInfo(sitename=sitename, fid=fid, importance=_importance)) model_param.feature_importances.extend(feature_importance_param) model_param.feature_name_fid_mapping.update( self.feature_name_fid_mapping) param_name = "HeteroSecureBoostingTreeGuestParam" return param_name, model_param def set_model_param(self, model_param): self.trees_ = list(model_param.trees_) self.init_score = np.array(list(model_param.init_score)) self.history_loss = list(model_param.losses) def export_model(self): meta_name, meta_protobuf = self.get_model_meta() param_name, param_protobuf = self.get_model_param() self.model_output = { meta_name: meta_protobuf, param_name: param_protobuf } return self.model_output def _load_model(self, model_dict): model_param = None model_meta = None for _, value in model_dict["model"].items(): for model in value: if model.endswith("Meta"): model_meta = value[model] if model.endswith("Param"): model_param = value[model] LOGGER.info("load model") self.set_model_meta(model_meta) self.set_model_param(model_param)
def fit(self, data_instances): """ Apply binning method for both data instances in local party as well as the other one. Afterwards, calculate the specific metric value for specific columns. Currently, iv is support for binary labeled data only. """ LOGGER.info("Start feature binning fit and transform") self._abnormal_detection(data_instances) # self._parse_cols(data_instances) self._setup_bin_inner_param(data_instances, self.model_param) if self.model_param.method == consts.OPTIMAL: has_missing_value = self.iv_calculator.check_containing_missing_value(data_instances) for idx in self.bin_inner_param.bin_indexes: if idx in has_missing_value: raise ValueError(f"Optimal Binning do not support missing value now.") split_points = self.binning_obj.fit_split_points(data_instances) if self.model_param.skip_static: self.transform(data_instances) return self.data_output label_counts_dict = data_overview.get_label_count(data_instances) if len(label_counts_dict) > 2: if self.model_param.method == consts.OPTIMAL: raise ValueError("Have not supported optimal binning in multi-class data yet") self.labels = list(label_counts_dict.keys()) label_counts = [label_counts_dict[k] for k in self.labels] label_table = IvCalculator.convert_label(data_instances, self.labels) self.bin_result = self.iv_calculator.cal_local_iv(data_instances=data_instances, split_points=split_points, labels=self.labels, label_counts=label_counts, bin_cols_map=self.bin_inner_param.get_need_cal_iv_cols_map(), label_table=label_table) if self.model_param.local_only: self.transform(data_instances) self.set_summary(self.bin_result.summary()) return self.data_output if self.model_param.encrypt_param.method == consts.PAILLIER: paillier_encryptor = PaillierEncrypt() paillier_encryptor.generate_key(self.model_param.encrypt_param.key_length) cipher = EncryptModeCalculator(encrypter=paillier_encryptor) else: raise NotImplementedError("encrypt method not supported yet") self._packer = GuestIntegerPacker(pack_num=len(self.labels), pack_num_range=label_counts, encrypt_mode_calculator=cipher) self.federated_iv(data_instances=data_instances, label_table=label_table, cipher=cipher, result_counts=label_counts_dict, label_elements=self.labels) total_summary = self.bin_result.summary() for host_res in self.host_results: total_summary = self._merge_summary(total_summary, host_res.summary()) self.set_schema(data_instances) self.transform(data_instances) LOGGER.info("Finish feature binning fit and transform") self.set_summary(total_summary) return self.data_output
def fit(self, data_instances): """ Apply binning method for both data instances in local party as well as the other one. Afterwards, calculate the specific metric value for specific columns. Currently, iv is support for binary labeled data only. """ LOGGER.info("Start feature binning fit and transform") self._abnormal_detection(data_instances) # self._parse_cols(data_instances) self._setup_bin_inner_param(data_instances, self.model_param) self.binning_obj.fit_split_points(data_instances) label_counts = data_overview.count_labels(data_instances) if label_counts > 2: raise ValueError( "Iv calculation support binary-data only in this version.") data_instances = data_instances.mapValues(self.load_data) self.set_schema(data_instances) label_table = data_instances.mapValues(lambda x: x.label) if self.model_param.local_only: LOGGER.info("This is a local only binning fit") self.binning_obj.cal_local_iv(data_instances, label_table=label_table) self.transform(data_instances) return self.data_output cipher = PaillierEncrypt() cipher.generate_key() f = functools.partial(self.encrypt, cipher=cipher) encrypted_label_table = label_table.mapValues(f) self.transfer_variable.encrypted_label.remote(encrypted_label_table, role=consts.HOST, idx=-1) LOGGER.info("Sent encrypted_label_table to host") self.binning_obj.cal_local_iv(data_instances, label_table=label_table) encrypted_bin_infos = self.transfer_variable.encrypted_bin_sum.get( idx=-1) # LOGGER.debug("encrypted_bin_sums: {}".format(encrypted_bin_sums)) LOGGER.info("Get encrypted_bin_sum from host") for host_idx, encrypted_bin_info in enumerate(encrypted_bin_infos): host_party_id = self.component_properties.host_party_idlist[ host_idx] encrypted_bin_sum = encrypted_bin_info['encrypted_bin_sum'] host_bin_methods = encrypted_bin_info['bin_method'] category_names = encrypted_bin_info['category_names'] result_counts = self.__decrypt_bin_sum(encrypted_bin_sum, cipher) LOGGER.debug( "Received host {} result, length of buckets: {}".format( host_idx, len(result_counts))) LOGGER.debug("category_name: {}, host_bin_methods: {}".format( category_names, host_bin_methods)) # if self.model_param.method == consts.OPTIMAL: if host_bin_methods == consts.OPTIMAL: optimal_binning_params = encrypted_bin_info['optimal_params'] host_model_params = copy.deepcopy(self.model_param) host_model_params.bin_num = optimal_binning_params.get( 'bin_num') host_model_params.optimal_binning_param.metric_method = optimal_binning_params.get( 'metric_method') host_model_params.optimal_binning_param.mixture = optimal_binning_params.get( 'mixture') host_model_params.optimal_binning_param.max_bin_pct = optimal_binning_params.get( 'max_bin_pct') host_model_params.optimal_binning_param.min_bin_pct = optimal_binning_params.get( 'min_bin_pct') self.binning_obj.event_total, self.binning_obj.non_event_total = self.get_histogram( data_instances) optimal_binning_cols = { x: y for x, y in result_counts.items() if x not in category_names } host_binning_obj = self.optimal_binning_sync( optimal_binning_cols, data_instances.count(), data_instances._partitions, host_idx, host_model_params) category_bins = { x: y for x, y in result_counts.items() if x in category_names } host_binning_obj.cal_iv_woe(category_bins, self.model_param.adjustment_factor) else: host_binning_obj = BaseBinning() host_binning_obj.cal_iv_woe(result_counts, self.model_param.adjustment_factor) host_binning_obj.set_role_party(role=consts.HOST, party_id=host_party_id) self.host_results.append(host_binning_obj) self.set_schema(data_instances) self.transform(data_instances) LOGGER.info("Finish feature binning fit and transform") return self.data_output
class HeteroFeatureBinningGuest(BaseHeteroFeatureBinning): def __init__(self): super(HeteroFeatureBinningGuest, self).__init__() self.encryptor = PaillierEncrypt() self.encryptor.generate_key() self.local_transform_result = None self.party_name = consts.GUEST # self._init_binning_obj() def fit(self, data_instances): """ Apply binning method for both data instances in local party as well as the other one. Afterwards, calculate the specific metric value for specific columns. Currently, iv is support for binary labeled data only. """ LOGGER.info("Start feature binning fit and transform") self._abnormal_detection(data_instances) self._parse_cols(data_instances) self.binning_obj.fit_split_points(data_instances) LOGGER.debug("After fit, binning_obj split_points: {}".format( self.binning_obj.split_points)) is_binary_data = data_overview.is_binary_labels(data_instances) if not is_binary_data: LOGGER.warning("Iv is not supported for Multiple-label data.") # data_instances = self.fit_local(data_instances) return data_instances # 1. Synchronize encryption information self.__synchronize_encryption() # 2. Prepare labels data_instances = data_instances.mapValues(self.load_data) self.set_schema(data_instances) label_table = data_instances.mapValues(lambda x: x.label) # 3. Transfer encrypted label f = functools.partial(self.encrypt, encryptor=self.encryptor) encrypted_label_table = label_table.mapValues(f) # encrypted_label_table_id = self.transfer_variable.generate_transferid(self.transfer_variable.encrypted_label) self.transfer_variable.encrypted_label.remote(encrypted_label_table, role=consts.HOST, idx=0) # federation.remote(encrypted_label_table, name=self.transfer_variable.encrypted_label.name, # tag=encrypted_label_table_id, role=consts.HOST, idx=0) LOGGER.info("Sent encrypted_label_table to host") # 4. Calculates self's binning. In case the other party need time to compute its data, # do binning calculation at this point. data_instances = self.fit_local(data_instances, label_table) # 5. Received host result and calculate iv value encrypted_bin_sum = self.transfer_variable.encrypted_bin_sum.get(idx=0) LOGGER.info("Get encrypted_bin_sum from host") result_counts = self.__decrypt_bin_sum(encrypted_bin_sum) host_iv_attrs = self.binning_obj.cal_iv_woe( result_counts, self.model_param.adjustment_factor) # Support one host only in this version. Multiple host will be supported in the future. self.host_results[consts.HOST] = host_iv_attrs self.set_schema(data_instances) LOGGER.debug("Before transform, binning_obj split_points: {}".format( self.binning_obj.split_points)) self.transform(data_instances) LOGGER.info("Finish feature binning fit and transform") return self.data_output @staticmethod def encrypt(x, encryptor): return encryptor.encrypt(x), encryptor.encrypt(1 - x) def transform_local(self, data_instances, label_table=None): self._abnormal_detection(data_instances) self._parse_cols(data_instances) split_points = {} for col_name, iv_attr in self.binning_result.items(): split_points[col_name] = iv_attr.split_points self.local_transform_result = self.binning_obj.cal_local_iv( data_instances, split_points=split_points, label_table=label_table) for col_name, col_index in self.local_transform_result.items(): LOGGER.info("The local feature {} 's iv is {}".format( col_name, self.local_transform_result[col_name].iv)) self.set_schema(data_instances) return data_instances def __synchronize_encryption(self): pub_key = self.encryptor.get_public_key() # pubkey_id = self.transfer_variable.generate_transferid(self.transfer_variable.paillier_pubkey) self.transfer_variable.paillier_pubkey.remote(pub_key, role=consts.HOST, idx=0) """ federation.remote(pub_key, name=self.transfer_variable.paillier_pubkey.name, tag=pubkey_id, role=consts.HOST, idx=0) """ LOGGER.info("send pubkey to host") self.has_synchronized = True def __decrypt_bin_sum(self, encrypted_bin_sum): # for feature_sum in encrypted_bin_sum: for col_name, count_list in encrypted_bin_sum.items(): new_list = [] for encrypted_event, encrypted_non_event in count_list: event_count = self.encryptor.decrypt(encrypted_event) non_event_count = self.encryptor.decrypt(encrypted_non_event) new_list.append((event_count, non_event_count)) encrypted_bin_sum[col_name] = new_list return encrypted_bin_sum def fit_local(self, data_instances, label_table=None): self._abnormal_detection(data_instances) self._parse_cols(data_instances) iv_attrs = self.binning_obj.cal_local_iv(data_instances, label_table=label_table) self.binning_result = iv_attrs self.set_schema(data_instances) return data_instances @staticmethod def load_data(data_instance): # Here suppose this is a binary question and the event label is 1 if data_instance.label != 1: data_instance.label = 0 return data_instance
def setUp(self): paillierEncrypt = PaillierEncrypt() paillierEncrypt.generate_key() self.publickey = paillierEncrypt.get_public_key() self.privatekey = paillierEncrypt.get_privacy_key()
class HeteroFeatureBinningGuest(BaseHeteroFeatureBinning): def __init__(self, params: FeatureBinningParam): super(HeteroFeatureBinningGuest, self).__init__(params) self.encryptor = PaillierEncrypt() self.encryptor.generate_key() self.iv_attrs = None self.host_iv_attrs = None def fit(self, data_instances): """ Apply binning method for both data instances in local party as well as the other one. Afterwards, calculate the specific metric value for specific columns. """ self._abnormal_detection(data_instances) self._parse_cols(data_instances) # 1. Synchronize encryption information self.__synchronize_encryption() # 2. Prepare labels data_instances = data_instances.mapValues(self.load_data) label_table = data_instances.mapValues(lambda x: x.label) # 3. Transfer encrypted label f = functools.partial(self.encrypt, encryptor=self.encryptor) encrypted_label_table = label_table.mapValues(f) encrypted_label_table_id = self.transfer_variable.generate_transferid( self.transfer_variable.encrypted_label) federation.remote(encrypted_label_table, name=self.transfer_variable.encrypted_label.name, tag=encrypted_label_table_id, role=consts.HOST, idx=0) LOGGER.info("Sent encrypted_label_table to host") # 4. Calculates self's binning. In case the other party need time to compute its data, # do binning calculation at this point. local_iv = self.fit_local(data_instances, label_table) # 5. Received host result and calculate iv value encrypted_bin_sum_id = self.transfer_variable.generate_transferid( self.transfer_variable.encrypted_bin_sum) encrypted_bin_sum = federation.get( name=self.transfer_variable.encrypted_bin_sum.name, tag=encrypted_bin_sum_id, idx=0) LOGGER.info("Get encrypted_bin_sum from host") result_counts = self.__decrypt_bin_sum(encrypted_bin_sum) host_iv_attrs = self.binning_obj.cal_iv_woe( result_counts, self.bin_param.adjustment_factor) self.host_iv_attrs = host_iv_attrs # LOGGER.debug("Lenght of host iv attrs: {}".format(len(self.host_iv_attrs))) # for idx, col in enumerate(self.cols): # LOGGER.info("The local iv of {}th feature is {}".format(col, local_iv[idx].iv)) for idx, iv_attr in enumerate(host_iv_attrs): LOGGER.info("The remote iv of {}th measured feature is {}".format( idx, iv_attr.iv)) iv_result = {'local': local_iv, 'remote': host_iv_attrs} return iv_result def transform(self, data_instances): self._abnormal_detection(data_instances) self.header = data_instances.schema.get( 'header') # ['x1', 'x2', 'x3' ... ] self._parse_cols(data_instances) # 1. Synchronize encryption information self.__synchronize_encryption() # 2. Prepare labels data_instances = data_instances.mapValues(self.load_data) label_table = data_instances.mapValues(lambda x: x.label) # 3. Transfer encrypted label f = functools.partial(self.encrypt, encryptor=self.encryptor) encrypted_label_table = label_table.mapValues(f) encrypted_label_table_id = self.transfer_variable.generate_transferid( self.transfer_variable.encrypted_label) federation.remote(encrypted_label_table, name=self.transfer_variable.encrypted_label.name, tag=encrypted_label_table_id, role=consts.HOST, idx=0) LOGGER.info("Sent encrypted_label_table to host for transform") # 4. Transform locally self.transform_local(data_instances, reformated=True) # 5. Received host result and calculate iv value encrypted_bin_sum_id = self.transfer_variable.generate_transferid( self.transfer_variable.encrypted_bin_sum) encrypted_bin_sum = federation.get( name=self.transfer_variable.encrypted_bin_sum.name, tag=encrypted_bin_sum_id, idx=0) result_counts = self.__decrypt_bin_sum(encrypted_bin_sum) host_iv_attrs = self.binning_obj.cal_iv_woe( result_counts, self.bin_param.adjustment_factor) self.host_iv_attrs = host_iv_attrs for idx, iv_attr in enumerate(host_iv_attrs): LOGGER.info("The remote iv of {}th measured feature is {}".format( idx, iv_attr.iv)) data_instances.schema['header'] = self.header return data_instances @staticmethod def encrypt(x, encryptor): return encryptor.encrypt(x), encryptor.encrypt(1 - x) def transform_local(self, data_instances, reformated=False): self._abnormal_detection(data_instances) self._parse_cols(data_instances) if not reformated: # Reformat the label type data_instances = data_instances.mapValues(self.load_data) split_points = [] for iv_attr in self.iv_attrs: s_p = list(iv_attr.split_points) split_points.append(s_p) self.iv_attrs = self.binning_obj.cal_local_iv(data_instances, self.cols, split_points) for idx, col in enumerate(self.cols): LOGGER.info("The local iv of {}th feature is {}".format( col, self.iv_attrs[idx].iv)) def __synchronize_encryption(self): pub_key = self.encryptor.get_public_key() pubkey_id = self.transfer_variable.generate_transferid( self.transfer_variable.paillier_pubkey) # LOGGER.debug("pubkey_id is : {}".format(pubkey_id)) federation.remote(pub_key, name=self.transfer_variable.paillier_pubkey.name, tag=pubkey_id, role=consts.HOST, idx=0) LOGGER.info("send pubkey to host") self.has_synchronized = True def __decrypt_bin_sum(self, encrypted_bin_sum): for feature_sum in encrypted_bin_sum: for idx, (encrypted_event, encrypted_non_event) in enumerate(feature_sum): event_count = self.encryptor.decrypt(encrypted_event) non_event_count = self.encryptor.decrypt(encrypted_non_event) feature_sum[idx] = (event_count, non_event_count) return encrypted_bin_sum def fit_local(self, data_instances, label_table=None): self._abnormal_detection(data_instances) self._parse_cols(data_instances) iv_attrs = self.binning_obj.cal_local_iv(data_instances, self.cols, label_table=label_table) for idx, col in enumerate(self.cols): LOGGER.info("The local iv of {}th feature is {}".format( col, iv_attrs[idx].iv)) self.iv_attrs = iv_attrs return iv_attrs @staticmethod def load_data(data_instance): # Here suppose this is a binary question and the event label is 1 # LOGGER.debug('label type is {}'.format(type(data_instance.label))) if data_instance.label != 1: data_instance.label = 0 return data_instance
class HeteroLRBase(BaseLinearModel, ABC): def __init__(self): super().__init__() self.model_name = 'HeteroSSHELogisticRegression' self.model_param_name = 'HeteroSSHELogisticRegressionParam' self.model_meta_name = 'HeteroSSHELogisticRegressionMeta' self.mode = consts.HETERO self.cipher = None self.q_field = None self.model_param = LogisticRegressionParam() self.labels = None self.batch_num = [] self.one_vs_rest_obj = None self.secure_matrix_obj: SecureMatrix self._set_parties() self.cipher_tool = None def _transfer_q_field(self): if self.role == consts.GUEST: q_field = self.cipher.public_key.n self.transfer_variable.q_field.remote(q_field, role=consts.HOST, suffix=("q_field", )) else: q_field = self.transfer_variable.q_field.get(role=consts.GUEST, idx=0, suffix=("q_field", )) return q_field def _init_model(self, params: LogisticRegressionParam): super()._init_model(params) self.encrypted_mode_calculator_param = params.encrypted_mode_calculator_param if self.role == consts.HOST: self.init_param_obj.fit_intercept = False self.cipher = PaillierEncrypt() self.cipher.generate_key(self.model_param.encrypt_param.key_length) self.transfer_variable = SSHEModelTransferVariable() self.one_vs_rest_obj = one_vs_rest_factory(self, role=self.role, mode=self.mode, has_arbiter=False) self.converge_func_name = params.early_stop self.reveal_every_iter = params.reveal_every_iter self.q_field = self._transfer_q_field() LOGGER.debug(f"q_field: {self.q_field}") if not self.reveal_every_iter: self.self_optimizer = copy.deepcopy(self.optimizer) self.remote_optimizer = copy.deepcopy(self.optimizer) self.batch_generator = batch_generator.Guest( ) if self.role == consts.GUEST else batch_generator.Host() self.batch_generator.register_batch_generator( BatchGeneratorTransferVariable(), has_arbiter=False) self.fixedpoint_encoder = FixedPointEndec(n=self.q_field) self.converge_transfer_variable = ConvergeCheckerTransferVariable() self.secure_matrix_obj = SecureMatrix(party=self.local_party, q_field=self.q_field, other_party=self.other_party) def _init_weights(self, model_shape): return self.initializer.init_model(model_shape, init_params=self.init_param_obj) def _set_parties(self): parties = [] guest_parties = get_parties().roles_to_parties(["guest"]) host_parties = get_parties().roles_to_parties(["host"]) parties.extend(guest_parties) parties.extend(host_parties) local_party = get_parties().local_party other_party = parties[0] if parties[0] != local_party else parties[1] self.parties = parties self.local_party = local_party self.other_party = other_party @property def is_respectively_reveal(self): return self.model_param.reveal_strategy == "respectively" def share_model(self, w, suffix): source = [w, self.other_party] if self.local_party.role == consts.GUEST: wb, wa = ( fixedpoint_numpy.FixedPointTensor.from_source( f"wb_{suffix}", source[0], encoder=self.fixedpoint_encoder, q_field=self.q_field), fixedpoint_numpy.FixedPointTensor.from_source( f"wa_{suffix}", source[1], encoder=self.fixedpoint_encoder, q_field=self.q_field), ) return wb, wa else: wa, wb = ( fixedpoint_numpy.FixedPointTensor.from_source( f"wa_{suffix}", source[0], encoder=self.fixedpoint_encoder, q_field=self.q_field), fixedpoint_numpy.FixedPointTensor.from_source( f"wb_{suffix}", source[1], encoder=self.fixedpoint_encoder, q_field=self.q_field), ) return wa, wb def forward(self, weights, features, suffix, cipher): raise NotImplementedError("Should not call here") def backward(self, error, features, suffix, cipher): raise NotImplementedError("Should not call here") def compute_loss(self, weights, suffix, cipher): raise NotImplementedError("Should not call here") def fit(self, data_instances, validate_data=None): self.header = data_instances.schema.get("header", []) self._abnormal_detection(data_instances) self.check_abnormal_values(data_instances) self.check_abnormal_values(validate_data) classes = self.one_vs_rest_obj.get_data_classes(data_instances) if len(classes) > 2: self.need_one_vs_rest = True self.need_call_back_loss = False self.one_vs_rest_fit(train_data=data_instances, validate_data=validate_data) else: self.need_one_vs_rest = False self.fit_binary(data_instances, validate_data) def one_vs_rest_fit(self, train_data=None, validate_data=None): LOGGER.info("Class num larger than 2, do one_vs_rest") self.one_vs_rest_obj.fit(data_instances=train_data, validate_data=validate_data) def fit_binary(self, data_instances, validate_data=None): LOGGER.info("Starting to hetero_sshe_logistic_regression") self.callback_list.on_train_begin(data_instances, validate_data) model_shape = self.get_features_shape(data_instances) instances_count = data_instances.count() if not self.component_properties.is_warm_start: w = self._init_weights(model_shape) self.model_weights = LinearModelWeights( l=w, fit_intercept=self.model_param.init_param.fit_intercept) last_models = copy.deepcopy(self.model_weights) else: last_models = copy.deepcopy(self.model_weights) w = last_models.unboxed self.callback_warm_start_init_iter(self.n_iter_) self.batch_generator.initialize_batch_generator( data_instances, batch_size=self.batch_size) with SPDZ( "sshe_lr", local_party=self.local_party, all_parties=self.parties, q_field=self.q_field, use_mix_rand=self.model_param.use_mix_rand, ) as spdz: spdz.set_flowid(self.flowid) self.secure_matrix_obj.set_flowid(self.flowid) if self.role == consts.GUEST: self.labels = data_instances.mapValues( lambda x: np.array([x.label], dtype=int)) w_self, w_remote = self.share_model(w, suffix="init") last_w_self, last_w_remote = w_self, w_remote LOGGER.debug( f"first_w_self shape: {w_self.shape}, w_remote_shape: {w_remote.shape}" ) batch_data_generator = self.batch_generator.generate_batch_data() self.cipher_tool = [] encoded_batch_data = [] for batch_data in batch_data_generator: if self.fit_intercept: batch_features = batch_data.mapValues(lambda x: np.hstack( (x.features, 1.0))) else: batch_features = batch_data.mapValues(lambda x: x.features) self.batch_num.append(batch_data.count()) encoded_batch_data.append( fixedpoint_table.FixedPointTensor( self.fixedpoint_encoder.encode(batch_features), q_field=self.fixedpoint_encoder.n, endec=self.fixedpoint_encoder)) self.cipher_tool.append( EncryptModeCalculator( self.cipher, self.encrypted_mode_calculator_param.mode, self.encrypted_mode_calculator_param.re_encrypted_rate) ) while self.n_iter_ < self.max_iter: self.callback_list.on_epoch_begin(self.n_iter_) LOGGER.info(f"start to n_iter: {self.n_iter_}") loss_list = [] self.optimizer.set_iters(self.n_iter_) if not self.reveal_every_iter: self.self_optimizer.set_iters(self.n_iter_) self.remote_optimizer.set_iters(self.n_iter_) for batch_idx, batch_data in enumerate(encoded_batch_data): current_suffix = (str(self.n_iter_), str(batch_idx)) if self.reveal_every_iter: y = self.forward(weights=self.model_weights, features=batch_data, suffix=current_suffix, cipher=self.cipher_tool[batch_idx]) else: y = self.forward(weights=(w_self, w_remote), features=batch_data, suffix=current_suffix, cipher=self.cipher_tool[batch_idx]) if self.role == consts.GUEST: error = y - self.labels self_g, remote_g = self.backward( error=error, features=batch_data, suffix=current_suffix, cipher=self.cipher_tool[batch_idx]) else: self_g, remote_g = self.backward( error=y, features=batch_data, suffix=current_suffix, cipher=self.cipher_tool[batch_idx]) # loss computing; suffix = ("loss", ) + current_suffix if self.reveal_every_iter: batch_loss = self.compute_loss( weights=self.model_weights, suffix=suffix, cipher=self.cipher_tool[batch_idx]) else: batch_loss = self.compute_loss( weights=(w_self, w_remote), suffix=suffix, cipher=self.cipher_tool[batch_idx]) if batch_loss is not None: batch_loss = batch_loss * self.batch_num[batch_idx] loss_list.append(batch_loss) if self.reveal_every_iter: # LOGGER.debug(f"before reveal: self_g shape: {self_g.shape}, remote_g_shape: {remote_g}," # f"self_g: {self_g}") new_g = self.reveal_models(self_g, remote_g, suffix=current_suffix) # LOGGER.debug(f"after reveal: new_g shape: {new_g.shape}, new_g: {new_g}" # f"self.model_param.reveal_strategy: {self.model_param.reveal_strategy}") if new_g is not None: self.model_weights = self.optimizer.update_model( self.model_weights, new_g, has_applied=False) else: self.model_weights = LinearModelWeights( l=np.zeros(self_g.shape), fit_intercept=self.model_param.init_param. fit_intercept) else: if self.optimizer.penalty == consts.L2_PENALTY: self_g = self_g + self.self_optimizer.alpha * w_self remote_g = remote_g + self.remote_optimizer.alpha * w_remote # LOGGER.debug(f"before optimizer: {self_g}, {remote_g}") self_g = self.self_optimizer.apply_gradients(self_g) remote_g = self.remote_optimizer.apply_gradients( remote_g) # LOGGER.debug(f"after optimizer: {self_g}, {remote_g}") w_self -= self_g w_remote -= remote_g LOGGER.debug( f"w_self shape: {w_self.shape}, w_remote_shape: {w_remote.shape}" ) if self.role == consts.GUEST: loss = np.sum(loss_list) / instances_count self.loss_history.append(loss) if self.need_call_back_loss: self.callback_loss(self.n_iter_, loss) else: loss = None if self.converge_func_name in ["diff", "abs"]: self.is_converged = self.check_converge_by_loss( loss, suffix=(str(self.n_iter_), )) elif self.converge_func_name == "weight_diff": if self.reveal_every_iter: self.is_converged = self.check_converge_by_weights( last_w=last_models.unboxed, new_w=self.model_weights.unboxed, suffix=(str(self.n_iter_), )) last_models = copy.deepcopy(self.model_weights) else: self.is_converged = self.check_converge_by_weights( last_w=(last_w_self, last_w_remote), new_w=(w_self, w_remote), suffix=(str(self.n_iter_), )) last_w_self, last_w_remote = copy.deepcopy( w_self), copy.deepcopy(w_remote) else: raise ValueError( f"Cannot recognize early_stop function: {self.converge_func_name}" ) LOGGER.info("iter: {}, is_converged: {}".format( self.n_iter_, self.is_converged)) self.callback_list.on_epoch_end(self.n_iter_) self.n_iter_ += 1 if self.stop_training: break if self.is_converged: break # Finally reconstruct if not self.reveal_every_iter: new_w = self.reveal_models(w_self, w_remote, suffix=("final", )) if new_w is not None: self.model_weights = LinearModelWeights( l=new_w, fit_intercept=self.model_param.init_param.fit_intercept ) LOGGER.debug(f"loss_history: {self.loss_history}") self.set_summary(self.get_model_summary()) def reveal_models(self, w_self, w_remote, suffix=None): if suffix is None: suffix = self.n_iter_ if self.model_param.reveal_strategy == "respectively": if self.role == consts.GUEST: new_w = w_self.get(tensor_name=f"wb_{suffix}", broadcast=False) w_remote.broadcast_reconstruct_share( tensor_name=f"wa_{suffix}") else: w_remote.broadcast_reconstruct_share( tensor_name=f"wb_{suffix}") new_w = w_self.get(tensor_name=f"wa_{suffix}", broadcast=False) elif self.model_param.reveal_strategy == "encrypted_reveal_in_host": if self.role == consts.GUEST: new_w = w_self.get(tensor_name=f"wb_{suffix}", broadcast=False) encrypted_w_remote = self.cipher.recursive_encrypt( self.fixedpoint_encoder.decode(w_remote.value)) encrypted_w_remote_tensor = fixedpoint_numpy.PaillierFixedPointTensor( value=encrypted_w_remote) encrypted_w_remote_tensor.broadcast_reconstruct_share( tensor_name=f"wa_{suffix}") else: w_remote.broadcast_reconstruct_share( tensor_name=f"wb_{suffix}") new_w = w_self.reconstruct(tensor_name=f"wa_{suffix}", broadcast=False) else: raise NotImplementedError( f"reveal strategy: {self.model_param.reveal_strategy} has not been implemented." ) return new_w def check_converge_by_loss(self, loss, suffix): if self.role == consts.GUEST: self.is_converged = self.converge_func.is_converge(loss) self.transfer_variable.is_converged.remote(self.is_converged, suffix=suffix) else: self.is_converged = self.transfer_variable.is_converged.get( idx=0, suffix=suffix) return self.is_converged def check_converge_by_weights(self, last_w, new_w, suffix): if self.reveal_every_iter: return self._reveal_every_iter_weights_check(last_w, new_w, suffix) else: return self._not_reveal_every_iter_weights_check( last_w, new_w, suffix) def _reveal_every_iter_weights_check(self, last_w, new_w, suffix): raise NotImplementedError() def _not_reveal_every_iter_weights_check(self, last_w, new_w, suffix): last_w_self, last_w_remote = last_w w_self, w_remote = new_w grad_self = w_self - last_w_self grad_remote = w_remote - last_w_remote if self.role == consts.GUEST: grad_encode = np.hstack((grad_remote.value, grad_self.value)) else: grad_encode = np.hstack((grad_self.value, grad_remote.value)) grad_encode = np.array([grad_encode]) grad_tensor_name = ".".join(("check_converge_grad", ) + suffix) grad_tensor = fixedpoint_numpy.FixedPointTensor( value=grad_encode, q_field=self.fixedpoint_encoder.n, endec=self.fixedpoint_encoder, tensor_name=grad_tensor_name) grad_tensor_transpose_name = ".".join( ("check_converge_grad_transpose", ) + suffix) grad_tensor_transpose = fixedpoint_numpy.FixedPointTensor( value=grad_encode.T, q_field=self.fixedpoint_encoder.n, endec=self.fixedpoint_encoder, tensor_name=grad_tensor_transpose_name) grad_norm_tensor_name = ".".join(("check_converge_grad_norm", ) + suffix) grad_norm = grad_tensor.dot(grad_tensor_transpose, target_name=grad_norm_tensor_name).get() weight_diff = np.sqrt(grad_norm[0][0]) LOGGER.info("iter: {}, weight_diff:{}, is_converged: {}".format( self.n_iter_, weight_diff, self.is_converged)) is_converge = False if weight_diff < self.model_param.tol: is_converge = True return is_converge def _get_meta(self): meta_protobuf_obj = lr_model_meta_pb2.LRModelMeta( penalty=self.model_param.penalty, tol=self.model_param.tol, alpha=self.alpha, optimizer=self.model_param.optimizer, batch_size=self.batch_size, learning_rate=self.model_param.learning_rate, max_iter=self.max_iter, early_stop=self.model_param.early_stop, fit_intercept=self.fit_intercept, need_one_vs_rest=self.need_one_vs_rest, reveal_strategy=self.model_param.reveal_strategy) return meta_protobuf_obj def get_single_model_param(self, model_weights=None, header=None): header = header if header else self.header result = { 'iters': self.n_iter_, 'loss_history': self.loss_history, 'is_converged': self.is_converged, # 'weight': weight_dict, 'intercept': self.model_weights.intercept_, 'header': header, 'best_iteration': -1 if self.validation_strategy is None else self.validation_strategy.best_iteration } if self.role == consts.GUEST or self.is_respectively_reveal: model_weights = model_weights if model_weights else self.model_weights weight_dict = {} for idx, header_name in enumerate(header): coef_i = model_weights.coef_[idx] weight_dict[header_name] = coef_i result['weight'] = weight_dict return result def get_model_summary(self): header = self.header if header is None: return {} weight_dict, intercept_ = self.get_weight_intercept_dict(header) best_iteration = -1 if self.validation_strategy is None else self.validation_strategy.best_iteration summary = { "coef": weight_dict, "intercept": intercept_, "is_converged": self.is_converged, "one_vs_rest": self.need_one_vs_rest, "best_iteration": best_iteration } if not self.is_respectively_reveal: del summary["intercept"] del summary["coef"] if self.validation_strategy: validation_summary = self.validation_strategy.summary() if validation_summary: summary["validation_metrics"] = validation_summary return summary def load_model(self, model_dict): LOGGER.debug("Start Loading model") result_obj = list(model_dict.get('model').values())[0].get( self.model_param_name) meta_obj = list(model_dict.get('model').values())[0].get( self.model_meta_name) if self.init_param_obj is None: self.init_param_obj = InitParam() self.init_param_obj.fit_intercept = meta_obj.fit_intercept self.model_param.reveal_strategy = meta_obj.reveal_strategy LOGGER.debug( f"reveal_strategy: {self.model_param.reveal_strategy}, {self.is_respectively_reveal}" ) self.header = list(result_obj.header) need_one_vs_rest = result_obj.need_one_vs_rest LOGGER.info( "in _load_model need_one_vs_rest: {}".format(need_one_vs_rest)) if need_one_vs_rest: one_vs_rest_result = result_obj.one_vs_rest_result self.one_vs_rest_obj = one_vs_rest_factory(classifier=self, role=self.role, mode=self.mode, has_arbiter=False) self.one_vs_rest_obj.load_model(one_vs_rest_result) self.need_one_vs_rest = True else: self.load_single_model(result_obj) self.need_one_vs_rest = False def load_single_model(self, single_model_obj): LOGGER.info("It's a binary task, start to load single model") if self.role == consts.GUEST or self.is_respectively_reveal: feature_shape = len(self.header) tmp_vars = np.zeros(feature_shape) weight_dict = dict(single_model_obj.weight) for idx, header_name in enumerate(self.header): tmp_vars[idx] = weight_dict.get(header_name) if self.fit_intercept: tmp_vars = np.append(tmp_vars, single_model_obj.intercept) self.model_weights = LinearModelWeights( tmp_vars, fit_intercept=self.fit_intercept) self.n_iter_ = single_model_obj.iters return self
class HeteroBoosting(Boosting, ABC): def __init__(self): super(HeteroBoosting, self).__init__() self.encrypter = None self.encrypted_calculator = None self.early_stopping_rounds = None self.binning_class = QuantileBinning self.model_param = HeteroBoostingParam() self.transfer_variable = HeteroBoostingTransferVariable() self.mode = consts.HETERO def _init_model(self, param: HeteroBoostingParam): LOGGER.debug('in hetero boosting, objective param is {}'.format( param.objective_param.objective)) super(HeteroBoosting, self)._init_model(param) self.encrypt_param = param.encrypt_param self.re_encrypt_rate = param.encrypted_mode_calculator_param self.calculated_mode = param.encrypted_mode_calculator_param.mode self.re_encrypted_rate = param.encrypted_mode_calculator_param.re_encrypted_rate self.early_stopping_rounds = param.early_stopping_rounds self.use_first_metric_only = param.use_first_metric_only def generate_encrypter(self): LOGGER.info("generate encrypter") if self.encrypt_param.method.lower() == consts.PAILLIER.lower(): self.encrypter = PaillierEncrypt() self.encrypter.generate_key(self.encrypt_param.key_length) else: raise NotImplementedError("unknown encrypt type {}".format( self.encrypt_param.method.lower())) self.encrypted_calculator = EncryptModeCalculator( self.encrypter, self.calculated_mode, self.re_encrypted_rate) def check_label(self): LOGGER.info("check label") classes_ = [] num_classes, booster_dim = 1, 1 if self.task_type == consts.CLASSIFICATION: num_classes, classes_ = ClassifyLabelChecker.validate_label( self.data_bin) if num_classes > 2: booster_dim = num_classes range_from_zero = True for _class in classes_: try: if 0 <= _class < len(classes_) and isinstance(_class, int): continue else: range_from_zero = False break except: range_from_zero = False classes_ = sorted(classes_) if not range_from_zero: class_mapping = dict(zip(classes_, range(num_classes))) self.y = self.y.mapValues(lambda _class: class_mapping[_class]) else: RegressionLabelChecker.validate_label(self.data_bin) return classes_, num_classes, booster_dim
class HeteroFeatureBinningGuest(BaseHeteroFeatureBinning): def __init__(self, params: FeatureBinningParam): super(HeteroFeatureBinningGuest, self).__init__(params) self.encryptor = PaillierEncrypt() self.encryptor.generate_key() self.local_transform_result = None self.party_name = consts.GUEST self._init_binning_obj() def fit(self, data_instances): """ Apply binning method for both data instances in local party as well as the other one. Afterwards, calculate the specific metric value for specific columns. """ self._abnormal_detection(data_instances) self._parse_cols(data_instances) # 1. Synchronize encryption information self.__synchronize_encryption() # 2. Prepare labels data_instances = data_instances.mapValues(self.load_data) self.set_schema(data_instances) label_table = data_instances.mapValues(lambda x: x.label) # 3. Transfer encrypted label f = functools.partial(self.encrypt, encryptor=self.encryptor) encrypted_label_table = label_table.mapValues(f) encrypted_label_table_id = self.transfer_variable.generate_transferid( self.transfer_variable.encrypted_label) federation.remote(encrypted_label_table, name=self.transfer_variable.encrypted_label.name, tag=encrypted_label_table_id, role=consts.HOST, idx=0) LOGGER.info("Sent encrypted_label_table to host") # 4. Calculates self's binning. In case the other party need time to compute its data, # do binning calculation at this point. data_instances = self.fit_local(data_instances, label_table) # 5. Received host result and calculate iv value encrypted_bin_sum_id = self.transfer_variable.generate_transferid( self.transfer_variable.encrypted_bin_sum) encrypted_bin_sum = federation.get( name=self.transfer_variable.encrypted_bin_sum.name, tag=encrypted_bin_sum_id, idx=0) LOGGER.info("Get encrypted_bin_sum from host") result_counts = self.__decrypt_bin_sum(encrypted_bin_sum) host_iv_attrs = self.binning_obj.cal_iv_woe( result_counts, self.bin_param.adjustment_factor) # Support one host only in this version. Multiple host will be supported in the future. self.host_results[consts.HOST] = host_iv_attrs for cols_name, iv_attr in host_iv_attrs.items(): display_result = iv_attr.display_result( self.bin_param.display_result) LOGGER.info( "[Result][FeatureBinning][Host] feature {} 's result is : {}". format(cols_name, display_result)) self.set_schema(data_instances) return data_instances def transform(self, data_instances): self._abnormal_detection(data_instances) self._parse_cols(data_instances) # 1. Synchronize encryption information self.__synchronize_encryption() # 2. Prepare labels data_instances = data_instances.mapValues(self.load_data) label_table = data_instances.mapValues(lambda x: x.label) self.set_schema(data_instances) # 3. Transfer encrypted label f = functools.partial(self.encrypt, encryptor=self.encryptor) encrypted_label_table = label_table.mapValues(f) encrypted_label_table_id = self.transfer_variable.generate_transferid( self.transfer_variable.encrypted_label) federation.remote(encrypted_label_table, name=self.transfer_variable.encrypted_label.name, tag=encrypted_label_table_id, role=consts.HOST, idx=0) LOGGER.info("Sent encrypted_label_table to host for transform") # 4. Transform locally self.transform_local(data_instances, label_table=label_table, save_result=False) # 5. Received host result and calculate iv value encrypted_bin_sum_id = self.transfer_variable.generate_transferid( self.transfer_variable.encrypted_bin_sum) encrypted_bin_sum = federation.get( name=self.transfer_variable.encrypted_bin_sum.name, tag=encrypted_bin_sum_id, idx=0) result_counts = self.__decrypt_bin_sum(encrypted_bin_sum) host_iv_attrs = self.binning_obj.cal_iv_woe( result_counts, self.bin_param.adjustment_factor) # host_results = {'host1': host_iv_attrs} # self.save_model(name=self.bin_param.transform_table, # namespace=self.bin_param.result_namespace, # binning_result=self.local_transform_result, # host_results=host_results) for col_name, iv_attr in host_iv_attrs.items(): LOGGER.info("The remote feature {} 's iv is {}".format( col_name, iv_attr.iv)) self.set_schema(data_instances) return data_instances @staticmethod def encrypt(x, encryptor): return encryptor.encrypt(x), encryptor.encrypt(1 - x) def transform_local(self, data_instances, label_table=None, save_result=True): self._abnormal_detection(data_instances) self._parse_cols(data_instances) split_points = {} for col_name, iv_attr in self.binning_result.items(): split_points[col_name] = iv_attr.split_points self.local_transform_result = self.binning_obj.cal_local_iv( data_instances, split_points=split_points, label_table=label_table) if save_result: self.save_model(name=self.bin_param.transform_table, namespace=self.bin_param.result_namespace, binning_result=self.local_transform_result, host_results={}) for col_name, col_index in self.local_transform_result.items(): LOGGER.info("The local feature {} 's iv is {}".format( col_name, self.local_transform_result[col_name].iv)) self.set_schema(data_instances) return data_instances def __synchronize_encryption(self): pub_key = self.encryptor.get_public_key() pubkey_id = self.transfer_variable.generate_transferid( self.transfer_variable.paillier_pubkey) federation.remote(pub_key, name=self.transfer_variable.paillier_pubkey.name, tag=pubkey_id, role=consts.HOST, idx=0) LOGGER.info("send pubkey to host") self.has_synchronized = True def __decrypt_bin_sum(self, encrypted_bin_sum): # for feature_sum in encrypted_bin_sum: for col_name, count_list in encrypted_bin_sum.items(): new_list = [] for encrypted_event, encrypted_non_event in count_list: event_count = self.encryptor.decrypt(encrypted_event) non_event_count = self.encryptor.decrypt(encrypted_non_event) new_list.append((event_count, non_event_count)) encrypted_bin_sum[col_name] = new_list return encrypted_bin_sum def fit_local(self, data_instances, label_table=None): self._abnormal_detection(data_instances) self._parse_cols(data_instances) iv_attrs = self.binning_obj.cal_local_iv(data_instances, label_table=label_table) for col_name, iv_attr in iv_attrs.items(): display_result = iv_attr.display_result( self.bin_param.display_result) LOGGER.info( "[Result][FeatureBinning][Guest] feature {} 's result is : {}". format(col_name, display_result)) # LOGGER.info("[Result][FeatureBinning]The feature {} 's iv is {}".format(col_name, iv_attrs[col_name].iv)) self.binning_result = iv_attrs self.set_schema(data_instances) return data_instances @staticmethod def load_data(data_instance): # Here suppose this is a binary question and the event label is 1 if data_instance.label != 1: data_instance.label = 0 return data_instance
class HeteroSecureBoostingTreeGuest(BoostingTree): def __init__(self, secureboost_tree_param): super(HeteroSecureBoostingTreeGuest, self).__init__(secureboost_tree_param) self.convegence = None self.y = None self.F = None self.data_bin = None self.loss = None self.init_score = None self.classes_dict = {} self.classes_ = [] self.num_classes = 0 self.classify_target = "binary" self.feature_num = None self.encrypter = None self.grad_and_hess = None self.flowid = 0 self.tree_dim = 1 self.tree_meta = None self.trees_ = [] self.history_loss = [] self.bin_split_points = None self.bin_sparse_points = None self.transfer_inst = HeteroSecureBoostingTreeTransferVariable() def set_loss(self, objective_param): loss_type = objective_param.objective params = objective_param.params LOGGER.info("set objective, objective is {}".format(loss_type)) if self.task_type == consts.CLASSIFICATION: if loss_type == "cross_entropy": if self.num_classes == 2: self.loss = SigmoidBinaryCrossEntropyLoss() else: self.loss = SoftmaxCrossEntropyLoss() else: raise NotImplementedError("objective %s not supported yet" % (loss_type)) elif self.task_type == consts.REGRESSION: if loss_type == "lse": self.loss = LeastSquaredErrorLoss() elif loss_type == "lae": self.loss = LeastAbsoluteErrorLoss() elif loss_type == "huber": self.loss = HuberLoss(params[0]) elif loss_type == "fair": self.loss = FairLoss(params[0]) elif loss_type == "tweedie": self.loss = TweedieLoss(params[0]) elif loss_type == "log_cosh": self.loss = LogCoshLoss() else: raise NotImplementedError("objective %s not supported yet" % (loss_type)) else: raise NotImplementedError("objective %s not supported yet" % (loss_type)) def convert_feature_to_bin(self, data_instance): LOGGER.info("convert feature to bins") self.data_bin, self.bin_split_points, self.bin_sparse_points = \ Quantile.convert_feature_to_bin( data_instance, self.quantile_method, self.bin_num, self.bin_gap, self.bin_sample_num) LOGGER.info("convert feature to bins over") def set_y(self): LOGGER.info("set label from data and check label") self.y = self.data_bin.mapValues(lambda instance: instance.label) self.check_label() def set_flowid(self, flowid=0): LOGGER.info("set flowid, flowid is {}".format(flowid)) self.flowid = flowid def generate_flowid(self, round_num, tree_num): LOGGER.info("generate flowid") return ".".join(map(str, [self.flowid, round_num, tree_num])) def check_label(self): LOGGER.info("check label") if self.task_type == consts.CLASSIFICATION: self.num_classes, self.classes_ = ClassifyLabelChecker.validate_y( self.y) if self.num_classes > 2: self.classify_target = "multinomial" self.tree_dim = self.num_classes range_from_zero = True for _class in self.classes_: try: if _class >= 0 and _class < range_from_zero and isinstance( _class, int): continue else: range_from_zero = False break except: range_from_zero = False self.classes_ = sorted(self.classes_) if not range_from_zero: class_mapping = dict( zip(self.classes_, range(self.num_classes))) self.y = self.y.mapValues(lambda _class: class_mapping[_class]) else: RegressionLabelChecker.validate_y(self.y) self.set_loss(self.objective_param) def generate_encrypter(self): LOGGER.info("generate encrypter") if self.encrypt_param.method == consts.PAILLIER: self.encrypter = PaillierEncrypt() self.encrypter.generate_key(self.encrypt_param.key_length) else: raise NotImplementedError("encrypt method not supported yes!!!") @staticmethod def accumulate_f(f_val, new_f_val, lr=0.1, idx=0): f_val[idx] += lr * new_f_val return f_val def update_f_value(self, new_f=None, tidx=-1): LOGGER.info("update tree f value, tree idx is {}".format(tidx)) if self.F is None: if self.tree_dim > 1: self.F, self.init_score = self.loss.initialize( self.y, self.tree_dim) else: LOGGER.info("tree_dim is %d" % (self.tree_dim)) self.F, self.init_score = self.loss.initialize(self.y) else: accumuldate_f = functools.partial(self.accumulate_f, lr=self.learning_rate, idx=tidx) self.F = self.F.join(new_f, accumuldate_f) def compute_grad_and_hess(self): LOGGER.info("compute grad and hess") loss_method = self.loss if self.task_type == consts.CLASSIFICATION: self.grad_and_hess = self.y.join(self.F, lambda y, f_val: \ (loss_method.compute_grad(y, loss_method.predict(f_val)), \ loss_method.compute_hess(y, loss_method.predict(f_val)))) else: self.grad_and_hess = self.y.join( self.F, lambda y, f_val: (loss_method.compute_grad(y, f_val), loss_method.compute_hess(y, f_val))) def compute_loss(self): LOGGER.info("compute loss") if self.task_type == consts.CLASSIFICATION: loss_method = self.loss y_predict = self.F.mapValues(lambda val: loss_method.predict(val)) loss = loss_method.compute_loss(self.y, y_predict) elif self.task_type == consts.REGRESSION: if self.objective_param.objective in [ "lse", "lae", "logcosh", "tweedie", "log_cosh", "huber" ]: loss_method = self.loss loss = loss_method.compute_loss(self.y, self.F) else: loss_method = self.loss y_predict = self.F.mapValues( lambda val: loss_method.predict(val)) loss = loss_method.compute_loss(self.y, y_predict) return loss def get_grad_and_hess(self, tree_idx): LOGGER.info("get grad and hess of tree {}".format(tree_idx)) grad_and_hess_subtree = self.grad_and_hess.mapValues( lambda grad_and_hess: (grad_and_hess[0][tree_idx], grad_and_hess[1][tree_idx])) return grad_and_hess_subtree def check_convergence(self, loss): LOGGER.info("check convergence") if self.convegence is None: self.convegence = DiffConverge() return self.convegence.is_converge(loss) def sample_valid_features(self): LOGGER.info("sample valid features") if self.feature_num is None: self.feature_num = self.bin_split_points.shape[0] choose_feature = random.choice(range(0, self.feature_num), \ max(1, int(self.subsample_feature_rate * self.feature_num)), replace=False) valid_features = [False for i in range(self.feature_num)] for fid in choose_feature: valid_features[fid] = True return valid_features def sync_tree_dim(self): LOGGER.info("sync tree dim to host") federation.remote(obj=self.tree_dim, name=self.transfer_inst.tree_dim.name, tag=self.transfer_inst.generate_transferid( self.transfer_inst.tree_dim), role=consts.HOST, idx=0) def sync_stop_flag(self, stop_flag, num_round): LOGGER.info( "sync stop flag to host, boosting round is {}".format(num_round)) federation.remote(obj=stop_flag, name=self.transfer_inst.stop_flag.name, tag=self.transfer_inst.generate_transferid( self.transfer_inst.stop_flag, num_round), role=consts.HOST, idx=0) def fit(self, data_inst): LOGGER.info("begin to train secureboosting guest model") data_inst = self.data_alignment(data_inst) self.convert_feature_to_bin(data_inst) self.set_y() self.update_f_value() self.generate_encrypter() self.sync_tree_dim() for i in range(self.num_trees): # n_tree = [] self.compute_grad_and_hess() for tidx in range(self.tree_dim): tree_inst = HeteroDecisionTreeGuest(self.tree_param) tree_inst.set_inputinfo(self.data_bin, self.get_grad_and_hess(tidx), self.bin_split_points, self.bin_sparse_points) valid_features = self.sample_valid_features() tree_inst.set_valid_features(valid_features) tree_inst.set_encrypter(self.encrypter) tree_inst.set_flowid(self.generate_flowid(i, tidx)) tree_inst.fit() tree_meta, tree_param = tree_inst.get_model() self.trees_.append(tree_param) if self.tree_meta is None: self.tree_meta = tree_meta # n_tree.append(tree_inst.get_tree_model()) self.update_f_value(new_f=tree_inst.predict_weights, tidx=tidx) # self.trees_.append(n_tree) loss = self.compute_loss() self.history_loss.append(loss) LOGGER.info("round {} loss is {}".format(i, loss)) if self.n_iter_no_change is True: if self.check_convergence(loss): self.sync_stop_flag(True, i) break else: self.sync_stop_flag(False, i) LOGGER.info("end to train secureboosting guest model") def predict_f_value(self, data_inst): LOGGER.info("predict tree f value, there are {} trees".format( len(self.trees_))) tree_dim = self.tree_dim init_score = self.init_score self.F = data_inst.mapValues(lambda v: init_score) rounds = len(self.trees_) // self.tree_dim for i in range(rounds): for tidx in range(self.tree_dim): tree_inst = HeteroDecisionTreeGuest(self.tree_param) tree_inst.load_model(self.tree_meta, self.trees_[i * self.tree_dim + tidx]) # tree_inst.set_tree_model(self.trees_[i * self.tree_dim + tidx]) tree_inst.set_flowid(self.generate_flowid(i, tidx)) predict_data = tree_inst.predict(data_inst) self.update_f_value(new_f=predict_data, tidx=tidx) def predict(self, data_inst, predict_param): LOGGER.info("start predict") data_inst = self.data_alignment(data_inst) self.predict_f_value(data_inst) if self.task_type == consts.CLASSIFICATION: loss_method = self.loss predicts = self.F.mapValues(lambda f: loss_method.predict(f)) elif self.task_type == consts.REGRESSION: if self.objective_param.objective in [ "lse", "lae", "huber", "log_cosh", "fair", "tweedie" ]: predicts = self.F else: raise NotImplementedError( "objective {} not supprted yet".format( self.objective_param.objective)) if self.task_type == consts.CLASSIFICATION: classes_ = self.classes_ if self.num_classes == 2: predict_label = predicts.mapValues(lambda pred: classes_[ 1] if pred > predict_param.threshold else classes_[0]) else: predict_label = predicts.mapValues( lambda preds: classes_[np.argmax(preds)]) if predict_param.with_proba: predict_result = data_inst.join( predicts, lambda inst, predict_prob: (inst.label, predict_prob)) else: predict_result = data_inst.mapValues(lambda inst: (inst.label, None)) predict_result = predict_result.join( predict_label, lambda label_prob, predict_label: (label_prob[0], label_prob[1], predict_label)) elif self.task_type == consts.REGRESSION: predict_result = data_inst.join( predicts, lambda inst, pred: (inst.label, pred, None)) else: raise NotImplementedError("task type {} not supported yet".format( self.task_type)) LOGGER.info("end predict") return predict_result def get_model_meta(self): model_meta = BoostingTreeModelMeta() model_meta.tree_meta.CopyFrom(self.tree_meta) model_meta.learning_rate = self.learning_rate model_meta.num_trees = self.num_trees model_meta.quantile_meta.CopyFrom( QuantileMeta(quantile_method=self.quantile_method, bin_num=self.bin_num, bin_gap=self.bin_gap, bin_sample_num=self.bin_sample_num)) #modelmeta.objective.CopyFrom(ObjectiveParamMeta(objective=self.objective_param.objective, param=self.objective_param.params)) model_meta.objective_meta.CopyFrom( ObjectiveMeta(objective=self.objective_param.objective, param=self.objective_param.params)) model_meta.task_type = self.task_type model_meta.tree_dim = self.tree_dim model_meta.n_iter_no_change = self.n_iter_no_change model_meta.tol = self.tol model_meta.num_classes = self.num_classes model_meta.classes_.extend(map(str, self.classes_)) meta_name = "HeteroSecureBoostingTreeGuest.meta" return meta_name, model_meta def set_model_meta(self, model_meta): self.tree_meta = model_meta.tree_meta self.learning_rate = model_meta.learning_rate self.num_trees = model_meta.num_trees self.quantile_method = model_meta.quantile_meta.quantile_method self.bin_num = model_meta.quantile_meta.bin_num self.bin_gap = model_meta.quantile_meta.bin_gap self.bin_sample_num = model_meta.quantile_meta.bin_sample_num self.objective_param.objective = model_meta.objective_meta.objective self.objective_param.params = list(model_meta.objective_meta.param) self.task_type = model_meta.task_type self.tree_dim = model_meta.tree_dim self.num_classes = model_meta.num_classes self.n_iter_no_change = model_meta.n_iter_no_change self.tol = model_meta.tol self.classes_ = list(model_meta.classes_) self.set_loss(self.objective_param) def get_model_param(self): model_param = BoostingTreeModelParam() model_param.tree_num = len(list(self.trees_)) model_param.trees_.extend(self.trees_) model_param.init_score.extend(self.init_score) model_param.losses.extend(self.history_loss) param_name = "HeteroSecureBoostingTreeGuest.param" return param_name, model_param def set_model_param(self, model_param): self.trees_ = list(model_param.trees_) self.init_score = np.array(list(model_param.init_score)) self.history_loss = list(model_param.losses) def save_model(self, model_table, model_namespace): LOGGER.info("save model") meta_name, meta_protobuf = self.get_model_meta() param_name, param_protobuf = self.get_model_param() manager.save_model(buffer_type=meta_name, proto_buffer=meta_protobuf, name=model_table, namespace=model_namespace) manager.save_model(buffer_type=param_name, proto_buffer=param_protobuf, name=model_table, namespace=model_namespace) return [(meta_name, param_name)] def load_model(self, model_table, model_namespace): LOGGER.info("load model") model_meta = BoostingTreeModelMeta() manager.read_model(buffer_type="HeteroSecureBoostingTreeGuest.meta", proto_buffer=model_meta, name=model_table, namespace=model_namespace) self.set_model_meta(model_meta) model_param = BoostingTreeModelParam() manager.read_model(buffer_type="HeteroSecureBoostingTreeGuest.param", proto_buffer=model_param, name=model_table, namespace=model_namespace) self.set_model_param(model_param) def evaluate(self, labels, pred_prob, pred_labels, evaluate_param): LOGGER.info("evaluate data") predict_res = None if self.task_type == consts.CLASSIFICATION: if evaluate_param.classi_type == consts.BINARY: predict_res = pred_prob elif evaluate_param.classi_type == consts.MULTY: predict_res = pred_labels else: LOGGER.warning( "unknown classification type, return None as evaluation results" ) elif self.task_type == consts.REGRESSION: predict_res = pred_prob else: LOGGER.warning( "unknown task type, return None as evaluation results") eva = Evaluation(evaluate_param.classi_type) return eva.report(labels, predict_res, evaluate_param.metrics, evaluate_param.thresholds, evaluate_param.pos_label)
plain_list.append(int(s.sum_hess * 10**decimal_to_keep)) s.sum_grad = en.encode_and_encrypt(s.sum_grad) s.sum_hess = en.encode_and_encrypt(s.sum_hess) def test_padding_num(plain_list, padding_num): rs_num = plain_list[0] for i in plain_list[1:]: rs_num = rs_num * padding_num + i return rs_num plain_list = [] decimal_to_keep = 7 key_length = 1024 en = Encrypt() en.generate_key(key_length) encoder = GuestGradHessEncoder( en, None, ) compressor = HostSplitInfoCompressor( key_length, consts.ITERATIVEAFFINE, ) decompressor = GuestSplitInfoDecompressor(en, ) compressor.renew_compressor([100000], {0: 0}) decompressor.renew_decompressor({0: 0}) gen_split_info = random_split_info_generate(num=10)
class HeteroBoostingGuest(HeteroBoosting, ABC): def __init__(self): super(HeteroBoostingGuest, self).__init__() def _init_model(self, param): super(HeteroBoostingGuest, self)._init_model(param) def generate_encrypter(self): LOGGER.info("generate encrypter") if self.encrypt_param.method.lower() == consts.PAILLIER.lower(): self.encrypter = PaillierEncrypt() self.encrypter.generate_key(self.encrypt_param.key_length) else: raise NotImplementedError("unknown encrypt type {}".format( self.encrypt_param.method.lower())) def check_label(self): LOGGER.info("check label") classes_ = [] num_classes, booster_dim = 1, 1 if self.task_type == consts.CLASSIFICATION: num_classes, classes_ = ClassifyLabelChecker.validate_label( self.data_bin) if num_classes > 2: booster_dim = num_classes range_from_zero = True for _class in classes_: try: if 0 <= _class < len(classes_) and isinstance(_class, int): continue else: range_from_zero = False break except BaseException: range_from_zero = False classes_ = sorted(classes_) if not range_from_zero: class_mapping = dict(zip(classes_, range(num_classes))) self.y = self.y.mapValues(lambda _class: class_mapping[_class]) else: RegressionLabelChecker.validate_label(self.data_bin) return classes_, num_classes, booster_dim def sync_booster_dim(self): LOGGER.info("sync booster_dim to host") self.transfer_variable.booster_dim.remote(self.booster_dim, role=consts.HOST, idx=-1) def sync_stop_flag(self, stop_flag, num_round): LOGGER.info("sync stop flag to host, boosting_core round is {}".format( num_round)) self.transfer_variable.stop_flag.remote(stop_flag, role=consts.HOST, idx=-1, suffix=(num_round, )) def sync_predict_round( self, predict_round, ): LOGGER.info("sync predict start round {}".format(predict_round)) self.transfer_variable.predict_start_round.remote( predict_round, role=consts.HOST, idx=-1, ) def prepare_warm_start(self, data_inst, classes): # adjust parameter for warm start warm_start_y_hat = self.predict(data_inst, ret_format='raw') self.y_hat = warm_start_y_hat self.start_round = len(self.boosting_model_list) // self.booster_dim self.boosting_round += self.start_round # check classes assert set(classes).issubset(set(self.classes_)), 'warm start label alignment failed: cur labels {},' \ 'previous model labels {}'.format(classes, self.classes_) # check fid self.feat_name_check(data_inst, self.feature_name_fid_mapping) self.callback_warm_start_init_iter(self.start_round) def fit(self, data_inst, validate_data=None): LOGGER.info('begin to fit a hetero boosting model, model is {}'.format( self.model_name)) self.start_round = 0 self.on_training = True self.data_inst = data_inst self.data_bin, self.bin_split_points, self.bin_sparse_points = self.prepare_data( data_inst) self.y = self.get_label(self.data_bin) if not self.is_warm_start: self.feature_name_fid_mapping = self.gen_feature_fid_mapping( data_inst.schema) self.classes_, self.num_classes, self.booster_dim = self.check_label( ) self.loss = self.get_loss_function() self.y_hat, self.init_score = self.get_init_score( self.y, self.num_classes) else: classes_, num_classes, booster_dim = self.check_label() self.prepare_warm_start(data_inst, classes_) LOGGER.info('class index is {}'.format(self.classes_)) self.sync_booster_dim() self.generate_encrypter() self.callback_list.on_train_begin(data_inst, validate_data) self.callback_meta( "loss", "train", MetricMeta(name="train", metric_type="LOSS", extra_metas={"unit_name": "iters"})) self.preprocess() for epoch_idx in range(self.start_round, self.boosting_round): LOGGER.info('cur epoch idx is {}'.format(epoch_idx)) self.callback_list.on_epoch_begin(epoch_idx) for class_idx in range(self.booster_dim): # fit a booster model = self.fit_a_learner(epoch_idx, class_idx) booster_meta, booster_param = model.get_model() if booster_meta is not None and booster_param is not None: self.booster_meta = booster_meta self.boosting_model_list.append(booster_param) # update predict score cur_sample_weights = model.get_sample_weights() self.y_hat = self.get_new_predict_score(self.y_hat, cur_sample_weights, dim=class_idx) # compute loss loss = self.compute_loss(self.y_hat, self.y) self.history_loss.append(loss) LOGGER.info("round {} loss is {}".format(epoch_idx, loss)) self.callback_metric("loss", "train", [Metric(epoch_idx, loss)]) # check validation validation_strategy = self.callback_list.get_validation_strategy() if validation_strategy: validation_strategy.set_precomputed_train_scores( self.score_to_predict_result(data_inst, self.y_hat)) self.callback_list.on_epoch_end(epoch_idx) should_stop = False if self.n_iter_no_change and self.check_convergence(loss): should_stop = True self.is_converged = True self.sync_stop_flag(self.is_converged, epoch_idx) if self.stop_training or should_stop: break self.postprocess() self.callback_list.on_train_end() self.callback_meta( "loss", "train", MetricMeta(name="train", metric_type="LOSS", extra_metas={"Best": min(self.history_loss)})) # get summary self.set_summary(self.generate_summary()) @assert_io_num_rows_equal def predict(self, data_inst): # predict is implemented in hetero_secureboost raise NotImplementedError('predict func is not implemented') @abc.abstractmethod def fit_a_learner(self, epoch_idx: int, booster_dim: int): raise NotImplementedError() @abc.abstractmethod def load_learner(self, model_meta, model_param, epoch_idx, booster_idx): raise NotImplementedError() @abc.abstractmethod def get_model_meta(self): raise NotImplementedError() @abc.abstractmethod def get_model_param(self): raise NotImplementedError() @abc.abstractmethod def set_model_meta(self, model_meta): raise NotImplementedError() @abc.abstractmethod def set_model_param(self, model_param): raise NotImplementedError()
def fit(self, data_instances): """ Apply binning method for both data instances in local party as well as the other one. Afterwards, calculate the specific metric value for specific columns. Currently, iv is support for binary labeled data only. """ LOGGER.info("Start feature binning fit and transform") self._abnormal_detection(data_instances) # self._parse_cols(data_instances) self._setup_bin_inner_param(data_instances, self.model_param) self.binning_obj.fit_split_points(data_instances) if self.model_param.skip_static: self.transform(data_instances) return self.data_output label_counts = data_overview.get_label_count(data_instances) if len(label_counts) > 2: raise ValueError("Iv calculation support binary-data only in this version.") data_instances = data_instances.mapValues(self.load_data) self.set_schema(data_instances) label_table = data_instances.mapValues(lambda x: x.label) if self.model_param.local_only: LOGGER.info("This is a local only binning fit") self.binning_obj.cal_local_iv(data_instances, label_table=label_table, label_counts=label_counts) self.transform(data_instances) self.set_summary(self.binning_obj.bin_results.summary()) return self.data_output if self.model_param.encrypt_param.method == consts.PAILLIER: cipher = PaillierEncrypt() cipher.generate_key(self.model_param.encrypt_param.key_length) else: raise NotImplementedError("encrypt method not supported yet") # from federatedml.secureprotol.encrypt import FakeEncrypt # cipher = FakeEncrypt() f = functools.partial(self.encrypt, cipher=cipher) encrypted_label_table = label_table.mapValues(f) self.transfer_variable.encrypted_label.remote(encrypted_label_table, role=consts.HOST, idx=-1) LOGGER.info("Sent encrypted_label_table to host") self.binning_obj.cal_local_iv(data_instances, label_table=label_table, label_counts=label_counts) encrypted_bin_sum_infos = self.transfer_variable.encrypted_bin_sum.get(idx=-1) encrypted_bin_infos = self.transfer_variable.optimal_info.get(idx=-1) total_summary = self.binning_obj.bin_results.summary() LOGGER.info("Get encrypted_bin_sum from host") for host_idx, encrypted_bin_info in enumerate(encrypted_bin_infos): host_party_id = self.component_properties.host_party_idlist[host_idx] encrypted_bin_sum = encrypted_bin_sum_infos[host_idx] result_counts = self.cipher_decompress(encrypted_bin_sum, cipher) host_bin_methods = encrypted_bin_info['bin_method'] category_names = encrypted_bin_info['category_names'] if host_bin_methods == consts.OPTIMAL: optimal_binning_params = encrypted_bin_info['optimal_params'] host_model_params = copy.deepcopy(self.model_param) host_model_params.bin_num = optimal_binning_params.get('bin_num') host_model_params.optimal_binning_param.metric_method = optimal_binning_params.get('metric_method') host_model_params.optimal_binning_param.mixture = optimal_binning_params.get('mixture') host_model_params.optimal_binning_param.max_bin_pct = optimal_binning_params.get('max_bin_pct') host_model_params.optimal_binning_param.min_bin_pct = optimal_binning_params.get('min_bin_pct') self.binning_obj.event_total, self.binning_obj.non_event_total = self.get_histogram(data_instances) result_counts = dict(result_counts.collect()) optimal_binning_cols = {x: y for x, y in result_counts.items() if x not in category_names} host_binning_obj = self.optimal_binning_sync(optimal_binning_cols, data_instances.count(), data_instances.partitions, host_idx, host_model_params) category_bins = {x: y for x, y in result_counts.items() if x in category_names} host_binning_obj.cal_iv_woe(category_bins, self.model_param.adjustment_factor) else: host_binning_obj = BaseBinning() host_binning_obj.cal_iv_woe(result_counts, self.model_param.adjustment_factor) host_binning_obj.set_role_party(role=consts.HOST, party_id=host_party_id) total_summary = self._merge_summary(total_summary, host_binning_obj.bin_results.summary()) self.host_results.append(host_binning_obj) self.set_schema(data_instances) self.transform(data_instances) LOGGER.info("Finish feature binning fit and transform") total_summary['test'] = 'test' self.set_summary(total_summary) return self.data_output
class HeteroSSHEBase(BaseLinearModel, ABC): def __init__(self): super().__init__() self.mode = consts.HETERO self.cipher = None self.q_field = None self.model_param = None # self.labels = None self.weight = None self.batch_generator = None self.batch_num = [] self.secure_matrix_obj: SecureMatrix # self._set_parties() self.parties = None self.local_party = None self.other_party = None self.label_type = None def _transfer_q_field(self): raise NotImplementedError(f"Should not be called here") def _init_model(self, params): super()._init_model(params) self.cipher = PaillierEncrypt() self.cipher.generate_key(self.model_param.encrypt_param.key_length) self.transfer_variable = SSHEModelTransferVariable() self.converge_func_name = params.early_stop self.reveal_every_iter = params.reveal_every_iter self.q_field = self._transfer_q_field() LOGGER.debug(f"q_field: {self.q_field}") if not self.reveal_every_iter: self.self_optimizer = copy.deepcopy(self.optimizer) self.remote_optimizer = copy.deepcopy(self.optimizer) self.fixedpoint_encoder = FixedPointEndec(n=self.q_field) self.converge_transfer_variable = ConvergeCheckerTransferVariable() self.secure_matrix_obj = SecureMatrix(party=self.local_party, q_field=self.q_field, other_party=self.other_party) def _init_weights(self, model_shape): return self.initializer.init_model(model_shape, init_params=self.init_param_obj) @property def is_respectively_reveal(self): return self.model_param.reveal_strategy == "respectively" def _cal_z_in_share(self, w_self, w_remote, features, suffix, cipher): raise NotImplementedError("Should not be called here") def share_model(self, w, suffix): raise NotImplementedError("Should not be called here") def forward(self, weights, features, labels, suffix, cipher, batch_weight): raise NotImplementedError("Should not be called here") def backward(self, error, features, suffix, cipher): raise NotImplementedError("Should not be called here") def compute_loss(self, weights, labels, suffix, cipher): raise NotImplementedError("Should not be called here") def reveal_models(self, w_self, w_remote, suffix=None): raise NotImplementedError(f"Should not be called here") def check_converge_by_loss(self, loss, suffix): raise NotImplementedError(f"Should not be called here") def check_converge_by_weights(self, last_w, new_w, suffix): if self.reveal_every_iter: return self._reveal_every_iter_weights_check(last_w, new_w, suffix) else: return self._not_reveal_every_iter_weights_check(last_w, new_w, suffix) def _reveal_every_iter_weights_check(self, last_w, new_w, suffix): raise NotImplementedError("Should not be called here") def _not_reveal_every_iter_weights_check(self, last_w, new_w, suffix): last_w_self, last_w_remote = last_w w_self, w_remote = new_w grad_self = w_self - last_w_self grad_remote = w_remote - last_w_remote if self.role == consts.GUEST: grad_encode = np.hstack((grad_remote.value, grad_self.value)) else: grad_encode = np.hstack((grad_self.value, grad_remote.value)) grad_encode = np.array([grad_encode]) grad_tensor_name = ".".join(("check_converge_grad",) + suffix) grad_tensor = fixedpoint_numpy.FixedPointTensor(value=grad_encode, q_field=self.fixedpoint_encoder.n, endec=self.fixedpoint_encoder, tensor_name=grad_tensor_name) grad_tensor_transpose_name = ".".join(("check_converge_grad_transpose",) + suffix) grad_tensor_transpose = fixedpoint_numpy.FixedPointTensor(value=grad_encode.T, q_field=self.fixedpoint_encoder.n, endec=self.fixedpoint_encoder, tensor_name=grad_tensor_transpose_name) grad_norm_tensor_name = ".".join(("check_converge_grad_norm",) + suffix) grad_norm = grad_tensor.dot(grad_tensor_transpose, target_name=grad_norm_tensor_name).get() weight_diff = np.sqrt(grad_norm[0][0]) LOGGER.info("iter: {}, weight_diff:{}, is_converged: {}".format(self.n_iter_, weight_diff, self.is_converged)) is_converge = False if weight_diff < self.model_param.tol: is_converge = True return is_converge def get_single_model_weight_dict(self, model_weights=None, header=None): header = header if header else self.header model_weights = model_weights if model_weights else self.model_weights weight_dict = {} for idx, header_name in enumerate(header): coef_i = model_weights.coef_[idx] weight_dict[header_name] = coef_i return weight_dict def get_single_model_param(self, model_weights=None, header=None): header = header if header else self.header result = {'iters': self.n_iter_, 'loss_history': self.loss_history, 'is_converged': self.is_converged, 'intercept': self.model_weights.intercept_, 'header': header, 'best_iteration': -1 if self.validation_strategy is None else self.validation_strategy.best_iteration } return result def load_model(self, model_dict): LOGGER.debug("Start Loading model") result_obj = list(model_dict.get('model').values())[0].get(self.model_param_name) meta_obj = list(model_dict.get('model').values())[0].get(self.model_meta_name) if self.init_param_obj is None: self.init_param_obj = InitParam() self.init_param_obj.fit_intercept = meta_obj.fit_intercept self.model_param.reveal_strategy = meta_obj.reveal_strategy LOGGER.debug(f"reveal_strategy: {self.model_param.reveal_strategy}, {self.is_respectively_reveal}") self.header = list(result_obj.header) return result_obj, meta_obj def load_single_model(self, single_model_obj): raise NotImplementedError(f"should not be called here") def load_single_model_weight(self, single_model_obj): feature_shape = len(self.header) tmp_vars = np.zeros(feature_shape) weight_dict = dict(single_model_obj.weight) for idx, header_name in enumerate(self.header): tmp_vars[idx] = weight_dict.get(header_name) if self.fit_intercept: tmp_vars = np.append(tmp_vars, single_model_obj.intercept) self.model_weights = LinearModelWeights(tmp_vars, fit_intercept=self.fit_intercept) def fit_single_model(self, data_instances, validate_data=None): LOGGER.info(f"Start to train single {self.model_name}") if len(self.component_properties.host_party_idlist) > 1: raise ValueError(f"Hetero SSHE Model does not support multi-host training.") self.callback_list.on_train_begin(data_instances, validate_data) model_shape = self.get_features_shape(data_instances) instances_count = data_instances.count() if not self.component_properties.is_warm_start: w = self._init_weights(model_shape) self.model_weights = LinearModelWeights(l=w, fit_intercept=self.model_param.init_param.fit_intercept) last_models = copy.deepcopy(self.model_weights) else: last_models = copy.deepcopy(self.model_weights) w = last_models.unboxed self.callback_warm_start_init_iter(self.n_iter_) if self.role == consts.GUEST: if with_weight(data_instances): LOGGER.info(f"data with sample weight, use sample weight.") if self.model_param.early_stop == "diff": LOGGER.warning("input data with weight, please use 'weight_diff' for 'early_stop'.") data_instances = scale_sample_weight(data_instances) self.batch_generator.initialize_batch_generator(data_instances, batch_size=self.batch_size) with SPDZ( "hetero_sshe", local_party=self.local_party, all_parties=self.parties, q_field=self.q_field, use_mix_rand=self.model_param.use_mix_rand, ) as spdz: spdz.set_flowid(self.flowid) self.secure_matrix_obj.set_flowid(self.flowid) # not sharing the model when reveal_every_iter if not self.reveal_every_iter: w_self, w_remote = self.share_model(w, suffix="init") last_w_self, last_w_remote = w_self, w_remote LOGGER.debug(f"first_w_self shape: {w_self.shape}, w_remote_shape: {w_remote.shape}") batch_data_generator = self.batch_generator.generate_batch_data() encoded_batch_data = [] batch_labels_list = [] batch_weight_list = [] for batch_data in batch_data_generator: if self.fit_intercept: batch_features = batch_data.mapValues(lambda x: np.hstack((x.features, 1.0))) else: batch_features = batch_data.mapValues(lambda x: x.features) if self.role == consts.GUEST: batch_labels = batch_data.mapValues(lambda x: np.array([x.label], dtype=self.label_type)) batch_labels_list.append(batch_labels) if self.weight: batch_weight = batch_data.mapValues(lambda x: np.array([x.weight], dtype=float)) batch_weight_list.append(batch_weight) else: batch_weight_list.append(None) self.batch_num.append(batch_data.count()) encoded_batch_data.append( fixedpoint_table.FixedPointTensor(self.fixedpoint_encoder.encode(batch_features), q_field=self.fixedpoint_encoder.n, endec=self.fixedpoint_encoder)) while self.n_iter_ < self.max_iter: self.callback_list.on_epoch_begin(self.n_iter_) LOGGER.info(f"start to n_iter: {self.n_iter_}") loss_list = [] self.optimizer.set_iters(self.n_iter_) if not self.reveal_every_iter: self.self_optimizer.set_iters(self.n_iter_) self.remote_optimizer.set_iters(self.n_iter_) for batch_idx, batch_data in enumerate(encoded_batch_data): current_suffix = (str(self.n_iter_), str(batch_idx)) if self.role == consts.GUEST: batch_labels = batch_labels_list[batch_idx] batch_weight = batch_weight_list[batch_idx] else: batch_labels = None batch_weight = None if self.reveal_every_iter: y = self.forward(weights=self.model_weights, features=batch_data, labels=batch_labels, suffix=current_suffix, cipher=self.cipher, batch_weight=batch_weight) else: y = self.forward(weights=(w_self, w_remote), features=batch_data, labels=batch_labels, suffix=current_suffix, cipher=self.cipher, batch_weight=batch_weight) if self.role == consts.GUEST: if self.weight: error = y - batch_labels.join(batch_weight, lambda y, b: y * b) else: error = y - batch_labels self_g, remote_g = self.backward(error=error, features=batch_data, suffix=current_suffix, cipher=self.cipher) else: self_g, remote_g = self.backward(error=y, features=batch_data, suffix=current_suffix, cipher=self.cipher) # loss computing; suffix = ("loss",) + current_suffix if self.reveal_every_iter: batch_loss = self.compute_loss(weights=self.model_weights, labels=batch_labels, suffix=suffix, cipher=self.cipher) else: batch_loss = self.compute_loss(weights=(w_self, w_remote), labels=batch_labels, suffix=suffix, cipher=self.cipher) if batch_loss is not None: batch_loss = batch_loss * self.batch_num[batch_idx] loss_list.append(batch_loss) if self.reveal_every_iter: # LOGGER.debug(f"before reveal: self_g shape: {self_g.shape}, remote_g_shape: {remote_g}," # f"self_g: {self_g}") new_g = self.reveal_models(self_g, remote_g, suffix=current_suffix) # LOGGER.debug(f"after reveal: new_g shape: {new_g.shape}, new_g: {new_g}" # f"self.model_param.reveal_strategy: {self.model_param.reveal_strategy}") if new_g is not None: self.model_weights = self.optimizer.update_model(self.model_weights, new_g, has_applied=False) else: self.model_weights = LinearModelWeights( l=np.zeros(self_g.shape), fit_intercept=self.model_param.init_param.fit_intercept) else: if self.optimizer.penalty == consts.L2_PENALTY: self_g = self_g + self.self_optimizer.alpha * w_self remote_g = remote_g + self.remote_optimizer.alpha * w_remote # LOGGER.debug(f"before optimizer: {self_g}, {remote_g}") self_g = self.self_optimizer.apply_gradients(self_g) remote_g = self.remote_optimizer.apply_gradients(remote_g) # LOGGER.debug(f"after optimizer: {self_g}, {remote_g}") w_self -= self_g w_remote -= remote_g LOGGER.debug(f"w_self shape: {w_self.shape}, w_remote_shape: {w_remote.shape}") if self.role == consts.GUEST: loss = np.sum(loss_list) / instances_count self.loss_history.append(loss) if self.need_call_back_loss: self.callback_loss(self.n_iter_, loss) else: loss = None if self.converge_func_name in ["diff", "abs"]: self.is_converged = self.check_converge_by_loss(loss, suffix=(str(self.n_iter_),)) elif self.converge_func_name == "weight_diff": if self.reveal_every_iter: self.is_converged = self.check_converge_by_weights( last_w=last_models.unboxed, new_w=self.model_weights.unboxed, suffix=(str(self.n_iter_),)) last_models = copy.deepcopy(self.model_weights) else: self.is_converged = self.check_converge_by_weights( last_w=(last_w_self, last_w_remote), new_w=(w_self, w_remote), suffix=(str(self.n_iter_),)) last_w_self, last_w_remote = copy.deepcopy(w_self), copy.deepcopy(w_remote) else: raise ValueError(f"Cannot recognize early_stop function: {self.converge_func_name}") LOGGER.info("iter: {}, is_converged: {}".format(self.n_iter_, self.is_converged)) self.callback_list.on_epoch_end(self.n_iter_) self.n_iter_ += 1 if self.stop_training: break if self.is_converged: break # Finally reconstruct if not self.reveal_every_iter: new_w = self.reveal_models(w_self, w_remote, suffix=("final",)) if new_w is not None: self.model_weights = LinearModelWeights( l=new_w, fit_intercept=self.model_param.init_param.fit_intercept) LOGGER.debug(f"loss_history: {self.loss_history}") self.set_summary(self.get_model_summary()) def get_model_summary(self): summary = super().get_model_summary() if not self.is_respectively_reveal: del summary["intercept"] del summary["coef"] return summary
class TestHeteroLogisticGradient(unittest.TestCase): def setUp(self): self.paillier_encrypt = PaillierEncrypt() self.paillier_encrypt.generate_key() # self.hetero_lr_gradient = HeteroLogisticGradient(self.paillier_encrypt) self.hetero_lr_gradient = hetero_lr_gradient_and_loss.Guest() size = 10 self.wx = session.parallelize( [self.paillier_encrypt.encrypt(i) for i in range(size)]) self.en_sum_wx_square = session.parallelize( [self.paillier_encrypt.encrypt(np.square(i)) for i in range(size)]) self.w = [i for i in range(size)] self.data_inst = session.parallelize([ Instance(features=[1 for _ in range(size)], label=pow(-1, i % 2)) for i in range(size) ], partition=1) # test fore_gradient self.fore_gradient_local = [ -0.5, 0.75, 0, 1.25, 0.5, 1.75, 1, 2.25, 1.5, 2.75 ] # test gradient self.gradient = [ 1.125, 1.125, 1.125, 1.125, 1.125, 1.125, 1.125, 1.125, 1.125, 1.125 ] self.gradient_fit_intercept = [ 1.125, 1.125, 1.125, 1.125, 1.125, 1.125, 1.125, 1.125, 1.125, 1.125, 1.125 ] self.loss = 4.505647 def test_compute_fore_gradient(self): # fore_gradient = self.hetero_lr_gradient.compute_and_aggregate_forwards(self.data_inst, self.wx) model_weights = LinearModelWeights(l=self.w, fit_intercept=False) class EncryptedCalculator(object): encrypter = self.paillier_encrypt def encrypt_row(self, row): return np.array([self.encrypter.encrypt(row)]) def encrypt(self, input_data): return input_data.mapValues(self.encrypt_row) encrypted_calculator = [EncryptedCalculator()] batch_index = 0 fore_gradient = self.hetero_lr_gradient.compute_and_aggregate_forwards( self.data_inst, model_weights, encrypted_calculator, batch_index) fore_gradient_local = [ self.paillier_encrypt.decrypt(iterator[1]) for iterator in fore_gradient.collect() ] self.assertListEqual(fore_gradient_local, self.fore_gradient_local) def test_compute_gradient(self): fore_gradient = self.hetero_lr_gradient.compute_fore_gradient( self.data_inst, self.wx) gradient = self.hetero_lr_gradient.compute_gradient( self.data_inst, fore_gradient, fit_intercept=False) de_gradient = [ self.paillier_encrypt.decrypt(iterator) for iterator in gradient ] self.assertListEqual(de_gradient, self.gradient) gradient = self.hetero_lr_gradient.compute_gradient(self.data_inst, fore_gradient, fit_intercept=True) de_gradient = [ self.paillier_encrypt.decrypt(iterator) for iterator in gradient ] self.assertListEqual(de_gradient, self.gradient_fit_intercept) def test_compute_gradient_and_loss(self): fore_gradient = self.hetero_lr_gradient.compute_fore_gradient( self.data_inst, self.wx) gradient, loss = self.hetero_lr_gradient.compute_gradient_and_loss( self.data_inst, fore_gradient, self.wx, self.en_sum_wx_square, False) de_gradient = [self.paillier_encrypt.decrypt(i) for i in gradient] self.assertListEqual(de_gradient, self.gradient) diff_loss = np.abs(self.loss - self.paillier_encrypt.decrypt(loss)) self.assertLess(diff_loss, 1e-5)
def fit(self, data_instances): """ Apply binning method for both data instances in local party as well as the other one. Afterwards, calculate the specific metric value for specific columns. Currently, iv is support for binary labeled data only. """ LOGGER.info("Start feature binning fit and transform") self._abnormal_detection(data_instances) # self._parse_cols(data_instances) self._setup_bin_inner_param(data_instances, self.model_param) self.binning_obj.fit_split_points(data_instances) LOGGER.debug("After fit, binning_obj split_points: {}".format( self.binning_obj.split_points)) is_binary_data = data_overview.is_binary_labels(data_instances) if not is_binary_data: # LOGGER.warning("Iv calculation support binary-data only in this version.") raise ValueError( "Iv calculation support binary-data only in this version.") # return data_instances data_instances = data_instances.mapValues(self.load_data) self.set_schema(data_instances) label_table = data_instances.mapValues(lambda x: x.label) if self.model_param.local_only: LOGGER.info("This is a local only binning fit") self.binning_obj.cal_local_iv(data_instances, label_table=label_table) self.transform(data_instances) return self.data_output cipher = PaillierEncrypt() cipher.generate_key() f = functools.partial(self.encrypt, cipher=cipher) encrypted_label_table = label_table.mapValues(f) self.transfer_variable.encrypted_label.remote(encrypted_label_table, role=consts.HOST, idx=-1) LOGGER.info("Sent encrypted_label_table to host") self.binning_obj.cal_local_iv(data_instances, label_table=label_table) encrypted_bin_sums = self.transfer_variable.encrypted_bin_sum.get( idx=-1) LOGGER.info("Get encrypted_bin_sum from host") for host_idx, encrypted_bin_sum in enumerate(encrypted_bin_sums): host_party_id = self.component_properties.host_party_idlist[ host_idx] host_binning_obj = HostBaseBinning() host_binning_obj.set_role_party(role=consts.HOST, party_id=host_party_id) result_counts = self.__decrypt_bin_sum(encrypted_bin_sum, cipher) host_binning_obj.cal_iv_woe(result_counts, self.model_param.adjustment_factor) self.host_results.append(host_binning_obj) self.set_schema(data_instances) self.transform(data_instances) LOGGER.info("Finish feature binning fit and transform") return self.data_output