def __synchronize_encryption(self, mode='train'):
        """
        Communicate with hosts. Specify whether use encryption or not and transfer the public keys.
        """
        # 2. Send pubkey to those use-encryption guest & hosts
        encrypter = PaillierEncrypt()
        encrypter.generate_key(self.key_length)

        pub_key = encrypter.get_public_key()

        # LOGGER.debug("Start to remote pub_key: {}, transfer_id: {}".format(pub_key, pubkey_id))
        self.transfer_variable.paillier_pubkey.remote(obj=pub_key,
                                                      role=consts.GUEST,
                                                      idx=0,
                                                      suffix=(mode, ))
        LOGGER.info("send pubkey to guest")
        pri_key = encrypter.get_privacy_key()
        self.transfer_variable.paillier_prikey.remote(obj=pri_key,
                                                      role=consts.GUEST,
                                                      idx=0,
                                                      suffix=(mode, ))
        # LOGGER.debug("Start to remote pri_key: {}, transfer_id: {}".format(pri_key, prikey_id))
        LOGGER.info("send prikey to guest")
        self.transfer_variable.paillier_pubkey.remote(obj=pub_key,
                                                      role=consts.HOST,
                                                      idx=-1,
                                                      suffix=(mode, ))
        LOGGER.info("send pubkey to host")
        self.transfer_variable.paillier_prikey.remote(obj=pri_key,
                                                      role=consts.HOST,
                                                      idx=-1,
                                                      suffix=(mode, ))
        LOGGER.info("send prikey to host")
Esempio n. 2
0
    def test_tensor_op(self):

        arr1 = np.ones((10, 1, 3))
        arr1[0] = np.array([[2, 3, 4]])
        arr2 = np.ones((10, 3, 3))
        arr3 = np.ones([1, 1, 3])

        arr4 = np.ones([50, 1])
        arr5 = np.ones([32])

        pt = PaillierTensor(arr1)
        pt2 = PaillierTensor(arr2)
        pt3 = PaillierTensor(arr3)

        pt4 = PaillierTensor(arr4)
        pt5 = PaillierTensor(arr5)

        encrypter = PaillierEncrypt()
        encrypter.generate_key(EncryptParam().key_length)
        encrypted_calculator = EncryptModeCalculator(
            encrypter,
            EncryptedModeCalculatorParam().mode,
            EncryptedModeCalculatorParam().re_encrypted_rate)
        rs1 = pt * arr2
        rs2 = pt * pt2
        rs3 = pt.matmul_3d(pt2)
        enpt = pt2.encrypt(encrypted_calculator)
        enrs = enpt.matmul_3d(arr1, multiply='right')

        rng_generator = random_number_generator.RandomNumberGenerator()

        enpt2 = pt4.encrypt(encrypted_calculator)
        random_num = rng_generator.generate_random_number(enpt2.shape)
class TestHeteroLogisticGradient(unittest.TestCase):
    def setUp(self):
        self.paillier_encrypt = PaillierEncrypt()
        self.paillier_encrypt.generate_key()
        # self.hetero_lr_gradient = HeteroLogisticGradient(self.paillier_encrypt)
        self.hetero_lr_gradient = hetero_lr_gradient_and_loss.Guest()

        size = 10
        self.wx = session.parallelize(
            [self.paillier_encrypt.encrypt(i) for i in range(size)])
        self.en_sum_wx_square = session.parallelize(
            [self.paillier_encrypt.encrypt(np.square(i)) for i in range(size)])
        self.w = [i for i in range(size)]
        self.data_inst = session.parallelize([
            Instance(features=[1 for _ in range(size)], label=pow(-1, i % 2))
            for i in range(size)
        ],
                                             partition=1)

        # test fore_gradient
        self.fore_gradient_local = [
            -0.5, 0.75, 0, 1.25, 0.5, 1.75, 1, 2.25, 1.5, 2.75
        ]
        # test gradient
        self.gradient = [
            1.125, 1.125, 1.125, 1.125, 1.125, 1.125, 1.125, 1.125, 1.125,
            1.125
        ]
        self.gradient_fit_intercept = [
            1.125, 1.125, 1.125, 1.125, 1.125, 1.125, 1.125, 1.125, 1.125,
            1.125, 1.125
        ]

        self.loss = 4.505647
Esempio n. 4
0
    def test_cipher_add_sub_mul(self):

        encrypter = PaillierEncrypt()
        encrypter.generate_key(1024)
        en_1, en_2, en_3, en_4 = encrypter.encrypt(1), encrypter.encrypt(
            2), encrypter.encrypt(3), encrypter.encrypt(4)
        en_5, en_6, en_7, en_8 = encrypter.encrypt(5), encrypter.encrypt(
            6), encrypter.encrypt(7), encrypter.encrypt(8)
        a = PackingCipherTensor([en_1, en_2, en_3, en_4])
        b = PackingCipherTensor([en_5, en_6, en_7, en_8])
        c = PackingCipherTensor(encrypter.encrypt(1))
        d = PackingCipherTensor([encrypter.encrypt(5)])

        rs_1 = a + b
        rs_2 = b - a
        rs_3 = c + d
        rs_4 = 123 * c
        rs_5 = d * 456
        rs_6 = a * 114
        print(encrypter.recursive_decrypt(rs_1.ciphers))
        print(encrypter.recursive_decrypt(rs_2.ciphers))
        print(encrypter.recursive_decrypt(rs_3.ciphers))
        print(encrypter.decrypt(rs_4.ciphers))
        print(encrypter.decrypt(rs_5.ciphers))
        print(encrypter.recursive_decrypt(rs_6.ciphers))
        print('cipher test done')
        print('*' * 30)
Esempio n. 5
0
    def test_data_type(self, mode="strict", re_encrypted_rate=0.2):
        from federatedml.secureprotol import PaillierEncrypt
        from federatedml.secureprotol.encrypt_mode import EncryptModeCalculator
        encrypter = PaillierEncrypt()
        encrypter.generate_key(1024)
        encrypted_calculator = EncryptModeCalculator(encrypter, mode,
                                                     re_encrypted_rate)

        data_list = dict(
            encrypted_calculator.encrypt(self.data_list).collect())
        data_tuple = dict(
            encrypted_calculator.encrypt(self.data_tuple).collect())
        data_numpy = dict(
            encrypted_calculator.encrypt(self.data_numpy).collect())

        for key, value in data_list.items():
            self.assertTrue(isinstance(value, list))
            self.assertTrue(len(value) == len(self.list_data[key]))

        for key, value in data_tuple.items():
            self.assertTrue(isinstance(value, tuple))
            self.assertTrue(len(value) == len(self.tuple_data[key]))

        for key, value in data_numpy.items():
            self.assertTrue(type(value).__name__ == "ndarray")
            self.assertTrue(value.shape[0] == self.numpy_data[key].shape[0])
Esempio n. 6
0
    def generate_encrypter(self, param):
        LOGGER.info("generate encrypter")
        if param.encrypt_param.method.lower() == consts.PAILLIER.lower():
            encrypter = PaillierEncrypt()
            encrypter.generate_key(param.encrypt_param.key_length)
        else:
            raise NotImplementedError("encrypt method not supported yet!!!")

        return encrypter
Esempio n. 7
0
def EINI_guest_predict(data_inst,
                       trees: List[HeteroDecisionTreeGuest],
                       learning_rate,
                       init_score,
                       booster_dim,
                       encrypt_key_length,
                       transfer_var: HeteroSecureBoostTransferVariable,
                       sitename=None,
                       party_list=None,
                       predict_cache=None,
                       pred_leaf=False):

    if sitename is None:
        raise ValueError(
            'input sitename is None, not able to run EINI predict algorithm')

    if pred_leaf:
        raise ValueError(
            'EINI predict mode does not support leaf idx prediction')

    # EINI algorithms
    id_pos_map_list = get_leaf_idx_map(trees)
    map_func = functools.partial(generate_leaf_candidates_guest,
                                 sitename=sitename,
                                 trees=trees,
                                 node_pos_map_list=id_pos_map_list,
                                 init_score=init_score,
                                 learning_rate=learning_rate,
                                 booster_dim=booster_dim)
    position_vec = data_inst.mapValues(map_func)

    # encryption
    encrypter = PaillierEncrypt()
    encrypter.generate_key(encrypt_key_length)
    encrypter_vec_table = position_vec.mapValues(encrypter.recursive_encrypt)

    # federation part
    # send to first host party
    transfer_var.guest_predict_data.remote(encrypter_vec_table,
                                           idx=0,
                                           suffix='position_vec',
                                           role=consts.HOST)

    # get from last host party
    result_table = transfer_var.host_predict_data.get(idx=len(party_list) - 1,
                                                      suffix='merge_result',
                                                      role=consts.HOST)

    # decode result
    result = result_table.mapValues(encrypter.recursive_decrypt)
    # reformat
    result = result.mapValues(lambda x: np.array(x))
    if predict_cache:
        result = result.join(predict_cache, lambda v1, v2: v1 + v2)

    return result
Esempio n. 8
0
class TestHeteroLogisticGradient(unittest.TestCase):
    def setUp(self):
        self.paillier_encrypt = PaillierEncrypt()
        self.paillier_encrypt.generate_key()
        # self.hetero_lr_gradient = HeteroLogisticGradient(self.paillier_encrypt)
        self.hetero_lr_gradient = hetero_lr_gradient_and_loss.Guest()

        size = 10
        self.en_wx = session.parallelize([self.paillier_encrypt.encrypt(i) for i in range(size)],
                                         partition=48,
                                         include_key=False)
        # self.en_wx = session.parallelize([self.paillier_encrypt.encrypt(i) for i in range(size)])

        self.en_sum_wx_square = session.parallelize([self.paillier_encrypt.encrypt(np.square(i)) for i in range(size)],
                                                    partition=48,
                                                    include_key=False)
        self.wx = np.array([i for i in range(size)])
        self.w = self.wx / np.array([1 for _ in range(size)])
        self.data_inst = session.parallelize(
            [Instance(features=np.array([1 for _ in range(size)]), label=pow(-1, i % 2)) for i in range(size)],
            partition=48, include_key=False)

        # test fore_gradient
        self.fore_gradient_local = [-0.5, 0.75, 0, 1.25, 0.5, 1.75, 1, 2.25, 1.5, 2.75]
        # test gradient
        self.gradient = [1.125, 1.125, 1.125, 1.125, 1.125, 1.125, 1.125, 1.125, 1.125, 1.125]
        self.gradient_fit_intercept = [1.125, 1.125, 1.125, 1.125, 1.125, 1.125, 1.125, 1.125, 1.125, 1.125, 1.125]

        self.loss = 4.505647

    def test_compute_partition_gradient(self):
        fore_gradient = self.en_wx.join(self.data_inst, lambda wx, d: 0.25 * wx - 0.5 * d.label)
        sparse_data = self._make_sparse_data()
        gradient_computer = hetero_linear_model_gradient.HeteroGradientBase()
        for fit_intercept in [True, False]:
            dense_result = gradient_computer.compute_gradient(self.data_inst, fore_gradient, fit_intercept)
            dense_result = [self.paillier_encrypt.decrypt(iterator) for iterator in dense_result]
            if fit_intercept:
                self.assertListEqual(dense_result, self.gradient_fit_intercept)
            else:
                self.assertListEqual(dense_result, self.gradient)
            sparse_result = gradient_computer.compute_gradient(sparse_data, fore_gradient, fit_intercept)
            sparse_result = [self.paillier_encrypt.decrypt(iterator) for iterator in sparse_result]
            self.assertListEqual(dense_result, sparse_result)

    def _make_sparse_data(self):
        def trans_sparse(instance):
            dense_features = instance.features
            indices = [i for i in range(len(dense_features))]
            sparse_features = SparseVector(indices=indices, data=dense_features, shape=len(dense_features))
            return Instance(inst_id=None,
                            features=sparse_features,
                            label=instance.label)

        return self.data_inst.mapValues(trans_sparse)
Esempio n. 9
0
class TestHomoLRGradient(unittest.TestCase):
    def setUp(self):
        self.paillier_encrypt = PaillierEncrypt()
        self.paillier_encrypt.generate_key()
        self.gradient_operator = LogisticGradient()
        self.taylor_operator = TaylorLogisticGradient()

        self.X = np.array([[1, 2, 3, 4, 5], [3, 2, 4, 5, 1], [
            2,
            2,
            3,
            1,
            1,
        ]]) / 10
        self.X1 = np.c_[self.X, np.ones(3)]

        self.Y = np.array([[1], [1], [-1]])

        self.values = []
        for idx, x in enumerate(self.X):
            inst = Instance(inst_id=idx, features=x, label=self.Y[idx])
            self.values.append((idx, inst))

        self.values1 = []
        for idx, x in enumerate(self.X1):
            inst = Instance(inst_id=idx, features=x, label=self.Y[idx])
            self.values1.append((idx, inst))

        self.coef = np.array([2, 2.3, 3, 4, 2.1]) / 10
        self.coef1 = np.append(self.coef, [1])

    def test_gradient_length(self):
        fit_intercept = False
        grad, loss = self.gradient_operator.compute(self.values, self.coef, 0,
                                                    fit_intercept)
        self.assertEqual(grad.shape[0], self.X.shape[1])

        taylor_grad, loss = self.taylor_operator.compute(
            self.values, self.coef, 0, fit_intercept)
        self.assertEqual(taylor_grad.shape[0], self.X.shape[1])
        self.assertTrue(np.sum(grad - taylor_grad) < 0.0001)

        fit_intercept = True
        grad, loss = self.gradient_operator.compute(self.values, self.coef, 0,
                                                    fit_intercept)
        self.assertEqual(grad.shape[0], self.X.shape[1] + 1)

        taylor_grad, loss = self.taylor_operator.compute(
            self.values, self.coef, 0, fit_intercept)
        self.assertEqual(taylor_grad.shape[0], self.X.shape[1] + 1)

        self.assertTrue(np.sum(grad - taylor_grad) < 0.0001)
Esempio n. 10
0
    def test_diff_mode(self, round=10, mode="strict", re_encrypted_rate=0.2):
        from federatedml.secureprotol.encrypt_mode import EncryptModeCalculator
        from federatedml.secureprotol import PaillierEncrypt
        encrypter = PaillierEncrypt()
        encrypter.generate_key(1024)
        encrypted_calculator = EncryptModeCalculator(encrypter, mode, re_encrypted_rate)        

        for i in range(round):
            data_i = self.data_numpy.mapValues(lambda v: v + i)
            data_i = encrypted_calculator.encrypt(data_i)
            decrypt_data_i = dict(data_i.mapValues(lambda arr: np.array([encrypter.decrypt(val) for val in arr])).collect())
            for j in range(30):
                self.assertTrue(np.fabs(self.numpy_data[j] - decrypt_data_i[j] + i).all() < 1e-5)
Esempio n. 11
0
 def keygen(self, key_length, suffix=tuple()) -> dict:
     use_cipher = self._use_encrypt.get_parties(
         parties=self._client_parties, suffix=suffix)
     ciphers = dict()
     for party, use_encryption in zip(self._client_parties, use_cipher):
         if not use_encryption:
             ciphers[party] = None
         else:
             cipher = PaillierEncrypt()
             cipher.generate_key(key_length)
             pub_key = cipher.get_public_key()
             self._pailler_pubkey.remote_parties(obj=pub_key,
                                                 parties=[party],
                                                 suffix=suffix)
             ciphers[party] = cipher
     return ciphers
Esempio n. 12
0
    def test_encrypt_and_decrypt(self):
        from federatedml.secureprotol import PaillierEncrypt
        from federatedml.secureprotol.encrypt_mode import EncryptModeCalculator
        encrypter = PaillierEncrypt()
        encrypter.generate_key(1024)

        encrypted_calculator = EncryptModeCalculator(encrypter, "fast")

        encrypter_tensor = self.paillier_tensor1.encrypt(encrypted_calculator)
        decrypted_tensor = encrypter_tensor.decrypt(encrypter)

        self.assertTrue(isinstance(encrypter_tensor, PaillierTensor))
        self.assertTrue(isinstance(decrypted_tensor, PaillierTensor))

        arr = decrypted_tensor.numpy()
        self.assertTrue(abs(arr.sum() - 10000) < consts.FLOAT_ZERO)
Esempio n. 13
0
class TestHeteroLogisticGradient(unittest.TestCase):
    def setUp(self):
        self.paillier_encrypt = PaillierEncrypt()
        self.paillier_encrypt.generate_key()
        self.hetero_lr_gradient = HeteroLogisticGradient(self.paillier_encrypt)

        size = 10
        self.wx = eggroll.parallelize([self.paillier_encrypt.encrypt(i) for i in range(size)])
        self.en_sum_wx_square = eggroll.parallelize([self.paillier_encrypt.encrypt(np.square(i)) for i in range(size)])
        self.w = [i for i in range(size)]
        self.data_inst = eggroll.parallelize(
            [Instance(features=[1 for _ in range(size)], label=pow(-1, i % 2)) for i in range(size)], partition=1)

        # test fore_gradient
        self.fore_gradient_local = [-0.5, 0.75, 0, 1.25, 0.5, 1.75, 1, 2.25, 1.5, 2.75]
        # test gradient
        self.gradient = [1.125, 1.125, 1.125, 1.125, 1.125, 1.125, 1.125, 1.125, 1.125, 1.125]
        self.gradient_fit_intercept = [1.125, 1.125, 1.125, 1.125, 1.125, 1.125, 1.125, 1.125, 1.125, 1.125, 1.125]

        self.loss = 4.505647

    def test_compute_fore_gradient(self):
        fore_gradient = self.hetero_lr_gradient.compute_fore_gradient(self.data_inst, self.wx)
        fore_gradient_local = [self.paillier_encrypt.decrypt(iterator[1]) for iterator in fore_gradient.collect()]

        self.assertListEqual(fore_gradient_local, self.fore_gradient_local)

    def test_compute_gradient(self):
        fore_gradient = self.hetero_lr_gradient.compute_fore_gradient(self.data_inst, self.wx)

        gradient = self.hetero_lr_gradient.compute_gradient(self.data_inst, fore_gradient, fit_intercept=False)
        de_gradient = [self.paillier_encrypt.decrypt(iterator) for iterator in gradient]
        self.assertListEqual(de_gradient, self.gradient)

        gradient = self.hetero_lr_gradient.compute_gradient(self.data_inst, fore_gradient, fit_intercept=True)
        de_gradient = [self.paillier_encrypt.decrypt(iterator) for iterator in gradient]
        self.assertListEqual(de_gradient, self.gradient_fit_intercept)

    def test_compute_gradient_and_loss(self):
        fore_gradient = self.hetero_lr_gradient.compute_fore_gradient(self.data_inst, self.wx)
        gradient, loss = self.hetero_lr_gradient.compute_gradient_and_loss(self.data_inst, fore_gradient, self.wx,
                                                                           self.en_sum_wx_square, False)
        de_gradient = [self.paillier_encrypt.decrypt(i) for i in gradient]
        self.assertListEqual(de_gradient, self.gradient)

        diff_loss = np.abs(self.loss - self.paillier_encrypt.decrypt(loss))
        self.assertLess(diff_loss, 1e-5)
class HeteroSecureBoostingTreeGuest(BoostingTree):
    def __init__(self, secureboost_tree_param):
        super(HeteroSecureBoostingTreeGuest,
              self).__init__(secureboost_tree_param)

        self.convegence = None
        self.y = None
        self.F = None
        self.data_bin = None
        self.loss = None
        self.classes_dict = {}
        self.classes_ = []
        self.num_classes = 0
        self.classify_target = "binary"
        self.feature_num = None
        self.encrypter = None
        self.grad_and_hess = None
        self.flowid = 0
        self.tree_dim = 1
        self.trees_ = []
        self.history_loss = []
        self.bin_split_points = None
        self.bin_sparse_points = None

        self.transfer_inst = HeteroSecureBoostingTreeTransferVariable()

    def set_loss(self, loss_type):
        LOGGER.info("set loss, loss type is {}".format(loss_type))
        if self.task_type == "classification":
            if loss_type == "cross_entropy":
                if self.num_classes == 2:
                    self.loss = SigmoidBinaryCrossEntropyLoss()
                else:
                    self.loss = SoftmaxCrossEntropyLoss()
            else:
                raise NotImplementedError("Loss type %s not supported yet" %
                                          (self.loss_type))
        else:
            raise NotImplementedError("Loss type %s not supported yet" %
                                      (self.loss_type))

    def convert_feature_to_bin(self, data_instance):
        LOGGER.info("convert feature to bins")
        self.data_bin, self.bin_split_points, self.bin_sparse_points = \
            Quantile.convert_feature_to_bin(
                data_instance, self.quantile_method, self.bin_num,
                self.bin_gap, self.bin_sample_num)

    def set_y(self):
        LOGGER.info("set label from data and check label")
        self.y = self.data_bin.mapValues(lambda instance: instance.label)
        self.check_label()

    def set_flowid(self, flowid=0):
        LOGGER.info("set flowid, flowid is {}".format(flowid))
        self.flowid = flowid

    def generate_flowid(self, round_num, tree_num):
        LOGGER.info("generate flowid")
        return ".".join(map(str, [self.flowid, round_num, tree_num]))

    def check_label(self):
        LOGGER.info("check label")
        if self.task_type == "classification":
            self.num_classes, self.classes_ = ClassifyLabelChecker.validate_y(
                self.y)
            if self.num_classes > 2:
                self.classify_target = "multinomial"
                self.tree_dim = self.num_classes

            range_from_zero = True
            for _class in self.classes_:
                try:
                    if _class >= 0 and _class < range_from_zero and isinstance(
                            _class, int):
                        continue
                    else:
                        range_from_zero = False
                        break
                except:
                    range_from_zero = False

            self.classes_ = sorted(self.classes_)
            if not range_from_zero:
                class_mapping = dict(
                    zip(self.classes_, range(self.num_classes)))
                self.y = self.y.mapValues(lambda _class: class_mapping[_class])

        else:
            RegressionLabelChecker.validate_y(self.y)

        self.set_loss(self.loss_type)

    def generate_encrypter(self):
        LOGGER.info("generate encrypter")
        if self.encrypt_param.method == "paillier":
            self.encrypter = PaillierEncrypt()
            self.encrypter.generate_key(self.encrypt_param.key_length)
        else:
            raise NotImplementedError("encrypt method not supported yes!!!")

    @staticmethod
    def accumulate_f(f_val, new_f_val, lr=0.1, idx=0):
        f_val[idx] += lr * new_f_val
        return f_val

    def update_f_value(self, new_f=None, tidx=-1):
        LOGGER.info("update tree f value, tree idx is {}".format(tidx))
        if self.F is None:
            LOGGER.info("tree_dim is %d" % (self.tree_dim))
            tree_dim = self.tree_dim
            self.F = self.y.mapValues(lambda v: np.zeros(tree_dim))
        else:
            accumuldate_f = functools.partial(self.accumulate_f,
                                              lr=self.learning_rate,
                                              idx=tidx)

            self.F = self.F.join(new_f, accumuldate_f)

    def compute_grad_and_hess(self):
        LOGGER.info("compute grad and hess")
        loss_method = self.loss
        self.grad_and_hess = self.y.join(self.F, lambda y, f_val: \
            (loss_method.compute_grad(y, loss_method.predict(f_val)), \
             loss_method.compute_hess(y, loss_method.predict(f_val))))

    def compute_loss(self):
        LOGGER.info("compute loss")
        loss_method = self.loss
        y_predict = self.F.mapValues(lambda val: loss_method.predict(val))
        loss = loss_method.compute_loss(self.y, y_predict)
        return loss

    def get_grad_and_hess(self, tree_idx):
        LOGGER.info("get grad and hess of tree {}".format(tree_idx))
        grad_and_hess_subtree = self.grad_and_hess.mapValues(
            lambda grad_and_hess:
            (grad_and_hess[0][tree_idx], grad_and_hess[1][tree_idx]))
        return grad_and_hess_subtree

    def check_convergence(self, loss):
        LOGGER.info("check convergence")
        if self.convegence is None:
            self.convegence = DiffConverge()

        return self.convegence.is_converge(loss)

    def sample_valid_features(self):
        LOGGER.info("sample valid features")
        if self.feature_num is None:
            self.feature_num = self.bin_split_points.shape[0]

        choose_feature = random.choice(range(0, self.feature_num), \
                                       max(1, int(self.subsample_feature_rate * self.feature_num)), replace=False)

        valid_features = [False for i in range(self.feature_num)]
        for fid in choose_feature:
            valid_features[fid] = True
        return valid_features

    def sync_tree_dim(self):
        LOGGER.info("sync tree dim to host")
        federation.remote(obj=self.tree_dim,
                          name=self.transfer_inst.tree_dim.name,
                          tag=self.transfer_inst.generate_transferid(
                              self.transfer_inst.tree_dim),
                          role=consts.HOST,
                          idx=0)

    def sync_stop_flag(self, stop_flag, num_round):
        LOGGER.info(
            "sync stop flag to host, boosting round is {}".format(num_round))
        federation.remote(obj=stop_flag,
                          name=self.transfer_inst.stop_flag.name,
                          tag=self.transfer_inst.generate_transferid(
                              self.transfer_inst.stop_flag, num_round),
                          role=consts.HOST,
                          idx=0)

    def fit(self, data_inst):
        LOGGER.info("begin to train secureboosting guest model")
        self.convert_feature_to_bin(data_inst)
        self.set_y()
        self.update_f_value()
        self.generate_encrypter()

        self.sync_tree_dim()

        for i in range(self.num_trees):
            n_tree = []
            self.compute_grad_and_hess()
            for tidx in range(self.tree_dim):
                tree_inst = HeteroDecisionTreeGuest(self.tree_param)

                tree_inst.set_inputinfo(self.data_bin,
                                        self.get_grad_and_hess(tidx),
                                        self.bin_split_points,
                                        self.bin_sparse_points)

                valid_features = self.sample_valid_features()
                tree_inst.set_valid_features(valid_features)
                tree_inst.set_encrypter(self.encrypter)
                tree_inst.set_flowid(self.generate_flowid(i, tidx))

                tree_inst.fit()
                n_tree.append(tree_inst.get_tree_model())
                self.update_f_value(tree_inst.predict_weights, tidx)

            self.trees_.append(n_tree)
            loss = self.compute_loss()
            self.history_loss.append(loss)
            LOGGER.info("round {} loss is {}".format(i, loss))

            if self.n_iter_no_change is True:
                if self.check_convergence(loss):
                    self.sync_stop_flag(True, i)
                    break
                else:
                    self.sync_stop_flag(False, i)

        LOGGER.info("end to train secureboosting guest model")

    def predict_f_value(self, data_inst):
        LOGGER.info("predict tree f value")
        tree_dim = self.tree_dim
        self.F = data_inst.mapValues(lambda v: np.zeros(tree_dim))
        for i in range(len(self.trees_)):
            n_tree = self.trees_[i]
            for tidx in range(len(n_tree)):
                tree_inst = HeteroDecisionTreeGuest(self.tree_param)
                tree_inst.set_tree_model(n_tree[tidx])
                tree_inst.set_flowid(self.generate_flowid(i, tidx))

                predict_data = tree_inst.predict(data_inst)
                self.update_f_value(predict_data, tidx)

    def predict(self, data_inst, predict_param):
        LOGGER.info("start predict")
        self.predict_f_value(data_inst)
        loss_method = self.loss
        predicts = self.F.mapValues(lambda f: loss_method.predict(f))
        if self.task_type == "classification":
            classes_ = self.classes_
            if self.num_classes == 2:
                predict_label = predicts.mapValues(lambda pred: classes_[
                    1] if pred > predict_param.threshold else classes_[0])
            else:
                predict_label = predicts.mapValues(
                    lambda preds: classes_[np.argmax(preds)])

            if predict_param.with_proba:
                predict_result = data_inst.join(
                    predicts, lambda inst, predict_prob:
                    (inst.label, predict_prob))
            else:
                predict_result = data_inst.mapValues(lambda inst: inst.label)

            predict_result = predict_result.join(
                predict_label, lambda label_prob, predict_label:
                (label_prob[0], label_prob[1], predict_label))
        else:
            raise NotImplementedError("task type %s not supported yet" %
                                      (self.task_type))

        LOGGER.info("end predict")

        return predict_result

    def save_model(self, model_table, model_namespace):
        LOGGER.info("save model")
        modelmeta = BoostingTreeModelMeta()
        modelmeta.trees_ = self.trees_
        modelmeta.loss_type = self.loss_type
        modelmeta.tree_dim = self.tree_dim
        modelmeta.task_type = self.task_type
        modelmeta.num_classes = self.num_classes
        modelmeta.classes_ = self.classes_
        modelmeta.loss = self.history_loss

        model = eggroll.parallelize([modelmeta], include_key=False)
        model.save_as(model_table, model_namespace)

    def load_model(self, model_table, model_namespace):
        LOGGER.info("load model")
        modelmeta = list(
            eggroll.table(model_table, model_namespace).collect())[0][1]
        self.task_type = modelmeta.task_type
        self.loss_type = modelmeta.loss_type
        self.tree_dim = modelmeta.tree_dim
        self.num_classes = modelmeta.num_classes
        self.trees_ = modelmeta.trees_
        self.classes_ = modelmeta.classes_
        self.history_loss = modelmeta.loss

        self.set_loss(self.loss_type)

    def evaluate(self, labels, pred_prob, pred_labels, evaluate_param):
        LOGGER.info("evaluate data")
        predict_res = None
        if evaluate_param.classi_type == consts.BINARY:
            predict_res = pred_prob
        elif evaluate_param.classi_type == consts.MULTY:
            predict_res = pred_labels
        else:
            LOGGER.warning(
                "unknown classification type, return None as evaluation results"
            )

        eva = Evaluation(evaluate_param.classi_type)
        return eva.report(labels, predict_res, evaluate_param.metrics,
                          evaluate_param.thresholds, evaluate_param.pos_label)
class HeteroSecureBoostingTreeGuest(BoostingTree):
    def __init__(self):
        super(HeteroSecureBoostingTreeGuest, self).__init__()

        self.convegence = None
        self.y = None
        self.F = None
        self.data_bin = None
        self.loss = None
        self.init_score = None
        self.classes_dict = {}
        self.classes_ = []
        self.num_classes = 0
        self.classify_target = "binary"
        self.feature_num = None
        self.encrypter = None
        self.grad_and_hess = None
        # self.flowid = 0
        self.tree_dim = 1
        self.tree_meta = None
        self.trees_ = []
        self.history_loss = []
        self.bin_split_points = None
        self.bin_sparse_points = None
        self.encrypted_mode_calculator = None
        self.runtime_idx = 0
        self.feature_importances_ = {}
        self.role = consts.GUEST

        self.transfer_inst = HeteroSecureBoostingTreeTransferVariable()

    def set_loss(self, objective_param):
        loss_type = objective_param.objective
        params = objective_param.params
        LOGGER.info("set objective, objective is {}".format(loss_type))
        if self.task_type == consts.CLASSIFICATION:
            if loss_type == "cross_entropy":
                if self.num_classes == 2:
                    self.loss = SigmoidBinaryCrossEntropyLoss()
                else:
                    self.loss = SoftmaxCrossEntropyLoss()
            else:
                raise NotImplementedError("objective %s not supported yet" %
                                          (loss_type))
        elif self.task_type == consts.REGRESSION:
            if loss_type == "lse":
                self.loss = LeastSquaredErrorLoss()
            elif loss_type == "lae":
                self.loss = LeastAbsoluteErrorLoss()
            elif loss_type == "huber":
                self.loss = HuberLoss(params[0])
            elif loss_type == "fair":
                self.loss = FairLoss(params[0])
            elif loss_type == "tweedie":
                self.loss = TweedieLoss(params[0])
            elif loss_type == "log_cosh":
                self.loss = LogCoshLoss()
            else:
                raise NotImplementedError("objective %s not supported yet" %
                                          (loss_type))
        else:
            raise NotImplementedError("objective %s not supported yet" %
                                      (loss_type))

    def convert_feature_to_bin(self, data_instance):
        LOGGER.info("convert feature to bins")
        param_obj = FeatureBinningParam(bin_num=self.bin_num)
        binning_obj = QuantileBinning(param_obj)
        binning_obj.fit_split_points(data_instance)
        self.data_bin, self.bin_split_points, self.bin_sparse_points = binning_obj.convert_feature_to_bin(
            data_instance)
        LOGGER.info("convert feature to bins over")

    def set_y(self):
        LOGGER.info("set label from data and check label")
        self.y = self.data_bin.mapValues(lambda instance: instance.label)
        self.check_label()

    def set_runtime_idx(self, runtime_idx):
        self.runtime_idx = runtime_idx

    def generate_flowid(self, round_num, tree_num):
        LOGGER.info("generate flowid, flowid {}".format(self.flowid))
        return ".".join(map(str, [self.flowid, round_num, tree_num]))

    def check_label(self):
        LOGGER.info("check label")
        if self.task_type == consts.CLASSIFICATION:
            self.num_classes, self.classes_ = ClassifyLabelChecker.validate_y(
                self.y)
            if self.num_classes > 2:
                self.classify_target = "multinomial"
                self.tree_dim = self.num_classes

            range_from_zero = True
            for _class in self.classes_:
                try:
                    if _class >= 0 and _class < range_from_zero and isinstance(
                            _class, int):
                        continue
                    else:
                        range_from_zero = False
                        break
                except:
                    range_from_zero = False

            self.classes_ = sorted(self.classes_)
            if not range_from_zero:
                class_mapping = dict(
                    zip(self.classes_, range(self.num_classes)))
                self.y = self.y.mapValues(lambda _class: class_mapping[_class])

        else:
            RegressionLabelChecker.validate_y(self.y)

        self.set_loss(self.objective_param)

    def generate_encrypter(self):
        LOGGER.info("generate encrypter")
        if self.encrypt_param.method == consts.PAILLIER:
            self.encrypter = PaillierEncrypt()
            self.encrypter.generate_key(self.encrypt_param.key_length)
        else:
            raise NotImplementedError("encrypt method not supported yes!!!")

        self.encrypted_calculator = EncryptModeCalculator(
            self.encrypter, self.calculated_mode, self.re_encrypted_rate)

    @staticmethod
    def accumulate_f(f_val, new_f_val, lr=0.1, idx=0):
        f_val[idx] += lr * new_f_val
        return f_val

    def update_feature_importance(self, tree_feature_importance):
        for fid in tree_feature_importance:
            if fid not in self.feature_importances_:
                self.feature_importances_[fid] = 0

            self.feature_importances_[fid] += tree_feature_importance[fid]

    def update_f_value(self, new_f=None, tidx=-1):
        LOGGER.info("update tree f value, tree idx is {}".format(tidx))
        if self.F is None:
            if self.tree_dim > 1:
                self.F, self.init_score = self.loss.initialize(
                    self.y, self.tree_dim)
            else:
                self.F, self.init_score = self.loss.initialize(self.y)
        else:
            accumuldate_f = functools.partial(self.accumulate_f,
                                              lr=self.learning_rate,
                                              idx=tidx)

            self.F = self.F.join(new_f, accumuldate_f)

    def compute_grad_and_hess(self):
        LOGGER.info("compute grad and hess")
        loss_method = self.loss
        if self.task_type == consts.CLASSIFICATION:
            self.grad_and_hess = self.y.join(self.F, lambda y, f_val: \
                (loss_method.compute_grad(y, loss_method.predict(f_val)), \
                 loss_method.compute_hess(y, loss_method.predict(f_val))))
        else:
            self.grad_and_hess = self.y.join(
                self.F, lambda y, f_val: (loss_method.compute_grad(y, f_val),
                                          loss_method.compute_hess(y, f_val)))

    def compute_loss(self):
        LOGGER.info("compute loss")
        if self.task_type == consts.CLASSIFICATION:
            loss_method = self.loss
            y_predict = self.F.mapValues(lambda val: loss_method.predict(val))
            loss = loss_method.compute_loss(self.y, y_predict)
        elif self.task_type == consts.REGRESSION:
            if self.objective_param.objective in [
                    "lse", "lae", "logcosh", "tweedie", "log_cosh", "huber"
            ]:
                loss_method = self.loss
                loss = loss_method.compute_loss(self.y, self.F)
            else:
                loss_method = self.loss
                y_predict = self.F.mapValues(
                    lambda val: loss_method.predict(val))
                loss = loss_method.compute_loss(self.y, y_predict)

        return float(loss)

    def get_grad_and_hess(self, tree_idx):
        LOGGER.info("get grad and hess of tree {}".format(tree_idx))
        grad_and_hess_subtree = self.grad_and_hess.mapValues(
            lambda grad_and_hess:
            (grad_and_hess[0][tree_idx], grad_and_hess[1][tree_idx]))
        return grad_and_hess_subtree

    def check_convergence(self, loss):
        LOGGER.info("check convergence")
        if self.convegence is None:
            self.convegence = DiffConverge(eps=self.tol)

        return self.convegence.is_converge(loss)

    def sample_valid_features(self):
        LOGGER.info("sample valid features")
        if self.feature_num is None:
            self.feature_num = self.bin_split_points.shape[0]

        choose_feature = random.choice(range(0, self.feature_num), \
                                       max(1, int(self.subsample_feature_rate * self.feature_num)), replace=False)

        valid_features = [False for i in range(self.feature_num)]
        for fid in choose_feature:
            valid_features[fid] = True
        return valid_features

    def sync_tree_dim(self):
        LOGGER.info("sync tree dim to host")
        federation.remote(obj=self.tree_dim,
                          name=self.transfer_inst.tree_dim.name,
                          tag=self.transfer_inst.generate_transferid(
                              self.transfer_inst.tree_dim),
                          role=consts.HOST,
                          idx=-1)

    def sync_stop_flag(self, stop_flag, num_round):
        LOGGER.info(
            "sync stop flag to host, boosting round is {}".format(num_round))
        federation.remote(obj=stop_flag,
                          name=self.transfer_inst.stop_flag.name,
                          tag=self.transfer_inst.generate_transferid(
                              self.transfer_inst.stop_flag, num_round),
                          role=consts.HOST,
                          idx=-1)

    def fit(self, data_inst):
        LOGGER.info("begin to train secureboosting guest model")
        self.gen_feature_fid_mapping(data_inst.schema)
        data_inst = self.data_alignment(data_inst)
        self.convert_feature_to_bin(data_inst)
        self.set_y()
        self.update_f_value()
        self.generate_encrypter()

        self.sync_tree_dim()

        self.callback_meta(
            "loss", "train",
            MetricMeta(name="train",
                       metric_type="LOSS",
                       extra_metas={"unit_name": "iters"}))

        for i in range(self.num_trees):
            self.compute_grad_and_hess()
            for tidx in range(self.tree_dim):
                tree_inst = HeteroDecisionTreeGuest(self.tree_param)

                tree_inst.set_inputinfo(self.data_bin,
                                        self.get_grad_and_hess(tidx),
                                        self.bin_split_points,
                                        self.bin_sparse_points)

                valid_features = self.sample_valid_features()
                tree_inst.set_valid_features(valid_features)
                tree_inst.set_encrypter(self.encrypter)
                tree_inst.set_encrypted_mode_calculator(
                    self.encrypted_calculator)
                tree_inst.set_flowid(self.generate_flowid(i, tidx))

                tree_inst.fit()

                tree_meta, tree_param = tree_inst.get_model()
                self.trees_.append(tree_param)
                if self.tree_meta is None:
                    self.tree_meta = tree_meta
                self.update_f_value(new_f=tree_inst.predict_weights, tidx=tidx)
                self.update_feature_importance(
                    tree_inst.get_feature_importance())

            loss = self.compute_loss()
            self.history_loss.append(loss)
            LOGGER.info("round {} loss is {}".format(i, loss))

            self.callback_metric("loss", "train", [Metric(i, loss)])

            if self.n_iter_no_change is True:
                if self.check_convergence(loss):
                    self.sync_stop_flag(True, i)
                    break
                else:
                    self.sync_stop_flag(False, i)

        LOGGER.debug("history loss is {}".format(min(self.history_loss)))
        self.callback_meta(
            "loss", "train",
            MetricMeta(name="train",
                       metric_type="LOSS",
                       extra_metas={"Best": min(self.history_loss)}))

        LOGGER.info("end to train secureboosting guest model")

    def predict_f_value(self, data_inst):
        LOGGER.info("predict tree f value, there are {} trees".format(
            len(self.trees_)))
        tree_dim = self.tree_dim
        init_score = self.init_score
        self.F = data_inst.mapValues(lambda v: init_score)
        rounds = len(self.trees_) // self.tree_dim
        for i in range(rounds):
            for tidx in range(self.tree_dim):
                tree_inst = HeteroDecisionTreeGuest(self.tree_param)
                tree_inst.load_model(self.tree_meta,
                                     self.trees_[i * self.tree_dim + tidx])
                tree_inst.set_flowid(self.generate_flowid(i, tidx))

                predict_data = tree_inst.predict(data_inst)
                self.update_f_value(new_f=predict_data, tidx=tidx)

    def predict(self, data_inst):
        LOGGER.info("start predict")
        data_inst = self.data_alignment(data_inst)
        self.predict_f_value(data_inst)
        if self.task_type == consts.CLASSIFICATION:
            loss_method = self.loss
            if self.num_classes == 2:
                predicts = self.F.mapValues(
                    lambda f: float(loss_method.predict(f)))
            else:
                predicts = self.F.mapValues(
                    lambda f: loss_method.predict(f).tolist())

        elif self.task_type == consts.REGRESSION:
            if self.objective_param.objective in [
                    "lse", "lae", "huber", "log_cosh", "fair", "tweedie"
            ]:
                predicts = self.F
            else:
                raise NotImplementedError(
                    "objective {} not supprted yet".format(
                        self.objective_param.objective))

        if self.task_type == consts.CLASSIFICATION:
            classes_ = self.classes_
            if self.num_classes == 2:
                threshold = self.predict_param.threshold
                predict_result = data_inst.join(
                    predicts, lambda inst, pred: [
                        inst.label, classes_[1]
                        if pred > threshold else classes_[0], pred, {
                            "0": 1 - pred,
                            "1": pred
                        }
                    ])
            else:
                predict_result = data_inst.join(
                    predicts, lambda inst, preds: [
                        inst.label, classes_[np.argmax(preds)],
                        np.max(preds),
                        dict(zip(map(str, classes_), preds))
                    ])

        elif self.task_type == consts.REGRESSION:
            predict_result = data_inst.join(
                predicts, lambda inst, pred:
                [inst.label,
                 float(pred),
                 float(pred), {
                     "label": float(pred)
                 }])

        else:
            raise NotImplementedError("task type {} not supported yet".format(
                self.task_type))

        LOGGER.info("end predict")

        return predict_result

    def get_feature_importance(self):
        return self.feature_importances_

    def get_model_meta(self):
        model_meta = BoostingTreeModelMeta()
        model_meta.tree_meta.CopyFrom(self.tree_meta)
        model_meta.learning_rate = self.learning_rate
        model_meta.num_trees = self.num_trees
        model_meta.quantile_meta.CopyFrom(QuantileMeta(bin_num=self.bin_num))
        model_meta.objective_meta.CopyFrom(
            ObjectiveMeta(objective=self.objective_param.objective,
                          param=self.objective_param.params))
        model_meta.task_type = self.task_type
        model_meta.tree_dim = self.tree_dim
        model_meta.n_iter_no_change = self.n_iter_no_change
        model_meta.tol = self.tol
        model_meta.num_classes = self.num_classes
        model_meta.classes_.extend(map(str, self.classes_))
        model_meta.need_run = self.need_run
        meta_name = "HeteroSecureBoostingTreeGuestMeta"

        return meta_name, model_meta

    def set_model_meta(self, model_meta):
        self.tree_meta = model_meta.tree_meta
        self.learning_rate = model_meta.learning_rate
        self.num_trees = model_meta.num_trees
        self.bin_num = model_meta.quantile_meta.bin_num
        self.objective_param.objective = model_meta.objective_meta.objective
        self.objective_param.params = list(model_meta.objective_meta.param)
        self.task_type = model_meta.task_type
        self.tree_dim = model_meta.tree_dim
        self.num_classes = model_meta.num_classes
        self.n_iter_no_change = model_meta.n_iter_no_change
        self.tol = model_meta.tol
        self.classes_ = list(model_meta.classes_)

        self.set_loss(self.objective_param)

    def get_model_param(self):
        model_param = BoostingTreeModelParam()
        model_param.tree_num = len(list(self.trees_))
        model_param.trees_.extend(self.trees_)
        model_param.init_score.extend(self.init_score)
        model_param.losses.extend(self.history_loss)

        feature_importances = list(self.get_feature_importance().items())
        feature_importances = sorted(feature_importances,
                                     key=itemgetter(1),
                                     reverse=True)
        feature_importance_param = []
        for (sitename, fid), _importance in feature_importances:
            feature_importance_param.append(
                FeatureImportanceInfo(sitename=sitename,
                                      fid=fid,
                                      importance=_importance))
        model_param.feature_importances.extend(feature_importance_param)
        model_param.feature_name_fid_mapping.update(
            self.feature_name_fid_mapping)

        param_name = "HeteroSecureBoostingTreeGuestParam"

        return param_name, model_param

    def set_model_param(self, model_param):
        self.trees_ = list(model_param.trees_)
        self.init_score = np.array(list(model_param.init_score))
        self.history_loss = list(model_param.losses)

    def export_model(self):
        meta_name, meta_protobuf = self.get_model_meta()
        param_name, param_protobuf = self.get_model_param()
        self.model_output = {
            meta_name: meta_protobuf,
            param_name: param_protobuf
        }

        return self.model_output

    def _load_model(self, model_dict):
        model_param = None
        model_meta = None
        for _, value in model_dict["model"].items():
            for model in value:
                if model.endswith("Meta"):
                    model_meta = value[model]
                if model.endswith("Param"):
                    model_param = value[model]
        LOGGER.info("load model")

        self.set_model_meta(model_meta)
        self.set_model_param(model_param)
Esempio n. 16
0
    def fit(self, data_instances):
        """
        Apply binning method for both data instances in local party as well as the other one. Afterwards, calculate
        the specific metric value for specific columns. Currently, iv is support for binary labeled data only.
        """
        LOGGER.info("Start feature binning fit and transform")
        self._abnormal_detection(data_instances)

        # self._parse_cols(data_instances)

        self._setup_bin_inner_param(data_instances, self.model_param)

        if self.model_param.method == consts.OPTIMAL:
            has_missing_value = self.iv_calculator.check_containing_missing_value(data_instances)
            for idx in self.bin_inner_param.bin_indexes:
                if idx in has_missing_value:
                    raise ValueError(f"Optimal Binning do not support missing value now.")
        split_points = self.binning_obj.fit_split_points(data_instances)

        if self.model_param.skip_static:
            self.transform(data_instances)
            return self.data_output

        label_counts_dict = data_overview.get_label_count(data_instances)

        if len(label_counts_dict) > 2:
            if self.model_param.method == consts.OPTIMAL:
                raise ValueError("Have not supported optimal binning in multi-class data yet")

        self.labels = list(label_counts_dict.keys())
        label_counts = [label_counts_dict[k] for k in self.labels]
        label_table = IvCalculator.convert_label(data_instances, self.labels)
        self.bin_result = self.iv_calculator.cal_local_iv(data_instances=data_instances,
                                                          split_points=split_points,
                                                          labels=self.labels,
                                                          label_counts=label_counts,
                                                          bin_cols_map=self.bin_inner_param.get_need_cal_iv_cols_map(),
                                                          label_table=label_table)

        if self.model_param.local_only:

            self.transform(data_instances)
            self.set_summary(self.bin_result.summary())
            return self.data_output

        if self.model_param.encrypt_param.method == consts.PAILLIER:
            paillier_encryptor = PaillierEncrypt()
            paillier_encryptor.generate_key(self.model_param.encrypt_param.key_length)
            cipher = EncryptModeCalculator(encrypter=paillier_encryptor)
        else:
            raise NotImplementedError("encrypt method not supported yet")
        self._packer = GuestIntegerPacker(pack_num=len(self.labels), pack_num_range=label_counts,
                                          encrypt_mode_calculator=cipher)

        self.federated_iv(data_instances=data_instances, label_table=label_table,
                          cipher=cipher, result_counts=label_counts_dict, label_elements=self.labels)

        total_summary = self.bin_result.summary()
        for host_res in self.host_results:
            total_summary = self._merge_summary(total_summary, host_res.summary())

        self.set_schema(data_instances)
        self.transform(data_instances)
        LOGGER.info("Finish feature binning fit and transform")
        self.set_summary(total_summary)
        return self.data_output
    def fit(self, data_instances):
        """
        Apply binning method for both data instances in local party as well as the other one. Afterwards, calculate
        the specific metric value for specific columns. Currently, iv is support for binary labeled data only.
        """
        LOGGER.info("Start feature binning fit and transform")
        self._abnormal_detection(data_instances)

        # self._parse_cols(data_instances)
        self._setup_bin_inner_param(data_instances, self.model_param)

        self.binning_obj.fit_split_points(data_instances)

        label_counts = data_overview.count_labels(data_instances)
        if label_counts > 2:
            raise ValueError(
                "Iv calculation support binary-data only in this version.")

        data_instances = data_instances.mapValues(self.load_data)
        self.set_schema(data_instances)
        label_table = data_instances.mapValues(lambda x: x.label)

        if self.model_param.local_only:
            LOGGER.info("This is a local only binning fit")
            self.binning_obj.cal_local_iv(data_instances,
                                          label_table=label_table)
            self.transform(data_instances)
            return self.data_output

        cipher = PaillierEncrypt()
        cipher.generate_key()

        f = functools.partial(self.encrypt, cipher=cipher)
        encrypted_label_table = label_table.mapValues(f)

        self.transfer_variable.encrypted_label.remote(encrypted_label_table,
                                                      role=consts.HOST,
                                                      idx=-1)
        LOGGER.info("Sent encrypted_label_table to host")

        self.binning_obj.cal_local_iv(data_instances, label_table=label_table)

        encrypted_bin_infos = self.transfer_variable.encrypted_bin_sum.get(
            idx=-1)
        # LOGGER.debug("encrypted_bin_sums: {}".format(encrypted_bin_sums))

        LOGGER.info("Get encrypted_bin_sum from host")
        for host_idx, encrypted_bin_info in enumerate(encrypted_bin_infos):
            host_party_id = self.component_properties.host_party_idlist[
                host_idx]
            encrypted_bin_sum = encrypted_bin_info['encrypted_bin_sum']
            host_bin_methods = encrypted_bin_info['bin_method']
            category_names = encrypted_bin_info['category_names']
            result_counts = self.__decrypt_bin_sum(encrypted_bin_sum, cipher)
            LOGGER.debug(
                "Received host {} result, length of buckets: {}".format(
                    host_idx, len(result_counts)))
            LOGGER.debug("category_name: {}, host_bin_methods: {}".format(
                category_names, host_bin_methods))
            # if self.model_param.method == consts.OPTIMAL:
            if host_bin_methods == consts.OPTIMAL:
                optimal_binning_params = encrypted_bin_info['optimal_params']

                host_model_params = copy.deepcopy(self.model_param)
                host_model_params.bin_num = optimal_binning_params.get(
                    'bin_num')
                host_model_params.optimal_binning_param.metric_method = optimal_binning_params.get(
                    'metric_method')
                host_model_params.optimal_binning_param.mixture = optimal_binning_params.get(
                    'mixture')
                host_model_params.optimal_binning_param.max_bin_pct = optimal_binning_params.get(
                    'max_bin_pct')
                host_model_params.optimal_binning_param.min_bin_pct = optimal_binning_params.get(
                    'min_bin_pct')

                self.binning_obj.event_total, self.binning_obj.non_event_total = self.get_histogram(
                    data_instances)
                optimal_binning_cols = {
                    x: y
                    for x, y in result_counts.items()
                    if x not in category_names
                }
                host_binning_obj = self.optimal_binning_sync(
                    optimal_binning_cols, data_instances.count(),
                    data_instances._partitions, host_idx, host_model_params)
                category_bins = {
                    x: y
                    for x, y in result_counts.items() if x in category_names
                }
                host_binning_obj.cal_iv_woe(category_bins,
                                            self.model_param.adjustment_factor)
            else:
                host_binning_obj = BaseBinning()
                host_binning_obj.cal_iv_woe(result_counts,
                                            self.model_param.adjustment_factor)
            host_binning_obj.set_role_party(role=consts.HOST,
                                            party_id=host_party_id)
            self.host_results.append(host_binning_obj)

        self.set_schema(data_instances)
        self.transform(data_instances)
        LOGGER.info("Finish feature binning fit and transform")
        return self.data_output
Esempio n. 18
0
class HeteroFeatureBinningGuest(BaseHeteroFeatureBinning):
    def __init__(self):
        super(HeteroFeatureBinningGuest, self).__init__()

        self.encryptor = PaillierEncrypt()
        self.encryptor.generate_key()
        self.local_transform_result = None
        self.party_name = consts.GUEST
        # self._init_binning_obj()

    def fit(self, data_instances):
        """
        Apply binning method for both data instances in local party as well as the other one. Afterwards, calculate
        the specific metric value for specific columns. Currently, iv is support for binary labeled data only.
        """
        LOGGER.info("Start feature binning fit and transform")
        self._abnormal_detection(data_instances)
        self._parse_cols(data_instances)

        self.binning_obj.fit_split_points(data_instances)
        LOGGER.debug("After fit, binning_obj split_points: {}".format(
            self.binning_obj.split_points))

        is_binary_data = data_overview.is_binary_labels(data_instances)

        if not is_binary_data:
            LOGGER.warning("Iv is not supported for Multiple-label data.")
            # data_instances = self.fit_local(data_instances)
            return data_instances

        # 1. Synchronize encryption information
        self.__synchronize_encryption()

        # 2. Prepare labels
        data_instances = data_instances.mapValues(self.load_data)
        self.set_schema(data_instances)

        label_table = data_instances.mapValues(lambda x: x.label)

        # 3. Transfer encrypted label
        f = functools.partial(self.encrypt, encryptor=self.encryptor)
        encrypted_label_table = label_table.mapValues(f)

        # encrypted_label_table_id = self.transfer_variable.generate_transferid(self.transfer_variable.encrypted_label)

        self.transfer_variable.encrypted_label.remote(encrypted_label_table,
                                                      role=consts.HOST,
                                                      idx=0)
        # federation.remote(encrypted_label_table, name=self.transfer_variable.encrypted_label.name,
        #                  tag=encrypted_label_table_id, role=consts.HOST, idx=0)

        LOGGER.info("Sent encrypted_label_table to host")

        # 4. Calculates self's binning. In case the other party need time to compute its data,
        #  do binning calculation at this point.
        data_instances = self.fit_local(data_instances, label_table)

        # 5. Received host result and calculate iv value

        encrypted_bin_sum = self.transfer_variable.encrypted_bin_sum.get(idx=0)

        LOGGER.info("Get encrypted_bin_sum from host")

        result_counts = self.__decrypt_bin_sum(encrypted_bin_sum)
        host_iv_attrs = self.binning_obj.cal_iv_woe(
            result_counts, self.model_param.adjustment_factor)

        # Support one host only in this version. Multiple host will be supported in the future.
        self.host_results[consts.HOST] = host_iv_attrs
        self.set_schema(data_instances)

        LOGGER.debug("Before transform, binning_obj split_points: {}".format(
            self.binning_obj.split_points))

        self.transform(data_instances)
        LOGGER.info("Finish feature binning fit and transform")
        return self.data_output

    @staticmethod
    def encrypt(x, encryptor):
        return encryptor.encrypt(x), encryptor.encrypt(1 - x)

    def transform_local(self, data_instances, label_table=None):
        self._abnormal_detection(data_instances)
        self._parse_cols(data_instances)
        split_points = {}
        for col_name, iv_attr in self.binning_result.items():
            split_points[col_name] = iv_attr.split_points

        self.local_transform_result = self.binning_obj.cal_local_iv(
            data_instances, split_points=split_points, label_table=label_table)

        for col_name, col_index in self.local_transform_result.items():
            LOGGER.info("The local feature {} 's iv is {}".format(
                col_name, self.local_transform_result[col_name].iv))
        self.set_schema(data_instances)
        return data_instances

    def __synchronize_encryption(self):
        pub_key = self.encryptor.get_public_key()
        # pubkey_id = self.transfer_variable.generate_transferid(self.transfer_variable.paillier_pubkey)

        self.transfer_variable.paillier_pubkey.remote(pub_key,
                                                      role=consts.HOST,
                                                      idx=0)
        """
        federation.remote(pub_key, name=self.transfer_variable.paillier_pubkey.name,
                          tag=pubkey_id, role=consts.HOST, idx=0)
        """

        LOGGER.info("send pubkey to host")
        self.has_synchronized = True

    def __decrypt_bin_sum(self, encrypted_bin_sum):
        # for feature_sum in encrypted_bin_sum:
        for col_name, count_list in encrypted_bin_sum.items():
            new_list = []
            for encrypted_event, encrypted_non_event in count_list:
                event_count = self.encryptor.decrypt(encrypted_event)
                non_event_count = self.encryptor.decrypt(encrypted_non_event)
                new_list.append((event_count, non_event_count))
            encrypted_bin_sum[col_name] = new_list
        return encrypted_bin_sum

    def fit_local(self, data_instances, label_table=None):
        self._abnormal_detection(data_instances)
        self._parse_cols(data_instances)

        iv_attrs = self.binning_obj.cal_local_iv(data_instances,
                                                 label_table=label_table)
        self.binning_result = iv_attrs
        self.set_schema(data_instances)
        return data_instances

    @staticmethod
    def load_data(data_instance):
        # Here suppose this is a binary question and the event label is 1
        if data_instance.label != 1:
            data_instance.label = 0
        return data_instance
 def setUp(self):
     paillierEncrypt = PaillierEncrypt()
     paillierEncrypt.generate_key()
     self.publickey = paillierEncrypt.get_public_key()
     self.privatekey = paillierEncrypt.get_privacy_key()
Esempio n. 20
0
class HeteroFeatureBinningGuest(BaseHeteroFeatureBinning):
    def __init__(self, params: FeatureBinningParam):
        super(HeteroFeatureBinningGuest, self).__init__(params)

        self.encryptor = PaillierEncrypt()
        self.encryptor.generate_key()
        self.iv_attrs = None
        self.host_iv_attrs = None

    def fit(self, data_instances):
        """
        Apply binning method for both data instances in local party as well as the other one. Afterwards, calculate
        the specific metric value for specific columns.
        """
        self._abnormal_detection(data_instances)
        self._parse_cols(data_instances)

        # 1. Synchronize encryption information
        self.__synchronize_encryption()

        # 2. Prepare labels
        data_instances = data_instances.mapValues(self.load_data)
        label_table = data_instances.mapValues(lambda x: x.label)

        # 3. Transfer encrypted label
        f = functools.partial(self.encrypt, encryptor=self.encryptor)
        encrypted_label_table = label_table.mapValues(f)

        encrypted_label_table_id = self.transfer_variable.generate_transferid(
            self.transfer_variable.encrypted_label)
        federation.remote(encrypted_label_table,
                          name=self.transfer_variable.encrypted_label.name,
                          tag=encrypted_label_table_id,
                          role=consts.HOST,
                          idx=0)

        LOGGER.info("Sent encrypted_label_table to host")

        # 4. Calculates self's binning. In case the other party need time to compute its data,
        #  do binning calculation at this point.
        local_iv = self.fit_local(data_instances, label_table)

        # 5. Received host result and calculate iv value
        encrypted_bin_sum_id = self.transfer_variable.generate_transferid(
            self.transfer_variable.encrypted_bin_sum)

        encrypted_bin_sum = federation.get(
            name=self.transfer_variable.encrypted_bin_sum.name,
            tag=encrypted_bin_sum_id,
            idx=0)

        LOGGER.info("Get encrypted_bin_sum from host")

        result_counts = self.__decrypt_bin_sum(encrypted_bin_sum)
        host_iv_attrs = self.binning_obj.cal_iv_woe(
            result_counts, self.bin_param.adjustment_factor)
        self.host_iv_attrs = host_iv_attrs
        # LOGGER.debug("Lenght of host iv attrs: {}".format(len(self.host_iv_attrs)))
        # for idx, col in enumerate(self.cols):
        #     LOGGER.info("The local iv of {}th feature is {}".format(col, local_iv[idx].iv))

        for idx, iv_attr in enumerate(host_iv_attrs):
            LOGGER.info("The remote iv of {}th measured feature is {}".format(
                idx, iv_attr.iv))

        iv_result = {'local': local_iv, 'remote': host_iv_attrs}

        return iv_result

    def transform(self, data_instances):
        self._abnormal_detection(data_instances)

        self.header = data_instances.schema.get(
            'header')  # ['x1', 'x2', 'x3' ... ]

        self._parse_cols(data_instances)

        # 1. Synchronize encryption information
        self.__synchronize_encryption()

        # 2. Prepare labels
        data_instances = data_instances.mapValues(self.load_data)
        label_table = data_instances.mapValues(lambda x: x.label)

        # 3. Transfer encrypted label
        f = functools.partial(self.encrypt, encryptor=self.encryptor)
        encrypted_label_table = label_table.mapValues(f)
        encrypted_label_table_id = self.transfer_variable.generate_transferid(
            self.transfer_variable.encrypted_label)
        federation.remote(encrypted_label_table,
                          name=self.transfer_variable.encrypted_label.name,
                          tag=encrypted_label_table_id,
                          role=consts.HOST,
                          idx=0)
        LOGGER.info("Sent encrypted_label_table to host for transform")

        # 4. Transform locally
        self.transform_local(data_instances, reformated=True)

        # 5. Received host result and calculate iv value
        encrypted_bin_sum_id = self.transfer_variable.generate_transferid(
            self.transfer_variable.encrypted_bin_sum)
        encrypted_bin_sum = federation.get(
            name=self.transfer_variable.encrypted_bin_sum.name,
            tag=encrypted_bin_sum_id,
            idx=0)

        result_counts = self.__decrypt_bin_sum(encrypted_bin_sum)
        host_iv_attrs = self.binning_obj.cal_iv_woe(
            result_counts, self.bin_param.adjustment_factor)
        self.host_iv_attrs = host_iv_attrs
        for idx, iv_attr in enumerate(host_iv_attrs):
            LOGGER.info("The remote iv of {}th measured feature is {}".format(
                idx, iv_attr.iv))

        data_instances.schema['header'] = self.header
        return data_instances

    @staticmethod
    def encrypt(x, encryptor):
        return encryptor.encrypt(x), encryptor.encrypt(1 - x)

    def transform_local(self, data_instances, reformated=False):
        self._abnormal_detection(data_instances)

        self._parse_cols(data_instances)

        if not reformated:  # Reformat the label type
            data_instances = data_instances.mapValues(self.load_data)

        split_points = []
        for iv_attr in self.iv_attrs:
            s_p = list(iv_attr.split_points)
            split_points.append(s_p)

        self.iv_attrs = self.binning_obj.cal_local_iv(data_instances,
                                                      self.cols, split_points)
        for idx, col in enumerate(self.cols):
            LOGGER.info("The local iv of {}th feature is {}".format(
                col, self.iv_attrs[idx].iv))

    def __synchronize_encryption(self):
        pub_key = self.encryptor.get_public_key()
        pubkey_id = self.transfer_variable.generate_transferid(
            self.transfer_variable.paillier_pubkey)
        # LOGGER.debug("pubkey_id is : {}".format(pubkey_id))

        federation.remote(pub_key,
                          name=self.transfer_variable.paillier_pubkey.name,
                          tag=pubkey_id,
                          role=consts.HOST,
                          idx=0)

        LOGGER.info("send pubkey to host")
        self.has_synchronized = True

    def __decrypt_bin_sum(self, encrypted_bin_sum):
        for feature_sum in encrypted_bin_sum:
            for idx, (encrypted_event,
                      encrypted_non_event) in enumerate(feature_sum):
                event_count = self.encryptor.decrypt(encrypted_event)
                non_event_count = self.encryptor.decrypt(encrypted_non_event)
                feature_sum[idx] = (event_count, non_event_count)
        return encrypted_bin_sum

    def fit_local(self, data_instances, label_table=None):
        self._abnormal_detection(data_instances)

        self._parse_cols(data_instances)

        iv_attrs = self.binning_obj.cal_local_iv(data_instances,
                                                 self.cols,
                                                 label_table=label_table)
        for idx, col in enumerate(self.cols):
            LOGGER.info("The local iv of {}th feature is {}".format(
                col, iv_attrs[idx].iv))
        self.iv_attrs = iv_attrs
        return iv_attrs

    @staticmethod
    def load_data(data_instance):
        # Here suppose this is a binary question and the event label is 1
        # LOGGER.debug('label type is {}'.format(type(data_instance.label)))
        if data_instance.label != 1:
            data_instance.label = 0
        return data_instance
Esempio n. 21
0
class HeteroLRBase(BaseLinearModel, ABC):
    def __init__(self):
        super().__init__()
        self.model_name = 'HeteroSSHELogisticRegression'
        self.model_param_name = 'HeteroSSHELogisticRegressionParam'
        self.model_meta_name = 'HeteroSSHELogisticRegressionMeta'
        self.mode = consts.HETERO
        self.cipher = None
        self.q_field = None
        self.model_param = LogisticRegressionParam()
        self.labels = None
        self.batch_num = []
        self.one_vs_rest_obj = None
        self.secure_matrix_obj: SecureMatrix
        self._set_parties()
        self.cipher_tool = None

    def _transfer_q_field(self):
        if self.role == consts.GUEST:
            q_field = self.cipher.public_key.n
            self.transfer_variable.q_field.remote(q_field,
                                                  role=consts.HOST,
                                                  suffix=("q_field", ))

        else:
            q_field = self.transfer_variable.q_field.get(role=consts.GUEST,
                                                         idx=0,
                                                         suffix=("q_field", ))

        return q_field

    def _init_model(self, params: LogisticRegressionParam):
        super()._init_model(params)
        self.encrypted_mode_calculator_param = params.encrypted_mode_calculator_param
        if self.role == consts.HOST:
            self.init_param_obj.fit_intercept = False
        self.cipher = PaillierEncrypt()
        self.cipher.generate_key(self.model_param.encrypt_param.key_length)
        self.transfer_variable = SSHEModelTransferVariable()
        self.one_vs_rest_obj = one_vs_rest_factory(self,
                                                   role=self.role,
                                                   mode=self.mode,
                                                   has_arbiter=False)

        self.converge_func_name = params.early_stop
        self.reveal_every_iter = params.reveal_every_iter

        self.q_field = self._transfer_q_field()

        LOGGER.debug(f"q_field: {self.q_field}")

        if not self.reveal_every_iter:
            self.self_optimizer = copy.deepcopy(self.optimizer)
            self.remote_optimizer = copy.deepcopy(self.optimizer)

        self.batch_generator = batch_generator.Guest(
        ) if self.role == consts.GUEST else batch_generator.Host()
        self.batch_generator.register_batch_generator(
            BatchGeneratorTransferVariable(), has_arbiter=False)
        self.fixedpoint_encoder = FixedPointEndec(n=self.q_field)
        self.converge_transfer_variable = ConvergeCheckerTransferVariable()
        self.secure_matrix_obj = SecureMatrix(party=self.local_party,
                                              q_field=self.q_field,
                                              other_party=self.other_party)

    def _init_weights(self, model_shape):
        return self.initializer.init_model(model_shape,
                                           init_params=self.init_param_obj)

    def _set_parties(self):
        parties = []
        guest_parties = get_parties().roles_to_parties(["guest"])
        host_parties = get_parties().roles_to_parties(["host"])
        parties.extend(guest_parties)
        parties.extend(host_parties)

        local_party = get_parties().local_party
        other_party = parties[0] if parties[0] != local_party else parties[1]

        self.parties = parties
        self.local_party = local_party
        self.other_party = other_party

    @property
    def is_respectively_reveal(self):
        return self.model_param.reveal_strategy == "respectively"

    def share_model(self, w, suffix):
        source = [w, self.other_party]
        if self.local_party.role == consts.GUEST:
            wb, wa = (
                fixedpoint_numpy.FixedPointTensor.from_source(
                    f"wb_{suffix}",
                    source[0],
                    encoder=self.fixedpoint_encoder,
                    q_field=self.q_field),
                fixedpoint_numpy.FixedPointTensor.from_source(
                    f"wa_{suffix}",
                    source[1],
                    encoder=self.fixedpoint_encoder,
                    q_field=self.q_field),
            )
            return wb, wa
        else:
            wa, wb = (
                fixedpoint_numpy.FixedPointTensor.from_source(
                    f"wa_{suffix}",
                    source[0],
                    encoder=self.fixedpoint_encoder,
                    q_field=self.q_field),
                fixedpoint_numpy.FixedPointTensor.from_source(
                    f"wb_{suffix}",
                    source[1],
                    encoder=self.fixedpoint_encoder,
                    q_field=self.q_field),
            )
            return wa, wb

    def forward(self, weights, features, suffix, cipher):
        raise NotImplementedError("Should not call here")

    def backward(self, error, features, suffix, cipher):
        raise NotImplementedError("Should not call here")

    def compute_loss(self, weights, suffix, cipher):
        raise NotImplementedError("Should not call here")

    def fit(self, data_instances, validate_data=None):
        self.header = data_instances.schema.get("header", [])
        self._abnormal_detection(data_instances)
        self.check_abnormal_values(data_instances)
        self.check_abnormal_values(validate_data)
        classes = self.one_vs_rest_obj.get_data_classes(data_instances)

        if len(classes) > 2:
            self.need_one_vs_rest = True
            self.need_call_back_loss = False
            self.one_vs_rest_fit(train_data=data_instances,
                                 validate_data=validate_data)
        else:
            self.need_one_vs_rest = False
            self.fit_binary(data_instances, validate_data)

    def one_vs_rest_fit(self, train_data=None, validate_data=None):
        LOGGER.info("Class num larger than 2, do one_vs_rest")
        self.one_vs_rest_obj.fit(data_instances=train_data,
                                 validate_data=validate_data)

    def fit_binary(self, data_instances, validate_data=None):
        LOGGER.info("Starting to hetero_sshe_logistic_regression")
        self.callback_list.on_train_begin(data_instances, validate_data)

        model_shape = self.get_features_shape(data_instances)
        instances_count = data_instances.count()

        if not self.component_properties.is_warm_start:
            w = self._init_weights(model_shape)
            self.model_weights = LinearModelWeights(
                l=w, fit_intercept=self.model_param.init_param.fit_intercept)
            last_models = copy.deepcopy(self.model_weights)
        else:
            last_models = copy.deepcopy(self.model_weights)
            w = last_models.unboxed
            self.callback_warm_start_init_iter(self.n_iter_)

        self.batch_generator.initialize_batch_generator(
            data_instances, batch_size=self.batch_size)

        with SPDZ(
                "sshe_lr",
                local_party=self.local_party,
                all_parties=self.parties,
                q_field=self.q_field,
                use_mix_rand=self.model_param.use_mix_rand,
        ) as spdz:
            spdz.set_flowid(self.flowid)
            self.secure_matrix_obj.set_flowid(self.flowid)
            if self.role == consts.GUEST:
                self.labels = data_instances.mapValues(
                    lambda x: np.array([x.label], dtype=int))

            w_self, w_remote = self.share_model(w, suffix="init")
            last_w_self, last_w_remote = w_self, w_remote
            LOGGER.debug(
                f"first_w_self shape: {w_self.shape}, w_remote_shape: {w_remote.shape}"
            )

            batch_data_generator = self.batch_generator.generate_batch_data()

            self.cipher_tool = []
            encoded_batch_data = []
            for batch_data in batch_data_generator:
                if self.fit_intercept:
                    batch_features = batch_data.mapValues(lambda x: np.hstack(
                        (x.features, 1.0)))
                else:
                    batch_features = batch_data.mapValues(lambda x: x.features)
                self.batch_num.append(batch_data.count())

                encoded_batch_data.append(
                    fixedpoint_table.FixedPointTensor(
                        self.fixedpoint_encoder.encode(batch_features),
                        q_field=self.fixedpoint_encoder.n,
                        endec=self.fixedpoint_encoder))

                self.cipher_tool.append(
                    EncryptModeCalculator(
                        self.cipher, self.encrypted_mode_calculator_param.mode,
                        self.encrypted_mode_calculator_param.re_encrypted_rate)
                )

            while self.n_iter_ < self.max_iter:
                self.callback_list.on_epoch_begin(self.n_iter_)
                LOGGER.info(f"start to n_iter: {self.n_iter_}")

                loss_list = []

                self.optimizer.set_iters(self.n_iter_)
                if not self.reveal_every_iter:
                    self.self_optimizer.set_iters(self.n_iter_)
                    self.remote_optimizer.set_iters(self.n_iter_)

                for batch_idx, batch_data in enumerate(encoded_batch_data):
                    current_suffix = (str(self.n_iter_), str(batch_idx))

                    if self.reveal_every_iter:
                        y = self.forward(weights=self.model_weights,
                                         features=batch_data,
                                         suffix=current_suffix,
                                         cipher=self.cipher_tool[batch_idx])
                    else:
                        y = self.forward(weights=(w_self, w_remote),
                                         features=batch_data,
                                         suffix=current_suffix,
                                         cipher=self.cipher_tool[batch_idx])

                    if self.role == consts.GUEST:
                        error = y - self.labels

                        self_g, remote_g = self.backward(
                            error=error,
                            features=batch_data,
                            suffix=current_suffix,
                            cipher=self.cipher_tool[batch_idx])
                    else:
                        self_g, remote_g = self.backward(
                            error=y,
                            features=batch_data,
                            suffix=current_suffix,
                            cipher=self.cipher_tool[batch_idx])

                    # loss computing;
                    suffix = ("loss", ) + current_suffix
                    if self.reveal_every_iter:
                        batch_loss = self.compute_loss(
                            weights=self.model_weights,
                            suffix=suffix,
                            cipher=self.cipher_tool[batch_idx])
                    else:
                        batch_loss = self.compute_loss(
                            weights=(w_self, w_remote),
                            suffix=suffix,
                            cipher=self.cipher_tool[batch_idx])

                    if batch_loss is not None:
                        batch_loss = batch_loss * self.batch_num[batch_idx]
                    loss_list.append(batch_loss)

                    if self.reveal_every_iter:
                        # LOGGER.debug(f"before reveal: self_g shape: {self_g.shape}, remote_g_shape: {remote_g},"
                        #              f"self_g: {self_g}")

                        new_g = self.reveal_models(self_g,
                                                   remote_g,
                                                   suffix=current_suffix)

                        # LOGGER.debug(f"after reveal: new_g shape: {new_g.shape}, new_g: {new_g}"
                        #              f"self.model_param.reveal_strategy: {self.model_param.reveal_strategy}")

                        if new_g is not None:
                            self.model_weights = self.optimizer.update_model(
                                self.model_weights, new_g, has_applied=False)

                        else:
                            self.model_weights = LinearModelWeights(
                                l=np.zeros(self_g.shape),
                                fit_intercept=self.model_param.init_param.
                                fit_intercept)
                    else:
                        if self.optimizer.penalty == consts.L2_PENALTY:
                            self_g = self_g + self.self_optimizer.alpha * w_self
                            remote_g = remote_g + self.remote_optimizer.alpha * w_remote

                        # LOGGER.debug(f"before optimizer: {self_g}, {remote_g}")

                        self_g = self.self_optimizer.apply_gradients(self_g)
                        remote_g = self.remote_optimizer.apply_gradients(
                            remote_g)

                        # LOGGER.debug(f"after optimizer: {self_g}, {remote_g}")
                        w_self -= self_g
                        w_remote -= remote_g

                    LOGGER.debug(
                        f"w_self shape: {w_self.shape}, w_remote_shape: {w_remote.shape}"
                    )

                if self.role == consts.GUEST:
                    loss = np.sum(loss_list) / instances_count
                    self.loss_history.append(loss)
                    if self.need_call_back_loss:
                        self.callback_loss(self.n_iter_, loss)
                else:
                    loss = None

                if self.converge_func_name in ["diff", "abs"]:
                    self.is_converged = self.check_converge_by_loss(
                        loss, suffix=(str(self.n_iter_), ))
                elif self.converge_func_name == "weight_diff":
                    if self.reveal_every_iter:
                        self.is_converged = self.check_converge_by_weights(
                            last_w=last_models.unboxed,
                            new_w=self.model_weights.unboxed,
                            suffix=(str(self.n_iter_), ))
                        last_models = copy.deepcopy(self.model_weights)
                    else:
                        self.is_converged = self.check_converge_by_weights(
                            last_w=(last_w_self, last_w_remote),
                            new_w=(w_self, w_remote),
                            suffix=(str(self.n_iter_), ))
                        last_w_self, last_w_remote = copy.deepcopy(
                            w_self), copy.deepcopy(w_remote)
                else:
                    raise ValueError(
                        f"Cannot recognize early_stop function: {self.converge_func_name}"
                    )

                LOGGER.info("iter: {},  is_converged: {}".format(
                    self.n_iter_, self.is_converged))
                self.callback_list.on_epoch_end(self.n_iter_)
                self.n_iter_ += 1

                if self.stop_training:
                    break

                if self.is_converged:
                    break

            # Finally reconstruct
            if not self.reveal_every_iter:
                new_w = self.reveal_models(w_self,
                                           w_remote,
                                           suffix=("final", ))
                if new_w is not None:
                    self.model_weights = LinearModelWeights(
                        l=new_w,
                        fit_intercept=self.model_param.init_param.fit_intercept
                    )

        LOGGER.debug(f"loss_history: {self.loss_history}")
        self.set_summary(self.get_model_summary())

    def reveal_models(self, w_self, w_remote, suffix=None):
        if suffix is None:
            suffix = self.n_iter_

        if self.model_param.reveal_strategy == "respectively":

            if self.role == consts.GUEST:
                new_w = w_self.get(tensor_name=f"wb_{suffix}", broadcast=False)
                w_remote.broadcast_reconstruct_share(
                    tensor_name=f"wa_{suffix}")

            else:
                w_remote.broadcast_reconstruct_share(
                    tensor_name=f"wb_{suffix}")
                new_w = w_self.get(tensor_name=f"wa_{suffix}", broadcast=False)

        elif self.model_param.reveal_strategy == "encrypted_reveal_in_host":

            if self.role == consts.GUEST:
                new_w = w_self.get(tensor_name=f"wb_{suffix}", broadcast=False)
                encrypted_w_remote = self.cipher.recursive_encrypt(
                    self.fixedpoint_encoder.decode(w_remote.value))
                encrypted_w_remote_tensor = fixedpoint_numpy.PaillierFixedPointTensor(
                    value=encrypted_w_remote)
                encrypted_w_remote_tensor.broadcast_reconstruct_share(
                    tensor_name=f"wa_{suffix}")
            else:
                w_remote.broadcast_reconstruct_share(
                    tensor_name=f"wb_{suffix}")

                new_w = w_self.reconstruct(tensor_name=f"wa_{suffix}",
                                           broadcast=False)

        else:
            raise NotImplementedError(
                f"reveal strategy: {self.model_param.reveal_strategy} has not been implemented."
            )
        return new_w

    def check_converge_by_loss(self, loss, suffix):
        if self.role == consts.GUEST:
            self.is_converged = self.converge_func.is_converge(loss)
            self.transfer_variable.is_converged.remote(self.is_converged,
                                                       suffix=suffix)
        else:
            self.is_converged = self.transfer_variable.is_converged.get(
                idx=0, suffix=suffix)
        return self.is_converged

    def check_converge_by_weights(self, last_w, new_w, suffix):
        if self.reveal_every_iter:
            return self._reveal_every_iter_weights_check(last_w, new_w, suffix)
        else:
            return self._not_reveal_every_iter_weights_check(
                last_w, new_w, suffix)

    def _reveal_every_iter_weights_check(self, last_w, new_w, suffix):
        raise NotImplementedError()

    def _not_reveal_every_iter_weights_check(self, last_w, new_w, suffix):
        last_w_self, last_w_remote = last_w
        w_self, w_remote = new_w
        grad_self = w_self - last_w_self
        grad_remote = w_remote - last_w_remote

        if self.role == consts.GUEST:
            grad_encode = np.hstack((grad_remote.value, grad_self.value))
        else:
            grad_encode = np.hstack((grad_self.value, grad_remote.value))

        grad_encode = np.array([grad_encode])

        grad_tensor_name = ".".join(("check_converge_grad", ) + suffix)
        grad_tensor = fixedpoint_numpy.FixedPointTensor(
            value=grad_encode,
            q_field=self.fixedpoint_encoder.n,
            endec=self.fixedpoint_encoder,
            tensor_name=grad_tensor_name)

        grad_tensor_transpose_name = ".".join(
            ("check_converge_grad_transpose", ) + suffix)
        grad_tensor_transpose = fixedpoint_numpy.FixedPointTensor(
            value=grad_encode.T,
            q_field=self.fixedpoint_encoder.n,
            endec=self.fixedpoint_encoder,
            tensor_name=grad_tensor_transpose_name)

        grad_norm_tensor_name = ".".join(("check_converge_grad_norm", ) +
                                         suffix)

        grad_norm = grad_tensor.dot(grad_tensor_transpose,
                                    target_name=grad_norm_tensor_name).get()

        weight_diff = np.sqrt(grad_norm[0][0])
        LOGGER.info("iter: {}, weight_diff:{}, is_converged: {}".format(
            self.n_iter_, weight_diff, self.is_converged))
        is_converge = False
        if weight_diff < self.model_param.tol:
            is_converge = True
        return is_converge

    def _get_meta(self):
        meta_protobuf_obj = lr_model_meta_pb2.LRModelMeta(
            penalty=self.model_param.penalty,
            tol=self.model_param.tol,
            alpha=self.alpha,
            optimizer=self.model_param.optimizer,
            batch_size=self.batch_size,
            learning_rate=self.model_param.learning_rate,
            max_iter=self.max_iter,
            early_stop=self.model_param.early_stop,
            fit_intercept=self.fit_intercept,
            need_one_vs_rest=self.need_one_vs_rest,
            reveal_strategy=self.model_param.reveal_strategy)
        return meta_protobuf_obj

    def get_single_model_param(self, model_weights=None, header=None):
        header = header if header else self.header
        result = {
            'iters':
            self.n_iter_,
            'loss_history':
            self.loss_history,
            'is_converged':
            self.is_converged,
            # 'weight': weight_dict,
            'intercept':
            self.model_weights.intercept_,
            'header':
            header,
            'best_iteration':
            -1 if self.validation_strategy is None else
            self.validation_strategy.best_iteration
        }

        if self.role == consts.GUEST or self.is_respectively_reveal:
            model_weights = model_weights if model_weights else self.model_weights
            weight_dict = {}
            for idx, header_name in enumerate(header):
                coef_i = model_weights.coef_[idx]
                weight_dict[header_name] = coef_i

            result['weight'] = weight_dict

        return result

    def get_model_summary(self):
        header = self.header
        if header is None:
            return {}
        weight_dict, intercept_ = self.get_weight_intercept_dict(header)
        best_iteration = -1 if self.validation_strategy is None else self.validation_strategy.best_iteration

        summary = {
            "coef": weight_dict,
            "intercept": intercept_,
            "is_converged": self.is_converged,
            "one_vs_rest": self.need_one_vs_rest,
            "best_iteration": best_iteration
        }

        if not self.is_respectively_reveal:
            del summary["intercept"]
            del summary["coef"]

        if self.validation_strategy:
            validation_summary = self.validation_strategy.summary()
            if validation_summary:
                summary["validation_metrics"] = validation_summary
        return summary

    def load_model(self, model_dict):
        LOGGER.debug("Start Loading model")
        result_obj = list(model_dict.get('model').values())[0].get(
            self.model_param_name)
        meta_obj = list(model_dict.get('model').values())[0].get(
            self.model_meta_name)

        if self.init_param_obj is None:
            self.init_param_obj = InitParam()
        self.init_param_obj.fit_intercept = meta_obj.fit_intercept
        self.model_param.reveal_strategy = meta_obj.reveal_strategy
        LOGGER.debug(
            f"reveal_strategy: {self.model_param.reveal_strategy}, {self.is_respectively_reveal}"
        )
        self.header = list(result_obj.header)

        need_one_vs_rest = result_obj.need_one_vs_rest
        LOGGER.info(
            "in _load_model need_one_vs_rest: {}".format(need_one_vs_rest))
        if need_one_vs_rest:
            one_vs_rest_result = result_obj.one_vs_rest_result
            self.one_vs_rest_obj = one_vs_rest_factory(classifier=self,
                                                       role=self.role,
                                                       mode=self.mode,
                                                       has_arbiter=False)
            self.one_vs_rest_obj.load_model(one_vs_rest_result)
            self.need_one_vs_rest = True
        else:
            self.load_single_model(result_obj)
            self.need_one_vs_rest = False

    def load_single_model(self, single_model_obj):
        LOGGER.info("It's a binary task, start to load single model")

        if self.role == consts.GUEST or self.is_respectively_reveal:
            feature_shape = len(self.header)
            tmp_vars = np.zeros(feature_shape)
            weight_dict = dict(single_model_obj.weight)

            for idx, header_name in enumerate(self.header):
                tmp_vars[idx] = weight_dict.get(header_name)

            if self.fit_intercept:
                tmp_vars = np.append(tmp_vars, single_model_obj.intercept)
            self.model_weights = LinearModelWeights(
                tmp_vars, fit_intercept=self.fit_intercept)

        self.n_iter_ = single_model_obj.iters
        return self
Esempio n. 22
0
class HeteroBoosting(Boosting, ABC):
    def __init__(self):
        super(HeteroBoosting, self).__init__()
        self.encrypter = None
        self.encrypted_calculator = None
        self.early_stopping_rounds = None
        self.binning_class = QuantileBinning
        self.model_param = HeteroBoostingParam()
        self.transfer_variable = HeteroBoostingTransferVariable()
        self.mode = consts.HETERO

    def _init_model(self, param: HeteroBoostingParam):
        LOGGER.debug('in hetero boosting, objective param is {}'.format(
            param.objective_param.objective))
        super(HeteroBoosting, self)._init_model(param)
        self.encrypt_param = param.encrypt_param
        self.re_encrypt_rate = param.encrypted_mode_calculator_param
        self.calculated_mode = param.encrypted_mode_calculator_param.mode
        self.re_encrypted_rate = param.encrypted_mode_calculator_param.re_encrypted_rate
        self.early_stopping_rounds = param.early_stopping_rounds
        self.use_first_metric_only = param.use_first_metric_only

    def generate_encrypter(self):

        LOGGER.info("generate encrypter")
        if self.encrypt_param.method.lower() == consts.PAILLIER.lower():
            self.encrypter = PaillierEncrypt()
            self.encrypter.generate_key(self.encrypt_param.key_length)
        else:
            raise NotImplementedError("unknown encrypt type {}".format(
                self.encrypt_param.method.lower()))
        self.encrypted_calculator = EncryptModeCalculator(
            self.encrypter, self.calculated_mode, self.re_encrypted_rate)

    def check_label(self):

        LOGGER.info("check label")
        classes_ = []
        num_classes, booster_dim = 1, 1
        if self.task_type == consts.CLASSIFICATION:
            num_classes, classes_ = ClassifyLabelChecker.validate_label(
                self.data_bin)
            if num_classes > 2:
                booster_dim = num_classes

            range_from_zero = True
            for _class in classes_:
                try:
                    if 0 <= _class < len(classes_) and isinstance(_class, int):
                        continue
                    else:
                        range_from_zero = False
                        break
                except:
                    range_from_zero = False

            classes_ = sorted(classes_)
            if not range_from_zero:
                class_mapping = dict(zip(classes_, range(num_classes)))
                self.y = self.y.mapValues(lambda _class: class_mapping[_class])

        else:
            RegressionLabelChecker.validate_label(self.data_bin)

        return classes_, num_classes, booster_dim
Esempio n. 23
0
class HeteroFeatureBinningGuest(BaseHeteroFeatureBinning):
    def __init__(self, params: FeatureBinningParam):
        super(HeteroFeatureBinningGuest, self).__init__(params)

        self.encryptor = PaillierEncrypt()
        self.encryptor.generate_key()
        self.local_transform_result = None
        self.party_name = consts.GUEST
        self._init_binning_obj()

    def fit(self, data_instances):
        """
        Apply binning method for both data instances in local party as well as the other one. Afterwards, calculate
        the specific metric value for specific columns.
        """
        self._abnormal_detection(data_instances)
        self._parse_cols(data_instances)

        # 1. Synchronize encryption information
        self.__synchronize_encryption()

        # 2. Prepare labels
        data_instances = data_instances.mapValues(self.load_data)
        self.set_schema(data_instances)

        label_table = data_instances.mapValues(lambda x: x.label)

        # 3. Transfer encrypted label
        f = functools.partial(self.encrypt, encryptor=self.encryptor)
        encrypted_label_table = label_table.mapValues(f)

        encrypted_label_table_id = self.transfer_variable.generate_transferid(
            self.transfer_variable.encrypted_label)
        federation.remote(encrypted_label_table,
                          name=self.transfer_variable.encrypted_label.name,
                          tag=encrypted_label_table_id,
                          role=consts.HOST,
                          idx=0)

        LOGGER.info("Sent encrypted_label_table to host")

        # 4. Calculates self's binning. In case the other party need time to compute its data,
        #  do binning calculation at this point.
        data_instances = self.fit_local(data_instances, label_table)

        # 5. Received host result and calculate iv value
        encrypted_bin_sum_id = self.transfer_variable.generate_transferid(
            self.transfer_variable.encrypted_bin_sum)

        encrypted_bin_sum = federation.get(
            name=self.transfer_variable.encrypted_bin_sum.name,
            tag=encrypted_bin_sum_id,
            idx=0)

        LOGGER.info("Get encrypted_bin_sum from host")

        result_counts = self.__decrypt_bin_sum(encrypted_bin_sum)
        host_iv_attrs = self.binning_obj.cal_iv_woe(
            result_counts, self.bin_param.adjustment_factor)

        # Support one host only in this version. Multiple host will be supported in the future.
        self.host_results[consts.HOST] = host_iv_attrs

        for cols_name, iv_attr in host_iv_attrs.items():
            display_result = iv_attr.display_result(
                self.bin_param.display_result)
            LOGGER.info(
                "[Result][FeatureBinning][Host] feature {} 's result is : {}".
                format(cols_name, display_result))

        self.set_schema(data_instances)
        return data_instances

    def transform(self, data_instances):
        self._abnormal_detection(data_instances)
        self._parse_cols(data_instances)

        # 1. Synchronize encryption information
        self.__synchronize_encryption()

        # 2. Prepare labels
        data_instances = data_instances.mapValues(self.load_data)
        label_table = data_instances.mapValues(lambda x: x.label)
        self.set_schema(data_instances)

        # 3. Transfer encrypted label
        f = functools.partial(self.encrypt, encryptor=self.encryptor)
        encrypted_label_table = label_table.mapValues(f)
        encrypted_label_table_id = self.transfer_variable.generate_transferid(
            self.transfer_variable.encrypted_label)
        federation.remote(encrypted_label_table,
                          name=self.transfer_variable.encrypted_label.name,
                          tag=encrypted_label_table_id,
                          role=consts.HOST,
                          idx=0)
        LOGGER.info("Sent encrypted_label_table to host for transform")

        # 4. Transform locally
        self.transform_local(data_instances,
                             label_table=label_table,
                             save_result=False)

        # 5. Received host result and calculate iv value
        encrypted_bin_sum_id = self.transfer_variable.generate_transferid(
            self.transfer_variable.encrypted_bin_sum)
        encrypted_bin_sum = federation.get(
            name=self.transfer_variable.encrypted_bin_sum.name,
            tag=encrypted_bin_sum_id,
            idx=0)

        result_counts = self.__decrypt_bin_sum(encrypted_bin_sum)
        host_iv_attrs = self.binning_obj.cal_iv_woe(
            result_counts, self.bin_param.adjustment_factor)
        # host_results = {'host1': host_iv_attrs}

        # self.save_model(name=self.bin_param.transform_table,
        #                 namespace=self.bin_param.result_namespace,
        #                 binning_result=self.local_transform_result,
        #                 host_results=host_results)

        for col_name, iv_attr in host_iv_attrs.items():
            LOGGER.info("The remote feature {} 's iv is {}".format(
                col_name, iv_attr.iv))

        self.set_schema(data_instances)
        return data_instances

    @staticmethod
    def encrypt(x, encryptor):
        return encryptor.encrypt(x), encryptor.encrypt(1 - x)

    def transform_local(self,
                        data_instances,
                        label_table=None,
                        save_result=True):
        self._abnormal_detection(data_instances)
        self._parse_cols(data_instances)
        split_points = {}
        for col_name, iv_attr in self.binning_result.items():
            split_points[col_name] = iv_attr.split_points

        self.local_transform_result = self.binning_obj.cal_local_iv(
            data_instances, split_points=split_points, label_table=label_table)

        if save_result:
            self.save_model(name=self.bin_param.transform_table,
                            namespace=self.bin_param.result_namespace,
                            binning_result=self.local_transform_result,
                            host_results={})
        for col_name, col_index in self.local_transform_result.items():
            LOGGER.info("The local feature {} 's iv is {}".format(
                col_name, self.local_transform_result[col_name].iv))
        self.set_schema(data_instances)
        return data_instances

    def __synchronize_encryption(self):
        pub_key = self.encryptor.get_public_key()
        pubkey_id = self.transfer_variable.generate_transferid(
            self.transfer_variable.paillier_pubkey)

        federation.remote(pub_key,
                          name=self.transfer_variable.paillier_pubkey.name,
                          tag=pubkey_id,
                          role=consts.HOST,
                          idx=0)

        LOGGER.info("send pubkey to host")
        self.has_synchronized = True

    def __decrypt_bin_sum(self, encrypted_bin_sum):
        # for feature_sum in encrypted_bin_sum:
        for col_name, count_list in encrypted_bin_sum.items():
            new_list = []
            for encrypted_event, encrypted_non_event in count_list:
                event_count = self.encryptor.decrypt(encrypted_event)
                non_event_count = self.encryptor.decrypt(encrypted_non_event)
                new_list.append((event_count, non_event_count))
            encrypted_bin_sum[col_name] = new_list
        return encrypted_bin_sum

    def fit_local(self, data_instances, label_table=None):
        self._abnormal_detection(data_instances)
        self._parse_cols(data_instances)

        iv_attrs = self.binning_obj.cal_local_iv(data_instances,
                                                 label_table=label_table)
        for col_name, iv_attr in iv_attrs.items():
            display_result = iv_attr.display_result(
                self.bin_param.display_result)
            LOGGER.info(
                "[Result][FeatureBinning][Guest] feature {} 's result is : {}".
                format(col_name, display_result))
            # LOGGER.info("[Result][FeatureBinning]The feature {} 's iv is {}".format(col_name, iv_attrs[col_name].iv))
        self.binning_result = iv_attrs
        self.set_schema(data_instances)
        return data_instances

    @staticmethod
    def load_data(data_instance):
        # Here suppose this is a binary question and the event label is 1
        if data_instance.label != 1:
            data_instance.label = 0
        return data_instance
class HeteroSecureBoostingTreeGuest(BoostingTree):
    def __init__(self, secureboost_tree_param):
        super(HeteroSecureBoostingTreeGuest,
              self).__init__(secureboost_tree_param)

        self.convegence = None
        self.y = None
        self.F = None
        self.data_bin = None
        self.loss = None
        self.init_score = None
        self.classes_dict = {}
        self.classes_ = []
        self.num_classes = 0
        self.classify_target = "binary"
        self.feature_num = None
        self.encrypter = None
        self.grad_and_hess = None
        self.flowid = 0
        self.tree_dim = 1
        self.tree_meta = None
        self.trees_ = []
        self.history_loss = []
        self.bin_split_points = None
        self.bin_sparse_points = None

        self.transfer_inst = HeteroSecureBoostingTreeTransferVariable()

    def set_loss(self, objective_param):
        loss_type = objective_param.objective
        params = objective_param.params
        LOGGER.info("set objective, objective is {}".format(loss_type))
        if self.task_type == consts.CLASSIFICATION:
            if loss_type == "cross_entropy":
                if self.num_classes == 2:
                    self.loss = SigmoidBinaryCrossEntropyLoss()
                else:
                    self.loss = SoftmaxCrossEntropyLoss()
            else:
                raise NotImplementedError("objective %s not supported yet" %
                                          (loss_type))
        elif self.task_type == consts.REGRESSION:
            if loss_type == "lse":
                self.loss = LeastSquaredErrorLoss()
            elif loss_type == "lae":
                self.loss = LeastAbsoluteErrorLoss()
            elif loss_type == "huber":
                self.loss = HuberLoss(params[0])
            elif loss_type == "fair":
                self.loss = FairLoss(params[0])
            elif loss_type == "tweedie":
                self.loss = TweedieLoss(params[0])
            elif loss_type == "log_cosh":
                self.loss = LogCoshLoss()
            else:
                raise NotImplementedError("objective %s not supported yet" %
                                          (loss_type))
        else:
            raise NotImplementedError("objective %s not supported yet" %
                                      (loss_type))

    def convert_feature_to_bin(self, data_instance):
        LOGGER.info("convert feature to bins")
        self.data_bin, self.bin_split_points, self.bin_sparse_points = \
            Quantile.convert_feature_to_bin(
                data_instance, self.quantile_method, self.bin_num,
                self.bin_gap, self.bin_sample_num)
        LOGGER.info("convert feature to bins over")

    def set_y(self):
        LOGGER.info("set label from data and check label")
        self.y = self.data_bin.mapValues(lambda instance: instance.label)
        self.check_label()

    def set_flowid(self, flowid=0):
        LOGGER.info("set flowid, flowid is {}".format(flowid))
        self.flowid = flowid

    def generate_flowid(self, round_num, tree_num):
        LOGGER.info("generate flowid")
        return ".".join(map(str, [self.flowid, round_num, tree_num]))

    def check_label(self):
        LOGGER.info("check label")
        if self.task_type == consts.CLASSIFICATION:
            self.num_classes, self.classes_ = ClassifyLabelChecker.validate_y(
                self.y)
            if self.num_classes > 2:
                self.classify_target = "multinomial"
                self.tree_dim = self.num_classes

            range_from_zero = True
            for _class in self.classes_:
                try:
                    if _class >= 0 and _class < range_from_zero and isinstance(
                            _class, int):
                        continue
                    else:
                        range_from_zero = False
                        break
                except:
                    range_from_zero = False

            self.classes_ = sorted(self.classes_)
            if not range_from_zero:
                class_mapping = dict(
                    zip(self.classes_, range(self.num_classes)))
                self.y = self.y.mapValues(lambda _class: class_mapping[_class])

        else:
            RegressionLabelChecker.validate_y(self.y)

        self.set_loss(self.objective_param)

    def generate_encrypter(self):
        LOGGER.info("generate encrypter")
        if self.encrypt_param.method == consts.PAILLIER:
            self.encrypter = PaillierEncrypt()
            self.encrypter.generate_key(self.encrypt_param.key_length)
        else:
            raise NotImplementedError("encrypt method not supported yes!!!")

    @staticmethod
    def accumulate_f(f_val, new_f_val, lr=0.1, idx=0):
        f_val[idx] += lr * new_f_val
        return f_val

    def update_f_value(self, new_f=None, tidx=-1):
        LOGGER.info("update tree f value, tree idx is {}".format(tidx))
        if self.F is None:
            if self.tree_dim > 1:
                self.F, self.init_score = self.loss.initialize(
                    self.y, self.tree_dim)
            else:
                LOGGER.info("tree_dim is %d" % (self.tree_dim))
                self.F, self.init_score = self.loss.initialize(self.y)
        else:
            accumuldate_f = functools.partial(self.accumulate_f,
                                              lr=self.learning_rate,
                                              idx=tidx)

            self.F = self.F.join(new_f, accumuldate_f)

    def compute_grad_and_hess(self):
        LOGGER.info("compute grad and hess")
        loss_method = self.loss
        if self.task_type == consts.CLASSIFICATION:
            self.grad_and_hess = self.y.join(self.F, lambda y, f_val: \
                (loss_method.compute_grad(y, loss_method.predict(f_val)), \
                 loss_method.compute_hess(y, loss_method.predict(f_val))))
        else:
            self.grad_and_hess = self.y.join(
                self.F, lambda y, f_val: (loss_method.compute_grad(y, f_val),
                                          loss_method.compute_hess(y, f_val)))

    def compute_loss(self):
        LOGGER.info("compute loss")
        if self.task_type == consts.CLASSIFICATION:
            loss_method = self.loss
            y_predict = self.F.mapValues(lambda val: loss_method.predict(val))
            loss = loss_method.compute_loss(self.y, y_predict)
        elif self.task_type == consts.REGRESSION:
            if self.objective_param.objective in [
                    "lse", "lae", "logcosh", "tweedie", "log_cosh", "huber"
            ]:
                loss_method = self.loss
                loss = loss_method.compute_loss(self.y, self.F)
            else:
                loss_method = self.loss
                y_predict = self.F.mapValues(
                    lambda val: loss_method.predict(val))
                loss = loss_method.compute_loss(self.y, y_predict)

        return loss

    def get_grad_and_hess(self, tree_idx):
        LOGGER.info("get grad and hess of tree {}".format(tree_idx))
        grad_and_hess_subtree = self.grad_and_hess.mapValues(
            lambda grad_and_hess:
            (grad_and_hess[0][tree_idx], grad_and_hess[1][tree_idx]))
        return grad_and_hess_subtree

    def check_convergence(self, loss):
        LOGGER.info("check convergence")
        if self.convegence is None:
            self.convegence = DiffConverge()

        return self.convegence.is_converge(loss)

    def sample_valid_features(self):
        LOGGER.info("sample valid features")
        if self.feature_num is None:
            self.feature_num = self.bin_split_points.shape[0]

        choose_feature = random.choice(range(0, self.feature_num), \
                                       max(1, int(self.subsample_feature_rate * self.feature_num)), replace=False)

        valid_features = [False for i in range(self.feature_num)]
        for fid in choose_feature:
            valid_features[fid] = True
        return valid_features

    def sync_tree_dim(self):
        LOGGER.info("sync tree dim to host")
        federation.remote(obj=self.tree_dim,
                          name=self.transfer_inst.tree_dim.name,
                          tag=self.transfer_inst.generate_transferid(
                              self.transfer_inst.tree_dim),
                          role=consts.HOST,
                          idx=0)

    def sync_stop_flag(self, stop_flag, num_round):
        LOGGER.info(
            "sync stop flag to host, boosting round is {}".format(num_round))
        federation.remote(obj=stop_flag,
                          name=self.transfer_inst.stop_flag.name,
                          tag=self.transfer_inst.generate_transferid(
                              self.transfer_inst.stop_flag, num_round),
                          role=consts.HOST,
                          idx=0)

    def fit(self, data_inst):
        LOGGER.info("begin to train secureboosting guest model")
        data_inst = self.data_alignment(data_inst)
        self.convert_feature_to_bin(data_inst)
        self.set_y()
        self.update_f_value()
        self.generate_encrypter()

        self.sync_tree_dim()

        for i in range(self.num_trees):
            # n_tree = []
            self.compute_grad_and_hess()
            for tidx in range(self.tree_dim):
                tree_inst = HeteroDecisionTreeGuest(self.tree_param)

                tree_inst.set_inputinfo(self.data_bin,
                                        self.get_grad_and_hess(tidx),
                                        self.bin_split_points,
                                        self.bin_sparse_points)

                valid_features = self.sample_valid_features()
                tree_inst.set_valid_features(valid_features)
                tree_inst.set_encrypter(self.encrypter)
                tree_inst.set_flowid(self.generate_flowid(i, tidx))

                tree_inst.fit()

                tree_meta, tree_param = tree_inst.get_model()
                self.trees_.append(tree_param)
                if self.tree_meta is None:
                    self.tree_meta = tree_meta
                # n_tree.append(tree_inst.get_tree_model())
                self.update_f_value(new_f=tree_inst.predict_weights, tidx=tidx)

            # self.trees_.append(n_tree)
            loss = self.compute_loss()
            self.history_loss.append(loss)
            LOGGER.info("round {} loss is {}".format(i, loss))

            if self.n_iter_no_change is True:
                if self.check_convergence(loss):
                    self.sync_stop_flag(True, i)
                    break
                else:
                    self.sync_stop_flag(False, i)

        LOGGER.info("end to train secureboosting guest model")

    def predict_f_value(self, data_inst):
        LOGGER.info("predict tree f value, there are {} trees".format(
            len(self.trees_)))
        tree_dim = self.tree_dim
        init_score = self.init_score
        self.F = data_inst.mapValues(lambda v: init_score)
        rounds = len(self.trees_) // self.tree_dim
        for i in range(rounds):
            for tidx in range(self.tree_dim):
                tree_inst = HeteroDecisionTreeGuest(self.tree_param)
                tree_inst.load_model(self.tree_meta,
                                     self.trees_[i * self.tree_dim + tidx])
                # tree_inst.set_tree_model(self.trees_[i * self.tree_dim + tidx])
                tree_inst.set_flowid(self.generate_flowid(i, tidx))

                predict_data = tree_inst.predict(data_inst)
                self.update_f_value(new_f=predict_data, tidx=tidx)

    def predict(self, data_inst, predict_param):
        LOGGER.info("start predict")
        data_inst = self.data_alignment(data_inst)
        self.predict_f_value(data_inst)
        if self.task_type == consts.CLASSIFICATION:
            loss_method = self.loss
            predicts = self.F.mapValues(lambda f: loss_method.predict(f))
        elif self.task_type == consts.REGRESSION:
            if self.objective_param.objective in [
                    "lse", "lae", "huber", "log_cosh", "fair", "tweedie"
            ]:
                predicts = self.F
            else:
                raise NotImplementedError(
                    "objective {} not supprted yet".format(
                        self.objective_param.objective))

        if self.task_type == consts.CLASSIFICATION:
            classes_ = self.classes_
            if self.num_classes == 2:
                predict_label = predicts.mapValues(lambda pred: classes_[
                    1] if pred > predict_param.threshold else classes_[0])
            else:
                predict_label = predicts.mapValues(
                    lambda preds: classes_[np.argmax(preds)])

            if predict_param.with_proba:
                predict_result = data_inst.join(
                    predicts, lambda inst, predict_prob:
                    (inst.label, predict_prob))
            else:
                predict_result = data_inst.mapValues(lambda inst:
                                                     (inst.label, None))

            predict_result = predict_result.join(
                predict_label, lambda label_prob, predict_label:
                (label_prob[0], label_prob[1], predict_label))
        elif self.task_type == consts.REGRESSION:
            predict_result = data_inst.join(
                predicts, lambda inst, pred: (inst.label, pred, None))

        else:
            raise NotImplementedError("task type {} not supported yet".format(
                self.task_type))

        LOGGER.info("end predict")

        return predict_result

    def get_model_meta(self):
        model_meta = BoostingTreeModelMeta()
        model_meta.tree_meta.CopyFrom(self.tree_meta)
        model_meta.learning_rate = self.learning_rate
        model_meta.num_trees = self.num_trees
        model_meta.quantile_meta.CopyFrom(
            QuantileMeta(quantile_method=self.quantile_method,
                         bin_num=self.bin_num,
                         bin_gap=self.bin_gap,
                         bin_sample_num=self.bin_sample_num))
        #modelmeta.objective.CopyFrom(ObjectiveParamMeta(objective=self.objective_param.objective, param=self.objective_param.params))
        model_meta.objective_meta.CopyFrom(
            ObjectiveMeta(objective=self.objective_param.objective,
                          param=self.objective_param.params))
        model_meta.task_type = self.task_type
        model_meta.tree_dim = self.tree_dim
        model_meta.n_iter_no_change = self.n_iter_no_change
        model_meta.tol = self.tol
        model_meta.num_classes = self.num_classes
        model_meta.classes_.extend(map(str, self.classes_))

        meta_name = "HeteroSecureBoostingTreeGuest.meta"

        return meta_name, model_meta

    def set_model_meta(self, model_meta):
        self.tree_meta = model_meta.tree_meta
        self.learning_rate = model_meta.learning_rate
        self.num_trees = model_meta.num_trees
        self.quantile_method = model_meta.quantile_meta.quantile_method
        self.bin_num = model_meta.quantile_meta.bin_num
        self.bin_gap = model_meta.quantile_meta.bin_gap
        self.bin_sample_num = model_meta.quantile_meta.bin_sample_num
        self.objective_param.objective = model_meta.objective_meta.objective
        self.objective_param.params = list(model_meta.objective_meta.param)
        self.task_type = model_meta.task_type
        self.tree_dim = model_meta.tree_dim
        self.num_classes = model_meta.num_classes
        self.n_iter_no_change = model_meta.n_iter_no_change
        self.tol = model_meta.tol
        self.classes_ = list(model_meta.classes_)

        self.set_loss(self.objective_param)

    def get_model_param(self):
        model_param = BoostingTreeModelParam()
        model_param.tree_num = len(list(self.trees_))
        model_param.trees_.extend(self.trees_)
        model_param.init_score.extend(self.init_score)
        model_param.losses.extend(self.history_loss)

        param_name = "HeteroSecureBoostingTreeGuest.param"

        return param_name, model_param

    def set_model_param(self, model_param):
        self.trees_ = list(model_param.trees_)
        self.init_score = np.array(list(model_param.init_score))
        self.history_loss = list(model_param.losses)

    def save_model(self, model_table, model_namespace):
        LOGGER.info("save model")
        meta_name, meta_protobuf = self.get_model_meta()
        param_name, param_protobuf = self.get_model_param()
        manager.save_model(buffer_type=meta_name,
                           proto_buffer=meta_protobuf,
                           name=model_table,
                           namespace=model_namespace)

        manager.save_model(buffer_type=param_name,
                           proto_buffer=param_protobuf,
                           name=model_table,
                           namespace=model_namespace)

        return [(meta_name, param_name)]

    def load_model(self, model_table, model_namespace):
        LOGGER.info("load model")
        model_meta = BoostingTreeModelMeta()
        manager.read_model(buffer_type="HeteroSecureBoostingTreeGuest.meta",
                           proto_buffer=model_meta,
                           name=model_table,
                           namespace=model_namespace)
        self.set_model_meta(model_meta)

        model_param = BoostingTreeModelParam()
        manager.read_model(buffer_type="HeteroSecureBoostingTreeGuest.param",
                           proto_buffer=model_param,
                           name=model_table,
                           namespace=model_namespace)
        self.set_model_param(model_param)

    def evaluate(self, labels, pred_prob, pred_labels, evaluate_param):
        LOGGER.info("evaluate data")
        predict_res = None

        if self.task_type == consts.CLASSIFICATION:
            if evaluate_param.classi_type == consts.BINARY:
                predict_res = pred_prob
            elif evaluate_param.classi_type == consts.MULTY:
                predict_res = pred_labels
            else:
                LOGGER.warning(
                    "unknown classification type, return None as evaluation results"
                )
        elif self.task_type == consts.REGRESSION:
            predict_res = pred_prob
        else:
            LOGGER.warning(
                "unknown task type, return None as evaluation results")

        eva = Evaluation(evaluate_param.classi_type)
        return eva.report(labels, predict_res, evaluate_param.metrics,
                          evaluate_param.thresholds, evaluate_param.pos_label)
Esempio n. 25
0
            plain_list.append(int(s.sum_hess * 10**decimal_to_keep))
            s.sum_grad = en.encode_and_encrypt(s.sum_grad)
            s.sum_hess = en.encode_and_encrypt(s.sum_hess)

    def test_padding_num(plain_list, padding_num):
        rs_num = plain_list[0]
        for i in plain_list[1:]:
            rs_num = rs_num * padding_num + i
        return rs_num

    plain_list = []
    decimal_to_keep = 7
    key_length = 1024

    en = Encrypt()
    en.generate_key(key_length)

    encoder = GuestGradHessEncoder(
        en,
        None,
    )
    compressor = HostSplitInfoCompressor(
        key_length,
        consts.ITERATIVEAFFINE,
    )
    decompressor = GuestSplitInfoDecompressor(en, )

    compressor.renew_compressor([100000], {0: 0})
    decompressor.renew_decompressor({0: 0})

    gen_split_info = random_split_info_generate(num=10)
Esempio n. 26
0
class HeteroBoostingGuest(HeteroBoosting, ABC):
    def __init__(self):
        super(HeteroBoostingGuest, self).__init__()

    def _init_model(self, param):
        super(HeteroBoostingGuest, self)._init_model(param)

    def generate_encrypter(self):

        LOGGER.info("generate encrypter")
        if self.encrypt_param.method.lower() == consts.PAILLIER.lower():
            self.encrypter = PaillierEncrypt()
            self.encrypter.generate_key(self.encrypt_param.key_length)
        else:
            raise NotImplementedError("unknown encrypt type {}".format(
                self.encrypt_param.method.lower()))

    def check_label(self):

        LOGGER.info("check label")
        classes_ = []
        num_classes, booster_dim = 1, 1
        if self.task_type == consts.CLASSIFICATION:
            num_classes, classes_ = ClassifyLabelChecker.validate_label(
                self.data_bin)
            if num_classes > 2:
                booster_dim = num_classes

            range_from_zero = True
            for _class in classes_:
                try:
                    if 0 <= _class < len(classes_) and isinstance(_class, int):
                        continue
                    else:
                        range_from_zero = False
                        break
                except BaseException:
                    range_from_zero = False

            classes_ = sorted(classes_)
            if not range_from_zero:
                class_mapping = dict(zip(classes_, range(num_classes)))
                self.y = self.y.mapValues(lambda _class: class_mapping[_class])

        else:
            RegressionLabelChecker.validate_label(self.data_bin)

        return classes_, num_classes, booster_dim

    def sync_booster_dim(self):
        LOGGER.info("sync booster_dim to host")

        self.transfer_variable.booster_dim.remote(self.booster_dim,
                                                  role=consts.HOST,
                                                  idx=-1)

    def sync_stop_flag(self, stop_flag, num_round):
        LOGGER.info("sync stop flag to host, boosting_core round is {}".format(
            num_round))

        self.transfer_variable.stop_flag.remote(stop_flag,
                                                role=consts.HOST,
                                                idx=-1,
                                                suffix=(num_round, ))

    def sync_predict_round(
        self,
        predict_round,
    ):
        LOGGER.info("sync predict start round {}".format(predict_round))
        self.transfer_variable.predict_start_round.remote(
            predict_round,
            role=consts.HOST,
            idx=-1,
        )

    def prepare_warm_start(self, data_inst, classes):
        # adjust parameter for warm start
        warm_start_y_hat = self.predict(data_inst, ret_format='raw')
        self.y_hat = warm_start_y_hat
        self.start_round = len(self.boosting_model_list) // self.booster_dim
        self.boosting_round += self.start_round
        # check classes
        assert set(classes).issubset(set(self.classes_)), 'warm start label alignment failed: cur labels {},' \
                                                          'previous model labels {}'.format(classes, self.classes_)
        # check fid
        self.feat_name_check(data_inst, self.feature_name_fid_mapping)
        self.callback_warm_start_init_iter(self.start_round)

    def fit(self, data_inst, validate_data=None):

        LOGGER.info('begin to fit a hetero boosting model, model is {}'.format(
            self.model_name))

        self.start_round = 0

        self.on_training = True

        self.data_inst = data_inst

        self.data_bin, self.bin_split_points, self.bin_sparse_points = self.prepare_data(
            data_inst)

        self.y = self.get_label(self.data_bin)

        if not self.is_warm_start:
            self.feature_name_fid_mapping = self.gen_feature_fid_mapping(
                data_inst.schema)
            self.classes_, self.num_classes, self.booster_dim = self.check_label(
            )
            self.loss = self.get_loss_function()
            self.y_hat, self.init_score = self.get_init_score(
                self.y, self.num_classes)
        else:
            classes_, num_classes, booster_dim = self.check_label()
            self.prepare_warm_start(data_inst, classes_)

        LOGGER.info('class index is {}'.format(self.classes_))

        self.sync_booster_dim()

        self.generate_encrypter()

        self.callback_list.on_train_begin(data_inst, validate_data)

        self.callback_meta(
            "loss", "train",
            MetricMeta(name="train",
                       metric_type="LOSS",
                       extra_metas={"unit_name": "iters"}))

        self.preprocess()

        for epoch_idx in range(self.start_round, self.boosting_round):

            LOGGER.info('cur epoch idx is {}'.format(epoch_idx))

            self.callback_list.on_epoch_begin(epoch_idx)

            for class_idx in range(self.booster_dim):

                # fit a booster
                model = self.fit_a_learner(epoch_idx, class_idx)

                booster_meta, booster_param = model.get_model()

                if booster_meta is not None and booster_param is not None:
                    self.booster_meta = booster_meta
                    self.boosting_model_list.append(booster_param)

                # update predict score
                cur_sample_weights = model.get_sample_weights()
                self.y_hat = self.get_new_predict_score(self.y_hat,
                                                        cur_sample_weights,
                                                        dim=class_idx)

            # compute loss
            loss = self.compute_loss(self.y_hat, self.y)
            self.history_loss.append(loss)
            LOGGER.info("round {} loss is {}".format(epoch_idx, loss))
            self.callback_metric("loss", "train", [Metric(epoch_idx, loss)])

            # check validation
            validation_strategy = self.callback_list.get_validation_strategy()
            if validation_strategy:
                validation_strategy.set_precomputed_train_scores(
                    self.score_to_predict_result(data_inst, self.y_hat))

            self.callback_list.on_epoch_end(epoch_idx)

            should_stop = False
            if self.n_iter_no_change and self.check_convergence(loss):
                should_stop = True
                self.is_converged = True
            self.sync_stop_flag(self.is_converged, epoch_idx)
            if self.stop_training or should_stop:
                break

        self.postprocess()
        self.callback_list.on_train_end()
        self.callback_meta(
            "loss", "train",
            MetricMeta(name="train",
                       metric_type="LOSS",
                       extra_metas={"Best": min(self.history_loss)}))
        # get summary
        self.set_summary(self.generate_summary())

    @assert_io_num_rows_equal
    def predict(self, data_inst):
        # predict is implemented in hetero_secureboost
        raise NotImplementedError('predict func is not implemented')

    @abc.abstractmethod
    def fit_a_learner(self, epoch_idx: int, booster_dim: int):
        raise NotImplementedError()

    @abc.abstractmethod
    def load_learner(self, model_meta, model_param, epoch_idx, booster_idx):
        raise NotImplementedError()

    @abc.abstractmethod
    def get_model_meta(self):
        raise NotImplementedError()

    @abc.abstractmethod
    def get_model_param(self):
        raise NotImplementedError()

    @abc.abstractmethod
    def set_model_meta(self, model_meta):
        raise NotImplementedError()

    @abc.abstractmethod
    def set_model_param(self, model_param):
        raise NotImplementedError()
Esempio n. 27
0
    def fit(self, data_instances):
        """
        Apply binning method for both data instances in local party as well as the other one. Afterwards, calculate
        the specific metric value for specific columns. Currently, iv is support for binary labeled data only.
        """
        LOGGER.info("Start feature binning fit and transform")
        self._abnormal_detection(data_instances)

        # self._parse_cols(data_instances)
        self._setup_bin_inner_param(data_instances, self.model_param)

        self.binning_obj.fit_split_points(data_instances)
        if self.model_param.skip_static:
            self.transform(data_instances)
            return self.data_output

        label_counts = data_overview.get_label_count(data_instances)
        if len(label_counts) > 2:
            raise ValueError("Iv calculation support binary-data only in this version.")

        data_instances = data_instances.mapValues(self.load_data)
        self.set_schema(data_instances)
        label_table = data_instances.mapValues(lambda x: x.label)

        if self.model_param.local_only:
            LOGGER.info("This is a local only binning fit")
            self.binning_obj.cal_local_iv(data_instances, label_table=label_table,
                                          label_counts=label_counts)
            self.transform(data_instances)
            self.set_summary(self.binning_obj.bin_results.summary())
            return self.data_output

        if self.model_param.encrypt_param.method == consts.PAILLIER:
            cipher = PaillierEncrypt()
            cipher.generate_key(self.model_param.encrypt_param.key_length)
        else:
            raise NotImplementedError("encrypt method not supported yet")
        # from federatedml.secureprotol.encrypt import FakeEncrypt
        # cipher = FakeEncrypt()
        f = functools.partial(self.encrypt, cipher=cipher)
        encrypted_label_table = label_table.mapValues(f)

        self.transfer_variable.encrypted_label.remote(encrypted_label_table,
                                                      role=consts.HOST,
                                                      idx=-1)
        LOGGER.info("Sent encrypted_label_table to host")

        self.binning_obj.cal_local_iv(data_instances, label_table=label_table,
                                      label_counts=label_counts)

        encrypted_bin_sum_infos = self.transfer_variable.encrypted_bin_sum.get(idx=-1)
        encrypted_bin_infos = self.transfer_variable.optimal_info.get(idx=-1)
        total_summary = self.binning_obj.bin_results.summary()

        LOGGER.info("Get encrypted_bin_sum from host")
        for host_idx, encrypted_bin_info in enumerate(encrypted_bin_infos):
            host_party_id = self.component_properties.host_party_idlist[host_idx]
            encrypted_bin_sum = encrypted_bin_sum_infos[host_idx]
            result_counts = self.cipher_decompress(encrypted_bin_sum, cipher)

            host_bin_methods = encrypted_bin_info['bin_method']
            category_names = encrypted_bin_info['category_names']
            if host_bin_methods == consts.OPTIMAL:
                optimal_binning_params = encrypted_bin_info['optimal_params']

                host_model_params = copy.deepcopy(self.model_param)
                host_model_params.bin_num = optimal_binning_params.get('bin_num')
                host_model_params.optimal_binning_param.metric_method = optimal_binning_params.get('metric_method')
                host_model_params.optimal_binning_param.mixture = optimal_binning_params.get('mixture')
                host_model_params.optimal_binning_param.max_bin_pct = optimal_binning_params.get('max_bin_pct')
                host_model_params.optimal_binning_param.min_bin_pct = optimal_binning_params.get('min_bin_pct')

                self.binning_obj.event_total, self.binning_obj.non_event_total = self.get_histogram(data_instances)
                result_counts = dict(result_counts.collect())
                optimal_binning_cols = {x: y for x, y in result_counts.items() if x not in category_names}
                host_binning_obj = self.optimal_binning_sync(optimal_binning_cols, data_instances.count(),
                                                             data_instances.partitions,
                                                             host_idx, host_model_params)
                category_bins = {x: y for x, y in result_counts.items() if x in category_names}
                host_binning_obj.cal_iv_woe(category_bins, self.model_param.adjustment_factor)
            else:
                host_binning_obj = BaseBinning()
                host_binning_obj.cal_iv_woe(result_counts, self.model_param.adjustment_factor)
            host_binning_obj.set_role_party(role=consts.HOST, party_id=host_party_id)
            total_summary = self._merge_summary(total_summary,
                                                host_binning_obj.bin_results.summary())
            self.host_results.append(host_binning_obj)

        self.set_schema(data_instances)
        self.transform(data_instances)
        LOGGER.info("Finish feature binning fit and transform")
        total_summary['test'] = 'test'
        self.set_summary(total_summary)
        return self.data_output
Esempio n. 28
0
class HeteroSSHEBase(BaseLinearModel, ABC):
    def __init__(self):
        super().__init__()
        self.mode = consts.HETERO
        self.cipher = None
        self.q_field = None
        self.model_param = None
        # self.labels = None
        self.weight = None
        self.batch_generator = None
        self.batch_num = []
        self.secure_matrix_obj: SecureMatrix
        # self._set_parties()
        self.parties = None
        self.local_party = None
        self.other_party = None
        self.label_type = None

    def _transfer_q_field(self):
        raise NotImplementedError(f"Should not be called here")

    def _init_model(self, params):
        super()._init_model(params)
        self.cipher = PaillierEncrypt()
        self.cipher.generate_key(self.model_param.encrypt_param.key_length)
        self.transfer_variable = SSHEModelTransferVariable()

        self.converge_func_name = params.early_stop
        self.reveal_every_iter = params.reveal_every_iter

        self.q_field = self._transfer_q_field()

        LOGGER.debug(f"q_field: {self.q_field}")

        if not self.reveal_every_iter:
            self.self_optimizer = copy.deepcopy(self.optimizer)
            self.remote_optimizer = copy.deepcopy(self.optimizer)

        self.fixedpoint_encoder = FixedPointEndec(n=self.q_field)
        self.converge_transfer_variable = ConvergeCheckerTransferVariable()
        self.secure_matrix_obj = SecureMatrix(party=self.local_party,
                                              q_field=self.q_field,
                                              other_party=self.other_party)

    def _init_weights(self, model_shape):
        return self.initializer.init_model(model_shape, init_params=self.init_param_obj)

    @property
    def is_respectively_reveal(self):
        return self.model_param.reveal_strategy == "respectively"

    def _cal_z_in_share(self, w_self, w_remote, features, suffix, cipher):
        raise NotImplementedError("Should not be called here")

    def share_model(self, w, suffix):
        raise NotImplementedError("Should not be called here")

    def forward(self, weights, features, labels, suffix, cipher, batch_weight):
        raise NotImplementedError("Should not be called here")

    def backward(self, error, features, suffix, cipher):
        raise NotImplementedError("Should not be called here")

    def compute_loss(self, weights, labels, suffix, cipher):
        raise NotImplementedError("Should not be called here")

    def reveal_models(self, w_self, w_remote, suffix=None):
        raise NotImplementedError(f"Should not be called here")

    def check_converge_by_loss(self, loss, suffix):
        raise NotImplementedError(f"Should not be called here")

    def check_converge_by_weights(self, last_w, new_w, suffix):
        if self.reveal_every_iter:
            return self._reveal_every_iter_weights_check(last_w, new_w, suffix)
        else:
            return self._not_reveal_every_iter_weights_check(last_w, new_w, suffix)

    def _reveal_every_iter_weights_check(self, last_w, new_w, suffix):
        raise NotImplementedError("Should not be called here")

    def _not_reveal_every_iter_weights_check(self, last_w, new_w, suffix):
        last_w_self, last_w_remote = last_w
        w_self, w_remote = new_w
        grad_self = w_self - last_w_self
        grad_remote = w_remote - last_w_remote

        if self.role == consts.GUEST:
            grad_encode = np.hstack((grad_remote.value, grad_self.value))
        else:
            grad_encode = np.hstack((grad_self.value, grad_remote.value))

        grad_encode = np.array([grad_encode])

        grad_tensor_name = ".".join(("check_converge_grad",) + suffix)
        grad_tensor = fixedpoint_numpy.FixedPointTensor(value=grad_encode,
                                                        q_field=self.fixedpoint_encoder.n,
                                                        endec=self.fixedpoint_encoder,
                                                        tensor_name=grad_tensor_name)

        grad_tensor_transpose_name = ".".join(("check_converge_grad_transpose",) + suffix)
        grad_tensor_transpose = fixedpoint_numpy.FixedPointTensor(value=grad_encode.T,
                                                                  q_field=self.fixedpoint_encoder.n,
                                                                  endec=self.fixedpoint_encoder,
                                                                  tensor_name=grad_tensor_transpose_name)

        grad_norm_tensor_name = ".".join(("check_converge_grad_norm",) + suffix)

        grad_norm = grad_tensor.dot(grad_tensor_transpose, target_name=grad_norm_tensor_name).get()

        weight_diff = np.sqrt(grad_norm[0][0])
        LOGGER.info("iter: {}, weight_diff:{}, is_converged: {}".format(self.n_iter_,
                                                                        weight_diff, self.is_converged))
        is_converge = False
        if weight_diff < self.model_param.tol:
            is_converge = True
        return is_converge

    def get_single_model_weight_dict(self, model_weights=None, header=None):
        header = header if header else self.header
        model_weights = model_weights if model_weights else self.model_weights
        weight_dict = {}
        for idx, header_name in enumerate(header):
            coef_i = model_weights.coef_[idx]
            weight_dict[header_name] = coef_i

        return weight_dict

    def get_single_model_param(self, model_weights=None, header=None):
        header = header if header else self.header
        result = {'iters': self.n_iter_,
                  'loss_history': self.loss_history,
                  'is_converged': self.is_converged,
                  'intercept': self.model_weights.intercept_,
                  'header': header,
                  'best_iteration': -1 if self.validation_strategy is None else
                  self.validation_strategy.best_iteration
                  }
        return result

    def load_model(self, model_dict):
        LOGGER.debug("Start Loading model")
        result_obj = list(model_dict.get('model').values())[0].get(self.model_param_name)
        meta_obj = list(model_dict.get('model').values())[0].get(self.model_meta_name)

        if self.init_param_obj is None:
            self.init_param_obj = InitParam()
        self.init_param_obj.fit_intercept = meta_obj.fit_intercept
        self.model_param.reveal_strategy = meta_obj.reveal_strategy
        LOGGER.debug(f"reveal_strategy: {self.model_param.reveal_strategy}, {self.is_respectively_reveal}")
        self.header = list(result_obj.header)
        return result_obj, meta_obj

    def load_single_model(self, single_model_obj):
        raise NotImplementedError(f"should not be called here")

    def load_single_model_weight(self, single_model_obj):
        feature_shape = len(self.header)
        tmp_vars = np.zeros(feature_shape)
        weight_dict = dict(single_model_obj.weight)

        for idx, header_name in enumerate(self.header):
            tmp_vars[idx] = weight_dict.get(header_name)

        if self.fit_intercept:
            tmp_vars = np.append(tmp_vars, single_model_obj.intercept)
        self.model_weights = LinearModelWeights(tmp_vars, fit_intercept=self.fit_intercept)

    def fit_single_model(self, data_instances, validate_data=None):
        LOGGER.info(f"Start to train single {self.model_name}")
        if len(self.component_properties.host_party_idlist) > 1:
            raise ValueError(f"Hetero SSHE Model does not support multi-host training.")
        self.callback_list.on_train_begin(data_instances, validate_data)

        model_shape = self.get_features_shape(data_instances)
        instances_count = data_instances.count()

        if not self.component_properties.is_warm_start:
            w = self._init_weights(model_shape)
            self.model_weights = LinearModelWeights(l=w,
                                                    fit_intercept=self.model_param.init_param.fit_intercept)
            last_models = copy.deepcopy(self.model_weights)
        else:
            last_models = copy.deepcopy(self.model_weights)
            w = last_models.unboxed
            self.callback_warm_start_init_iter(self.n_iter_)

        if self.role == consts.GUEST:
            if with_weight(data_instances):
                LOGGER.info(f"data with sample weight, use sample weight.")
                if self.model_param.early_stop == "diff":
                    LOGGER.warning("input data with weight, please use 'weight_diff' for 'early_stop'.")
                data_instances = scale_sample_weight(data_instances)
        self.batch_generator.initialize_batch_generator(data_instances, batch_size=self.batch_size)

        with SPDZ(
            "hetero_sshe",
            local_party=self.local_party,
            all_parties=self.parties,
            q_field=self.q_field,
            use_mix_rand=self.model_param.use_mix_rand,
        ) as spdz:
            spdz.set_flowid(self.flowid)
            self.secure_matrix_obj.set_flowid(self.flowid)
            # not sharing the model when reveal_every_iter
            if not self.reveal_every_iter:
                w_self, w_remote = self.share_model(w, suffix="init")
                last_w_self, last_w_remote = w_self, w_remote
                LOGGER.debug(f"first_w_self shape: {w_self.shape}, w_remote_shape: {w_remote.shape}")
            batch_data_generator = self.batch_generator.generate_batch_data()

            encoded_batch_data = []
            batch_labels_list = []
            batch_weight_list = []

            for batch_data in batch_data_generator:
                if self.fit_intercept:
                    batch_features = batch_data.mapValues(lambda x: np.hstack((x.features, 1.0)))
                else:
                    batch_features = batch_data.mapValues(lambda x: x.features)
                if self.role == consts.GUEST:
                    batch_labels = batch_data.mapValues(lambda x: np.array([x.label], dtype=self.label_type))
                    batch_labels_list.append(batch_labels)
                    if self.weight:
                        batch_weight = batch_data.mapValues(lambda x: np.array([x.weight], dtype=float))
                        batch_weight_list.append(batch_weight)
                    else:
                        batch_weight_list.append(None)

                self.batch_num.append(batch_data.count())

                encoded_batch_data.append(
                    fixedpoint_table.FixedPointTensor(self.fixedpoint_encoder.encode(batch_features),
                                                      q_field=self.fixedpoint_encoder.n,
                                                      endec=self.fixedpoint_encoder))

            while self.n_iter_ < self.max_iter:
                self.callback_list.on_epoch_begin(self.n_iter_)
                LOGGER.info(f"start to n_iter: {self.n_iter_}")

                loss_list = []

                self.optimizer.set_iters(self.n_iter_)
                if not self.reveal_every_iter:
                    self.self_optimizer.set_iters(self.n_iter_)
                    self.remote_optimizer.set_iters(self.n_iter_)

                for batch_idx, batch_data in enumerate(encoded_batch_data):
                    current_suffix = (str(self.n_iter_), str(batch_idx))
                    if self.role == consts.GUEST:
                        batch_labels = batch_labels_list[batch_idx]
                        batch_weight = batch_weight_list[batch_idx]
                    else:
                        batch_labels = None
                        batch_weight = None

                    if self.reveal_every_iter:
                        y = self.forward(weights=self.model_weights,
                                         features=batch_data,
                                         labels=batch_labels,
                                         suffix=current_suffix,
                                         cipher=self.cipher,
                                         batch_weight=batch_weight)
                    else:
                        y = self.forward(weights=(w_self, w_remote),
                                         features=batch_data,
                                         labels=batch_labels,
                                         suffix=current_suffix,
                                         cipher=self.cipher,
                                         batch_weight=batch_weight)

                    if self.role == consts.GUEST:
                        if self.weight:
                            error = y - batch_labels.join(batch_weight, lambda y, b: y * b)
                        else:
                            error = y - batch_labels

                        self_g, remote_g = self.backward(error=error,
                                                         features=batch_data,
                                                         suffix=current_suffix,
                                                         cipher=self.cipher)
                    else:
                        self_g, remote_g = self.backward(error=y,
                                                         features=batch_data,
                                                         suffix=current_suffix,
                                                         cipher=self.cipher)

                    # loss computing;
                    suffix = ("loss",) + current_suffix
                    if self.reveal_every_iter:
                        batch_loss = self.compute_loss(weights=self.model_weights,
                                                       labels=batch_labels,
                                                       suffix=suffix,
                                                       cipher=self.cipher)
                    else:
                        batch_loss = self.compute_loss(weights=(w_self, w_remote),
                                                       labels=batch_labels,
                                                       suffix=suffix,
                                                       cipher=self.cipher)

                    if batch_loss is not None:
                        batch_loss = batch_loss * self.batch_num[batch_idx]
                    loss_list.append(batch_loss)

                    if self.reveal_every_iter:
                        # LOGGER.debug(f"before reveal: self_g shape: {self_g.shape}, remote_g_shape: {remote_g},"
                        #              f"self_g: {self_g}")

                        new_g = self.reveal_models(self_g, remote_g, suffix=current_suffix)

                        # LOGGER.debug(f"after reveal: new_g shape: {new_g.shape}, new_g: {new_g}"
                        #              f"self.model_param.reveal_strategy: {self.model_param.reveal_strategy}")

                        if new_g is not None:
                            self.model_weights = self.optimizer.update_model(self.model_weights, new_g,
                                                                             has_applied=False)

                        else:
                            self.model_weights = LinearModelWeights(
                                l=np.zeros(self_g.shape),
                                fit_intercept=self.model_param.init_param.fit_intercept)
                    else:
                        if self.optimizer.penalty == consts.L2_PENALTY:
                            self_g = self_g + self.self_optimizer.alpha * w_self
                            remote_g = remote_g + self.remote_optimizer.alpha * w_remote

                        # LOGGER.debug(f"before optimizer: {self_g}, {remote_g}")

                        self_g = self.self_optimizer.apply_gradients(self_g)
                        remote_g = self.remote_optimizer.apply_gradients(remote_g)

                        # LOGGER.debug(f"after optimizer: {self_g}, {remote_g}")
                        w_self -= self_g
                        w_remote -= remote_g

                        LOGGER.debug(f"w_self shape: {w_self.shape}, w_remote_shape: {w_remote.shape}")

                if self.role == consts.GUEST:
                    loss = np.sum(loss_list) / instances_count
                    self.loss_history.append(loss)
                    if self.need_call_back_loss:
                        self.callback_loss(self.n_iter_, loss)
                else:
                    loss = None

                if self.converge_func_name in ["diff", "abs"]:
                    self.is_converged = self.check_converge_by_loss(loss, suffix=(str(self.n_iter_),))
                elif self.converge_func_name == "weight_diff":
                    if self.reveal_every_iter:
                        self.is_converged = self.check_converge_by_weights(
                            last_w=last_models.unboxed,
                            new_w=self.model_weights.unboxed,
                            suffix=(str(self.n_iter_),))
                        last_models = copy.deepcopy(self.model_weights)
                    else:
                        self.is_converged = self.check_converge_by_weights(
                            last_w=(last_w_self, last_w_remote),
                            new_w=(w_self, w_remote),
                            suffix=(str(self.n_iter_),))
                        last_w_self, last_w_remote = copy.deepcopy(w_self), copy.deepcopy(w_remote)
                else:
                    raise ValueError(f"Cannot recognize early_stop function: {self.converge_func_name}")

                LOGGER.info("iter: {},  is_converged: {}".format(self.n_iter_, self.is_converged))
                self.callback_list.on_epoch_end(self.n_iter_)
                self.n_iter_ += 1

                if self.stop_training:
                    break

                if self.is_converged:
                    break

            # Finally reconstruct
            if not self.reveal_every_iter:
                new_w = self.reveal_models(w_self, w_remote, suffix=("final",))
                if new_w is not None:
                    self.model_weights = LinearModelWeights(
                        l=new_w,
                        fit_intercept=self.model_param.init_param.fit_intercept)

        LOGGER.debug(f"loss_history: {self.loss_history}")
        self.set_summary(self.get_model_summary())

    def get_model_summary(self):
        summary = super().get_model_summary()

        if not self.is_respectively_reveal:
            del summary["intercept"]
            del summary["coef"]

        return summary
Esempio n. 29
0
class TestHeteroLogisticGradient(unittest.TestCase):
    def setUp(self):
        self.paillier_encrypt = PaillierEncrypt()
        self.paillier_encrypt.generate_key()
        # self.hetero_lr_gradient = HeteroLogisticGradient(self.paillier_encrypt)
        self.hetero_lr_gradient = hetero_lr_gradient_and_loss.Guest()

        size = 10
        self.wx = session.parallelize(
            [self.paillier_encrypt.encrypt(i) for i in range(size)])
        self.en_sum_wx_square = session.parallelize(
            [self.paillier_encrypt.encrypt(np.square(i)) for i in range(size)])
        self.w = [i for i in range(size)]
        self.data_inst = session.parallelize([
            Instance(features=[1 for _ in range(size)], label=pow(-1, i % 2))
            for i in range(size)
        ],
                                             partition=1)

        # test fore_gradient
        self.fore_gradient_local = [
            -0.5, 0.75, 0, 1.25, 0.5, 1.75, 1, 2.25, 1.5, 2.75
        ]
        # test gradient
        self.gradient = [
            1.125, 1.125, 1.125, 1.125, 1.125, 1.125, 1.125, 1.125, 1.125,
            1.125
        ]
        self.gradient_fit_intercept = [
            1.125, 1.125, 1.125, 1.125, 1.125, 1.125, 1.125, 1.125, 1.125,
            1.125, 1.125
        ]

        self.loss = 4.505647

    def test_compute_fore_gradient(self):
        # fore_gradient = self.hetero_lr_gradient.compute_and_aggregate_forwards(self.data_inst, self.wx)
        model_weights = LinearModelWeights(l=self.w, fit_intercept=False)

        class EncryptedCalculator(object):
            encrypter = self.paillier_encrypt

            def encrypt_row(self, row):
                return np.array([self.encrypter.encrypt(row)])

            def encrypt(self, input_data):
                return input_data.mapValues(self.encrypt_row)

        encrypted_calculator = [EncryptedCalculator()]
        batch_index = 0
        fore_gradient = self.hetero_lr_gradient.compute_and_aggregate_forwards(
            self.data_inst, model_weights, encrypted_calculator, batch_index)

        fore_gradient_local = [
            self.paillier_encrypt.decrypt(iterator[1])
            for iterator in fore_gradient.collect()
        ]

        self.assertListEqual(fore_gradient_local, self.fore_gradient_local)

    def test_compute_gradient(self):
        fore_gradient = self.hetero_lr_gradient.compute_fore_gradient(
            self.data_inst, self.wx)

        gradient = self.hetero_lr_gradient.compute_gradient(
            self.data_inst, fore_gradient, fit_intercept=False)
        de_gradient = [
            self.paillier_encrypt.decrypt(iterator) for iterator in gradient
        ]
        self.assertListEqual(de_gradient, self.gradient)

        gradient = self.hetero_lr_gradient.compute_gradient(self.data_inst,
                                                            fore_gradient,
                                                            fit_intercept=True)
        de_gradient = [
            self.paillier_encrypt.decrypt(iterator) for iterator in gradient
        ]
        self.assertListEqual(de_gradient, self.gradient_fit_intercept)

    def test_compute_gradient_and_loss(self):
        fore_gradient = self.hetero_lr_gradient.compute_fore_gradient(
            self.data_inst, self.wx)
        gradient, loss = self.hetero_lr_gradient.compute_gradient_and_loss(
            self.data_inst, fore_gradient, self.wx, self.en_sum_wx_square,
            False)
        de_gradient = [self.paillier_encrypt.decrypt(i) for i in gradient]
        self.assertListEqual(de_gradient, self.gradient)

        diff_loss = np.abs(self.loss - self.paillier_encrypt.decrypt(loss))
        self.assertLess(diff_loss, 1e-5)
Esempio n. 30
0
    def fit(self, data_instances):
        """
        Apply binning method for both data instances in local party as well as the other one. Afterwards, calculate
        the specific metric value for specific columns. Currently, iv is support for binary labeled data only.
        """
        LOGGER.info("Start feature binning fit and transform")
        self._abnormal_detection(data_instances)

        # self._parse_cols(data_instances)
        self._setup_bin_inner_param(data_instances, self.model_param)

        self.binning_obj.fit_split_points(data_instances)
        LOGGER.debug("After fit, binning_obj split_points: {}".format(
            self.binning_obj.split_points))

        is_binary_data = data_overview.is_binary_labels(data_instances)
        if not is_binary_data:
            # LOGGER.warning("Iv calculation support binary-data only in this version.")
            raise ValueError(
                "Iv calculation support binary-data only in this version.")
            # return data_instances

        data_instances = data_instances.mapValues(self.load_data)
        self.set_schema(data_instances)
        label_table = data_instances.mapValues(lambda x: x.label)

        if self.model_param.local_only:
            LOGGER.info("This is a local only binning fit")
            self.binning_obj.cal_local_iv(data_instances,
                                          label_table=label_table)
            self.transform(data_instances)
            return self.data_output

        cipher = PaillierEncrypt()
        cipher.generate_key()

        f = functools.partial(self.encrypt, cipher=cipher)
        encrypted_label_table = label_table.mapValues(f)

        self.transfer_variable.encrypted_label.remote(encrypted_label_table,
                                                      role=consts.HOST,
                                                      idx=-1)
        LOGGER.info("Sent encrypted_label_table to host")

        self.binning_obj.cal_local_iv(data_instances, label_table=label_table)

        encrypted_bin_sums = self.transfer_variable.encrypted_bin_sum.get(
            idx=-1)

        LOGGER.info("Get encrypted_bin_sum from host")
        for host_idx, encrypted_bin_sum in enumerate(encrypted_bin_sums):
            host_party_id = self.component_properties.host_party_idlist[
                host_idx]
            host_binning_obj = HostBaseBinning()
            host_binning_obj.set_role_party(role=consts.HOST,
                                            party_id=host_party_id)
            result_counts = self.__decrypt_bin_sum(encrypted_bin_sum, cipher)
            host_binning_obj.cal_iv_woe(result_counts,
                                        self.model_param.adjustment_factor)
            self.host_results.append(host_binning_obj)

        self.set_schema(data_instances)
        self.transform(data_instances)
        LOGGER.info("Finish feature binning fit and transform")
        return self.data_output