def __synchronize_encryption(self, mode='train'): """ Communicate with hosts. Specify whether use encryption or not and transfer the public keys. """ # 2. Send pubkey to those use-encryption guest & hosts encrypter = PaillierEncrypt() encrypter.generate_key(self.key_length) pub_key = encrypter.get_public_key() # LOGGER.debug("Start to remote pub_key: {}, transfer_id: {}".format(pub_key, pubkey_id)) self.transfer_variable.paillier_pubkey.remote(obj=pub_key, role=consts.GUEST, idx=0, suffix=(mode, )) LOGGER.info("send pubkey to guest") pri_key = encrypter.get_privacy_key() self.transfer_variable.paillier_prikey.remote(obj=pri_key, role=consts.GUEST, idx=0, suffix=(mode, )) # LOGGER.debug("Start to remote pri_key: {}, transfer_id: {}".format(pri_key, prikey_id)) LOGGER.info("send prikey to guest") self.transfer_variable.paillier_pubkey.remote(obj=pub_key, role=consts.HOST, idx=-1, suffix=(mode, )) LOGGER.info("send pubkey to host") self.transfer_variable.paillier_prikey.remote(obj=pri_key, role=consts.HOST, idx=-1, suffix=(mode, )) LOGGER.info("send prikey to host")
def keygen(self, key_length, suffix=tuple()) -> dict: use_cipher = self._use_encrypt.get_parties( parties=self._client_parties, suffix=suffix) ciphers = dict() for party, use_encryption in zip(self._client_parties, use_cipher): if not use_encryption: ciphers[party] = None else: cipher = PaillierEncrypt() cipher.generate_key(key_length) pub_key = cipher.get_public_key() self._pailler_pubkey.remote_parties(obj=pub_key, parties=[party], suffix=suffix) ciphers[party] = cipher return ciphers
class HeteroFeatureBinningGuest(BaseHeteroFeatureBinning): def __init__(self): super(HeteroFeatureBinningGuest, self).__init__() self.encryptor = PaillierEncrypt() self.encryptor.generate_key() self.local_transform_result = None self.party_name = consts.GUEST # self._init_binning_obj() def fit(self, data_instances): """ Apply binning method for both data instances in local party as well as the other one. Afterwards, calculate the specific metric value for specific columns. Currently, iv is support for binary labeled data only. """ LOGGER.info("Start feature binning fit and transform") self._abnormal_detection(data_instances) self._parse_cols(data_instances) self.binning_obj.fit_split_points(data_instances) LOGGER.debug("After fit, binning_obj split_points: {}".format( self.binning_obj.split_points)) is_binary_data = data_overview.is_binary_labels(data_instances) if not is_binary_data: LOGGER.warning("Iv is not supported for Multiple-label data.") # data_instances = self.fit_local(data_instances) return data_instances # 1. Synchronize encryption information self.__synchronize_encryption() # 2. Prepare labels data_instances = data_instances.mapValues(self.load_data) self.set_schema(data_instances) label_table = data_instances.mapValues(lambda x: x.label) # 3. Transfer encrypted label f = functools.partial(self.encrypt, encryptor=self.encryptor) encrypted_label_table = label_table.mapValues(f) # encrypted_label_table_id = self.transfer_variable.generate_transferid(self.transfer_variable.encrypted_label) self.transfer_variable.encrypted_label.remote(encrypted_label_table, role=consts.HOST, idx=0) # federation.remote(encrypted_label_table, name=self.transfer_variable.encrypted_label.name, # tag=encrypted_label_table_id, role=consts.HOST, idx=0) LOGGER.info("Sent encrypted_label_table to host") # 4. Calculates self's binning. In case the other party need time to compute its data, # do binning calculation at this point. data_instances = self.fit_local(data_instances, label_table) # 5. Received host result and calculate iv value encrypted_bin_sum = self.transfer_variable.encrypted_bin_sum.get(idx=0) LOGGER.info("Get encrypted_bin_sum from host") result_counts = self.__decrypt_bin_sum(encrypted_bin_sum) host_iv_attrs = self.binning_obj.cal_iv_woe( result_counts, self.model_param.adjustment_factor) # Support one host only in this version. Multiple host will be supported in the future. self.host_results[consts.HOST] = host_iv_attrs self.set_schema(data_instances) LOGGER.debug("Before transform, binning_obj split_points: {}".format( self.binning_obj.split_points)) self.transform(data_instances) LOGGER.info("Finish feature binning fit and transform") return self.data_output @staticmethod def encrypt(x, encryptor): return encryptor.encrypt(x), encryptor.encrypt(1 - x) def transform_local(self, data_instances, label_table=None): self._abnormal_detection(data_instances) self._parse_cols(data_instances) split_points = {} for col_name, iv_attr in self.binning_result.items(): split_points[col_name] = iv_attr.split_points self.local_transform_result = self.binning_obj.cal_local_iv( data_instances, split_points=split_points, label_table=label_table) for col_name, col_index in self.local_transform_result.items(): LOGGER.info("The local feature {} 's iv is {}".format( col_name, self.local_transform_result[col_name].iv)) self.set_schema(data_instances) return data_instances def __synchronize_encryption(self): pub_key = self.encryptor.get_public_key() # pubkey_id = self.transfer_variable.generate_transferid(self.transfer_variable.paillier_pubkey) self.transfer_variable.paillier_pubkey.remote(pub_key, role=consts.HOST, idx=0) """ federation.remote(pub_key, name=self.transfer_variable.paillier_pubkey.name, tag=pubkey_id, role=consts.HOST, idx=0) """ LOGGER.info("send pubkey to host") self.has_synchronized = True def __decrypt_bin_sum(self, encrypted_bin_sum): # for feature_sum in encrypted_bin_sum: for col_name, count_list in encrypted_bin_sum.items(): new_list = [] for encrypted_event, encrypted_non_event in count_list: event_count = self.encryptor.decrypt(encrypted_event) non_event_count = self.encryptor.decrypt(encrypted_non_event) new_list.append((event_count, non_event_count)) encrypted_bin_sum[col_name] = new_list return encrypted_bin_sum def fit_local(self, data_instances, label_table=None): self._abnormal_detection(data_instances) self._parse_cols(data_instances) iv_attrs = self.binning_obj.cal_local_iv(data_instances, label_table=label_table) self.binning_result = iv_attrs self.set_schema(data_instances) return data_instances @staticmethod def load_data(data_instance): # Here suppose this is a binary question and the event label is 1 if data_instance.label != 1: data_instance.label = 0 return data_instance
class HomoLRHost(HomoLRBase): def __init__(self): super(HomoLRHost, self).__init__() self.gradient_operator = None self.loss_history = [] self.is_converged = False self.role = consts.HOST self.aggregator = aggregator.Host() self.model_weights = None self.cipher = paillier_cipher.Host() self.zcl_encrypt_operator = PaillierEncrypt() def _init_model(self, params): super()._init_model(params) self.cipher.register_paillier_cipher(self.transfer_variable) if params.encrypt_param.method in [consts.PAILLIER]: self.use_encrypt = True self.gradient_operator = TaylorLogisticGradient() self.re_encrypt_batches = params.re_encrypt_batches else: self.use_encrypt = False self.gradient_operator = LogisticGradient() def fit(self, data_instances, validate_data=None): LOGGER.debug("Start data count: {}".format(data_instances.count())) self._abnormal_detection(data_instances) self.init_schema(data_instances) validation_strategy = self.init_validation_strategy(data_instances, validate_data) pubkey = self.cipher.gen_paillier_pubkey(enable=self.use_encrypt, suffix=('fit',)) if self.use_encrypt: self.cipher_operator.set_public_key(pubkey) self.model_weights = self._init_model_variables(data_instances) w = self.cipher_operator.encrypt_list(self.model_weights.unboxed) self.model_weights = LogisticRegressionWeights(w, self.model_weights.fit_intercept) LOGGER.debug("After init, model_weights: {}".format(self.model_weights.unboxed)) mini_batch_obj = MiniBatch(data_inst=data_instances, batch_size=self.batch_size) total_batch_num = mini_batch_obj.batch_nums if self.use_encrypt: re_encrypt_times = total_batch_num // self.re_encrypt_batches + 1 LOGGER.debug("re_encrypt_times is :{}, batch_size: {}, total_batch_num: {}, re_encrypt_batches: {}".format( re_encrypt_times, self.batch_size, total_batch_num, self.re_encrypt_batches)) self.cipher.set_re_cipher_time(re_encrypt_times) total_data_num = data_instances.count() LOGGER.debug("Current data count: {}".format(total_data_num)) model_weights = self.model_weights degree = 0 self.__synchronize_encryption() self.zcl_idx, self.zcl_num_party = self.transfer_variable.num_party.get(idx=0, suffix=('train',)) LOGGER.debug("party num:" + str(self.zcl_num_party)) self.__init_model() self.train_loss_results = [] self.train_accuracy_results = [] self.test_loss_results = [] self.test_accuracy_results = [] for iter_num in range(self.max_iter): # mini-batch LOGGER.debug("In iter: {}".format(iter_num)) # batch_data_generator = self.mini_batch_obj.mini_batch_data_generator() batch_num = 0 total_loss = 0 epoch_train_loss_avg = tfe.metrics.Mean() epoch_train_accuracy = tfe.metrics.Accuracy() for train_x, train_y in self.zcl_dataset: LOGGER.info("Staring batch {}".format(batch_num)) start_t = time.time() loss_value, grads = self.__grad(self.zcl_model, train_x, train_y) loss_value = loss_value.numpy() grads = [x.numpy() for x in grads] LOGGER.info("Start encrypting") loss_value = batch_encryption.encrypt(self.zcl_encrypt_operator.get_public_key(), loss_value) grads = [batch_encryption.encrypt_matrix(self.zcl_encrypt_operator.get_public_key(), x) for x in grads] LOGGER.info("Finish encrypting") grads = Gradients(grads) self.transfer_variable.host_grad.remote(obj=grads.for_remote(), role=consts.ARBITER, idx=0, suffix=(iter_num, batch_num)) LOGGER.info("Sent grads") self.transfer_variable.host_loss.remote(obj=loss_value, role=consts.ARBITER, idx=0, suffix=(iter_num, batch_num)) LOGGER.info("Sent loss") sum_grads = self.transfer_variable.aggregated_grad.get(idx=0, suffix=(iter_num, batch_num)) LOGGER.info("Got grads") sum_loss = self.transfer_variable.aggregated_loss.get(idx=0, suffix=(iter_num, batch_num)) LOGGER.info("Got loss") sum_loss = batch_encryption.decrypt(self.zcl_encrypt_operator.get_privacy_key(), sum_loss) sum_grads = [ batch_encryption.decrypt_matrix(self.zcl_encrypt_operator.get_privacy_key(), x).astype(np.float32) for x in sum_grads.unboxed] LOGGER.info("Finish decrypting") # sum_grads = np.array(sum_grads) / self.zcl_num_party self.zcl_optimizer.apply_gradients(zip(sum_grads, self.zcl_model.trainable_variables), self.zcl_global_step) elapsed_time = time.time() - start_t # epoch_train_loss_avg(loss_value) # epoch_train_accuracy(tf.argmax(self.zcl_model(train_x), axis=1, output_type=tf.int32), # train_y) self.train_loss_results.append(sum_loss) train_accuracy_v = accuracy_score(train_y, tf.argmax(self.zcl_model(train_x), axis=1, output_type=tf.int32)) self.train_accuracy_results.append(train_accuracy_v) test_loss_v = self.__loss(self.zcl_model, self.zcl_x_test, self.zcl_y_test) self.test_loss_results.append(test_loss_v) test_accuracy_v = accuracy_score(self.zcl_y_test, tf.argmax(self.zcl_model(self.zcl_x_test), axis=1, output_type=tf.int32)) self.test_accuracy_results.append(test_accuracy_v) LOGGER.info( "Epoch {:03d}, iteration {:03d}: train_loss: {:.3f}, train_accuracy: {:.3%}, test_loss: {:.3f}, " "test_accuracy: {:.3%}, elapsed_time: {:.4f}".format( iter_num, batch_num, sum_loss, train_accuracy_v, test_loss_v, test_accuracy_v, elapsed_time) ) batch_num += 1 if batch_num >= self.zcl_early_stop_batch: return self.n_iter_ = iter_num def __synchronize_encryption(self, mode='train'): """ Communicate with hosts. Specify whether use encryption or not and transfer the public keys. """ pub_key = self.transfer_variable.paillier_pubkey.get(idx=0, suffix=(mode,)) LOGGER.debug("Received pubkey") self.zcl_encrypt_operator.set_public_key(pub_key) pri_key = self.transfer_variable.paillier_prikey.get(idx=0, suffix=(mode,)) LOGGER.debug("Received prikey") self.zcl_encrypt_operator.set_privacy_key(pri_key) def __init_model(self): # self.zcl_model = keras.Sequential([ # keras.layers.Flatten(input_shape=(28, 28)), # keras.layers.Dense(128, activation=tf.nn.relu), # keras.layers.Dense(10, activation=tf.nn.softmax) # ]) # # LOGGER.info("Initialed model") json_file = open(MODEL_JSON_DIR, 'r') loaded_model_json = json_file.read() json_file.close() loaded_model = keras.models.model_from_json(loaded_model_json) loaded_model.load_weights(MODEL_WEIGHT_DIR) self.zcl_model = loaded_model LOGGER.info("Initialed model") # The data, split between train and test sets: (x_train, y_train), (x_test, y_test) = keras.datasets.fashion_mnist.load_data() x_train = x_train.astype('float32') x_test = x_test.astype('float32') x_train /= 255.0 x_test /= 255.0 y_train = y_train.squeeze().astype(np.int32) y_test = y_test.squeeze().astype(np.int32) avg_length = int(len(x_train) / self.zcl_num_party) split_idx = [_ * avg_length for _ in range(1, self.zcl_num_party)] x_train = np.split(x_train, split_idx)[self.zcl_idx] y_train = np.split(y_train, split_idx)[self.zcl_idx] train_dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train)) BATCH_SIZE = 128 SHUFFLE_BUFFER_SIZE = 1000 train_dataset = train_dataset.shuffle(SHUFFLE_BUFFER_SIZE, reshuffle_each_iteration=True).batch(BATCH_SIZE) self.zcl_dataset = train_dataset self.zcl_x_test = x_test self.zcl_y_test = y_test self.zcl_cce = tf.keras.losses.SparseCategoricalCrossentropy() self.zcl_optimizer = tf.train.AdamOptimizer(learning_rate=LEARNING_RATE) self.zcl_global_step = tf.Variable(0) def __loss(self, model, x, y): y_ = model(x) return self.zcl_cce(y_true=y, y_pred=y_) def __grad(self, model, inputs, targets): with tf.GradientTape() as tape: loss_value = self.__loss(model, inputs, targets) return loss_value, tape.gradient(loss_value, model.trainable_variables) def __clip_gradients(self, grads, min_v, max_v): results = [tf.clip_by_value(t, min_v, max_v).numpy() for t in grads] return results def predict(self, data_instances): LOGGER.info(f'Start predict task') self._abnormal_detection(data_instances) self.init_schema(data_instances) suffix = ('predict',) pubkey = self.cipher.gen_paillier_pubkey(enable=self.use_encrypt, suffix=suffix) if self.use_encrypt: self.cipher_operator.set_public_key(pubkey) if self.use_encrypt: final_model = self.transfer_variable.aggregated_model.get(idx=0, suffix=suffix) model_weights = LogisticRegressionWeights(final_model.unboxed, self.fit_intercept) wx = self.compute_wx(data_instances, model_weights.coef_, model_weights.intercept_) self.transfer_variable.predict_wx.remote(wx, consts.ARBITER, 0, suffix=suffix) predict_result = self.transfer_variable.predict_result.get(idx=0, suffix=suffix) predict_result = predict_result.join(data_instances, lambda p, d: [d.label, p, None, {"0": None, "1": None}]) else: predict_wx = self.compute_wx(data_instances, self.model_weights.coef_, self.model_weights.intercept_) pred_table = self.classify(predict_wx, self.model_param.predict_param.threshold) predict_result = data_instances.mapValues(lambda x: x.label) predict_result = pred_table.join(predict_result, lambda x, y: [y, x[1], x[0], {"1": x[0], "0": 1 - x[0]}]) return predict_result def _get_param(self): header = self.header weight_dict = {} intercept = 0 if not self.use_encrypt: lr_vars = self.model_weights.coef_ for idx, header_name in enumerate(header): coef_i = lr_vars[idx] weight_dict[header_name] = coef_i intercept = self.model_weights.intercept_ param_protobuf_obj = lr_model_param_pb2.LRModelParam(iters=self.n_iter_, loss_history=self.loss_history, is_converged=self.is_converged, weight=weight_dict, intercept=intercept, header=header) from google.protobuf import json_format json_result = json_format.MessageToJson(param_protobuf_obj) LOGGER.debug("json_result: {}".format(json_result)) return param_protobuf_obj
class HomoLRGuest(HomoLRBase): def __init__(self): super(HomoLRGuest, self).__init__() self.gradient_operator = LogisticGradient() self.loss_history = [] self.role = consts.GUEST self.aggregator = aggregator.Guest() self.zcl_encrypt_operator = PaillierEncrypt() def _init_model(self, params): super()._init_model(params) def fit(self, data_instances, validate_data=None): self._abnormal_detection(data_instances) self.init_schema(data_instances) validation_strategy = self.init_validation_strategy( data_instances, validate_data) self.model_weights = self._init_model_variables(data_instances) max_iter = self.max_iter total_data_num = data_instances.count() mini_batch_obj = MiniBatch(data_inst=data_instances, batch_size=self.batch_size) model_weights = self.model_weights self.__synchronize_encryption() self.zcl_idx, self.zcl_num_party = self.transfer_variable.num_party.get( idx=0, suffix=('train', )) LOGGER.debug("party num:" + str(self.zcl_num_party)) self.__init_model() self.train_loss_results = [] self.train_accuracy_results = [] self.test_loss_results = [] self.test_accuracy_results = [] for iter_num in range(self.max_iter): total_loss = 0 batch_num = 0 epoch_train_loss_avg = tfe.metrics.Mean() epoch_train_accuracy = tfe.metrics.Accuracy() for train_x, train_y in self.zcl_dataset: LOGGER.info("Staring batch {}".format(batch_num)) start_t = time.time() loss_value, grads = self.__grad(self.zcl_model, train_x, train_y) loss_value = loss_value.numpy() grads = [x.numpy() for x in grads] LOGGER.info("Start encrypting") loss_value = batch_encryption.encrypt( self.zcl_encrypt_operator.get_public_key(), loss_value) grads = [ batch_encryption.encrypt_matrix( self.zcl_encrypt_operator.get_public_key(), x) for x in grads ] grads = Gradients(grads) LOGGER.info("Finish encrypting") # grads = self.encrypt_operator.get_public_key() self.transfer_variable.guest_grad.remote( obj=grads.for_remote(), role=consts.ARBITER, idx=0, suffix=(iter_num, batch_num)) LOGGER.info("Sent grads") self.transfer_variable.guest_loss.remote(obj=loss_value, role=consts.ARBITER, idx=0, suffix=(iter_num, batch_num)) LOGGER.info("Sent loss") sum_grads = self.transfer_variable.aggregated_grad.get( idx=0, suffix=(iter_num, batch_num)) LOGGER.info("Got grads") sum_loss = self.transfer_variable.aggregated_loss.get( idx=0, suffix=(iter_num, batch_num)) LOGGER.info("Got loss") sum_loss = batch_encryption.decrypt( self.zcl_encrypt_operator.get_privacy_key(), sum_loss) sum_grads = [ batch_encryption.decrypt_matrix( self.zcl_encrypt_operator.get_privacy_key(), x).astype(np.float32) for x in sum_grads.unboxed ] LOGGER.info("Finish decrypting") # sum_grads = np.array(sum_grads) / self.zcl_num_party self.zcl_optimizer.apply_gradients( zip(sum_grads, self.zcl_model.trainable_variables), self.zcl_global_step) elapsed_time = time.time() - start_t # epoch_train_loss_avg(loss_value) # epoch_train_accuracy(tf.argmax(self.zcl_model(train_x), axis=1, output_type=tf.int32), # train_y) self.train_loss_results.append(sum_loss) train_accuracy_v = accuracy_score( train_y, tf.argmax(self.zcl_model(train_x), axis=1, output_type=tf.int32)) self.train_accuracy_results.append(train_accuracy_v) test_loss_v = self.__loss(self.zcl_model, self.zcl_x_test, self.zcl_y_test) self.test_loss_results.append(test_loss_v) test_accuracy_v = accuracy_score( self.zcl_y_test, tf.argmax(self.zcl_model(self.zcl_x_test), axis=1, output_type=tf.int32)) self.test_accuracy_results.append(test_accuracy_v) LOGGER.info( "Epoch {:03d}, iteration {:03d}: train_loss: {:.3f}, train_accuracy: {:.3%}, test_loss: {:.3f}, " "test_accuracy: {:.3%}, elapsed_time: {:.4f}".format( iter_num, batch_num, sum_loss, train_accuracy_v, test_loss_v, test_accuracy_v, elapsed_time)) batch_num += 1 if batch_num >= self.zcl_early_stop_batch: return self.n_iter_ = iter_num def __synchronize_encryption(self, mode='train'): """ Communicate with hosts. Specify whether use encryption or not and transfer the public keys. """ pub_key = self.transfer_variable.paillier_pubkey.get(idx=0, suffix=(mode, )) LOGGER.debug("Received pubkey") self.zcl_encrypt_operator.set_public_key(pub_key) pri_key = self.transfer_variable.paillier_prikey.get(idx=0, suffix=(mode, )) LOGGER.debug("Received prikey") self.zcl_encrypt_operator.set_privacy_key(pri_key) def __init_model(self): # self.zcl_model = keras.Sequential([ # keras.layers.Flatten(input_shape=(28, 28)), # keras.layers.Dense(128, activation=tf.nn.relu), # keras.layers.Dense(10, activation=tf.nn.softmax) # ]) # json_file = open(MODEL_JSON_DIR, 'r') loaded_model_json = json_file.read() json_file.close() loaded_model = keras.models.model_from_json(loaded_model_json) loaded_model.load_weights(MODEL_WEIGHT_DIR) self.zcl_model = loaded_model LOGGER.info("Initialed model") # The data, split between train and test sets: (x_train, y_train), (x_test, y_test) = keras.datasets.fashion_mnist.load_data() x_train = x_train.astype('float32') x_test = x_test.astype('float32') x_train /= 255.0 x_test /= 255.0 y_train = y_train.squeeze().astype(np.int32) y_test = y_test.squeeze().astype(np.int32) avg_length = int(len(x_train) / self.zcl_num_party) split_idx = [_ * avg_length for _ in range(1, self.zcl_num_party)] x_train = np.split(x_train, split_idx)[self.zcl_idx] y_train = np.split(y_train, split_idx)[self.zcl_idx] train_dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train)) BATCH_SIZE = 128 SHUFFLE_BUFFER_SIZE = 1000 train_dataset = train_dataset.shuffle( SHUFFLE_BUFFER_SIZE, reshuffle_each_iteration=True).batch(BATCH_SIZE) self.zcl_dataset = train_dataset self.zcl_x_test = x_test self.zcl_y_test = y_test self.zcl_cce = tf.keras.losses.SparseCategoricalCrossentropy() self.zcl_optimizer = tf.train.AdamOptimizer( learning_rate=LEARNING_RATE) self.zcl_global_step = tf.Variable(0) def __loss(self, model, x, y): y_ = model(x) return self.zcl_cce(y_true=y, y_pred=y_) def __grad(self, model, inputs, targets): with tf.GradientTape() as tape: loss_value = self.__loss(model, inputs, targets) return loss_value, tape.gradient(loss_value, model.trainable_variables) def __clip_gradients(self, grads, min_v, max_v): results = [tf.clip_by_value(t, min_v, max_v).numpy() for t in grads] return results def predict(self, data_instances): self._abnormal_detection(data_instances) self.init_schema(data_instances) predict_wx = self.compute_wx(data_instances, self.model_weights.coef_, self.model_weights.intercept_) pred_table = self.classify(predict_wx, self.model_param.predict_param.threshold) predict_result = data_instances.mapValues(lambda x: x.label) predict_result = pred_table.join( predict_result, lambda x, y: [y, x[1], x[0], { "1": x[0], "0": 1 - x[0] }]) return predict_result
def setUp(self): paillierEncrypt = PaillierEncrypt() paillierEncrypt.generate_key() self.publickey = paillierEncrypt.get_public_key() self.privatekey = paillierEncrypt.get_privacy_key()
class HeteroFeatureBinningGuest(BaseHeteroFeatureBinning): def __init__(self, params: FeatureBinningParam): super(HeteroFeatureBinningGuest, self).__init__(params) self.encryptor = PaillierEncrypt() self.encryptor.generate_key() self.local_transform_result = None self.party_name = consts.GUEST self._init_binning_obj() def fit(self, data_instances): """ Apply binning method for both data instances in local party as well as the other one. Afterwards, calculate the specific metric value for specific columns. """ self._abnormal_detection(data_instances) self._parse_cols(data_instances) # 1. Synchronize encryption information self.__synchronize_encryption() # 2. Prepare labels data_instances = data_instances.mapValues(self.load_data) self.set_schema(data_instances) label_table = data_instances.mapValues(lambda x: x.label) # 3. Transfer encrypted label f = functools.partial(self.encrypt, encryptor=self.encryptor) encrypted_label_table = label_table.mapValues(f) encrypted_label_table_id = self.transfer_variable.generate_transferid( self.transfer_variable.encrypted_label) federation.remote(encrypted_label_table, name=self.transfer_variable.encrypted_label.name, tag=encrypted_label_table_id, role=consts.HOST, idx=0) LOGGER.info("Sent encrypted_label_table to host") # 4. Calculates self's binning. In case the other party need time to compute its data, # do binning calculation at this point. data_instances = self.fit_local(data_instances, label_table) # 5. Received host result and calculate iv value encrypted_bin_sum_id = self.transfer_variable.generate_transferid( self.transfer_variable.encrypted_bin_sum) encrypted_bin_sum = federation.get( name=self.transfer_variable.encrypted_bin_sum.name, tag=encrypted_bin_sum_id, idx=0) LOGGER.info("Get encrypted_bin_sum from host") result_counts = self.__decrypt_bin_sum(encrypted_bin_sum) host_iv_attrs = self.binning_obj.cal_iv_woe( result_counts, self.bin_param.adjustment_factor) # Support one host only in this version. Multiple host will be supported in the future. self.host_results[consts.HOST] = host_iv_attrs for cols_name, iv_attr in host_iv_attrs.items(): display_result = iv_attr.display_result( self.bin_param.display_result) LOGGER.info( "[Result][FeatureBinning][Host] feature {} 's result is : {}". format(cols_name, display_result)) self.set_schema(data_instances) return data_instances def transform(self, data_instances): self._abnormal_detection(data_instances) self._parse_cols(data_instances) # 1. Synchronize encryption information self.__synchronize_encryption() # 2. Prepare labels data_instances = data_instances.mapValues(self.load_data) label_table = data_instances.mapValues(lambda x: x.label) self.set_schema(data_instances) # 3. Transfer encrypted label f = functools.partial(self.encrypt, encryptor=self.encryptor) encrypted_label_table = label_table.mapValues(f) encrypted_label_table_id = self.transfer_variable.generate_transferid( self.transfer_variable.encrypted_label) federation.remote(encrypted_label_table, name=self.transfer_variable.encrypted_label.name, tag=encrypted_label_table_id, role=consts.HOST, idx=0) LOGGER.info("Sent encrypted_label_table to host for transform") # 4. Transform locally self.transform_local(data_instances, label_table=label_table, save_result=False) # 5. Received host result and calculate iv value encrypted_bin_sum_id = self.transfer_variable.generate_transferid( self.transfer_variable.encrypted_bin_sum) encrypted_bin_sum = federation.get( name=self.transfer_variable.encrypted_bin_sum.name, tag=encrypted_bin_sum_id, idx=0) result_counts = self.__decrypt_bin_sum(encrypted_bin_sum) host_iv_attrs = self.binning_obj.cal_iv_woe( result_counts, self.bin_param.adjustment_factor) # host_results = {'host1': host_iv_attrs} # self.save_model(name=self.bin_param.transform_table, # namespace=self.bin_param.result_namespace, # binning_result=self.local_transform_result, # host_results=host_results) for col_name, iv_attr in host_iv_attrs.items(): LOGGER.info("The remote feature {} 's iv is {}".format( col_name, iv_attr.iv)) self.set_schema(data_instances) return data_instances @staticmethod def encrypt(x, encryptor): return encryptor.encrypt(x), encryptor.encrypt(1 - x) def transform_local(self, data_instances, label_table=None, save_result=True): self._abnormal_detection(data_instances) self._parse_cols(data_instances) split_points = {} for col_name, iv_attr in self.binning_result.items(): split_points[col_name] = iv_attr.split_points self.local_transform_result = self.binning_obj.cal_local_iv( data_instances, split_points=split_points, label_table=label_table) if save_result: self.save_model(name=self.bin_param.transform_table, namespace=self.bin_param.result_namespace, binning_result=self.local_transform_result, host_results={}) for col_name, col_index in self.local_transform_result.items(): LOGGER.info("The local feature {} 's iv is {}".format( col_name, self.local_transform_result[col_name].iv)) self.set_schema(data_instances) return data_instances def __synchronize_encryption(self): pub_key = self.encryptor.get_public_key() pubkey_id = self.transfer_variable.generate_transferid( self.transfer_variable.paillier_pubkey) federation.remote(pub_key, name=self.transfer_variable.paillier_pubkey.name, tag=pubkey_id, role=consts.HOST, idx=0) LOGGER.info("send pubkey to host") self.has_synchronized = True def __decrypt_bin_sum(self, encrypted_bin_sum): # for feature_sum in encrypted_bin_sum: for col_name, count_list in encrypted_bin_sum.items(): new_list = [] for encrypted_event, encrypted_non_event in count_list: event_count = self.encryptor.decrypt(encrypted_event) non_event_count = self.encryptor.decrypt(encrypted_non_event) new_list.append((event_count, non_event_count)) encrypted_bin_sum[col_name] = new_list return encrypted_bin_sum def fit_local(self, data_instances, label_table=None): self._abnormal_detection(data_instances) self._parse_cols(data_instances) iv_attrs = self.binning_obj.cal_local_iv(data_instances, label_table=label_table) for col_name, iv_attr in iv_attrs.items(): display_result = iv_attr.display_result( self.bin_param.display_result) LOGGER.info( "[Result][FeatureBinning][Guest] feature {} 's result is : {}". format(col_name, display_result)) # LOGGER.info("[Result][FeatureBinning]The feature {} 's iv is {}".format(col_name, iv_attrs[col_name].iv)) self.binning_result = iv_attrs self.set_schema(data_instances) return data_instances @staticmethod def load_data(data_instance): # Here suppose this is a binary question and the event label is 1 if data_instance.label != 1: data_instance.label = 0 return data_instance
class HeteroFeatureBinningGuest(BaseHeteroFeatureBinning): def __init__(self, params: FeatureBinningParam): super(HeteroFeatureBinningGuest, self).__init__(params) self.encryptor = PaillierEncrypt() self.encryptor.generate_key() self.iv_attrs = None self.host_iv_attrs = None def fit(self, data_instances): """ Apply binning method for both data instances in local party as well as the other one. Afterwards, calculate the specific metric value for specific columns. """ self._abnormal_detection(data_instances) self._parse_cols(data_instances) # 1. Synchronize encryption information self.__synchronize_encryption() # 2. Prepare labels data_instances = data_instances.mapValues(self.load_data) label_table = data_instances.mapValues(lambda x: x.label) # 3. Transfer encrypted label f = functools.partial(self.encrypt, encryptor=self.encryptor) encrypted_label_table = label_table.mapValues(f) encrypted_label_table_id = self.transfer_variable.generate_transferid( self.transfer_variable.encrypted_label) federation.remote(encrypted_label_table, name=self.transfer_variable.encrypted_label.name, tag=encrypted_label_table_id, role=consts.HOST, idx=0) LOGGER.info("Sent encrypted_label_table to host") # 4. Calculates self's binning. In case the other party need time to compute its data, # do binning calculation at this point. local_iv = self.fit_local(data_instances, label_table) # 5. Received host result and calculate iv value encrypted_bin_sum_id = self.transfer_variable.generate_transferid( self.transfer_variable.encrypted_bin_sum) encrypted_bin_sum = federation.get( name=self.transfer_variable.encrypted_bin_sum.name, tag=encrypted_bin_sum_id, idx=0) LOGGER.info("Get encrypted_bin_sum from host") result_counts = self.__decrypt_bin_sum(encrypted_bin_sum) host_iv_attrs = self.binning_obj.cal_iv_woe( result_counts, self.bin_param.adjustment_factor) self.host_iv_attrs = host_iv_attrs # LOGGER.debug("Lenght of host iv attrs: {}".format(len(self.host_iv_attrs))) # for idx, col in enumerate(self.cols): # LOGGER.info("The local iv of {}th feature is {}".format(col, local_iv[idx].iv)) for idx, iv_attr in enumerate(host_iv_attrs): LOGGER.info("The remote iv of {}th measured feature is {}".format( idx, iv_attr.iv)) iv_result = {'local': local_iv, 'remote': host_iv_attrs} return iv_result def transform(self, data_instances): self._abnormal_detection(data_instances) self.header = data_instances.schema.get( 'header') # ['x1', 'x2', 'x3' ... ] self._parse_cols(data_instances) # 1. Synchronize encryption information self.__synchronize_encryption() # 2. Prepare labels data_instances = data_instances.mapValues(self.load_data) label_table = data_instances.mapValues(lambda x: x.label) # 3. Transfer encrypted label f = functools.partial(self.encrypt, encryptor=self.encryptor) encrypted_label_table = label_table.mapValues(f) encrypted_label_table_id = self.transfer_variable.generate_transferid( self.transfer_variable.encrypted_label) federation.remote(encrypted_label_table, name=self.transfer_variable.encrypted_label.name, tag=encrypted_label_table_id, role=consts.HOST, idx=0) LOGGER.info("Sent encrypted_label_table to host for transform") # 4. Transform locally self.transform_local(data_instances, reformated=True) # 5. Received host result and calculate iv value encrypted_bin_sum_id = self.transfer_variable.generate_transferid( self.transfer_variable.encrypted_bin_sum) encrypted_bin_sum = federation.get( name=self.transfer_variable.encrypted_bin_sum.name, tag=encrypted_bin_sum_id, idx=0) result_counts = self.__decrypt_bin_sum(encrypted_bin_sum) host_iv_attrs = self.binning_obj.cal_iv_woe( result_counts, self.bin_param.adjustment_factor) self.host_iv_attrs = host_iv_attrs for idx, iv_attr in enumerate(host_iv_attrs): LOGGER.info("The remote iv of {}th measured feature is {}".format( idx, iv_attr.iv)) data_instances.schema['header'] = self.header return data_instances @staticmethod def encrypt(x, encryptor): return encryptor.encrypt(x), encryptor.encrypt(1 - x) def transform_local(self, data_instances, reformated=False): self._abnormal_detection(data_instances) self._parse_cols(data_instances) if not reformated: # Reformat the label type data_instances = data_instances.mapValues(self.load_data) split_points = [] for iv_attr in self.iv_attrs: s_p = list(iv_attr.split_points) split_points.append(s_p) self.iv_attrs = self.binning_obj.cal_local_iv(data_instances, self.cols, split_points) for idx, col in enumerate(self.cols): LOGGER.info("The local iv of {}th feature is {}".format( col, self.iv_attrs[idx].iv)) def __synchronize_encryption(self): pub_key = self.encryptor.get_public_key() pubkey_id = self.transfer_variable.generate_transferid( self.transfer_variable.paillier_pubkey) # LOGGER.debug("pubkey_id is : {}".format(pubkey_id)) federation.remote(pub_key, name=self.transfer_variable.paillier_pubkey.name, tag=pubkey_id, role=consts.HOST, idx=0) LOGGER.info("send pubkey to host") self.has_synchronized = True def __decrypt_bin_sum(self, encrypted_bin_sum): for feature_sum in encrypted_bin_sum: for idx, (encrypted_event, encrypted_non_event) in enumerate(feature_sum): event_count = self.encryptor.decrypt(encrypted_event) non_event_count = self.encryptor.decrypt(encrypted_non_event) feature_sum[idx] = (event_count, non_event_count) return encrypted_bin_sum def fit_local(self, data_instances, label_table=None): self._abnormal_detection(data_instances) self._parse_cols(data_instances) iv_attrs = self.binning_obj.cal_local_iv(data_instances, self.cols, label_table=label_table) for idx, col in enumerate(self.cols): LOGGER.info("The local iv of {}th feature is {}".format( col, iv_attrs[idx].iv)) self.iv_attrs = iv_attrs return iv_attrs @staticmethod def load_data(data_instance): # Here suppose this is a binary question and the event label is 1 # LOGGER.debug('label type is {}'.format(type(data_instance.label))) if data_instance.label != 1: data_instance.label = 0 return data_instance