def sync_predict_finish_tag(self, finish_tag, send_times): LOGGER.info("send the {}-th predict finish tag {} to host".format(finish_tag, send_times)) federation.remote(obj=finish_tag, name=self.transfer_inst.predict_finish_tag.name, tag=self.transfer_inst.generate_transferid(self.transfer_inst.predict_finish_tag, send_times), role=consts.HOST, idx=0)
def sync_dispatch_node_host(self, dispatch_guest_data, dep=-1): LOGGER.info("send node to host to dispath, depth is {}".format(dep)) federation.remote(obj=dispatch_guest_data, name=self.transfer_inst.dispatch_node_host.name, tag=self.transfer_inst.generate_transferid(self.transfer_inst.dispatch_node_host, dep), role=consts.HOST, idx=0)
def sync_predict_data(self, predict_data, send_times): LOGGER.info("send predict data to host, sending times is {}".format(send_times)) federation.remote(obj=predict_data, name=self.transfer_inst.predict_data.name, tag=self.transfer_inst.generate_transferid(self.transfer_inst.predict_data, send_times), role=consts.HOST, idx=0)
def sync_node_positions(self, dep): LOGGER.info("send node positions of depth {}".format(dep)) federation.remote(obj=self.node_dispatch, name=self.transfer_inst.node_positions.name, tag=self.transfer_inst.generate_transferid(self.transfer_inst.node_positions, dep), role=consts.HOST, idx=0)
def __synchronize_encryption(self, mode='train'): """ Communicate with hosts. Specify whether use encryption or not and transfer the public keys. """ # Send if this host use encryption or not use_encryption_id = self.transfer_variable.generate_transferid( self.transfer_variable.use_encrypt, mode) LOGGER.debug("Start to remote use_encrypt: {}, transfer_id: {}".format( self.use_encrypt, use_encryption_id)) federation.remote(self.use_encrypt, name=self.transfer_variable.use_encrypt.name, tag=use_encryption_id, role=consts.ARBITER, idx=0) # Set public key if self.use_encrypt: pubkey_id = self.transfer_variable.generate_transferid( self.transfer_variable.paillier_pubkey, mode) pubkey = federation.get( name=self.transfer_variable.paillier_pubkey.name, tag=pubkey_id, idx=0) LOGGER.debug("Received pubkey") self.encrypt_operator.set_public_key(pubkey) LOGGER.info("Finish synchronized ecryption") self.has_sychronized_encryption = True
def __synchronize_encryption(self): """ Communicate with hosts. Specify whether use encryption or not and transfer the public keys. """ # 1. Use Encrypt: Specify which host use encryption host_use_encryption_id = self.transfer_variable.generate_transferid( self.transfer_variable.use_encrypt) host_use_encryption = federation.get( name=self.transfer_variable.use_encrypt.name, tag=host_use_encryption_id, idx=-1) self.host_use_encryption = host_use_encryption LOGGER.info("host use encryption: {}".format(self.host_use_encryption)) # 2. Send pubkey to those use-encryption hosts for idx, use_encryption in enumerate(self.host_use_encryption): if not use_encryption: encrypter = FakeEncrypt() else: encrypter = PaillierEncrypt() encrypter.generate_key(self.encrypt_param.key_length) pub_key = encrypter.get_public_key() pubkey_id = self.transfer_variable.generate_transferid( self.transfer_variable.paillier_pubkey) federation.remote( pub_key, name=self.transfer_variable.paillier_pubkey.name, tag=pubkey_id, role=consts.HOST, idx=idx) # LOGGER.debug("send pubkey to host: {}".format(idx)) self.host_encrypter.append(encrypter) self.has_sychronized_encryption = True
def sync_host_sum_to_guest(self, host_sum): federation.remote(obj=host_sum, name=self.transfer_inst.host_sum.name, tag=self.transfer_inst.generate_transferid( self.transfer_inst.host_sum), role="guest", idx=0)
def _synchronous_data(self, data_instance, flowid, data_application=None): if data_application is None: LOGGER.warning("not data_application!") return transfer_variable = HeteroWorkFlowTransferVariable() if data_application == consts.TRAIN_DATA: transfer_id = transfer_variable.train_data elif data_application == consts.TEST_DATA: transfer_id = transfer_variable.test_data else: LOGGER.warning("data_application error!") return if self.role == consts.GUEST: data_sid = data_instance.mapValues(lambda v: 1) federation.remote(data_sid, name=transfer_id.name, tag=transfer_variable.generate_transferid( transfer_id, flowid), role=consts.HOST, idx=0) LOGGER.info("remote {} to host".format(data_application)) return None elif self.role == consts.HOST: data_sid = federation.get( name=transfer_id.name, tag=transfer_variable.generate_transferid(transfer_id, flowid), idx=0) LOGGER.info("get {} from guest".format(data_application)) join_data_insts = data_sid.join(data_instance, lambda s, d: d) return join_data_insts
def sync_share_to_guest(self): federation.remote(obj=self.y1, name=self.transfer_inst.host_share.name, tag=self.transfer_inst.generate_transferid( self.transfer_inst.host_share), role="guest", idx=0)
def predict(self, data_instances, predict_param): if not self.has_sychronized_encryption: self.__synchronize_encryption() self.__load_arbiter_model() else: LOGGER.info("in predict, has synchronize encryption information") wx = self.compute_wx(data_instances, self.coef_, self.intercept_) if self.use_encrypt: encrypted_wx_id = self.transfer_variable.generate_transferid(self.transfer_variable.predict_wx) # LOGGER.debug("predict_wd_id: {}".format(encrypted_wx_id)) federation.remote(wx, name=self.transfer_variable.predict_wx.name, tag=encrypted_wx_id, role=consts.ARBITER, idx=0) predict_result_id = self.transfer_variable.generate_transferid(self.transfer_variable.predict_result) # LOGGER.debug("predict_result_id: {}".format(predict_result_id)) predict_result = federation.get(name=self.transfer_variable.predict_result.name, tag=predict_result_id, idx=0) # local_predict_table = predict_result.collect() predict_result_table = predict_result.join(data_instances, lambda p, d: (d.label, None, p)) else: pred_prob = wx.mapValues(lambda x: activation.sigmoid(x)) pred_label = self.classified(pred_prob, predict_param.threshold) if predict_param.with_proba: predict_result = data_instances.mapValues(lambda x: x.label) predict_result = predict_result.join(pred_prob, lambda x, y: (x, y)) else: predict_result = data_instances.mapValues(lambda x: (x.label, None)) predict_result_table = predict_result.join(pred_label, lambda x, y: (x[0], x[1], y)) return predict_result_table
def transform(self, data_instances): self._abnormal_detection(data_instances) self._parse_cols(data_instances) # 1. Synchronize encryption information self.__synchronize_encryption() split_points = [] for iv_attr in self.iv_attrs: s_p = list(iv_attr.split_points) split_points.append(s_p) # LOGGER.debug("In transform, self.cols: {}".format(self.cols)) data_bin_table = self.binning_obj.transform(data_instances, split_points, self.cols) encrypted_label_table_id = self.transfer_variable.generate_transferid(self.transfer_variable.encrypted_label) encrypted_label_table = federation.get(name=self.transfer_variable.encrypted_label.name, tag=encrypted_label_table_id, idx=0) LOGGER.info("Get encrypted_label_table from guest") encrypted_bin_sum = self.__static_encrypted_bin_label(data_bin_table, encrypted_label_table, self.cols) encrypted_bin_sum_id = self.transfer_variable.generate_transferid(self.transfer_variable.encrypted_bin_sum) federation.remote(encrypted_bin_sum, name=self.transfer_variable.encrypted_bin_sum.name, tag=encrypted_bin_sum_id, role=consts.GUEST, idx=0) LOGGER.info("Sent encrypted_bin_sum to guest")
def sync_final_splitinfo_host(self, splitinfo_host, federated_best_splitinfo_host, dep=-1, batch=-1): LOGGER.info("send host final splitinfo of depth {}, batch {}".format( dep, batch)) final_splitinfos = [] for i in range(len(splitinfo_host)): best_idx, best_gain = federated_best_splitinfo_host[i] if best_idx != -1: assert splitinfo_host[i][best_idx].sitename == consts.HOST splitinfo = splitinfo_host[i][best_idx] splitinfo.best_fid = self.encode("feature_idx", splitinfo.best_fid) assert splitinfo.best_fid is not None splitinfo.best_bid = self.encode("feature_val", splitinfo.best_bid, self.cur_split_nodes[i].id) splitinfo.gain = best_gain else: splitinfo = SplitInfo(sitename=consts.HOST, best_fid=-1, best_bid=-1, gain=best_gain) final_splitinfos.append(splitinfo) federation.remote(obj=final_splitinfos, name=self.transfer_inst.final_splitinfo_host.name, tag=self.transfer_inst.generate_transferid( self.transfer_inst.final_splitinfo_host, dep, batch), role=consts.GUEST, idx=0)
def __init_parameters(self, data_instances): party_weight_id = self.transfer_variable.generate_transferid( self.transfer_variable.host_party_weight ) # LOGGER.debug("party_weight_id: {}".format(party_weight_id)) federation.remote(self.party_weight, name=self.transfer_variable.host_party_weight.name, tag=party_weight_id, role=consts.ARBITER, idx=0) self.__synchronize_encryption() # Send re-encrypt times self.mini_batch_obj = MiniBatch(data_inst=data_instances, batch_size=self.batch_size) if self.use_encrypt: # LOGGER.debug("Use encryption, send re_encrypt_times") total_batch_num = self.mini_batch_obj.batch_nums re_encrypt_times = total_batch_num // self.re_encrypt_batches transfer_id = self.transfer_variable.generate_transferid(self.transfer_variable.re_encrypt_times) federation.remote(re_encrypt_times, name=self.transfer_variable.re_encrypt_times.name, tag=transfer_id, role=consts.ARBITER, idx=0) LOGGER.info("sent re_encrypt_times: {}".format(re_encrypt_times))
def sync_dispatch_node_host_result(self, dispatch_node_host_result, dep=-1): LOGGER.info("send host dispatch result, depth is {}".format(dep)) federation.remote(obj=dispatch_node_host_result, name=self.transfer_inst.dispatch_node_host_result.name, tag=self.transfer_inst.generate_transferid(self.transfer_inst.dispatch_node_host_result, dep), role=consts.GUEST, idx=-1)
def __synchronize_classes_list(self): """ Guest will get classes from host data, and aggregate classes it has. After that, send the aggregate classes to host and arbiter as binary classification times. """ if self.mode == consts.H**O: if self.role == consts.GUEST: host_classes_list = federation.get( name=self.transfer_variable.host_classes.name, tag=self.transfer_variable.generate_transferid( self.transfer_variable.host_classes), idx=0) for host_class in host_classes_list: self.classes.add(host_class) elif self.role == consts.HOST: federation.remote( self.classes, name=self.transfer_variable.host_classes.name, tag=self.transfer_variable.generate_transferid( self.transfer_variable.host_classes), role=consts.GUEST, idx=0) self.__synchronize_aggregate_classed_list()
def sync_sample_ids(self, sample_ids): transfer_inst = SampleTransferVariable() federation.remote(obj=sample_ids, name=transfer_inst.sample_ids.name, tag=transfer_inst.generate_transferid(transfer_inst.sample_ids, self.flowid), role="host")
def __re_encrypt(self, iter_num): # If use encrypt, model weight need to be re-encrypt every several batches. self.curt_re_encrypt_times = self.re_encrypt_times.copy() # Part2: re-encrypt model weight from each host batch_num = 0 while True: batch_num += self.re_encrypt_batches to_encrypt_model_id = self.transfer_variable.generate_transferid( self.transfer_variable.to_encrypt_model, iter_num, batch_num) re_encrypted_model_id = self.transfer_variable.generate_transferid( self.transfer_variable.re_encrypted_model, iter_num, batch_num) for idx, left_times in enumerate(self.curt_re_encrypt_times): if left_times <= 0: continue re_encrypt_model = federation.get( name=self.transfer_variable.to_encrypt_model.name, tag=to_encrypt_model_id, idx=idx) encrypter = self.host_encrypter[idx] decrypt_model = encrypter.decrypt_list(re_encrypt_model) re_encrypt_model = encrypter.encrypt_list(decrypt_model) federation.remote( re_encrypt_model, name=self.transfer_variable.re_encrypted_model.name, tag=re_encrypted_model_id, role=consts.HOST, idx=idx) left_times -= 1 self.curt_re_encrypt_times[idx] = left_times if sum(self.curt_re_encrypt_times) == 0: break
def sync_stop_flag(self, stop_flag, num_round): LOGGER.info("sync stop flag to host, boosting round is {}".format(num_round)) federation.remote(obj=stop_flag, name=self.transfer_inst.stop_flag.name, tag=self.transfer_inst.generate_transferid(self.transfer_inst.stop_flag, num_round), role=consts.HOST, idx=-1)
def sync_tree_dim(self): LOGGER.info("sync tree dim to host") federation.remote(obj=self.tree_dim, name=self.transfer_inst.tree_dim.name, tag=self.transfer_inst.generate_transferid(self.transfer_inst.tree_dim), role=consts.HOST, idx=-1)
def intersect_join_id(self, data_instances): LOGGER.info("Join id role is {}".format(self.role)) sid_encode_pair = None if self.with_encode and self.encode_params.encode_method != "none": if Encode.is_support(self.encode_params.encode_method): encode_operator = Encode(self.encode_params.encode_method, self.encode_params.base64) sid_encode_pair = data_instances.map( lambda k, v: (encode_operator.compute(k, postfit_salt=self.encode_params.salt), k)) data_sid = sid_encode_pair.mapValues(lambda v: 1) else: raise ValueError("Unknown encode_method, please check the configure of encode_param") else: data_sid = data_instances.mapValues(lambda v: 1) if self.role == consts.HOST: send_ids_name = self.transfer_variable.send_ids_guest.name send_ids_tag = self.transfer_variable.generate_transferid(self.transfer_variable.send_ids_guest) elif self.role == consts.GUEST: send_ids_name = self.transfer_variable.send_ids_host.name send_ids_tag = self.transfer_variable.generate_transferid(self.transfer_variable.send_ids_host) else: raise ValueError("Unknown intersect role, please check the code") recv_ids = get(name=send_ids_name, tag=send_ids_tag, idx=0) LOGGER.info("Get intersect_host_ids from role-send") send_intersect_ids = recv_ids.join(data_sid, lambda i, d: "intersect_id") LOGGER.info("Finish intersect_ids computing") if self.send_intersect_id_flag: if self.role == consts.GUEST: intersect_ids_name = self.transfer_variable.intersect_ids_guest.name intersect_ids_tag = self.transfer_variable.generate_transferid( self.transfer_variable.intersect_ids_guest) recv_role = consts.HOST elif self.role == consts.HOST: intersect_ids_name = self.transfer_variable.intersect_ids_host.name intersect_ids_tag = self.transfer_variable.generate_transferid( self.transfer_variable.intersect_ids_host) recv_role = consts.GUEST else: raise ValueError("Unknown intersect role, please check the code") remote(send_intersect_ids, name=intersect_ids_name, tag=intersect_ids_tag, role=recv_role, idx=0) LOGGER.info("Remote intersect ids to role-send") if sid_encode_pair: encode_intersect_ids = send_intersect_ids.join(sid_encode_pair, lambda r, s: s) intersect_ids = encode_intersect_ids.map(lambda k, v: (v, 'intersect_id')) else: intersect_ids = send_intersect_ids return intersect_ids
def predict(self, data=None, predict_param=None): # synchronize encryption information if not self.has_sychronized_encryption: self.__synchronize_encryption() self.__send_host_mode() for idx, use_encrypt in enumerate(self.host_use_encryption): if use_encrypt: encrypter = self.host_encrypter[idx] predict_wx_id = self.transfer_variable.generate_transferid( self.transfer_variable.predict_wx) predict_wx = federation.get( name=self.transfer_variable.predict_wx.name, tag=predict_wx_id, idx=idx) decrypted_wx = encrypter.distribute_decrypt(predict_wx) pred_prob = decrypted_wx.mapValues( lambda x: activation.sigmoid(x)) pred_label = self.classified(pred_prob, predict_param.threshold) predict_result_id = self.transfer_variable.generate_transferid( self.transfer_variable.predict_result) federation.remote( pred_label, name=self.transfer_variable.predict_result.name, tag=predict_result_id, role=consts.HOST, idx=idx) LOGGER.info("Finish predicting, result has been sent back") return
def __synchronize_aggregate_classed_list(self): """ synchronize all of class of data, include guest, host and arbiter, from guest to the others """ if self.role == consts.GUEST: federation.remote( self.classes, name=self.transfer_variable.aggregate_classes.name, tag=self.transfer_variable.generate_transferid( self.transfer_variable.aggregate_classes), role=consts.HOST, idx=0) if self.has_arbiter: federation.remote( self.classes, name=self.transfer_variable.aggregate_classes.name, tag=self.transfer_variable.generate_transferid( self.transfer_variable.aggregate_classes), role=consts.ARBITER, idx=0) elif self.role == consts.HOST or self.role == consts.ARBITER: self.classes = federation.get( name=self.transfer_variable.aggregate_classes.name, tag=self.transfer_variable.generate_transferid( self.transfer_variable.aggregate_classes), idx=0) else: raise ValueError("Unknown role:{}".format(self.role))
def _distributed_negative_sampling_src(self, positive_instances, src=consts.HOST, dst=consts.GUEST): if src == consts.HOST: if dst != consts.GUEST: raise NameError("if src is host, then dst should be guest!!!") nega_ids_transfer = self.transfer_variable.host_neg_samp_ids elif src == consts.GUEST: if dst != consts.HOST: raise NameError("if src is guest, then dst should be host!!!") nega_ids_transfer = self.transfer_variable.guest_neg_samp_ids else: raise NameError("src should be choose from {host, guest}") def gen_neg_ids(k, v, neg_sum): ids = [] for i in range(neg_sum): ids.append((k + '_negative_' + str(i), v)) return (k, ids) distributed_negative_instances_src = positive_instances.map(lambda k, v: gen_neg_ids(k, v, self.nega_samp_num)).flatMap(lambda k, v: v) distributed_negative_ids = distributed_negative_instances_src.take(distributed_negative_instances_src.count(), keysOnly=True) federation.remote(distributed_negative_ids, name=nega_ids_transfer.name, tag=self.transfer_variable.generate_transferid(nega_ids_transfer), role=dst, idx=0) LOGGER.info("Remote the distributed negative instances id to {} from {}".format(dst, src)) LOGGER.info("Distributed negative instances count: {}".format(len(distributed_negative_ids))) logDtableInstances(LOGGER, distributed_negative_instances_src, topk=10, isInstance=False) return distributed_negative_instances_src
def fit(self, data_instances): """ Apply binning method for both data instances in local party as well as the other one. Afterwards, calculate the specific metric value for specific columns. """ self._abnormal_detection(data_instances) self._parse_cols(data_instances) # 1. Synchronize encryption information self.__synchronize_encryption() # 2. Prepare labels data_instances = data_instances.mapValues(self.load_data) label_table = data_instances.mapValues(lambda x: x.label) # 3. Transfer encrypted label f = functools.partial(self.encrypt, encryptor=self.encryptor) encrypted_label_table = label_table.mapValues(f) encrypted_label_table_id = self.transfer_variable.generate_transferid( self.transfer_variable.encrypted_label) federation.remote(encrypted_label_table, name=self.transfer_variable.encrypted_label.name, tag=encrypted_label_table_id, role=consts.HOST, idx=0) LOGGER.info("Sent encrypted_label_table to host") # 4. Calculates self's binning. In case the other party need time to compute its data, # do binning calculation at this point. local_iv = self.fit_local(data_instances, label_table) # 5. Received host result and calculate iv value encrypted_bin_sum_id = self.transfer_variable.generate_transferid( self.transfer_variable.encrypted_bin_sum) encrypted_bin_sum = federation.get( name=self.transfer_variable.encrypted_bin_sum.name, tag=encrypted_bin_sum_id, idx=0) LOGGER.info("Get encrypted_bin_sum from host") result_counts = self.__decrypt_bin_sum(encrypted_bin_sum) host_iv_attrs = self.binning_obj.cal_iv_woe( result_counts, self.bin_param.adjustment_factor) self.host_iv_attrs = host_iv_attrs # LOGGER.debug("Lenght of host iv attrs: {}".format(len(self.host_iv_attrs))) # for idx, col in enumerate(self.cols): # LOGGER.info("The local iv of {}th feature is {}".format(col, local_iv[idx].iv)) for idx, iv_attr in enumerate(host_iv_attrs): LOGGER.info("The remote iv of {}th measured feature is {}".format( idx, iv_attr.iv)) iv_result = {'local': local_iv, 'remote': host_iv_attrs} return iv_result
def fit(self, data_instances): """ Apply binning method for both data instances in local party as well as the other one. Afterwards, calculate the specific metric value for specific columns. Currently, iv is support for binary labeled data only. """ LOGGER.info("Start feature binning fit and transform") self._abnormal_detection(data_instances) self._parse_cols(data_instances) self.binning_obj.fit_split_points(data_instances) is_binary_data = data_overview.is_binary_labels(data_instances) if not is_binary_data: LOGGER.warning("Iv is not supported for Multiple-label data.") # data_instances = self.fit_local(data_instances) return data_instances # 1. Synchronize encryption information self.__synchronize_encryption() # 2. Prepare labels data_instances = data_instances.mapValues(self.load_data) self.set_schema(data_instances) label_table = data_instances.mapValues(lambda x: x.label) # 3. Transfer encrypted label f = functools.partial(self.encrypt, encryptor=self.encryptor) encrypted_label_table = label_table.mapValues(f) encrypted_label_table_id = self.transfer_variable.generate_transferid(self.transfer_variable.encrypted_label) federation.remote(encrypted_label_table, name=self.transfer_variable.encrypted_label.name, tag=encrypted_label_table_id, role=consts.HOST, idx=0) LOGGER.info("Sent encrypted_label_table to host") # 4. Calculates self's binning. In case the other party need time to compute its data, # do binning calculation at this point. data_instances = self.fit_local(data_instances, label_table) # 5. Received host result and calculate iv value encrypted_bin_sum_id = self.transfer_variable.generate_transferid(self.transfer_variable.encrypted_bin_sum) encrypted_bin_sum = federation.get(name=self.transfer_variable.encrypted_bin_sum.name, tag=encrypted_bin_sum_id, idx=0) LOGGER.info("Get encrypted_bin_sum from host") result_counts = self.__decrypt_bin_sum(encrypted_bin_sum) host_iv_attrs = self.binning_obj.cal_iv_woe(result_counts, self.model_param.adjustment_factor) # Support one host only in this version. Multiple host will be supported in the future. self.host_results[consts.HOST] = host_iv_attrs self.set_schema(data_instances) self.transform(data_instances) LOGGER.info("Finish feature binning fit and transform") return self.data_output
def sync_data_predicted_by_host(self, predict_data, send_times): LOGGER.info("send predicted data by host, send times is {}".format(send_times)) federation.remote(obj=predict_data, name=self.transfer_inst.predict_data_by_host.name, tag=self.transfer_inst.generate_transferid(self.transfer_inst.predict_data_by_host, send_times), role=consts.GUEST, idx=0)
def sync_encrypted_splitinfo_host(self, encrypted_splitinfo_host, dep=-1, batch=-1): LOGGER.info("send encrypted splitinfo of depth {}, batch {}".format(dep, batch)) federation.remote(obj=encrypted_splitinfo_host, name=self.transfer_inst.encrypted_splitinfo_host.name, tag=self.transfer_inst.generate_transferid(self.transfer_inst.encrypted_splitinfo_host, dep, batch), role=consts.GUEST, idx=-1)
def sync_encrypted_grad_and_hess(self): LOGGER.info("send encrypted grad and hess to host") encrypted_grad_and_hess = self.encrypt_grad_and_hess() federation.remote(obj=encrypted_grad_and_hess, name=self.transfer_inst.encrypted_grad_and_hess.name, tag=self.transfer_inst.generate_transferid(self.transfer_inst.encrypted_grad_and_hess), role=consts.HOST, idx=0)
def _send_host_result_cols(self, filter_name): result_cols_id = self.transfer_variable.generate_transferid(self.transfer_variable.result_left_cols, filter_name) federation.remote(self.host_left_cols, name=self.transfer_variable.result_left_cols.name, tag=result_cols_id, role=consts.HOST, idx=0) LOGGER.info("Sent result cols from guest to host, result cols are: {}".format(self.host_left_cols))
def _send_select_cols(self, filter_name): host_select_cols_id = self.transfer_variable.generate_transferid(self.transfer_variable.host_select_cols, filter_name) federation.remote(self.left_col_names, name=self.transfer_variable.host_select_cols.name, tag=host_select_cols_id, role=consts.GUEST, idx=0) LOGGER.info("Sent select cols to guest")