def setUp(self): from fate_arch.session import computing_session as session session.init("test_encrypt_mode_calculator") self.list_data = [] self.tuple_data = [] self.numpy_data = [] for i in range(30): list_value = [100 * i + j for j in range(20)] tuple_value = tuple(list_value) numpy_value = np.array(list_value, dtype="int") self.list_data.append(list_value) self.tuple_data.append(tuple_value) self.numpy_data.append(numpy_value) self.data_list = session.parallelize(self.list_data, include_key=False, partition=10) self.data_tuple = session.parallelize(self.tuple_data, include_key=False, partition=10) self.data_numpy = session.parallelize(self.numpy_data, include_key=False, partition=10)
def setUp(self): self.paillier_encrypt = PaillierEncrypt() self.paillier_encrypt.generate_key() # self.hetero_lr_gradient = HeteroLogisticGradient(self.paillier_encrypt) self.hetero_lr_gradient = hetero_lr_gradient_and_loss.Guest() size = 10 self.en_wx = session.parallelize([self.paillier_encrypt.encrypt(i) for i in range(size)], partition=48, include_key=False) # self.en_wx = session.parallelize([self.paillier_encrypt.encrypt(i) for i in range(size)]) self.en_sum_wx_square = session.parallelize([self.paillier_encrypt.encrypt(np.square(i)) for i in range(size)], partition=48, include_key=False) self.wx = np.array([i for i in range(size)]) self.w = self.wx / np.array([1 for _ in range(size)]) self.data_inst = session.parallelize( [Instance(features=np.array([1 for _ in range(size)]), label=pow(-1, i % 2)) for i in range(size)], partition=48, include_key=False) # test fore_gradient self.fore_gradient_local = [-0.5, 0.75, 0, 1.25, 0.5, 1.75, 1, 2.25, 1.5, 2.75] # test gradient self.gradient = [1.125, 1.125, 1.125, 1.125, 1.125, 1.125, 1.125, 1.125, 1.125, 1.125] self.gradient_fit_intercept = [1.125, 1.125, 1.125, 1.125, 1.125, 1.125, 1.125, 1.125, 1.125, 1.125, 1.125] self.loss = 4.505647
def setUp(self): session.init("test_cross_entropy") self.softmax_loss = SoftmaxCrossEntropyLoss() self.y_list = [i % 5 for i in range(100)] self.predict_list = [np.array([random.random() for i in range(5)]) for j in range(100)] self.y = session.parallelize(self.y_list, include_key=False, partition=16) self.predict = session.parallelize(self.predict_list, include_key=False, partition=16)
def setUp(self): session.init("test_cross_entropy") self.sigmoid_loss = SigmoidBinaryCrossEntropyLoss() self.y_list = [i % 2 for i in range(100)] self.predict_list = [random.random() for i in range(100)] self.y = session.parallelize(self.y_list, include_key=False, partition=16) self.predict = session.parallelize(self.predict_list, include_key=False, partition=16)
def _gen_table_data(self): if self._dense_table is not None: return self._dense_table, self._dense_not_inst_table, self._original_data headers = ['x' + str(i) for i in range(self.feature_num)] dense_inst = [] dense_not_inst = [] original_data = 100 * np.random.random((self.count, self.feature_num)) # original_data = 100 * np.zeros((self.count, self.feature_num)) for i in range(self.count): features = original_data[i, :] inst = Instance(features=features) dense_inst.append((i, inst)) dense_not_inst.append((i, features)) dense_table = session.parallelize(dense_inst, include_key=True, partition=16) dense_not_inst_table = session.parallelize(dense_not_inst, include_key=True, partition=16) dense_table.schema = {'header': headers} dense_not_inst_table.schema = {'header': headers} self._dense_table, self._dense_not_inst_table, self._original_data = \ dense_table, dense_not_inst_table, original_data return dense_table, dense_not_inst_table, original_data
def gen_data(self): dense_inst = [] headers = ['x' + str(i) for i in range(20)] for i in range(100): inst = Instance(features=(i % 16 * np.ones(20))) dense_inst.append((i, inst)) self.dense_table = session.parallelize(dense_inst, include_key=True, partition=2) self.dense_table.schema = {'header': headers} self.sparse_inst = [] for i in range(100): dict = {} indices = [] data = [] for j in range(20): idx = random.randint(0, 29) if idx in dict: continue dict[idx] = 1 val = random.random() indices.append(idx) data.append(val) sparse_vec = SparseVector(indices, data, 30) self.sparse_inst.append((i, Instance(features=sparse_vec))) self.sparse_table = session.parallelize(self.sparse_inst, include_key=True, partition=48) self.sparse_table.schema = { "header": ["fid" + str(i) for i in range(30)] }
def setUp(self): self.feature_histogram = FeatureHistogram() session.init("test_feature_histogram") data_insts = [] for i in range(1000): indices = [] data = [] for j in range(10): x = random.randint(0, 5) if x != 0: data.append(x) indices.append(j) sparse_vec = SparseVector(indices, data, shape=10) data_insts.append( (Instance(features=sparse_vec), (1, random.randint(0, 3)))) self.node_map = {0: 0, 1: 1, 2: 2, 3: 3} self.data_insts = data_insts self.data_bin = session.parallelize(data_insts, include_key=False, partition=16) self.grad_and_hess_list = [(random.random(), random.random()) for i in range(1000)] self.grad_and_hess = session.parallelize(self.grad_and_hess_list, include_key=False, partition=16) bin_split_points = [] for i in range(10): bin_split_points.append(np.array([i for i in range(6)])) self.bin_split_points = np.array(bin_split_points) self.bin_sparse = [0 for i in range(10)]
def setUp(self): session.init("test_dataio_" + str(random.random())) self.data = [] self.data_with_value = [] for i in range(100): row = [] row_with_value = [] for j in range(100): if random.randint(1, 100) > 30: continue str_r = ''.join( random.sample(string.ascii_letters + string.digits, 10)) row.append(str_r) row_with_value.append(str_r + ':' + str(random.random())) self.data.append((i, ' '.join(row))) self.data_with_value.append((i, ' '.join(row_with_value))) self.table1 = session.parallelize(self.data, include_key=True, partition=16) self.table2 = session.parallelize(self.data_with_value, include_key=True, partition=16) self.args1 = {"data": {"data_io_0": {"data": self.table1}}} self.args2 = {"data": {"data_io_1": {"data": self.table2}}}
def setUp(self): session.init("test_random_sampler") self.data = [(i * 10 + 5, i * i) for i in range(100)] self.table = session.parallelize(self.data, include_key=True, partition=16) self.data_to_trans = [(i * 10 + 5, i * i * i) for i in range(100)] self.table_trans = session.parallelize(self.data_to_trans, include_key=True, partition=16)
def setUp(self): session.init("test_least_abs_error_loss") self.lae_loss = LeastAbsoluteErrorLoss() self.y_list = [i % 2 for i in range(100)] self.predict_list = [random.random() for i in range(100)] self.y = session.parallelize(self.y_list, include_key=False, partition=16) self.predict = session.parallelize(self.predict_list, include_key=False, partition=16)
def setUp(self): session.init("test_fair_loss") self.log_cosh_loss = LogCoshLoss() self.y_list = [i % 2 for i in range(100)] self.predict_list = [random.random() for i in range(100)] self.y = session.parallelize(self.y_list, include_key=False, partition=16) self.predict = session.parallelize(self.predict_list, include_key=False, partition=16)
def setUp(self): session.init("test_fair_loss") self.rho = 0.5 self.tweedie_loss = TweedieLoss(self.rho) self.y_list = [i % 2 for i in range(100)] self.predict_list = [random.random() for i in range(100)] self.y = session.parallelize(self.y_list, include_key=False, partition=16) self.predict = session.parallelize(self.predict_list, include_key=False, partition=16)
def setUp(self): session.init("test_huber_loss") self.delta = 1 self.huber_loss = HuberLoss(self.delta) self.y_list = [i % 2 for i in range(100)] self.predict_list = [random.random() for i in range(100)] self.y = session.parallelize(self.y_list, include_key=False, partition=16) self.predict = session.parallelize(self.predict_list, include_key=False, partition=16)
def _generate_batch_data_with_batch_ids(data_insts, batch_ids, masked_ids=None): batch_index_table = session.parallelize(batch_ids, include_key=True, partition=data_insts.partitions) batch_data_table = batch_index_table.join(data_insts, lambda x, y: y) if masked_ids: masked_index_table = session.parallelize(masked_ids, include_key=True, partition=data_insts.partitions) return masked_index_table, batch_data_table else: return batch_index_table, batch_data_table
def setUp(self): session.init("test_stratified_sampler") self.data = [] self.data_to_trans = [] for i in range(1000): self.data.append((i, Instance(label=i % 4, features=i * i))) self.data_to_trans.append((i, Instance(features=i**3))) self.table = session.parallelize(self.data, include_key=True, partition=16) self.table_trans = session.parallelize(self.data_to_trans, include_key=True, partition=16)
def setUp(self): session.init("test_label_checker") self.small_label_set = [Instance(label=i % 5) for i in range(100)] self.classify_inst = session.parallelize(self.small_label_set, include_key=False, partition=16) self.regression_label = [ Instance(label=random.random()) for i in range(100) ] self.regression_inst = session.parallelize(self.regression_label, partition=16, include_key=False) self.classify_checker = ClassifyLabelChecker() self.regression_checker = RegressionLabelChecker()
def client_predict(self, data_inst): self.align_data_header(data_instances=data_inst, pre_header=self._header) data = self.data_converter.convert( data_inst, batch_size=self.batch_size, encode_label=self.encode_label, label_mapping=self._label_align_mapping, ) predict = self.nn_model.predict(data) num_output_units = predict.shape[1] if num_output_units == 1: kv = zip(data.get_keys(), map(lambda x: x.tolist()[0], predict)) else: kv = zip(data.get_keys(), predict.tolist()) pred_tbl = computing_session.parallelize(kv, include_key=True, partition=data_inst.partitions) classes = [0, 1] if num_output_units == 1 else [ i for i in range(num_output_units) ] return self.predict_score_to_output( data_inst, pred_tbl, classes=classes, threshold=self.param.predict_param.threshold, )
def setUp(self): session.init("test_dataio_" + str(random.random())) self.data = [] self.max_feature = -1 for i in range(100): row = [] label = i % 2 row.append(str(label)) dict = {} for j in range(20): x = random.randint(0, 1000) val = random.random() if x in dict: continue self.max_feature = max(self.max_feature, x) dict[x] = True row.append(":".join(map(str, [x, val]))) self.data.append((i, " ".join(row))) self.table = session.parallelize(self.data, include_key=True, partition=16) self.args = {"data": {"data_io_0": {"data": self.table}}}
def construct_table(histograms_dict, bin_split_points, valid_features, partition, use_missing, n_final): get_obj = functools.partial(FastFeatureHistogram.get_obj, n_final=n_final) buf = [] for nid in histograms_dict: valid_fid = 0 for fid in range(len(valid_features)): if valid_features[fid]: feature_bin_num = len( bin_split_points[fid]) + int(use_missing) histogram = [[] for _ in range(feature_bin_num)] for bid in range(len(bin_split_points[fid])): grad = histograms_dict[nid][0][bid, valid_fid, 0] hess = histograms_dict[nid][0][bid, valid_fid, 1] cnt = histograms_dict[nid][1][bid, valid_fid] histogram[bid].append(get_obj(grad)) histogram[bid].append(get_obj(hess)) histogram[bid].append(cnt) if use_missing: grad = histograms_dict[nid][0][-1, valid_fid, 0] hess = histograms_dict[nid][0][-1, valid_fid, 1] cnt = histograms_dict[nid][1][-1, valid_fid] histogram[-1].append(get_obj(grad)) histogram[-1].append(get_obj(hess)) histogram[-1].append(cnt) buf.append(((nid, fid), (fid, histogram))) valid_fid += 1 return session.parallelize(buf, include_key=True, partition=partition)
def mapReducePartitions(self, mapper, reducer, **kwargs): def _mapper_wrapper(it): import uuid puid = str(uuid.uuid1()) ret = {} for _k, _v in mapper(it): if _k not in ret: ret[_k] = _v else: ret[_k] = reducer(ret[_k], _v) return [((_k, puid), _v) for _k, _v in ret.items()] partitions = self._rp.get_partitions() mapped = self._rp.map_partitions(_mapper_wrapper) reduced = {} for (k, _), v in mapped.get_all(): if k not in reduced: reduced[k] = v else: reduced[k] = reducer(reduced[k], v) from fate_arch.session import computing_session return computing_session.parallelize(reduced.items(), partition=partitions, include_key=True)
def predict(self, dataset, batch_size): if batch_size < 0: batch_size = len(dataset) dataloader = torch.utils.data.DataLoader(dataset=dataset, batch_size=batch_size, num_workers=1) results = [] for x, y in dataloader: results.append(self.pl_model(x).detach().numpy()) predict = numpy.vstack(results) num_output_units = predict.shape[1] if num_output_units == 1: kv = zip(dataset.get_keys(), (x.tolist()[0] for x in predict)) else: kv = zip(dataset.get_keys(), predict.tolist()) partitions = getattr(dataset, "partitions", 10) pred_tbl = computing_session.parallelize(kv, include_key=True, partition=partitions) classes = ([0, 1] if num_output_units == 1 else [i for i in range(num_output_units)]) return pred_tbl, classes
def setUp(self): # eggroll.init("123") self.data_num = 1000 self.feature_num = 200 self.bin_num = 10 final_result = [] numpy_array = [] for i in range(self.data_num): if 100 < i < 500: continue tmp = i * np.ones(self.feature_num) inst = Instance(inst_id=i, features=tmp, label=i % 2) tmp_pair = (str(i), inst) final_result.append(tmp_pair) numpy_array.append(tmp) table = session.parallelize(final_result, include_key=True, partition=10) header = ['x' + str(i) for i in range(self.feature_num)] self.table = table self.table.schema = {'header': header} self.numpy_table = np.array(numpy_array) self.cols = [1, 2]
def generate_mask_table(self): _mask_table = session.parallelize(self._mask, include_key=False, partition=self._partition) self._mask_table = _mask_table return _mask_table
def merge_splitinfo(self, splitinfo_guest, splitinfo_host, merge_host_split_only=False): LOGGER.info("merge splitinfo, merge_host_split_only is {}".format(merge_host_split_only)) if merge_host_split_only: splitinfo_guest = [None for i in range(len(splitinfo_host[0]))] merge_infos = [] for i in range(len(splitinfo_guest)): splitinfo = [splitinfo_guest[i]] for j in range(len(splitinfo_host)): splitinfo.append(splitinfo_host[j][i]) merge_infos.append(splitinfo) splitinfo_guest_host_table = session.parallelize(merge_infos, include_key=False, partition=self.data_bin.partitions) best_splitinfo_table = splitinfo_guest_host_table.mapValues(self.find_best_split_guest_and_host) best_splitinfos = [None for i in range(len(merge_infos))] for _, best_splitinfo in best_splitinfo_table.collect(): best_splitinfos[_] = best_splitinfo return best_splitinfos
def secure_aggregate_table(self, send_func, table, enable_secure_aggregate=True): """ Secure aggregate tables. Degree is useless, exist for extension. """ LOGGER.debug( f"In secure aggregate, enable_secure_aggregate: {enable_secure_aggregate}" ) if enable_secure_aggregate: LOGGER.debug(f"Before mapValues, type of table: {type(table)}") key_table = table.mapValues(lambda v: None) LOGGER.debug("After mapValues") list_key = list(key_table.collect()) list_key = sorted([x[0] for x in list_key]) zeros_table = np.zeros(len(list_key)) LOGGER.debug("Before cipher encrypted") rand_table = self._random_padding_cipher.encrypt(zeros_table) LOGGER.debug(f"rand_table: {rand_table}") rand_table = computing_session.parallelize( tuple(zip(list_key, rand_table)), include_key=True, partition=table.partitions) table = table.join(rand_table, lambda x, y: x + y) LOGGER.debug("Finish add random numbers") send_func(table)
def gen_data(self, data_num, feature_num, partition): data = [] header = [str(i) for i in range(feature_num)] # col_2 = np.random.rand(data_num) col_data = [] for _ in range(feature_num - 1): while True: col_1 = np.random.rand(data_num) if np.mean(col_1) != 0: break col_data.append(col_1) col_data.append(10 * np.ones(data_num)) for key in range(data_num): data.append( (key, Instance(features=np.array([col[key] for col in col_data])))) result = session.parallelize(data, include_key=True, partition=partition) result.schema = {'header': header} self.header = header self.coe_list = [] for col in col_data: self.coe_list.append(np.std(col) / np.mean(col)) return result
def init_query_points(self, partitions, split_num, error_rank=1, need_first=True): query_points = [] for idx, col_name in enumerate(self.bin_inner_param.bin_names): max_value = self.max_values[idx] min_value = self.min_values[idx] sps = np.linspace(min_value, max_value, split_num) if not need_first: sps = sps[1:] split_point_array = [ SplitPointNode(sps[i], min_value, max_value, allow_error_rank=error_rank) for i in range(len(sps)) ] query_points.append((col_name, split_point_array)) query_points_table = session.parallelize(query_points, include_key=True, partition=partitions) return query_points_table
def predict(self, data_inst): LOGGER.debug('guest start to predict') data_loader_key = self.get_dataset_key(data_inst) data_inst_ = data_overview.header_alignment(data_inst, self.store_header) if data_loader_key in self.cache_dataloader: data_loader = self.cache_dataloader[data_loader_key] else: data_loader, _, _, _ = self.prepare_data(self.init_intersect_obj(), data_inst_, guest_side=True) self.cache_dataloader[data_loader_key] = data_loader LOGGER.debug('try to get predict u from host, suffix is {}'.format((0, 'host_u'))) host_predicts = self.transfer_variable.predict_host_u.get(idx=0, suffix=(0, 'host_u')) predict_score = np.matmul(host_predicts, self.phi.transpose()) predicts = self.sigmoid(predict_score) # convert to predict scores predicts = list(map(float, predicts)) predict_tb = session.parallelize(zip(data_loader.get_overlap_keys(), predicts,), include_key=True, partition=data_inst.partitions) threshold = self.predict_param.threshold predict_result = self.predict_score_to_output(data_inst_, predict_tb, classes=[0, 1], threshold=threshold) LOGGER.debug('ftl guest prediction done') return predict_result
def gen_data(self, data_num, feature_num, partition, is_sparse=False, use_random=False): data = [] shift_iter = 0 header = [str(i) for i in range(feature_num)] for data_key in range(data_num): value = data_key % bin_num if value == 0: if shift_iter % bin_num == 0: value = bin_num - 1 shift_iter += 1 if not is_sparse: if not use_random: features = value * np.ones(feature_num) else: features = np.random.random(feature_num) inst = Instance(inst_id=data_key, features=features, label=data_key % 2) else: if not use_random: features = value * np.ones(feature_num) else: features = np.random.random(feature_num) data_index = [x for x in range(feature_num)] sparse_inst = SparseVector(data_index, data=features, shape=10 * feature_num) inst = Instance(inst_id=data_key, features=sparse_inst, label=data_key % 2) header = [str(i) for i in range(feature_num * 10)] data.append((data_key, inst)) result = session.parallelize(data, include_key=True, partition=partition) result.schema = {'header': header} return result
def setUp(self): self.data_num = 1000 self.feature_num = 3 self.cols = [0, 1, 2, 3] self.header = ['x' + str(i) for i in range(self.feature_num)] final_result = [] for i in range(self.data_num): tmp = [] for _ in range(self.feature_num): tmp.append(np.random.choice([1, 2, 3, 'test_str'])) tmp = np.array(tmp) inst = Instance(inst_id=i, features=tmp, label=0) tmp_pair = (str(i), inst) final_result.append(tmp_pair) table = session.parallelize(final_result, include_key=True, partition=10) table.schema = {"header": self.header} self.model_name = 'OneHotEncoder' self.table = table self.args = {"data": {self.model_name: {"data": table}}}