def setUp(self): session.init("test_cross_entropy") self.softmax_loss = SoftmaxCrossEntropyLoss() self.y_list = [i % 5 for i in range(100)] self.predict_list = [np.array([random.random() for i in range(5)]) for j in range(100)] self.y = session.parallelize(self.y_list, include_key=False, partition=16) self.predict = session.parallelize(self.predict_list, include_key=False, partition=16)
def setUp(self): self.paillier_encrypt = PaillierEncrypt() self.paillier_encrypt.generate_key() # self.hetero_lr_gradient = HeteroLogisticGradient(self.paillier_encrypt) self.hetero_lr_gradient = hetero_lr_gradient_and_loss.Guest() size = 10 self.wx = session.parallelize( [self.paillier_encrypt.encrypt(i) for i in range(size)]) self.en_sum_wx_square = session.parallelize( [self.paillier_encrypt.encrypt(np.square(i)) for i in range(size)]) self.w = [i for i in range(size)] self.data_inst = session.parallelize([ Instance(features=[1 for _ in range(size)], label=pow(-1, i % 2)) for i in range(size) ], partition=1) # test fore_gradient self.fore_gradient_local = [ -0.5, 0.75, 0, 1.25, 0.5, 1.75, 1, 2.25, 1.5, 2.75 ] # test gradient self.gradient = [ 1.125, 1.125, 1.125, 1.125, 1.125, 1.125, 1.125, 1.125, 1.125, 1.125 ] self.gradient_fit_intercept = [ 1.125, 1.125, 1.125, 1.125, 1.125, 1.125, 1.125, 1.125, 1.125, 1.125, 1.125 ] self.loss = 4.505647
def setUp(self): session.init("test_random_sampler") self.data = [(i * 10 + 5, i * i) for i in range(100)] self.table = session.parallelize(self.data, include_key=True) self.data_to_trans = [(i * 10 + 5, i * i * i) for i in range(100)] self.table_trans = session.parallelize(self.data_to_trans, include_key=True)
def setUp(self): session.init("test_cross_entropy") self.sigmoid_loss = SigmoidBinaryCrossEntropyLoss() self.y_list = [i % 2 for i in range(100)] self.predict_list = [random.random() for i in range(100)] self.y = session.parallelize(self.y_list, include_key=False, partition=16) self.predict = session.parallelize(self.predict_list, include_key=False, partition=16)
def setUp(self): self.data = [] self.data_with_value = [] for i in range(100): row = [] row_with_value = [] for j in range(100): if random.randint(1, 100) > 30: continue str_r = ''.join( random.sample(string.ascii_letters + string.digits, 10)) row.append(str_r) row_with_value.append(str_r + ':' + str(random.random())) self.data.append((i, ' '.join(row))) self.data_with_value.append((i, ' '.join(row_with_value))) self.table1 = session.parallelize(self.data, include_key=True, partition=16) self.table2 = session.parallelize(self.data_with_value, include_key=True, partition=16) self.args1 = {"data": {"data_io_0": {"data": self.table1}}} self.args2 = {"data": {"data_io_1": {"data": self.table2}}} self.tracker = Tracking("jobid", "guest", 9999, "abc", "123")
def setUp(self): session.init("test_instance") dense_inst = [] headers = ['x' + str(i) for i in range(20)] for i in range(100): inst = Instance(features=(i % 16 * np.ones(20))) dense_inst.append((i, inst)) self.dense_table = session.parallelize(dense_inst, include_key=True, partition=2) self.dense_table.schema = {'header': headers} self.sparse_inst = [] for i in range(100): dict = {} indices = [] data = [] for j in range(20): idx = random.randint(0, 29) if idx in dict: continue dict[idx] = 1 val = random.random() indices.append(idx) data.append(val) sparse_vec = SparseVector(indices, data, 30) self.sparse_inst.append((i, Instance(features=sparse_vec))) self.sparse_table = session.parallelize(self.sparse_inst, include_key=True) self.sparse_table.schema = {"header": ["fid" + str(i) for i in range(30)]}
def setUp(self): self.feature_histogram = FeatureHistogram() session.init("test_feature_histogram") data_insts = [] for i in range(1000): indices = [] data = [] for j in range(10): x = random.randint(0, 5) if x != 0: data.append(x) indices.append(j) sparse_vec = SparseVector(indices, data, shape=10) data_insts.append((Instance(features=sparse_vec), (1, random.randint(0, 3)))) self.node_map = {0: 0, 1: 1, 2: 2, 3: 3} self.data_insts = data_insts self.data_bin = session.parallelize(data_insts, include_key=False, partition=16) self.grad_and_hess_list = [(random.random(), random.random()) for i in range(1000)] self.grad_and_hess = session.parallelize(self.grad_and_hess_list, include_key=False, partition=16) bin_split_points = [] for i in range(10): bin_split_points.append(np.array([i for i in range(5)])) self.bin_split_points = np.array(bin_split_points) self.bin_sparse = [0 for i in range(10)]
def setUp(self): session.init("test_least_abs_error_loss") self.lae_loss = LeastAbsoluteErrorLoss() self.y_list = [i % 2 for i in range(100)] self.predict_list = [random.random() for i in range(100)] self.y = session.parallelize(self.y_list, include_key=False) self.predict = session.parallelize(self.predict_list, include_key=False)
def setUp(self): session.init("test_fair_loss") self.log_cosh_loss = LogCoshLoss() self.y_list = [i % 2 for i in range(100)] self.predict_list = [random.random() for i in range(100)] self.y = session.parallelize(self.y_list, include_key=False) self.predict = session.parallelize(self.predict_list, include_key=False)
def predict(self, data_inst): data = self.data_converter.convert(data_inst, batch_size=self.batch_size, task_type=self.task_type) #change predict = self.nn_model.predict(data) num_output_units = predict.shape[1] threshold = self.param.predict_param.threshold if num_output_units == 1: kv = [(x[0], (0 if x[1][0] <= threshold else 1, x[1][0].item())) for x in zip(data.get_keys(), predict)] pred_tbl = session.parallelize(kv, include_key=True) return data_inst.join( pred_tbl, lambda d, pred: [d.label, pred[0], pred[1], { "label": pred[0] }]) else: kv = [(x[0], (x[1].argmax(), [float(e) for e in x[1]])) for x in zip(data.get_keys(), predict)] pred_tbl = session.parallelize(kv, include_key=True) return data_inst.join( pred_tbl, lambda d, pred: [ d.label, pred[0].item(), pred[1][pred[0]] / sum(pred[1]), { "raw_predict": pred[1] } ])
def setUp(self): session.init("test_encrypt_mode_calculator") self.list_data = [] self.tuple_data = [] self.numpy_data = [] for i in range(30): list_value = [100 * i + j for j in range(20)] tuple_value = tuple(list_value) numpy_value = np.array(list_value, dtype="int") self.list_data.append(list_value) self.tuple_data.append(tuple_value) self.numpy_data.append(numpy_value) self.data_list = session.parallelize(self.list_data, include_key=False, partition=10) self.data_tuple = session.parallelize(self.tuple_data, include_key=False, partition=10) self.data_numpy = session.parallelize(self.numpy_data, include_key=False, partition=10)
def predict(self, data_inst): data = self.data_converter.convert(data_inst, batch_size=self.batch_size, encode_label=self.encode_label) predict = self.nn_model.predict(data) num_output_units = predict.shape[1] threshold = self.param.predict_param.threshold if num_output_units == 1: kv = [(x[0], (0 if x[1][0] <= threshold else 1, x[1][0].item())) for x in zip(data.get_keys(), predict)] pred_tbl = session.parallelize( kv, include_key=True, partition=data_inst.get_partitions()) return data_inst.join( pred_tbl, lambda d, pred: [d.label, pred[0], pred[1], { "0": 1 - pred[1], "1": pred[1] }]) else: kv = [(x[0], (x[1].argmax(), [float(e) for e in x[1]])) for x in zip(data.get_keys(), predict)] pred_tbl = session.parallelize( kv, include_key=True, partition=data_inst.get_partitions()) return data_inst.join( pred_tbl, lambda d, pred: [ d.label, pred[0].item(), pred[1][pred[0]], {str(v): pred[1][v] for v in range(len(pred[1]))} ])
def setUp(self): session.init("test_huber_loss") self.delta = 1 self.huber_loss = HuberLoss(self.delta) self.y_list = [i % 2 for i in range(100)] self.predict_list = [random.random() for i in range(100)] self.y = session.parallelize(self.y_list, include_key=False) self.predict = session.parallelize(self.predict_list, include_key=False)
def setUp(self): session.init("test_fair_loss") self.rho = 0.5 self.tweedie_loss = TweedieLoss(self.rho) self.y_list = [i % 2 for i in range(100)] self.predict_list = [random.random() for i in range(100)] self.y = session.parallelize(self.y_list, include_key=False) self.predict = session.parallelize(self.predict_list, include_key=False)
def save_eval_result(self, eval_data): session.parallelize( [eval_data], include_key=False, name=self.workflow_param.evaluation_output_table, namespace=self.workflow_param.evaluation_output_namespace, error_if_exist=False, persistent=True)
def setUp(self): session.init("test_label_checker") self.small_label_set = [Instance(label=i % 5) for i in range(100)] self.classify_inst = session.parallelize(self.small_label_set, include_key=False) self.regression_label = [Instance(label=random.random()) for i in range(100)] self.regression_inst = session.parallelize(self.regression_label) self.classify_checker = ClassifyLabelChecker() self.regression_checker = RegressionLabelChecker()
def save_eval_result(self, eval_data): LOGGER.info("@ save evaluation result to table with namespace: {0} and name: {1}".format( self.workflow_param.evaluation_output_namespace, self.workflow_param.evaluation_output_table)) session.parallelize([eval_data], include_key=False, name=self.workflow_param.evaluation_output_table, namespace=self.workflow_param.evaluation_output_namespace, error_if_exist=False, persistent=True )
def setUp(self): session.init("test_stratified_sampler") self.data = [] self.data_to_trans = [] for i in range(1000): self.data.append((i, Instance(label=i % 4, features=i * i))) self.data_to_trans.append((i, Instance(features=i**3))) self.table = session.parallelize(self.data, include_key=True) self.table_trans = session.parallelize(self.data_to_trans, include_key=True)
def predict(self, data_inst): """ predicton function. Note that: GMF model use different DataConverter in evaluation and prediction procedure. :param data_inst: data instance :return: the prediction results """ LOGGER.info( f"data_inst type: {type(data_inst)}, size: {data_inst.count()}, table name: {data_inst.get_name()}" ) LOGGER.info(f"current flowid: {self.flowid}") if self.flowid == 'validate': # use GMFSequenceData in evaluation procedure (after training procedure) data = self.data_converter.convert( data_inst, batch_size=self.batch_size, neg_count=self.model_param.neg_count, training=True, flow_id=self.flowid) keys = data.get_keys() labels = data.get_validate_labels() label_data = fate_session.parallelize( zip(keys, labels), include_key=True, partition=data_inst._partitions) else: # use GMFSequencePredictData in prediction procedure data = self.data_converter.convert(data_inst, batch_size=self.batch_size, training=False) label_data = data_inst.map(lambda k, v: (k, v.features.astype(int).tolist()[2])) LOGGER.info(f"label_data example: {label_data.take(10)}") LOGGER.info( f"data example: {data_inst.first()[1].features.astype(int)}") LOGGER.info(f"converted data, size :{data.size}") predict = self._model.predict(data) LOGGER.info(f"predict shape: {predict.shape}") threshold = self.params.predict_param.threshold kv = [(x[0], (0 if x[1] <= threshold else 1, x[1].item())) for x in zip(data.get_keys(), predict)] pred_tbl = fate_session.parallelize(kv, include_key=True, partition=data_inst._partitions) pred_data = label_data.join( pred_tbl, lambda d, pred: [d, pred[0], pred[1], { "label": pred[0] }]) LOGGER.info(f"pred_data sample: {pred_data.take(20)}") return pred_data
def gen_data(self, data_num, feature_num, partition): data = [] header = [str(i) for i in range(feature_num)] # col_2 = np.random.rand(data_num) col_data = [] for _ in range(feature_num - 1): while True: col_1 = np.random.rand(data_num) if np.mean(col_1) != 0: break col_data.append(col_1) col_data.append(10 * np.ones(data_num)) for key in range(data_num): data.append((key, np.array([col[key] for col in col_data]))) result = session.parallelize(data, include_key=True, partition=partition) result.schema = {'header': header} self.header = header self.coe_list = [] for col in col_data: self.coe_list.append(np.std(col) / np.mean(col)) return result
def setUp(self): self.data_num = 100 self.feature_num = 3 self.cols = [0, 1, 2] self.header = ['x' + str(i) for i in range(self.feature_num)] final_result = [] for i in range(self.data_num): tmp = [] for _ in range(self.feature_num): tmp.append(np.random.choice([1, 2, 3])) tmp = np.array(tmp) inst = Instance(inst_id=i, features=tmp, label=0) tmp_pair = (str(i), inst) final_result.append(tmp_pair) table = session.parallelize(final_result, include_key=True, partition=10) table.schema = {"header": self.header} self.model_name = 'OneHotEncoder' self.table = table self.args = {"data": {self.model_name: {"data": table}}}
def setUp(self): # eggroll.init("123") self.data_num = 1000 self.feature_num = 200 self.bin_num = 10 final_result = [] numpy_array = [] for i in range(self.data_num): if 100 < i < 500: continue tmp = i * np.ones(self.feature_num) inst = Instance(inst_id=i, features=tmp, label=i % 2) tmp_pair = (str(i), inst) final_result.append(tmp_pair) numpy_array.append(tmp) table = session.parallelize(final_result, include_key=True, partition=10) header = ['x' + str(i) for i in range(self.feature_num)] self.table = table self.table.schema = {'header': header} self.numpy_table = np.array(numpy_array) self.cols = [1, 2]
def create_shared_gradient_table(gradients, index_list): indexed_instances = [] for idx, grad in zip(index_list, gradients): indexed_instances.append((idx, grad)) dtable = session.parallelize(indexed_instances, include_key=True) return dtable
def save_model_parameters(model_parameters, model_table_name, model_namespace): dtable = parallelize(model_parameters.items(), include_key=True, name=model_table_name, namespace=model_namespace, error_if_exist=True, persistent=True) return dtable
def predict(self, data_inst): keys, test_x, test_y = self._load_data(data_inst) self.set_partition(data_inst) preds = self.model.predict(test_x) predict_tb = session.parallelize(zip(keys, preds), include_key=True) if self.task_type == "regression": result = data_inst.join(predict_tb, lambda inst, predict: [inst.label, float(predict[0]), float(predict[0]), {"label": float(predict[0])}]) else: if self.num_label > 2: result = data_inst.join(predict_tb, lambda inst, predict: [inst.label, int(np.argmax(predict)), float(np.max(predict)), dict([(str(idx), float(predict[idx])) for idx in range(predict.shape[0])])]) else: threshold = self.predict_param.threshold result = data_inst.join(predict_tb, lambda inst, predict: [inst.label, 1 if predict[0] > threshold else 0, float(predict[0]), {"0": 1 - float(predict[0]), "1": float(predict[0])}]) return result
def test_sparse_abnormal_data(self): final_result = [] numpy_array = [] sparse_inst_shape = self.feature_num + 15 indices = [x for x in range(self.feature_num + 10)] for i in range(self.data_num): tmp = 100 * np.random.rand(self.feature_num) tmp = [ik for ik in range(self.feature_num)] tmp[i % self.feature_num] = 'nan' # data_index = np.random.choice(indices, self.feature_num, replace=False) # data_index = sorted(data_index) data_index = [idx for idx in range(self.feature_num)] sparse_inst = SparseVector(data_index, tmp, shape=sparse_inst_shape) if i == 0: aa = sparse_inst.get_data(0, 'a') print('in for loop: {}, type: {}'.format(aa, type(aa))) inst = Instance(inst_id=i, features=sparse_inst, label=0) tmp_pair = (str(i), inst) final_result.append(tmp_pair) n = 0 pointer = 0 tmp_array = [] while n < sparse_inst_shape: if n in data_index: tmp_array.append(tmp[pointer]) pointer += 1 else: tmp_array.append(0) n += 1 numpy_array.append(tmp_array) abnormal_value = final_result[0][1].features.get_data(0, 'a') print('abnormal_value: {}, type: {}'.format(abnormal_value, type(abnormal_value))) table = session.parallelize(final_result, include_key=True, partition=1) header = ['x' + str(i) for i in range(sparse_inst_shape)] numpy_table = np.array(numpy_array) table.schema = {'header': header} self.used_data_set.append(table) bin_obj = self._bin_obj_generator(abnormal_list=['nan']) split_points = bin_obj.fit_split_points(table) print('split_points: {}'.format(split_points)) print(numpy_table) trans_result = bin_obj.transform(table, transform_cols_idx=-1, transform_type='bin_num') trans_result = trans_result.collect() print('transform result: ') for k, v in trans_result: value = v.features.get_all_data() value_list = [] for value_k, value_v in value: value_list.append((value_k, value_v)) print(k, value_list)
def setUp(self): self.data = [] self.max_feature = -1 for i in range(100): row = [] label = i % 2 row.append(str(label)) dict = {} for j in range(20): x = random.randint(0, 1000) val = random.random() if x in dict: continue self.max_feature = max(self.max_feature, x) dict[x] = True row.append(":".join(map(str, [x, val]))) self.data.append((i, " ".join(row))) self.table = session.parallelize(self.data, include_key=True, partition=16) self.args = {"data": {"data_io_0": {"data": self.table}}} self.tracker = Tracking("jobid", "guest", 9999, "abc", "123")
def setUp(self): # eggroll.init("123") self.data_num = 10 self.feature_num = 5 final_result = [] numpy_array = [] for i in range(self.data_num): tmp = np.random.rand(self.feature_num) inst = Instance(inst_id=i, features=tmp, label=0) tmp_pair = (str(i), inst) final_result.append(tmp_pair) numpy_array.append(tmp) table = session.parallelize(final_result, include_key=True, partition=10) header = ['x' + str(i) for i in range(self.feature_num)] self.col_dict = {} for idx, h in enumerate(header): self.col_dict[h] = idx self.table = table self.table.schema = {'header': header} self.numpy_table = np.array(numpy_array) self.cols = [1, 2] self.used_data_set = []
def transform(self, instance_table): """ transform instances into features. Parameters ---------- :param instance_table: dtable with a collection of (index, instance) pairs :return: """ start = time.time() LOGGER.debug("@ extract representative features from raw input") index_tracking_list = [] indexed_instances = instance_table.collect() features_list = [] instances_list = [] for idx, inst in indexed_instances: index_tracking_list.append(idx) features_list.append(inst.features) instances_list.append(inst) raw_features = np.array(features_list) trans_features = self.model.transform(raw_features) indexed_instances = [] for idx, inst, feat in zip(index_tracking_list, instances_list, trans_features): inst.set_feature(feat) indexed_instances.append((idx, inst)) dtable = session.parallelize(indexed_instances, include_key=True, partition=instance_table._partitions) end = time.time() LOGGER.debug("@ transform time:" + str(end - start)) return dtable, index_tracking_list
def federated_find_split(self, dep=-1, batch=-1): LOGGER.info("federated find split of depth {}, batch {}".format(dep, batch)) encrypted_splitinfo_host = self.sync_encrypted_splitinfo_host(dep, batch) for i in range(len(encrypted_splitinfo_host)): init_gain = self.min_impurity_split - consts.FLOAT_ZERO encrypted_init_gain = self.encrypter.encrypt(init_gain) best_splitinfo_host = [[-1, encrypted_init_gain] for j in range(len(self.cur_split_nodes))] best_gains = [init_gain for j in range(len(self.cur_split_nodes))] max_nodes = max(len(encrypted_splitinfo_host[i][j]) for j in range(len(self.cur_split_nodes))) for k in range(0, max_nodes, consts.MAX_FEDERATED_NODES): batch_splitinfo_host = [encrypted_splitinfo[k: k + consts.MAX_FEDERATED_NODES] for encrypted_splitinfo in encrypted_splitinfo_host[i]] encrypted_splitinfo_host_table = session.parallelize(zip(self.cur_split_nodes, batch_splitinfo_host), include_key=False, partition=self.data_bin._partitions) splitinfos = encrypted_splitinfo_host_table.mapValues(self.find_host_split).collect() for _, splitinfo in splitinfos: if best_splitinfo_host[_][0] == -1: best_splitinfo_host[_] = list(splitinfo[:2]) best_gains[_] = splitinfo[2] elif splitinfo[0] != -1 and splitinfo[2] > best_gains[_]: best_splitinfo_host[_][0] = k + splitinfo[0] best_splitinfo_host[_][1] = splitinfo[1] best_gains[_] = splitinfo[2] self.sync_federated_best_splitinfo_host(best_splitinfo_host, dep, batch, i)