def split(self, data_inst, shuffle=True): header = data_inst.schema.get('header') data_sids_iter, data_size = collect_index(data_inst) data_sids = [] for sid, _ in data_sids_iter: data_sids.append(sid) data_sids = np.array(data_sids) if shuffle: np.random.shuffle(data_sids) kf = sk_KFold(n_splits=self.n_splits) for train, test in kf.split(data_sids): train_sids = data_sids[train] test_sids = data_sids[test] train_sids_table = [(str(x), 1) for x in train_sids] test_sids_table = [(str(x), 1) for x in test_sids] # print(train_sids_table) train_table = eggroll.parallelize(train_sids_table, include_key=True, partition=data_inst._partitions) train_data = data_inst.join(train_table, lambda x, y: x) test_table = eggroll.parallelize(test_sids_table, include_key=True, partition=data_inst._partitions) test_data = data_inst.join(test_table, lambda x, y: x) train_data.schema['header'] = header test_data.schema['header'] = header yield train_data, test_data
def setUp(self): eggroll.init("test_encrypt_mode_calculator") self.list_data = [] self.tuple_data = [] self.numpy_data = [] for i in range(30): list_value = [100 * i + j for j in range(20)] tuple_value = tuple(list_value) numpy_value = np.array(list_value, dtype="int") self.list_data.append(list_value) self.tuple_data.append(tuple_value) self.numpy_data.append(numpy_value) self.data_list = eggroll.parallelize(self.list_data, include_key=False, partition=10) self.data_tuple = eggroll.parallelize(self.tuple_data, include_key=False, partition=10) self.data_numpy = eggroll.parallelize(self.numpy_data, include_key=False, partition=10)
def setUp(self): eggroll.init("test_instance") dense_inst = [] for i in range(100): inst = Instance(features=(i % 16 * np.ones(20))) dense_inst.append((i, inst)) self.dense_table = eggroll.parallelize(dense_inst, include_key=True, partition=2) sparse_inst = [] col_zero = [] for i in range(100): indices = [] data = [] for j in range(20): val = ((i + 5)**3 + (j + 1)**4) % 16 if val > 0: indices.append(j) data.append(val) if j == 0: col_zero.append(val) sparse_vec = SparseVector(indices, data, 20) inst = Instance(features=sparse_vec) sparse_inst.append((i, inst)) self.sparse_inst = sparse_inst self.sparse_table = eggroll.parallelize(sparse_inst, include_key=True, partition=1)
def setUp(self): eggroll.init("test_instance") dense_inst = [] dense_not_inst = [] headers = ['x' + str(i) for i in range(20)] self.header = headers self.eps = 1e-5 self.count = 100 self.dense_data_transpose = [] for i in range(self.count): features = i % 16 * np.ones(20) inst = Instance(features=features) dense_inst.append((i, inst)) self.dense_data_transpose.append(features) dense_not_inst.append((i, features)) self.dense_inst = dense_inst self.dense_not_inst = dense_not_inst self.dense_data_transpose = np.array(self.dense_data_transpose) self.dense_data_transpose = self.dense_data_transpose.transpose() self.dense_table = eggroll.parallelize(dense_inst, include_key=True, partition=5) self.dense_not_inst_table = eggroll.parallelize(dense_not_inst, include_key=True, partition=5) self.dense_table.schema = {'header': headers} self.dense_not_inst_table.schema = {'header': headers} col_index = [1, 2, 3] self.col_index = col_index self.summary_obj = MultivariateStatisticalSummary(self.dense_table, col_index, abnormal_list=[None]) self.summary_obj_not_inst = MultivariateStatisticalSummary(self.dense_not_inst_table, col_index, abnormal_list=[None])
def setUp(self): self.paillier_encrypt = PaillierEncrypt() self.paillier_encrypt.generate_key() self.hetero_lr_gradient = HeteroLogisticGradient(self.paillier_encrypt) size = 10 self.wx = eggroll.parallelize( [self.paillier_encrypt.encrypt(i) for i in range(size)]) self.en_sum_wx_square = eggroll.parallelize( [self.paillier_encrypt.encrypt(np.square(i)) for i in range(size)]) self.w = [i for i in range(size)] self.data_inst = eggroll.parallelize([ Instance(features=[1 for _ in range(size)], label=pow(-1, i % 2)) for i in range(size) ], partition=1) # test fore_gradient self.fore_gradient_local = [ -0.5, 0.75, 0, 1.25, 0.5, 1.75, 1, 2.25, 1.5, 2.75 ] # test gradient self.gradient = [ 1.125, 1.125, 1.125, 1.125, 1.125, 1.125, 1.125, 1.125, 1.125, 1.125 ] self.gradient_fit_intercept = [ 1.125, 1.125, 1.125, 1.125, 1.125, 1.125, 1.125, 1.125, 1.125, 1.125, 1.125 ] self.loss = 4.505647
def setUp(self): self.feature_histogram = FeatureHistogram() eggroll.init("test_feature_histogram") data_insts = [] for i in range(1000): indices = [] data = [] for j in range(10): x = random.randint(0, 5) if x != 0: data.append(x) indices.append(j) sparse_vec = SparseVector(indices, data, shape=10) data_insts.append((Instance(features=sparse_vec), (1, random.randint(0, 3)))) self.node_map = {0: 0, 1: 1, 2: 2, 3: 3} self.data_insts = data_insts self.data_bin = eggroll.parallelize(data_insts, include_key=False) self.grad_and_hess_list = [(random.random(), random.random()) for i in range(1000)] self.grad_and_hess = eggroll.parallelize(self.grad_and_hess_list, include_key=False) bin_split_points = [] for i in range(10): bin_split_points.append(np.array([i for i in range(5)])) self.bin_split_points = np.array(bin_split_points) self.bin_sparse = [0 for i in range(10)]
def setUp(self): eggroll.init("test_random_sampler") self.data = [(i * 10 + 5, i * i) for i in range(100)] self.table = eggroll.parallelize(self.data, include_key=True) self.data_to_trans = [(i * 10 + 5, i * i * i) for i in range(100)] self.table_trans = eggroll.parallelize(self.data_to_trans, include_key=True)
def setUp(self): eggroll.init("test_instance") dense_inst = [] headers = ['x' + str(i) for i in range(20)] for i in range(100): inst = Instance(features=(i % 16 * np.ones(20))) dense_inst.append((i, inst)) self.dense_table = eggroll.parallelize(dense_inst, include_key=True, partition=2) self.dense_table.schema = {'header': headers} self.sparse_inst = [] for i in range(100): dict = {} indices = [] data = [] for j in range(20): idx = random.randint(0, 29) if idx in dict: continue dict[idx] = 1 val = random.random() indices.append(idx) data.append(val) sparse_vec = SparseVector(indices, data, 30) self.sparse_inst.append((i, Instance(features=sparse_vec))) self.sparse_table = eggroll.parallelize(self.sparse_inst, include_key=True) self.sparse_table.schema = { "header": ["fid" + str(i) for i in range(30)] }
def setUp(self): eggroll.init("test_least_abs_error_loss") self.lae_loss = LeastAbsoluteErrorLoss() self.y_list = [i % 2 for i in range(100)] self.predict_list = [random.random() for i in range(100)] self.y = eggroll.parallelize(self.y_list, include_key=False) self.predict = eggroll.parallelize(self.predict_list, include_key=False)
def setUp(self): eggroll.init("test_fair_loss") self.log_cosh_loss = LogCoshLoss() self.y_list = [i % 2 for i in range(100)] self.predict_list = [random.random() for i in range(100)] self.y = eggroll.parallelize(self.y_list, include_key=False) self.predict = eggroll.parallelize(self.predict_list, include_key=False)
def setUp(self): eggroll.init("test_cross_entropy") self.sigmoid_loss = SigmoidBinaryCrossEntropyLoss() self.y_list = [i % 2 for i in range(100)] self.predict_list = [random.random() for i in range(100)] self.y = eggroll.parallelize(self.y_list, include_key=False) self.predict = eggroll.parallelize(self.predict_list, include_key=False)
def save_eval_result(self, eval_data): eggroll.parallelize( [eval_data], include_key=False, name=self.workflow_param.evaluation_output_table, namespace=self.workflow_param.evaluation_output_namespace, error_if_exist=False, persistent=True)
def setUp(self): eggroll.init("test_huber_loss") self.delta = 1 self.huber_loss = HuberLoss(self.delta) self.y_list = [i % 2 for i in range(100)] self.predict_list = [random.random() for i in range(100)] self.y = eggroll.parallelize(self.y_list, include_key=False) self.predict = eggroll.parallelize(self.predict_list, include_key=False)
def setUp(self): data1 = [("a", "1,2,-1,0,0,5"), ("b", "4,5,6,0,1,2")] self.table1 = eggroll.parallelize(data1, include_key=True) data2 = [("a", '-1,,na,null,null,2')] self.table2 = eggroll.parallelize(data2, include_key=True) self.args1 = {"data": {"data_io_0": {"data": self.table1}}} self.args2 = {"data": {"data_io_1": {"data": self.table2}}} self.tracker = Tracking("jobid", "guest", 9999, "abc", "123")
def setUp(self): eggroll.init("test_label_checker") self.small_label_set = [i % 5 for i in range(100)] self.classify_y = eggroll.parallelize(self.small_label_set, include_key=False) self.regression_label = [random.random() for i in range(100)] self.regression_y = eggroll.parallelize(self.regression_label) self.classify_checker = ClassifyLabelChecker() self.regression_checker = RegressionLabelChecker()
def setUp(self): eggroll.init("test_fair_loss") self.rho = 0.5 self.tweedie_loss = TweedieLoss(self.rho) self.y_list = [i % 2 for i in range(100)] self.predict_list = [random.random() for i in range(100)] self.y = eggroll.parallelize(self.y_list, include_key=False) self.predict = eggroll.parallelize(self.predict_list, include_key=False)
def setUp(self): eggroll.init("test_cross_entropy") self.softmax_loss = SoftmaxCrossEntropyLoss() self.y_list = [i % 5 for i in range(100)] self.predict_list = [ np.array([random.random() for i in range(5)]) for j in range(100) ] self.y = eggroll.parallelize(self.y_list, include_key=False) self.predict = eggroll.parallelize(self.predict_list, include_key=False)
def setUp(self): eggroll.init("test_stratified_sampler") self.data = [] self.data_to_trans = [] for i in range(1000): self.data.append((i, Instance(label=i % 4, features=i * i))) self.data_to_trans.append((i, Instance(features = i ** 3))) self.table = eggroll.parallelize(self.data, include_key=True) self.table_trans = eggroll.parallelize(self.data_to_trans, include_key=True)
def setUp(self): eggroll.init("test_dataio" + str(int(time.time()))) self.table = "dataio_table_test" self.namespace = "dataio_test" table = eggroll.parallelize([("a", "1,2,-1,0,0,5"), ("b", "4,5,6,0,1,2")], include_key=True) table.save_as(self.table, self.namespace) self.table2 = "dataio_table_test2" self.namespace2 = "dataio_test2" table2 = eggroll.parallelize([("a", '-1,,NA,NULL,null,2')], include_key=True) table2.save_as(self.table2, self.namespace2)
def save_eval_result(self, eval_data): LOGGER.info( "@ save evaluation result to table with namespace: {0} and name: {1}" .format(self.workflow_param.evaluation_output_namespace, self.workflow_param.evaluation_output_table)) eggroll.parallelize( [eval_data], include_key=False, name=self.workflow_param.evaluation_output_table, namespace=self.workflow_param.evaluation_output_namespace, error_if_exist=False, persistent=True)
def test_sparse_abnormal_data(self): final_result = [] numpy_array = [] sparse_inst_shape = self.feature_num + 15 indices = [x for x in range(self.feature_num + 10)] for i in range(self.data_num): tmp = 100 * np.random.rand(self.feature_num) tmp = [ik for ik in range(self.feature_num)] tmp[i % self.feature_num] = 'nan' # data_index = np.random.choice(indices, self.feature_num, replace=False) # data_index = sorted(data_index) data_index = [idx for idx in range(self.feature_num)] sparse_inst = SparseVector(data_index, tmp, shape=sparse_inst_shape) if i == 0: aa = sparse_inst.get_data(0, 'a') print('in for loop: {}, type: {}'.format(aa, type(aa))) inst = Instance(inst_id=i, features=sparse_inst, label=0) tmp_pair = (str(i), inst) final_result.append(tmp_pair) n = 0 pointer = 0 tmp_array = [] while n < sparse_inst_shape: if n in data_index: tmp_array.append(tmp[pointer]) pointer += 1 else: tmp_array.append(0) n += 1 numpy_array.append(tmp_array) abnormal_value = final_result[0][1].features.get_data(0, 'a') print('abnormal_value: {}, type: {}'.format(abnormal_value, type(abnormal_value))) table = eggroll.parallelize(final_result, include_key=True, partition=1) header = ['x' + str(i) for i in range(sparse_inst_shape)] numpy_table = np.array(numpy_array) table.schema = {'header': header} self.used_data_set.append(table) bin_obj = self._bin_obj_generator(abnormal_list=['nan']) split_points = bin_obj.fit_split_points(table) print('split_points: {}'.format(split_points)) print(numpy_table) trans_result = bin_obj.transform(table, transform_cols_idx=-1, transform_type='bin_num') trans_result = trans_result.collect() print('transform result: ') for k, v in trans_result: value = v.features.get_all_data() value_list = [] for value_k, value_v in value: value_list.append((value_k, value_v)) print(k, value_list)
def save_model_parameters(model_parameters, model_table_name, model_namespace): dtable = parallelize(model_parameters.items(), include_key=True, name=model_table_name, namespace=model_namespace, error_if_exist=True, persistent=True) return dtable
def sample_data(data_instance, bin_sample_num=DEFAULT_BIN_SAMPLE_NUM): """ sample data from a dtable Parameters ---------- data_instance : DTable The input data bin_sample_num : int, max number of data to be sample to generate bin split points Returns ------- sample_data: list, element is a (id, instance) tuple """ LOGGER.info("fsample data set") data_key_none_value = data_instance.mapValues(lambda value: None) data_key_none_value_tuple = list(data_key_none_value.collect()) data_num = len(data_key_none_value_tuple) if data_num <= bin_sample_num: data_keys = [(key, _) for (key, _) in data_key_none_value_tuple] else: sample_idxs = np.random.choice(data_num, bin_sample_num, replace=False) data_keys = [data_key_none_value_tuple[idx] for idx in sample_idxs] data_key_table = eggroll.parallelize(data_keys, include_key=True) sample_data = list( data_key_table.join(data_instance, lambda x, y: y).collect()) return sample_data
def setUp(self): # eggroll.init("123") self.data_num = 1000 self.feature_num = 20 final_result = [] numpy_array = [] for i in range(self.data_num): tmp = np.random.rand(self.feature_num) inst = Instance(inst_id=i, features=tmp, label=0) tmp_pair = (str(i), inst) final_result.append(tmp_pair) numpy_array.append(tmp) table = eggroll.parallelize(final_result, include_key=True, partition=10) header = ['x' + str(i) for i in range(self.feature_num)] self.col_dict = {} for idx, h in enumerate(header): self.col_dict[h] = idx self.table = table self.table.schema = {'header': header} self.numpy_table = np.array(numpy_array) self.cols = [1, 2] self.used_data_set = []
def transform(self, instance_table): """ transform instances into features. Parameters ---------- :param instance_table: dtable with a collection of (index, instance) pairs :return: """ LOGGER.debug("@ extract representative features from raw input") index_tracking_list = [] indexed_instances = instance_table.collect() features_list = [] instances_list = [] for idx, inst in indexed_instances: index_tracking_list.append(idx) features_list.append(inst.features) instances_list.append(inst) raw_features = np.array(features_list) trans_features = self.model.transform(raw_features) indexed_instances = [] for idx, inst, feat in zip(index_tracking_list, instances_list, trans_features): inst.set_feature(feat) indexed_instances.append((idx, inst)) dtable = eggroll.parallelize(indexed_instances, include_key=True, partition=instance_table._partitions) return dtable, index_tracking_list
def create_shared_gradient_table(gradients, index_list): indexed_instances = [] for idx, grad in zip(index_list, gradients): indexed_instances.append((idx, grad)) dtable = eggroll.parallelize(indexed_instances, include_key=True) return dtable
def setUp(self): self.data_num = 100 self.feature_num = 3 self.cols = [0, 1, 2] self.header = ['x' + str(i) for i in range(self.feature_num)] final_result = [] for i in range(self.data_num): tmp = [] for _ in range(self.feature_num): tmp.append(np.random.choice([1, 2, 3])) tmp = np.array(tmp) inst = Instance(inst_id=i, features=tmp, label=0) tmp_pair = (str(i), inst) final_result.append(tmp_pair) table = eggroll.parallelize(final_result, include_key=True, partition=10) table.schema = {"header": self.header} self.model_name = 'OneHotEncoder' self.table = table self.args = {"data": {self.model_name: {"data": table}}}
def create_table(data, indexes=None, model_table_name=None, model_namespace=None, persistent=False): if indexes is None: dtable = parallelize(data, include_key=False, name=model_table_name, namespace=model_namespace, error_if_exist=True, persistent=persistent) else: data_dict = {} for i, index in enumerate(indexes): data_dict[index] = data[i] dtable = parallelize(data_dict.items(), include_key=True, name=model_table_name, namespace=model_namespace, error_if_exist=True, persistent=persistent) return dtable
def merge_splitinfo(self, splitinfo_guest, splitinfo_host): LOGGER.info("merge splitinfo") splitinfo_guest_host_table = eggroll.parallelize(zip(splitinfo_guest, splitinfo_host), include_key=False, partition=self.data_bin._partitions) best_splitinfo_table = splitinfo_guest_host_table.mapValues(self.find_best_split_guest_and_host) best_splitinfos = [best_splitinfo[1] for best_splitinfo in best_splitinfo_table.collect()] return best_splitinfos
def mini_batch_index_generator(self, data_inst=None, batch_size=320): if data_inst is not None or batch_size != self.batch_size: self.batch_data_sids = self.__mini_batch_data_seperator(data_inst, batch_size) self.batch_size = batch_size batch_data_sids = self.batch_data_sids for bid in range(len(batch_data_sids)): index_data = batch_data_sids[bid] index_table = eggroll.parallelize(index_data, include_key=True, partition=data_inst._partitions) yield index_table