def get_table_count(name, namespace): from arch.api import eggroll eggroll.init("get_intersect_output", mode=1) table = eggroll.table(name, namespace) count = table.count() print("table count:{}".format(count)) return count
def setUp(self): eggroll.init("test_instance") dense_inst = [] for i in range(100): inst = Instance(features=(i % 16 * np.ones(20))) dense_inst.append((i, inst)) self.dense_table = eggroll.parallelize(dense_inst, include_key=True, partition=2) sparse_inst = [] col_zero = [] for i in range(100): indices = [] data = [] for j in range(20): val = ((i + 5)**3 + (j + 1)**4) % 16 if val > 0: indices.append(j) data.append(val) if j == 0: col_zero.append(val) sparse_vec = SparseVector(indices, data, 20) inst = Instance(features=sparse_vec) sparse_inst.append((i, inst)) self.sparse_inst = sparse_inst self.sparse_table = eggroll.parallelize(sparse_inst, include_key=True, partition=1)
def _init_argument(self): parser = argparse.ArgumentParser() parser.add_argument('-c', '--config', required=True, type=str, help="Specify a config json file path") parser.add_argument('-j', '--job_id', type=str, required=True, help="Specify the job id") # parser.add_argument('-p', '--party_id', type=str, required=True, help="Specify the party id") # parser.add_argument('-l', '--LOGGER_path', type=str, required=True, help="Specify the LOGGER path") args = parser.parse_args() config_path = args.config self.config_path = config_path if not args.config: LOGGER.error("Config File should be provided") exit(-100) self.job_id = args.job_id all_checker = AllChecker(config_path) all_checker.check_all() self._initialize(config_path) with open(config_path) as conf_f: runtime_json = json.load(conf_f) eggroll.init(self.job_id, self.workflow_param.work_mode) LOGGER.debug("The job id is {}".format(self.job_id)) federation.init(self.job_id, runtime_json) LOGGER.debug("Finish eggroll and federation init") self._init_pipeline()
def _init_argument(self): self._init_LOGGER(LOGGER_path) self._initialize(config_path) with open(config_path) as conf_f: runtime_json = json.load(conf_f) eggroll.init(job_id) federation.init(job_id, runtime_json)
def do_export_file(job_id, _data): try: work_mode = _data.get("work_mode") name = _data.get("name") namespace = _data.get("namespace") delimitor = _data.get("delimitor", ",") output_path = _data.get("output_path") eggroll.init(job_id, work_mode) with open(os.path.abspath(output_path), "w") as fout: data_table = storage.get_data_table(name=name, namespace=namespace) print('===== begin to export data =====') lines = 0 for key, value in data_table.collect(): if not value: fout.write(key + "\n") else: fout.write(key + delimitor + value + "\n") lines += 1 if lines % 2000 == 0: print("===== export {} lines =====".format(lines)) print("===== export {} lines totally =====".format(lines)) print('===== export data finish =====') except: raise ValueError("cannot export data, please check json file")
def setUp(self): eggroll.init("test_encrypt_mode_calculator") self.list_data = [] self.tuple_data = [] self.numpy_data = [] for i in range(30): list_value = [100 * i + j for j in range(20)] tuple_value = tuple(list_value) numpy_value = np.array(list_value, dtype="int") self.list_data.append(list_value) self.tuple_data.append(tuple_value) self.numpy_data.append(numpy_value) self.data_list = eggroll.parallelize(self.list_data, include_key=False, partition=10) self.data_tuple = eggroll.parallelize(self.tuple_data, include_key=False, partition=10) self.data_numpy = eggroll.parallelize(self.numpy_data, include_key=False, partition=10)
def data_to_eggroll_table(data, namespace, table_name,partition=1, work_mode=0): eggroll.init(mode=work_mode) data_table = eggroll.table(table_name, namespace, partition=partition, create_if_missing=True, error_if_exist=False) data_table.put_all(data) print("------------load data finish!-----------------") print("total data_count:"+str(data_table.count())) print("namespace:%s, table_name:%s" %(namespace, table_name))
def get_cross_size(gpid, guid, g_table, hpid, huid, h_table): job_id = gen_job_id(hpid, gpid) roles = {consts.HOST: [hpid], consts.GUEST: [gpid]} args = ((huid, consts.HOST, job_id, h_table, roles), (guid, consts.GUEST, job_id, g_table, roles)) # ps = [Process(target=role_jobs, args=arg) for arg in args] # for p in ps: # p.start() # for p in ps: # p.join() # print('processes done') with concurrent.futures.ProcessPoolExecutor(max_workers=2) as executor: futures = {executor.submit(role_jobs, *arg): arg[1] for arg in args} for future in concurrent.futures.as_completed(futures): try: future.result() except Exception as exc: print(str(exc)) intersect_table = consts.GUEST + '_intersect_output_' + job_id intersect_namespace = 'fdp_output_namespace' eggroll.init(job_id, WORK_MODE) table = eggroll.table( name=intersect_table, namespace=intersect_namespace, ) table_size = table.count() return table_size
def setUp(self): eggroll.init("test_random_sampler") self.data = [(i * 10 + 5, i * i) for i in range(100)] self.table = eggroll.parallelize(self.data, include_key=True) self.data_to_trans = [(i * 10 + 5, i * i * i) for i in range(100)] self.table_trans = eggroll.parallelize(self.data_to_trans, include_key=True)
def setUp(self): # use default setting eggroll.init("123") logistic_param = LogisticParam() self.model = BaseLogisticRegression(logistic_param) self.model.header = [] self.data_instance = self.__prepare_data()
def test_plain_lr(): from sklearn.datasets import make_moons import functools # 修改flow_id 否则内存表可能被覆盖 eggroll.init(mode=0) ns = str(uuid.uuid1()) X = eggroll.table('testX7', ns, partition=2) Y = eggroll.table('testY7', ns, partition=2) b = np.array([0]) eta = 1.2 max_iter = 10 total_num = 500 _x, _y = make_moons(total_num, noise=0.25, random_state=12345) for i in range(np.shape(_y)[0]): X.put(i, _x[i]) Y.put(i, _y[i]) print(len([y for y in Y.collect()])) current_milli_time = lambda: int(round(time.time() * 1000)) start = current_milli_time() #shape_w = [1, np.shape(_x)[1]] shape_w = [np.shape(_x)[1]] w = np.ones(shape_w) print(w) X = TensorInEgg(None, None, X) Y = TensorInEgg(None, None, Y) w = TensorInPy(None, None, w) b = TensorInPy(None, None, b) # lr = LR(shape_w) # lr.train(X, Y) itr = 0 while itr < max_iter: H = 1 / X H = 1.0 / (1 + ((X @ w + b) * -1).map(np.exp)) R = H - Y gradient_w = (R * X).sum() / total_num gradient_b = R.sum() / total_num w = w - eta * gradient_w b = b - eta * gradient_b print("aaa", w, b) # self.plot(itr) itr += 1 print("train total time: {}".format(current_milli_time() - start)) _x_test, _y_test = make_moons(50, random_state=12345) _x_test = TensorInPy(None, None, _x_test) y_pred = 1.0 / (1 + ((_x_test @ w + b) * -1).map(np.exp)) from sklearn import metrics auc = metrics.roc_auc_score(_y_test, y_pred.store.reshape(50)) print("auc: {}".format(auc))
def setUp(self): eggroll.init("test_instance") dense_inst = [] dense_not_inst = [] headers = ['x' + str(i) for i in range(20)] self.header = headers self.eps = 1e-5 self.count = 100 self.dense_data_transpose = [] for i in range(self.count): features = i % 16 * np.ones(20) inst = Instance(features=features) dense_inst.append((i, inst)) self.dense_data_transpose.append(features) dense_not_inst.append((i, features)) self.dense_inst = dense_inst self.dense_not_inst = dense_not_inst self.dense_data_transpose = np.array(self.dense_data_transpose) self.dense_data_transpose = self.dense_data_transpose.transpose() self.dense_table = eggroll.parallelize(dense_inst, include_key=True, partition=5) self.dense_not_inst_table = eggroll.parallelize(dense_not_inst, include_key=True, partition=5) self.dense_table.schema = {'header': headers} self.dense_not_inst_table.schema = {'header': headers} col_index = [1, 2, 3] self.col_index = col_index self.summary_obj = MultivariateStatisticalSummary(self.dense_table, col_index, abnormal_list=[None]) self.summary_obj_not_inst = MultivariateStatisticalSummary(self.dense_not_inst_table, col_index, abnormal_list=[None])
def setUp(self): self.feature_histogram = FeatureHistogram() eggroll.init("test_feature_histogram") data_insts = [] for i in range(1000): indices = [] data = [] for j in range(10): x = random.randint(0, 5) if x != 0: data.append(x) indices.append(j) sparse_vec = SparseVector(indices, data, shape=10) data_insts.append((Instance(features=sparse_vec), (1, random.randint(0, 3)))) self.node_map = {0: 0, 1: 1, 2: 2, 3: 3} self.data_insts = data_insts self.data_bin = eggroll.parallelize(data_insts, include_key=False) self.grad_and_hess_list = [(random.random(), random.random()) for i in range(1000)] self.grad_and_hess = eggroll.parallelize(self.grad_and_hess_list, include_key=False) bin_split_points = [] for i in range(10): bin_split_points.append(np.array([i for i in range(5)])) self.bin_split_points = np.array(bin_split_points) self.bin_sparse = [0 for i in range(10)]
def import_offline_feature(): eggroll.init(job_id=generate_job_id(), mode=WORK_MODE) request_data = request.json try: if not request_data.get("jobId"): return get_json_result(status=2, msg="no job id") job_id = request_data.get("jobId") job_data = query_job_by_id(job_id=job_id) if not job_data: return get_json_result(status=3, msg="can not found this job id: %s" % request_data.get("jobId", "")) response = GetFeature.import_data(request_data, json.loads(job_data[0]["config"])) if response.get("status", 1) == 0: update_job_by_id(job_id=job_id, update_data={ "status": "success", "end_date": datetime.datetime.now() }) return get_json_result() else: return get_json_result(status=1, msg="request offline feature error: %s" % response.get("msg", "")) except Exception as e: logger.exception(e) return get_json_result(status=1, msg="request offline feature error: %s" % e)
def setUp(self): eggroll.init("test_least_abs_error_loss") self.lae_loss = LeastAbsoluteErrorLoss() self.y_list = [i % 2 for i in range(100)] self.predict_list = [random.random() for i in range(100)] self.y = eggroll.parallelize(self.y_list, include_key=False) self.predict = eggroll.parallelize(self.predict_list, include_key=False)
def setUp(self): eggroll.init("test_fair_loss") self.log_cosh_loss = LogCoshLoss() self.y_list = [i % 2 for i in range(100)] self.predict_list = [random.random() for i in range(100)] self.y = eggroll.parallelize(self.y_list, include_key=False) self.predict = eggroll.parallelize(self.predict_list, include_key=False)
def setUp(self): eggroll.init("test_cross_entropy") self.sigmoid_loss = SigmoidBinaryCrossEntropyLoss() self.y_list = [i % 2 for i in range(100)] self.predict_list = [random.random() for i in range(100)] self.y = eggroll.parallelize(self.y_list, include_key=False) self.predict = eggroll.parallelize(self.predict_list, include_key=False)
def setUp(self): eggroll.init("test_huber_loss") self.delta = 1 self.huber_loss = HuberLoss(self.delta) self.y_list = [i % 2 for i in range(100)] self.predict_list = [random.random() for i in range(100)] self.y = eggroll.parallelize(self.y_list, include_key=False) self.predict = eggroll.parallelize(self.predict_list, include_key=False)
def setUp(self): eggroll.init("test_fair_loss") self.rho = 0.5 self.tweedie_loss = TweedieLoss(self.rho) self.y_list = [i % 2 for i in range(100)] self.predict_list = [random.random() for i in range(100)] self.y = eggroll.parallelize(self.y_list, include_key=False) self.predict = eggroll.parallelize(self.predict_list, include_key=False)
def setUp(self): eggroll.init("test_label_checker") self.small_label_set = [i % 5 for i in range(100)] self.classify_y = eggroll.parallelize(self.small_label_set, include_key=False) self.regression_label = [random.random() for i in range(100)] self.regression_y = eggroll.parallelize(self.regression_label) self.classify_checker = ClassifyLabelChecker() self.regression_checker = RegressionLabelChecker()
def predict(gid, g_table, gy_id, hid, h_table, hy_id, model_name): job_id = gen_job_id(hid, gid) run_jobs(gid, g_table, gy_id, hid, h_table, hy_id, job_id, 'predict', model_name) eggroll.init(job_id, WORK_MODE) predict_output = consts.GUEST + '_predict_table_' + job_id table = eggroll.table(predict_output, PREDICT_NAMESPACE) result = list(table.collect()) items = [(a, b, c) for a, (b, c, d) in result] return pd.DataFrame.from_records(items, columns=['id', 'label', 'prob'])
def import_id(): eggroll.init(job_id=generate_job_id(), mode=WORK_MODE) request_data = request.json table_name_space = "id_library" try: id_library_info = eggroll.table("info", table_name_space, partition=10, create_if_missing=True, error_if_exist=False) if request_data.request("rangeStart") == 0: data_id = generate_job_id() id_library_info.put("tmp_data_id", data_id) else: data_id = id_library_info.request("tmp_data_id") data_table = eggroll.table(data_id, table_name_space, partition=50, create_if_missing=True, error_if_exist=False) for i in request_data.request("ids", []): data_table.put(i, "") if request_data.request("rangeEnd") and request_data.request( "total") and (request_data.request("total") - request_data.request("rangeEnd") == 1): # end new_id_count = data_table.count() if new_id_count == request_data["total"]: id_library_info.put( data_id, json.dumps({ "salt": request_data.request("salt"), "saltMethod": request_data.request("saltMethod") })) old_data_id = id_library_info.request("use_data_id") id_library_info.put("use_data_id", data_id) logger.info( "import id success, dtable name is {}, namespace is {}", data_id, table_name_space) # TODO: destroy DTable, should be use a lock old_data_table = eggroll.table(old_data_id, table_name_space, partition=50, create_if_missing=True, error_if_exist=False) old_data_table.destroy() id_library_info.delete(old_data_id) else: data_table.destroy() return get_json_result( 2, "the actual amount of data is not equal to total.") return get_json_result() except Exception as e: logger.exception(e) return get_json_result(1, "import error.")
def setUp(self): eggroll.init("test_stratified_sampler") self.data = [] self.data_to_trans = [] for i in range(1000): self.data.append((i, Instance(label=i % 4, features=i * i))) self.data_to_trans.append((i, Instance(features = i ** 3))) self.table = eggroll.parallelize(self.data, include_key=True) self.table_trans = eggroll.parallelize(self.data_to_trans, include_key=True)
def setUp(self): eggroll.init("test_cross_entropy") self.softmax_loss = SoftmaxCrossEntropyLoss() self.y_list = [i % 5 for i in range(100)] self.predict_list = [ np.array([random.random() for i in range(5)]) for j in range(100) ] self.y = eggroll.parallelize(self.y_list, include_key=False) self.predict = eggroll.parallelize(self.predict_list, include_key=False)
def query_model_version_history(): request_data = request.json try: config = file_utils.load_json_conf(request_data.get("config_path")) eggroll.init(mode=WORK_MODE) history = version_history(data_table_namespace=config.get("namespace")) return get_json_result(msg=json.dumps(history)) except Exception as e: logger.exception(e) return get_json_result(status=1, msg="load model error: %s" % e)
def _init_argument(self): self._initialize(config_path) with open(config_path) as conf_f: runtime_json = json.load(conf_f) LOGGER.debug("The Guest job id is {}".format(job_id)) LOGGER.debug("The Guest work mode id is {}".format(self.workflow_param.work_mode)) eggroll.init(job_id, self.workflow_param.work_mode) federation.init(job_id, runtime_json) LOGGER.debug("Finish eggroll and federation init")
def setUp(self): eggroll.init("test_dataio" + str(int(time.time()))) self.table = "dataio_table_test" self.namespace = "dataio_test" table = eggroll.parallelize([("a", "1,2,-1,0,0,5"), ("b", "4,5,6,0,1,2")], include_key=True) table.save_as(self.table, self.namespace) self.table2 = "dataio_table_test2" self.namespace2 = "dataio_test2" table2 = eggroll.parallelize([("a", '-1,,NA,NULL,null,2')], include_key=True) table2.save_as(self.table2, self.namespace2)
def init(job_id, runtime_conf, mode, server_conf_path="arch/conf/server_conf.json"): eggroll.init(job_id, mode) print("runtime_conf:{}".format(runtime_conf)) all_checker = AllChecker(runtime_conf) all_checker.check_all() with open(runtime_conf) as conf_p: runtime_json = json.load(conf_p) if mode is None: raise EnvironmentError("eggroll should be initialized before fate_script") if mode == WorkMode.STANDALONE: RuntimeInstance.FEDERATION = standalone_fate_script.init(job_id=job_id, runtime_conf=runtime_json) else: RuntimeInstance.FEDERATION = cluster_fate_script.init(job_id=job_id, runtime_conf=runtime_json, server_conf_path=server_conf_path)
def setUp(self): eggroll.init("123") self.data_num = 1000 self.feature_num = 200 final_result = [] for i in range(self.data_num): tmp = i * np.ones(self.feature_num) inst = Instance(inst_id=i, features=tmp, label=0) tmp = (str(i), inst) final_result.append(tmp) table = eggroll.parallelize(final_result, include_key=True, partition=3) self.table = table
def setUp(self): # for test_aggregate_add eggroll.init("test_hetero_federated_aggregator") self.size = 10 self.table_a = eggroll.parallelize(range(self.size)) self.table_b = eggroll.parallelize(list(range(self.size))) self.add_a_b = [i * 2 for i in range(self.size)] # for test_aggregate_mean self.table_d_tuple = eggroll.parallelize([(i, i + 1) for i in range(self.size)]) self.reduce_a = np.sum(list(range(self.size))) / self.size * 1.0 self.reduce_d_tuple = [ np.sum(list(range(self.size))) / self.size * 1.0, np.sum(list(range(self.size + 1))) / self.size * 1.0 ] # for test_separate self.separate_data = list(range(self.size)) self.separate_size_list = [ int(0.1 * self.size), int(0.2 * self.size), int(0.3 * self.size), int(0.4 * self.size) ] self.separate_result = [] cur_index = 0 for i in range(len(self.separate_size_list)): self.separate_result.append( self.separate_data[cur_index:cur_index + self.separate_size_list[i]]) cur_index += self.separate_size_list[i] # for test_aggregate_add_square this_size = 10000 list_a = [random.randint(0, 1000) for _ in range(this_size)] list_b = [random.randint(0, 1000) for _ in range(this_size)] self.table_list_a = eggroll.parallelize(list_a) self.table_list_b = eggroll.parallelize(list_b) self.table_list_a_square = eggroll.parallelize( [np.square(i) for i in list_a]) self.table_list_b_square = eggroll.parallelize( [np.square(i) for i in list_b]) self.list_add_square_result = list( np.sort( np.array([np.square(i + j) for (i, j) in zip(list_a, list_b)])))