def test_plain_lr(): from sklearn.datasets import make_moons import functools # 修改flow_id 否则内存表可能被覆盖 eggroll.init(mode=0) ns = str(uuid.uuid1()) X = eggroll.table('testX7', ns, partition=2) Y = eggroll.table('testY7', ns, partition=2) b = np.array([0]) eta = 1.2 max_iter = 10 total_num = 500 _x, _y = make_moons(total_num, noise=0.25, random_state=12345) for i in range(np.shape(_y)[0]): X.put(i, _x[i]) Y.put(i, _y[i]) print(len([y for y in Y.collect()])) current_milli_time = lambda: int(round(time.time() * 1000)) start = current_milli_time() #shape_w = [1, np.shape(_x)[1]] shape_w = [np.shape(_x)[1]] w = np.ones(shape_w) print(w) X = TensorInEgg(None, None, X) Y = TensorInEgg(None, None, Y) w = TensorInPy(None, None, w) b = TensorInPy(None, None, b) # lr = LR(shape_w) # lr.train(X, Y) itr = 0 while itr < max_iter: H = 1 / X H = 1.0 / (1 + ((X @ w + b) * -1).map(np.exp)) R = H - Y gradient_w = (R * X).sum() / total_num gradient_b = R.sum() / total_num w = w - eta * gradient_w b = b - eta * gradient_b print("aaa", w, b) # self.plot(itr) itr += 1 print("train total time: {}".format(current_milli_time() - start)) _x_test, _y_test = make_moons(50, random_state=12345) _x_test = TensorInPy(None, None, _x_test) y_pred = 1.0 / (1 + ((_x_test @ w + b) * -1).map(np.exp)) from sklearn import metrics auc = metrics.roc_auc_score(_y_test, y_pred.store.reshape(50)) print("auc: {}".format(auc))
def split_into_guest_host_dtable(X, y, overlap_ratio=0.2, guest_split_ratio=0.5, guest_feature_num=16, tables_name=None, partition=1): data_size = X.shape[0] overlap_size = int(data_size * overlap_ratio) overlap_indexes = np.array(range(overlap_size)) guest_size = int((data_size - overlap_size) * guest_split_ratio) guest_table_ns = "guest_table_ns" guest_table_name = "guest_table_name" host_table_ns = "host_table_ns" host_table_name = "host_table_name" if tables_name is not None: guest_table_ns = tables_name["guest_table_ns"] guest_table_name = tables_name["guest_table_name"] host_table_ns = tables_name["host_table_ns"] host_table_name = tables_name["host_table_name"] guest_temp = [] for i in range(0, overlap_size + guest_size): guest_temp.append( (i, Instance(inst_id=None, weight=1.0, features=X[i, :guest_feature_num].reshape(1, -1), label=y[i, 0]))) guest_data = table(name=guest_table_name, namespace=guest_table_ns, partition=partition) guest_data.put_all(guest_temp) host_temp = [] for i in range(0, overlap_size): host_temp.append( (i, Instance(inst_id=None, weight=1.0, features=X[i, guest_feature_num:].reshape(1, -1), label=y[i, 0]))) for i in range(overlap_size + guest_size, len(X)): host_temp.append( (i, Instance(inst_id=None, weight=1.0, features=X[i, guest_feature_num:].reshape(1, -1), label=y[i, 0]))) host_data = table(name=host_table_name, namespace=host_table_ns, partition=partition) host_data.put_all(host_temp) return guest_data, host_data, overlap_indexes
def test_read_guest_host_eggroll_table(self): X = np.random.rand(30, 3) y = np.random.rand(30, 1) overlap_ratio = 0.2 guest_split_ratio = 0.5 guest_feature_num = 16 tables_name = {} tables_name["guest_table_ns"] = "guest_table_ns_01" tables_name["guest_table_name"] = "guest_table_name_01" tables_name["host_table_ns"] = "host_table_ns_01" tables_name["host_table_name"] = "host_table_name_01" guest_data, host_data, overlap_indexes = split_into_guest_host_dtable(X, y, overlap_ratio=overlap_ratio, guest_split_ratio=guest_split_ratio, guest_feature_num=guest_feature_num, tables_name=tables_name) expected_guest_size = guest_data.count() expected_host_size = host_data.count() actual_guest_table = table(tables_name["guest_table_name"], tables_name["guest_table_ns"]) actual_host_table = table(tables_name["host_table_name"], tables_name["host_table_ns"]) actual_guest_size = actual_guest_table.count() actual_host_size = actual_host_table.count() assert expected_guest_size == actual_guest_size assert expected_host_size == actual_host_size
def import_id(): eggroll.init(job_id=generate_job_id(), mode=WORK_MODE) request_data = request.json table_name_space = "id_library" try: id_library_info = eggroll.table("info", table_name_space, partition=10, create_if_missing=True, error_if_exist=False) if request_data.request("rangeStart") == 0: data_id = generate_job_id() id_library_info.put("tmp_data_id", data_id) else: data_id = id_library_info.request("tmp_data_id") data_table = eggroll.table(data_id, table_name_space, partition=50, create_if_missing=True, error_if_exist=False) for i in request_data.request("ids", []): data_table.put(i, "") if request_data.request("rangeEnd") and request_data.request( "total") and (request_data.request("total") - request_data.request("rangeEnd") == 1): # end new_id_count = data_table.count() if new_id_count == request_data["total"]: id_library_info.put( data_id, json.dumps({ "salt": request_data.request("salt"), "saltMethod": request_data.request("saltMethod") })) old_data_id = id_library_info.request("use_data_id") id_library_info.put("use_data_id", data_id) logger.info( "import id success, dtable name is {}, namespace is {}", data_id, table_name_space) # TODO: destroy DTable, should be use a lock old_data_table = eggroll.table(old_data_id, table_name_space, partition=50, create_if_missing=True, error_if_exist=False) old_data_table.destroy() id_library_info.delete(old_data_id) else: data_table.destroy() return get_json_result( 2, "the actual amount of data is not equal to total.") return get_json_result() except Exception as e: logger.exception(e) return get_json_result(1, "import error.")
def test_destroy_table(self): row_count = 10 expect_data = np.random.rand(row_count, 10) table_name = "table_name" table_ns = "table_ns" dtable = create_table(expect_data, model_table_name=table_name, model_namespace=table_ns, persistent=True) dtable_2 = table(name=table_name, namespace=table_ns) assert dtable.count() == dtable_2.count() dtable_2.destroy() dtable_3 = table(name=table_name, namespace=table_ns) assert dtable_3.count() == 0
def show_embedding(): host_embedding = eggroll.table('host', 'node_embedding', persistent=True) guest_embedding = eggroll.table('guest', 'node_embedding', persistent=True) print(guest_embedding.count()) common_nodes = eggroll.table("common_nodes", "common_nodes", persistent=True) common_nodes = common_nodes.take(common_nodes.count(), keysOnly=True) for node in common_nodes[0:5]: node = int(node) sim = cos_sim(host_embedding.get(node), guest_embedding.get(node)) print("node: {}, sim: {}".format(node, sim)) print(cos_sim(host_embedding.get(8), guest_embedding.get(12))) """
def gen_data_instance(self, table_name, namespace): data_model = self._get_data_model_param() if data_model.is_read_table: return eggroll.table(table_name, namespace) else: file_path = data_model.file_path overlap_ratio = data_model.overlap_ratio guest_split_ratio = data_model.guest_split_ratio guest_feature_num = data_model.n_feature_guest num_samples = data_model.num_samples balanced = data_model.balanced namespace, table_name = generate_table_namespace_n_name(file_path) suffix = "_" + str(uuid.uuid1()) tables_name = { "guest_table_ns": "guest_" + namespace + suffix, "guest_table_name": "guest_" + table_name + suffix, "host_table_ns": "host_" + namespace + suffix, "host_table_name": "host_" + table_name + suffix, } guest_data, host_data = load_guest_host_dtable_from_UCI_Credit_Card( file_path=file_path, num_samples=num_samples, tables_name=tables_name, overlap_ratio=overlap_ratio, guest_split_ratio=guest_split_ratio, guest_feature_num=guest_feature_num, balanced=balanced) return guest_data
def data_to_eggroll_table(data, namespace, table_name,partition=1, work_mode=0): eggroll.init(mode=work_mode) data_table = eggroll.table(table_name, namespace, partition=partition, create_if_missing=True, error_if_exist=False) data_table.put_all(data) print("------------load data finish!-----------------") print("total data_count:"+str(data_table.count())) print("namespace:%s, table_name:%s" %(namespace, table_name))
def get_table_count(name, namespace): from arch.api import eggroll eggroll.init("get_intersect_output", mode=1) table = eggroll.table(name, namespace) count = table.count() print("table count:{}".format(count)) return count
def _distributed_negative_sampling_dst(self, adj_instances, src=consts.HOST, dst=consts.GUEST): if src == consts.HOST: if dst != consts.GUEST: raise NameError("if src is host, then dst should be guest!!!") nega_ids_transfer = self.transfer_variable.host_neg_samp_ids elif src == consts.GUEST: if dst != consts.HOST: raise NameError("if src is guest, then dst should be host!!!") nega_ids_transfer = self.transfer_variable.guest_neg_samp_ids else: raise NameError("src should be choose from {host, guest}") distributed_negative_ids = federation.get(name=nega_ids_transfer.name, tag=self.transfer_variable.generate_transferid(nega_ids_transfer), idx=0) LOGGER.info("Get distributed nagative samples from {}".format(src)) for i in range(10): LOGGER.info("id:{}".format(distributed_negative_ids[i])) #sample some negative samples distribution = NeighborsSampling.generate_nega_distribution(adj_instances) sampler = DiscreteDistributionSampler([data[1] for data in distribution]) distributed_negative_instances_dst = eggroll.table(name=dst + eggroll.generateUniqueId(), namespace='neighbors_sampling/distributed_sampling', persistent=False) for id in distributed_negative_ids: index = sampler.sampling() distributed_negative_instances_dst.put(id, (distribution[index][0], -1)) logDtableInstances(LOGGER, distributed_negative_instances_dst, isInstance=False) return distributed_negative_instances_dst
def feed_into_dtable(ids, X, y, sample_range, feature_range, tables_name=None, partition=1): """ Create an eggroll table feed with data specified by parameters provided parameters ---------- :param ids: 1D numpy array :param X: 2D numpy array :param y: 2D numpy array :param sample_range: a tuple specifies the range of samples to feed into dtable :param feature_range: a tuple specifies the range of features to feed into dtable :param tables_name: a dictionary specifies table namespace (with key table_ns) and table name (with key table_name) :param partition: number of partition used when creating the dtable :return: an eggroll dtable """ table_ns = "default_table_namespace" table_name = get_timestamp() if tables_name is not None: table_ns = tables_name["table_ns"] table_name = tables_name["table_name"] sample_list = [] for i in range(sample_range[0], sample_range[1]): sample_list.append((ids[i], Instance(inst_id=ids[i], features=X[i, feature_range[0]:feature_range[1]], label=y[i, 0]))) data_table = table(name=table_name, namespace=table_ns, partition=partition) data_table.put_all(sample_list) return data_table
def save_data(kv_data: Iterable, name, namespace, partition=1, create_if_missing=True, error_if_exist=False, version_log=None): """ save data into data table :param kv_data: :param name: table name of data table :param namespace: table namespace of data table :param partition: number of partition :param create_if_missing: :param error_if_exist: :return: data table instance """ data_table = eggroll.table(name=name, namespace=namespace, partition=partition, create_if_missing=create_if_missing, error_if_exist=error_if_exist) data_table.put_all(kv_data) version_log = "[AUTO] save data at %s." % datetime.datetime.now( ) if not version_log else version_log control.save_version(name=name, namespace=namespace, version_log=version_log) return data_table
def table(name: str, namespace: str, partition: int = 1, persistent: bool = True, create_if_missing: bool = True, error_if_exist: bool = False, in_place_computing: bool = False): data_table = eggroll.table(name=name, namespace=namespace, partition=partition, persistent=persistent, create_if_missing=create_if_missing, error_if_exist=error_if_exist, in_place_computing=in_place_computing) return data_table
def show_distributed_samples(topk): samples_anchor = eggroll.table( 'anchor', "neighbors_samples/distributed_samples/host", persistent=True) samples_target = eggroll.table( 'target', "neighbors_samples/distributed_samples/guest", persistent=True) samples_anchor = list(samples_anchor.collect()) samples_target = list(samples_target.collect()) for anchor, target in zip(samples_anchor[:topk + 10], samples_target): print("sample_id: {}, anchor:{} sample_id: {}, target:{}".format( anchor[0], anchor[1], target[0], target[1]))
def save_model(buffer_type, proto_buffer, name, namespace, version_log=None): data_table = eggroll.table(name=name, namespace=namespace, partition=get_model_table_partition_count(), create_if_missing=True, error_if_exist=False) # todo: model slice? data_table.put(buffer_type, proto_buffer.SerializeToString(), use_serialize=False) version_log = "[AUTO] save model at %s." % datetime.datetime.now() if not version_log else version_log version_control.save_version(name=name, namespace=namespace, version_log=version_log)
def load_eval_result(self): eval_data = eggroll.table( name=self.workflow_param.evaluation_output_table, namespace=self.workflow_param.evaluation_output_namespace, ) LOGGER.debug("Evaluate result loaded: {}".format(eval_data)) return eval_data
def get_cross_size(gpid, guid, g_table, hpid, huid, h_table): job_id = gen_job_id(hpid, gpid) roles = {consts.HOST: [hpid], consts.GUEST: [gpid]} args = ((huid, consts.HOST, job_id, h_table, roles), (guid, consts.GUEST, job_id, g_table, roles)) # ps = [Process(target=role_jobs, args=arg) for arg in args] # for p in ps: # p.start() # for p in ps: # p.join() # print('processes done') with concurrent.futures.ProcessPoolExecutor(max_workers=2) as executor: futures = {executor.submit(role_jobs, *arg): arg[1] for arg in args} for future in concurrent.futures.as_completed(futures): try: future.result() except Exception as exc: print(str(exc)) intersect_table = consts.GUEST + '_intersect_output_' + job_id intersect_namespace = 'fdp_output_namespace' eggroll.init(job_id, WORK_MODE) table = eggroll.table( name=intersect_table, namespace=intersect_namespace, ) table_size = table.count() return table_size
def get_commit_tmp_table(data_table_namespace): version_tmp_table = eggroll.table(name=data_table_namespace, namespace="version_tmp", partition=1, create_if_missing=True, error_if_exist=False) return version_tmp_table
def get_id_library_table_name(): id_library_info = eggroll.table('info', 'id_library', partition=10, create_if_missing=True, error_if_exist=False) return id_library_info.get("use_data_id")
def show_local_samples(name, namespace, topk=5): local_samples = eggroll.table(name, namespace, persistent=True) samples = list(local_samples.collect()) for data in samples[:topk]: print("sample_id: {}, training pairs:{}".format(data[0], data[1])) for data in samples[-topk:]: print("sample_id: {}, training pairs:{}".format(data[0], data[1]))
def load_model(self, model_table, model_namespace): LOGGER.info("load model") modelmeta = list( eggroll.table(model_table, model_namespace).collect())[0][1] self.task_type = modelmeta.task_type self.loss_type = modelmeta.loss_type self.tree_dim = modelmeta.tree_dim self.trees_ = modelmeta.trees_
def save_data_to_eggroll_table(data, namespace, table_name, partition=1): data_table = table(table_name, namespace, partition=partition, create_if_missing=True, error_if_exist=True) data_table.put_all(data) return data_table
def predict(gid, g_table, gy_id, hid, h_table, hy_id, model_name): job_id = gen_job_id(hid, gid) run_jobs(gid, g_table, gy_id, hid, h_table, hy_id, job_id, 'predict', model_name) eggroll.init(job_id, WORK_MODE) predict_output = consts.GUEST + '_predict_table_' + job_id table = eggroll.table(predict_output, PREDICT_NAMESPACE) result = list(table.collect()) items = [(a, b, c) for a, (b, c, d) in result] return pd.DataFrame.from_records(items, columns=['id', 'label', 'prob'])
def get_data_table(name, namespace): """ return data table instance by table name and table name space :param name: table name of data table :param namespace: table name space of data table :return: data table instance """ return eggroll.table(name=name, namespace=namespace, create_if_missing=False)
def get_lr_y_table(file_path): ns = str(uuid.uuid1()) csv_table = pd.read_csv(file_path) data = pd.read_csv(file_path).values y = eggroll.table('fata_script_test_data_y_' + str(RuntimeInstance.FEDERATION.role) + str(RuntimeInstance.FEDERATION.job_id), ns, partition=2, persistent=True) if 'y' not in list(csv_table.columns.values): raise RuntimeError("input data must contain y column") for i in range(np.shape(data)[0]): y.put(data[i][0], 1 if data[i][1] == 1 else -1) return TensorInEgg(RuntimeInstance.FEDERATION.encrypt_operator, None, y)
def show_result(table, namespace, rows=10): result = eggroll.table(table, namespace) print('data count: {}'.format(result.count())) if result.count() > 10: result_data = result.collect() n = 0 while n < rows: result = result_data.__next__() print("predict result: {}".format(result[1].features)) n += 1
def get_lr_x_table(file_path): ns = str(uuid.uuid1()) csv_table = pd.read_csv(file_path) data = pd.read_csv(file_path).values x = eggroll.table('fata_script_test_data_x_' + str(RuntimeInstance.FEDERATION.role + str(RuntimeInstance.FEDERATION.job_id)), ns, partition=2, persistent=True) if 'y' in list(csv_table.columns.values): data_index = 2 else: data_index = 1 for i in range(np.shape(data)[0]): x.put(data[i][0], data[i][data_index:]) return TensorInEgg(RuntimeInstance.FEDERATION.encrypt_operator, None, x)
def read_model(buffer_type, proto_buffer, name, namespace): data_table = eggroll.table(name=name, namespace=namespace, partition=get_model_table_partition_count(), create_if_missing=False, error_if_exist=False) if data_table: buffer_bytes = data_table.get(buffer_type, use_serialize=False) proto_buffer.ParseFromString(buffer_bytes) return True else: return False
def read_data(self, table_name, namespace): input_data = eggroll.table(table_name, namespace) LOGGER.info("start to read data and change data to instance") params = [self.delimitor, self.data_type, self.missing_fill, self.default_value, self.with_label, self.label_idx, self.label_type, self.output_format] to_instance_with_param = functools.partial(self.to_instance, params) data_instance = input_data.mapValues(to_instance_with_param) return data_instance
def load_model(self, model_table, model_namespace): LOGGER.info("load model") modelmeta = list( eggroll.table(model_table, model_namespace).collect())[0][1] self.task_type = modelmeta.task_type self.loss_type = modelmeta.loss_type self.tree_dim = modelmeta.tree_dim self.num_classes = modelmeta.num_classes self.trees_ = modelmeta.trees_ self.classes_ = modelmeta.classes_ self.history_loss = modelmeta.loss self.set_loss(self.loss_type)