def jsonToObject(): topology_manager = load_topology_manager( "f1404379d87f34bda07aba3c530bd146") GflNode.init_node() node = GflNode.default_node topology_manager.add_client(client_node=node, add_into_topology=True) print(topology_manager.get_index_by_node(node)) topology_manager.generate_topology() save_topology_manager("123", topology_manager) temp_topology_manager = load_topology_manager("123") print(temp_topology_manager.topology)
def setUp(self) -> None: # 共有两个训练节点,有一个任务,这个任务需要训练两轮。需要1个aggregator_scheduler,需要2个jobTrainerScheduler # 将这3个调度器放入队列 # 队列非空时,从队列中取出第一个调度器,判断是否可以开始执行。 # 可以执行,则执行。执行完毕之后,判断是否达到任务的要求(训练是否到达了指定轮数) # 没有达到任务的要求,则放入队尾 # 不可以执行,则放入队尾 self.dataset = generate_dataset() print("生成的dataset_id:" + self.dataset.dataset_id) self.job = generate_job() print("生成的job_id:" + self.job.job_id) self.job.mount_dataset(self.dataset) self.job_2 = generate_job() self.job_2.job_id = self.job.job_id self.job_2.mount_dataset(self.dataset) print("生成的job_2_id:" + self.job_2.job_id) self.job_3 = generate_job() self.job_3.job_id = self.job.job_id self.job_3.mount_dataset(self.dataset) print("生成的job_3_id:" + self.job_3.job_id) GflNode.init_node() node1 = GflNode.default_node self.aggregator_scheduler = JobAggregateScheduler(node=None, job=self.job) self.jobTrainerScheduler_1 = JobTrainScheduler(node=node1, job=self.job_2) JobManager.init_job_sqlite(self.job_2.job_id) client1 = ClientEntity(self.jobTrainerScheduler_1.node.address, self.jobTrainerScheduler_1.job.dataset.dataset_id, self.jobTrainerScheduler_1.node.pub_key) save_client(self.job_2.job_id, client=client1) self.jobTrainerScheduler_1.register() GflNode.init_node() node2 = GflNode.default_node self.jobTrainerScheduler_2 = JobTrainScheduler(node=node2, job=self.job_3) client2 = ClientEntity(self.jobTrainerScheduler_2.node.address, self.jobTrainerScheduler_2.job.dataset.dataset_id, self.jobTrainerScheduler_2.node.pub_key) save_client(self.job_3.job_id, client=client2) self.jobTrainerScheduler_2.register() # 将调度器放入队列 self.list = [] self.list.append(self.aggregator_scheduler) self.list.append(self.jobTrainerScheduler_1) self.list.append(self.jobTrainerScheduler_2)
def init(cls, force): if os.path.exists(GflConf.home_dir): if force: logging.shutdown() shutil.rmtree(GflConf.home_dir) else: raise ValueError("homedir not empty.") # create home dir os.makedirs(GflConf.home_dir) # generate config file GflConf.generate_config(PathUtils.join(GflConf.home_dir, "config.yaml")) # generate node address and key GflNode.init_node() # create data directories Lfs.init()
def setUp(self) -> None: self.dataset = generate_dataset() print("dataset_id:" + self.dataset.dataset_id) self.job = generate_job() print("job_id:" + self.job.job_id) self.job.mount_dataset(self.dataset) GflNode.init_node() node = GflNode.default_node self.aggregator_scheduler = JobAggregateScheduler(node=None, job=copy.deepcopy( self.job), target_num=1) self.jobTrainerScheduler = JobTrainScheduler(node=node, job=copy.deepcopy( self.job)) JobManager.init_job_sqlite(self.job.job_id) client = ClientEntity(self.jobTrainerScheduler.node.address, self.jobTrainerScheduler.job.dataset.dataset_id, self.jobTrainerScheduler.node.pub_key) save_client(self.job.job_id, client=client)
def run(cls, role, console, **kwargs): sys.stderr = open(os.devnull, "w") cls.logger = logging.getLogger("gfl") with Daemonizer() as (is_setup, daemonizer): main_pid = None if is_setup: main_pid = os.getpid() pid_file = PathUtils.join(GflConf.home_dir, "proc.lock") stdout_file = PathUtils.join(GflConf.logs_dir, "console_out") stderr_file = PathUtils.join(GflConf.logs_dir, "console_err") is_parent = daemonizer(pid_file, stdout_goto=stdout_file, stderr_goto=stderr_file) if is_parent: if console and main_pid == os.getpid(): Shell.startup() GflNode.load_node() if GflConf.get_property("net.mode") == "standalone": client_number = GflConf.get_property( "net.standalone.client_number") for _ in range(len(GflNode.standalone_nodes), client_number): GflNode.add_standalone_node() ManagerHolder.default_manager = NodeManager( node=GflNode.default_node, role="server") for i in range(client_number): client_manager = NodeManager(node=GflNode.standalone_nodes[i], role="client") ManagerHolder.standalone_managers.append(client_manager) else: ManagerHolder.default_manager = NodeManager( node=GflNode.default_node, role=role) # cls.__startup_node_managers() HttpListener.start() while HttpListener.is_alive(): time.sleep(2)
def setUp(self) -> None: self.dataset = generate_dataset() print("dataset_id:" + self.dataset.dataset_id) self.job = generate_job() print("job_id:" + self.job.job_id) self.job.mount_dataset(self.dataset) GflNode.init_node() node = GflNode.default_node self.jobTrainerScheduler = JobTrainScheduler(node=node, job=self.job) self.jobTrainerScheduler.register() # aggregator需要初始化随机模型 global_params_dir = JobPath(self.job.job_id).global_params_dir( self.job.cur_round) # print("global_params_dir:"+global_params_dir) os.makedirs(global_params_dir, exist_ok=True) model_params_path = PathUtils.join(global_params_dir, self.job.job_id + '.pth') # print("model_params_path:"+model_params_path) model = Net() torch.save(model.state_dict(), model_params_path)
def setUp(self) -> None: # GflNode.init_node() GflNode.load_node() node_server = GflNode.standalone_nodes[0] node_client1 = GflNode.standalone_nodes[1] node_client2 = GflNode.standalone_nodes[2] self.node_manager_server = NodeManager(node=node_server, role="server") self.node_manager_client1 = NodeManager(node=node_client1, role="client") self.node_manager_client2 = NodeManager(node=node_client2, role="client") # 创建job self.job = generate_job() # 创建job对应的拓扑 topology_config = TopologyConfig() topology_config.with_train_node_num(2) topology_config.with_server_nodes([node_server.address]) topology_config.with_client_nodes([node_client1.address, node_client2.address]) topology_config.with_index2node([node_server.address, node_client1.address, node_client2.address]) temp_topology_manager = CentralizedTopologyManager(topology_config) temp_topology_manager.generate_topology() # 保存job对应的拓扑 save_topology_manager(job_id=self.job.job_id, topology_manager=temp_topology_manager)
def test_start(self): # 共有两个训练节点,有一个任务,这个任务需要训练两轮。需要1个aggregator_scheduler,需要2个jobTrainerScheduler # 生成dataset self.dataset = generate_dataset() print("生成的dataset_id:" + self.dataset.dataset_id) # 生成3个一样的job self.job = generate_job() print("生成的job_id:" + self.job.job_id) self.job.mount_dataset(self.dataset) JobManager.init_job_sqlite(self.job.job_id) JobManager.submit_job(self.job) self.job_2 = generate_job() self.job_2.job_id = self.job.job_id self.job_2.mount_dataset(self.dataset) print("生成的job_2_id:" + self.job_2.job_id) self.job_3 = generate_job() self.job_3.job_id = self.job.job_id self.job_3.mount_dataset(self.dataset) print("生成的job_3_id:" + self.job_3.job_id) # 生成3个node。1个是聚合方,2个是训练方 # 聚合方 GflNode.init_node() node1 = GflNode.default_node # 将job和聚合方绑定 self.job.add_server(node1) self.job_2.add_server(node1) self.job_3.add_server(node1) # 拓扑结构,根据job生成 self.tpmgr = CentralizedTopologyManager(n=3, job=self.job, aggregate_node=node1) self.aggregator_scheduler = JobAggregateScheduler(node=node1, topology_manager=self.tpmgr, job=self.job) # 训练方 GflNode.init_node() node2 = GflNode.default_node self.jobTrainerScheduler_1 = JobTrainScheduler(node=node2, topology_manager=self.tpmgr, job=self.job_2) # JobManager.init_job_sqlite(self.job_2.job_id) client1 = ClientEntity(self.jobTrainerScheduler_1.node.address, self.jobTrainerScheduler_1.job.dataset.dataset_id, self.jobTrainerScheduler_1.node.pub_key) save_client(self.job_2.job_id, client=client1) self.jobTrainerScheduler_1.register() # 加到拓扑结构当中 self.tpmgr.add_node_into_topology(node2, 1) # 训练方 GflNode.init_node() node3 = GflNode.default_node self.jobTrainerScheduler_2 = JobTrainScheduler(node=node3, topology_manager=self.tpmgr, job=self.job_3) client2 = ClientEntity(self.jobTrainerScheduler_2.node.address, self.jobTrainerScheduler_2.job.dataset.dataset_id, self.jobTrainerScheduler_2.node.pub_key) save_client(self.job_3.job_id, client=client2) self.jobTrainerScheduler_2.register() # 加到拓扑结构当中 self.tpmgr.add_node_into_topology(node3, 2) # 生成中心化的拓扑结构 self.tpmgr.generate_topology() # 将调度器放入队列 self.list = [] self.list.append(self.aggregator_scheduler) self.list.append(self.jobTrainerScheduler_1) self.list.append(self.jobTrainerScheduler_2) while len(self.list) != 0: for num in range(len(self.list) - 1, -1, -1): scheduler = self.list[num] if scheduler.is_finished(): self.list.remove(scheduler) else: if scheduler.is_available(): scheduler.start() if scheduler.is_finished(): self.list.remove(scheduler)
import os work_dir = os.path.dirname(os.path.dirname(__file__)) os.chdir(work_dir) from gfl.conf import GflConf from gfl.core.manager.node import GflNode GflConf.home_dir = "data" GflNode.load_node()
def generate_nodes(): GflNode.init_node() GflNode.add_standalone_node() GflNode.add_standalone_node() GflNode.add_standalone_node()