def load_node(cls) -> NoReturn: """ 加载节点目录中的key文件 :return: """ key_dir = PathUtils.join(GflConf.home_dir, "key") cls.__load_node(PathUtils.join(key_dir, "key.json"))
def __init__(self, id): super(DatasetPath, self).__init__(id) self.__root_dir = PathUtils.join(GflConf.data_dir, "dataset", id) self.__metadata_file = PathUtils.join(self.__root_dir, "metadata.json") self.__config_dir = PathUtils.join(self.__root_dir, "dataset") self.__dataset_config_file = PathUtils.join(self.__root_dir, "dataset", "dataset.json") self.__module_name = "fl_dataset" self.__module_dir = PathUtils.join(self.__root_dir, "dataset")
def init_node(cls) -> NoReturn: """ initialize GFL node """ node = cls.__new_node() key_dir = PathUtils.join(GflConf.home_dir, "key") os.makedirs(key_dir, exist_ok=True) key_file = PathUtils.join(key_dir, "key.json") cls.__save_node(node, key_file) cls.default_node = node
def init_node(cls) -> NoReturn: """ 初始化GFL节点 :return: """ cls.__new_node() key_dir = PathUtils.join(GflConf.home_dir, "key") os.makedirs(key_dir, exist_ok=True) key_file = PathUtils.join(key_dir, "key.json") cls.__save_node(key_file)
def load_node(cls) -> NoReturn: """ Load the key file in the node directory, and create default_node and standalone_nodes objects """ key_dir = PathUtils.join(GflConf.home_dir, "key") cls.default_node = cls.__load_node(PathUtils.join(key_dir, "key.json")) for filename in os.listdir(key_dir): if filename.startswith("node-"): node_idx = int(filename[5:-5]) cls.standalone_nodes[node_idx] = cls.__load_node( PathUtils.join(key_dir, filename))
def load_node(cls) -> NoReturn: """ 加载节点目录中的key文件,创建default_node和standalone_nodes对象 :return: """ key_dir = PathUtils.join(GflConf.home_dir, "key") cls.default_node = cls.__load_node(PathUtils.join(key_dir, "key.json")) for filename in os.listdir(key_dir): if filename.startswith("node-"): node_idx = int(filename[5:-5]) cls.standalone_nodes[node_idx] = cls.__load_node( PathUtils.join(key_dir, filename))
def load_all_job(cls) -> List[Job]: job_dir = PathUtils.join(GflConf.data_dir, "job") jobs = [] for filename in os.listdir(job_dir): path = PathUtils.join(job_dir, filename) if os.path.isdir(path): try: job = cls.load_job(filename) jobs.append(job) except: pass return jobs
def save_job(cls, job: Job, *, module=None, module_data=None) -> NoReturn: """ Save job :param job: job to save :param module: job module :param module_data: """ job_path = JobPath(job.job_id) job_path.makedirs() cls.__save_json(job_path.metadata_file, job.metadata) cls.__save_json(job_path.job_config_file, job.job_config) cls.__save_json(job_path.train_config_file, job.train_config) cls.__save_json(job_path.aggregate_config_file, job.aggregate_config) if module_data is not None: ZipUtils.extract_data(module_data, GflConf.temp_dir) ModuleUtils.migrate_module( PathUtils.join(GflConf.temp_dir, job.job_id), job_path.module_name, job_path.module_dir) elif module is not None: ModuleUtils.submit_module(module, job_path.module_name, job_path.module_dir) else: ModuleUtils.submit_module(job.module, job_path.module_name, job_path.module_dir)
def save_dataset(cls, dataset: Dataset, *, module=None, module_data=None) -> NoReturn: """ Save dataset :param dataset: dataset to save :param module: dataset module """ if module is None: module = dataset.module dataset_path = DatasetPath(dataset.dataset_id) dataset_path.makedirs() cls.__save_json(dataset_path.metadata_file, dataset.metadata) cls.__save_json(dataset_path.dataset_config_file, dataset.dataset_config) if module_data is not None: ZipUtils.extract_data(module_data, GflConf.temp_dir) ModuleUtils.migrate_module( PathUtils.join(GflConf.temp_dir, dataset.dataset_id), dataset_path.module_name, dataset_path.module_dir) elif module is not None: ModuleUtils.submit_module(module, dataset_path.module_name, dataset_path.module_dir) else: ModuleUtils.submit_module(dataset.module, dataset_path.module_name, dataset_path.module_dir)
def reload(cls): """ Reload readonly parameters from the YAML file. :return: """ with open(PathUtils.join(cls.__home_dir, "conf.yaml"), "r") as f: cls.readonly_props = yaml.safe_load(f.read())
def init_node(cls) -> NoReturn: """ 初始化GFL节点 :return: """ node = cls.__new_node() key_dir = PathUtils.join(GflConf.home_dir, "key") # /Users/YY/.gfl/key os.makedirs(key_dir, exist_ok=True) key_file = PathUtils.join(key_dir, "key.json") # /Users/YY/.gfl/key/key.json cls.__save_node(node, key_file) # key.json中的内容如下 # { # "address": "a8C03cEBFc6C11C1707032590adf2ACF4ccAc655", # "pub_key": "d2a95fb211c91f79d052c3c927f51b22893a3b3f7a28090f32d03fc7224bdca0be91173445f71bf1bf91d0fee52ee7c805b7b10dc1b12fa2ed5267b818eb1bc8", # "priv_key": "708d8f67deb461bdf2a3c9c2d82584b8304cbad32398a5ce5706a8e45f5210bf" # } cls.default_node = node
def run(cls, role, console, **kwargs): sys.stderr = open(os.devnull, "w") cls.logger = logging.getLogger("gfl") with Daemonizer() as (is_setup, daemonizer): main_pid = None if is_setup: main_pid = os.getpid() pid_file = PathUtils.join(GflConf.home_dir, "proc.lock") stdout_file = PathUtils.join(GflConf.logs_dir, "console_out") stderr_file = PathUtils.join(GflConf.logs_dir, "console_err") is_parent = daemonizer(pid_file, stdout_goto=stdout_file, stderr_goto=stderr_file) if is_parent: if console and main_pid == os.getpid(): Shell.startup() GflNode.load_node() if GflConf.get_property("net.mode") == "standalone": client_number = GflConf.get_property( "net.standalone.client_number") for _ in range(len(GflNode.standalone_nodes), client_number): GflNode.add_standalone_node() ManagerHolder.default_manager = NodeManager( node=GflNode.default_node, role="server") for i in range(client_number): client_manager = NodeManager(node=GflNode.standalone_nodes[i], role="client") ManagerHolder.standalone_managers.append(client_manager) else: ManagerHolder.default_manager = NodeManager( node=GflNode.default_node, role=role) # cls.__startup_node_managers() HttpListener.start() while HttpListener.is_alive(): time.sleep(2)
def generate_config(cls, path: str = None) -> None: """ generate config file in ``path``. :param path: the config file path, if it's None, will be replaced by './config.yaml'. :return: """ if path is None: path = "config.yaml" src_path = PathUtils.join(PathUtils.src_root_dir(), "resources", "config.yaml") shutil.copy(src_path, path)
def load(cls) -> None: """ load config properties from disk file. :return: """ base_config_path = PathUtils.join(PathUtils.src_root_dir(), "resources", "config.yaml") with open(base_config_path) as f: cls.__readonly_props = yaml.load(f, Loader=yaml.SafeLoader) path = PathUtils.join(cls.home_dir, "config.yaml") if os.path.exists(path): with open(path) as f: config_data = yaml.load(f, Loader=yaml.SafeLoader) cls.__readonly_props.update(config_data) if os.path.exists(cls.logs_dir): cls.load_logging_config() else: warnings.warn("cannot found logs dir.")
class Log(LogBase): debug_filename = PathUtils.join(GflConf.logs_dir, "debug.log") info_filename = PathUtils.join(GflConf.logs_dir, "info.log") warn_filename = PathUtils.join(GflConf.logs_dir, "warn.log") error_filename = PathUtils.join(GflConf.logs_dir, "error.log") def __init__(self, name): super(Log, self).__init__(name) def debug(self, msg, *params): pass def info(self, msg, *params): pass def warn(self, msg, *params): pass def error(self, msg, *params): pass
def add_standalone_node(cls) -> NoReturn: # 添加【一个】standalone_node node = cls.__new_node() for i in range(100): # 限制最多100个模拟节点, 防止此处出现死循环 if i not in cls.standalone_nodes: key_file = PathUtils.join(GflConf.home_dir, "key", "node-%d.json" % i) cls.__save_node(node, key_file) cls.standalone_nodes[i] = node return raise ValueError("最多只支持100个standalone模式虚拟节点.")
def send_partial_params(cls, client: str, job_id: str, step: int, params) -> NoReturn: # 这里的参数client,暂时认为是client_address # 在standalone模式下,trainer当前训练轮次得到的模型保存在指定路径下 client_params_dir = JobPath(job_id).client_params_dir(step, client) os.makedirs(client_params_dir, exist_ok=True) # 保存 job_id.pth为文件名 path = PathUtils.join(client_params_dir, job_id + '.pkl') # path = client_params_dir + 'job_id.pth' # torch.save(params, path) with open(path, 'wb') as f: pickle.dump(params, f) print("训练完成,已将模型保存至:" + str(client_params_dir))
def run(cls, console=True, **kwargs): sys.stderr = open(os.devnull, "w") cls.logger = logging.getLogger("gfl") with Daemonizer() as (is_setup, daemonizer): main_pid = None if is_setup: main_pid = os.getpid() pid_file = PathUtils.join(GflConf.home_dir, "proc.lock") stdout_file = PathUtils.join(GflConf.logs_dir, "console_out") stderr_file = PathUtils.join(GflConf.logs_dir, "console_err") is_parent = daemonizer(pid_file, stdout_goto=stdout_file, stderr_goto=stderr_file) if is_parent: if console and main_pid == os.getpid(): Shell.startup() GflNode.load_node() HttpListener.start() NodeManager.get_instance().run()
def run(cls, **kwargs): daemon = kwargs.pop("daemon", False) if daemon: print("DAEMON") with Daemonizer() as (is_setup, daemonizer): if is_setup: pass pid_file = "proc.lock" stdout_file = PathUtils.join(GflConf.logs_dir, "console_out") stderr_file = PathUtils.join(GflConf.logs_dir, "console_err") is_parent = daemonizer(pid_file, stdout_goto=stdout_file, stderr_goto=stderr_file) if is_parent: pass GflConf.reload() GflNode.load_node() if GflConf.get_property("standalone.enabled"): server_number = GflConf.get_property("standalone.server_number") client_number = GflConf.get_property("standalone.client_number") for _ in range(len(GflNode.standalone_nodes), server_number + client_number): GflNode.add_standalone_node() for i in range(0, server_number): node_manager = NodeManager(node=GflNode.standalone_nodes[i], role="server") cls.node_managers.append(node_manager) for i in range(server_number, server_number + client_number): node_manager = NodeManager(node=GflNode.standalone_nodes[i], role="client") cls.node_managers.append(node_manager) else: role = kwargs.pop("role") print(role) node_manager = NodeManager(node=GflNode.default_node, role=role) cls.node_managers.append(node_manager) cls.__startup_node_managers()
def add_standalone_node(cls) -> NoReturn: """ add standalone GFL node """ node = cls.__new_node() for i in range(100): # Limit up to 100 mock nodes to prevent an endless loop here if i not in cls.standalone_nodes: key_file = PathUtils.join(GflConf.home_dir, "key", "node-%d.json" % i) cls.__save_node(node, key_file) cls.standalone_nodes[i] = node return raise ValueError("最多只支持100个standalone模式虚拟节点.")
def load_logging_config(cls) -> None: """ """ logging_config_path = PathUtils.join(PathUtils.src_root_dir(), "resources", "logging.yaml") with open(logging_config_path) as f: text = f.read().replace("{logs_root}", GflConf.logs_dir) data = yaml.load(text, yaml.SafeLoader) if cls.get_property("debug"): data["root"]["level"] = "DEBUG" data["loggers"]["gfl"]["level"] = "DEBUG" logging.config.dictConfig(data)
def init(cls, force): if os.path.exists(GflConf.home_dir): if force: logging.shutdown() shutil.rmtree(GflConf.home_dir) else: raise ValueError("homedir not empty.") # create home dir os.makedirs(GflConf.home_dir) # generate config file GflConf.generate_config(PathUtils.join(GflConf.home_dir, "config.yaml")) # generate node address and key GflNode.init_node() # create data directories Lfs.init()
def receive_global_params(cls, job_id: str, cur_round: int): # 在standalone模式下,trainer获取当前聚合轮次下的全局模型 # 根据 Job 中的 job_id 和 cur_round 获取指定轮次聚合后的 全局模型参数的路径 global_params_dir = JobPath(job_id).global_params_dir(cur_round) model_params_path = PathUtils.join(global_params_dir, job_id + '.pkl') # 判断是否存在模型参数文件,如果存在则返回。 if os.path.exists(global_params_dir) and os.path.isfile( model_params_path): # resources_already:1 # self.__status = JobStatus.RESOURCE_ALREADY print("训练方接收全局模型") return model_params_path else: # 等待一段时间。在这段时间内获取到了模型参数文件,则返回 # 暂时不考虑这种情况 # 否则,认为当前模型参数文件已经无法获取 return None
def setUp(self) -> None: self.dataset = generate_dataset() print("dataset_id:" + self.dataset.dataset_id) self.job = generate_job() print("job_id:" + self.job.job_id) self.job.mount_dataset(self.dataset) GflNode.init_node() node = GflNode.default_node self.jobTrainerScheduler = JobTrainScheduler(node=node, job=self.job) self.jobTrainerScheduler.register() # aggregator需要初始化随机模型 global_params_dir = JobPath(self.job.job_id).global_params_dir( self.job.cur_round) # print("global_params_dir:"+global_params_dir) os.makedirs(global_params_dir, exist_ok=True) model_params_path = PathUtils.join(global_params_dir, self.job.job_id + '.pth') # print("model_params_path:"+model_params_path) model = Net() torch.save(model.state_dict(), model_params_path)
def init_conf(cls): """ Serialize default configuration into a YAML stream. """ with open(PathUtils.join(cls.__home_dir, "conf.yaml"), "w") as f: yaml.safe_dump(default_conf, f)
import logging.config import os import shutil import tempfile import warnings import yaml from gfl.utils import PathUtils os_tempdir = tempfile.gettempdir() gfl_tempdir = PathUtils.join(os_tempdir, "gfl") if os.path.exists(gfl_tempdir): os.makedirs(gfl_tempdir, exist_ok=True) class GflConfMetadata(type): @property def home_dir(cls): return cls._GflConf__home_dir @home_dir.setter def home_dir(cls, value): cls._GflConf__home_dir = PathUtils.abspath(value) cls._GflConf__data_dir = PathUtils.join(value, "data") cls._GflConf__logs_dir = PathUtils.join(value, "logs") cls._GflConf__cache_dir = PathUtils.join(value, "cache") @property def data_dir(cls): return cls._GflConf__data_dir
def home_dir(cls, value): cls._GflConf__home_dir = PathUtils.abspath(value) cls._GflConf__data_dir = PathUtils.join(value, "data") cls._GflConf__logs_dir = PathUtils.join(value, "logs") cls._GflConf__cache_dir = PathUtils.join(value, "cache")
def init(cls): os.makedirs(GflConf.cache_dir) os.makedirs(GflConf.data_dir) os.makedirs(GflConf.logs_dir) os.makedirs(PathUtils.join(GflConf.data_dir, "job")) os.makedirs(PathUtils.join(GflConf.data_dir, "dataset"))
class GflConf(object, metaclass=GflConfMetadata): # Parameters that can be modified at run time __props = {} # Parameters that are read from a configuration file and cannot be changed at run time __readonly_props = {} __home_dir = PathUtils.join(PathUtils.user_home_dir(), ".gfl") __data_dir = PathUtils.join(__home_dir, "data") __logs_dir = PathUtils.join(__home_dir, "logs") __cache_dir = PathUtils.join(__home_dir, "cache") __temp_dir = gfl_tempdir @classmethod def load(cls) -> None: """ load config properties from disk file. :return: """ base_config_path = PathUtils.join(PathUtils.src_root_dir(), "resources", "config.yaml") with open(base_config_path) as f: cls.__readonly_props = yaml.load(f, Loader=yaml.SafeLoader) path = PathUtils.join(cls.home_dir, "config.yaml") if os.path.exists(path): with open(path) as f: config_data = yaml.load(f, Loader=yaml.SafeLoader) cls.__readonly_props.update(config_data) if os.path.exists(cls.logs_dir): cls.load_logging_config() else: warnings.warn("cannot found logs dir.") @classmethod def load_logging_config(cls) -> None: """ """ logging_config_path = PathUtils.join(PathUtils.src_root_dir(), "resources", "logging.yaml") with open(logging_config_path) as f: text = f.read().replace("{logs_root}", GflConf.logs_dir) data = yaml.load(text, yaml.SafeLoader) if cls.get_property("debug"): data["root"]["level"] = "DEBUG" data["loggers"]["gfl"]["level"] = "DEBUG" logging.config.dictConfig(data) @classmethod def generate_config(cls, path: str = None) -> None: """ generate config file in ``path``. :param path: the config file path, if it's None, will be replaced by './config.yaml'. :return: """ if path is None: path = "config.yaml" src_path = PathUtils.join(PathUtils.src_root_dir(), "resources", "config.yaml") shutil.copy(src_path, path) @classmethod def set_config(cls, d: dict) -> None: """ Batch update config properties. Generally, this method is not recommend. :param d: a dict represent config properties. :return: """ cls.__props.update(d.copy()) @classmethod def get_property(cls, key, default=None): """ Get the value of readonly parameters. :param key: a string of the key to get the value :param default: return value if key not found """ op_res, val = cls.__get_from_dict(cls.__props, cls.__split_key(key), default) if op_res: return val return cls.__get_from_dict(cls.__readonly_props, cls.__split_key(key), default)[1] @classmethod def set_property(cls, key, value): """ Set parameters at run time. :param key: :param value: :return: """ cls.__set_to_dict(cls.__props, cls.__split_key(key), value) @classmethod def remove_property(cls, key): cls.__remove_from_dict(cls.__props, cls.__split_key(key)) @classmethod def __split_key(cls, key: str): if key is None or key.strip() == "": raise ValueError("key cannot be none or empty.") return key.split(".") @classmethod def __exists_in_dict(cls, d: dict, k_seq: list): if k_seq is None or len(k_seq) == 0: return False for k in k_seq: if k in d: d = d[k] else: return False return True @classmethod def __get_from_dict(cls, d: dict, k_seq: list, default=None): if k_seq is None or len(k_seq) == 0: raise ValueError("key cannot be none or empty") for k in k_seq: if k in d: d = d[k] else: return False, default return True, d @classmethod def __remove_from_dict(cls, d: dict, k_seq: list): if k_seq is None or len(k_seq) == 0: raise ValueError("key cannot be none or empty") for k in k_seq[:-1]: if k not in d: return False d = d[k] try: del d[k_seq[-1]] return True except: return False @classmethod def __set_to_dict(cls, d: dict, k_seq: list, value): if k_seq is None or len(k_seq) == 0: raise ValueError("key cannot be none or empty") for k in k_seq[:-1]: if k not in d: d[k] = {} d = d[k] d[k_seq[-1]] = value
def __init__(self, id): super(JobPath, self).__init__(id) self.__root_dir = PathUtils.join(GflConf.data_dir, "job", id) self.__metadata_file = PathUtils.join(self.__root_dir, "metadata.json") self.__sqlite_file = PathUtils.join(self.__root_dir, "job.sqlite") self.__config_dir = PathUtils.join(self.__root_dir, "job") self.__job_config_file = PathUtils.join(self.__root_dir, "job", "job.json") self.__train_config_file = PathUtils.join(self.__root_dir, "job", "train.json") self.__aggregate_config_file = PathUtils.join(self.__root_dir, "job", "aggregate.json") self.__module_name = "fl_model" self.__module_dir = PathUtils.join(self.__root_dir, "job") self.__metrics_dir = PathUtils.join(self.__root_dir, "results", "metrics") self.__params_dir = PathUtils.join(self.__root_dir, "results", "params") self.__reports_dir = PathUtils.join(self.__root_dir, "results", "reports") self.__client_params_dir = PathUtils.join(self.__root_dir, "round-%d", "%s", "params") self.__client_word_dir = PathUtils.join(self.__root_dir, "round-%d", "%s", "work")