def configure_client_instances( num_clients: int, num_cpu: int, num_ram: float, gpu: bool = False) -> Tuple[List[Instance], List[str]]: """Return list of client instances and a list of instance names.""" instance_names = [f"client_{i}" for i in range(num_clients)] instances = [ Instance( name=instance_name, group="clients", num_cpu=num_cpu, num_ram=num_ram, gpu=gpu, ) for instance_name in instance_names ] return instances, instance_names
client_instances_100, client_names_100 = configure_client_instances( num_clients=100, num_cpu=2, num_ram=4) client_instances_10, client_names_10 = configure_client_instances( num_clients=10, num_cpu=2, num_ram=4) SETTINGS = { ### ### FedFS vs FedAvg ### "fn-c50-r40-fedavg-16": Baseline( instances=[ Instance(name="server", group="server", num_cpu=4, num_ram=16) ] + client_instances_100, server=ServerSetting( instance_name="server", strategy="fedavg", rounds=FN_ROUNDS, min_num_clients=FN_MIN_NUM_CLIENTS, sample_fraction=FN_SAMPLE_FRACTION_50, min_sample_size=FN_MIN_SAMPLE_SIZE_50, training_round_timeout=16, lr_initial=FN_LR_INITIAL, partial_updates=False, importance_sampling=False, dynamic_timeout=False, ), clients=configure_clients(
def run(baseline: str, setting: str, adapter: str) -> None: """Run baseline.""" print(f"Starting baseline with {setting} settings.") wheel_remote_path = (f"/root/{WHEEL_FILENAME}" if adapter == "docker" else f"/home/ubuntu/{WHEEL_FILENAME}") settings = load_baseline_setting(baseline, setting) # Get instances and add a logserver to the list instances = settings.instances instances.append( Instance(name="logserver", group="logserver", num_cpu=2, num_ram=2)) # Configure cluster log(INFO, "(1/9) Configure cluster.") cluster = configure_cluster(adapter, instances, baseline, setting) # Start the cluster; this takes some time log(INFO, "(2/9) Start cluster.") cluster.start() # Upload wheel to all instances log(INFO, "(3/9) Upload wheel to all instances.") cluster.upload_all(WHEEL_LOCAL_PATH, wheel_remote_path) # Install the wheel on all instances log(INFO, "(4/9) Install wheel on all instances.") cluster.exec_all(command.install_wheel(wheel_remote_path)) extras = ["examples-tensorflow" ] if "tf_" in baseline else ["examples-pytorch"] cluster.exec_all( command.install_wheel(wheel_remote_path=wheel_remote_path, wheel_extras=extras)) # Download datasets in server and clients log(INFO, "(5/9) Download dataset on server and clients.") cluster.exec_all(command.download_dataset(baseline=baseline), groups=["server", "clients"]) # Start logserver log(INFO, "(6/9) Start logserver.") logserver = cluster.get_instance("logserver") cluster.exec( logserver.name, command.start_logserver( logserver_s3_bucket=CONFIG.get("aws", "logserver_s3_bucket"), logserver_s3_key=f"{baseline}_{setting}_{now()}.log", ), ) # Start Flower server on Flower server instances log(INFO, "(7/9) Start server.") cluster.exec( "server", command.start_server( log_host=f"{logserver.private_ip}:8081", baseline=baseline, setting=setting, ), ) # Start Flower clients log(INFO, "(8/9) Start clients.") server = cluster.get_instance("server") with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor: # Start the load operations and mark each future with its URL concurrent.futures.wait([ executor.submit( cluster.exec, client_setting.instance_name, command.start_client( log_host=f"{logserver.private_ip}:8081", server_address=f"{server.private_ip}:8080", baseline=baseline, setting=setting, cid=client_setting.cid, ), ) for client_setting in settings.clients ]) # Shutdown server and client instance after 10min if not at least one Flower # process is running it log(INFO, "(9/9) Start shutdown watcher script.") cluster.exec_all(command.watch_and_shutdown("flwr", adapter)) # Give user info how to tail logfile private_key = (DOCKER_PRIVATE_KEY if adapter == "docker" else path.expanduser(CONFIG.get("ssh", "private_key"))) log( INFO, "If you would like to tail the central logfile run:\n\n\t%s\n", command.tail_logfile(adapter, private_key, logserver), )
client_instances_50, client_names_50 = configure_client_instances( num_clients=50, num_cpu=2, num_ram=8) client_instances_10, client_names_10 = configure_client_instances( num_clients=10, num_cpu=2, num_ram=8) SETTINGS = { ### ### FedFS vs FedAvg ### "fn-c25-r50-fedavg-230": Baseline( instances=[ Instance(name="server", group="server", num_cpu=4, num_ram=16) ] + client_instances_50, server=ServerSetting( instance_name="server", strategy="fedavg", rounds=FN_ROUNDS, min_num_clients=FN_MIN_NUM_CLIENTS, sample_fraction=FN_SAMPLE_FRACTION_25, min_sample_size=FN_MIN_SAMPLE_SIZE_25, training_round_timeout=FN_TRAINING_ROUND_TIMEOUT, lr_initial=FN_LR_INITIAL, partial_updates=False, importance_sampling=False, dynamic_timeout=False, ), clients=configure_clients(