np.random.seed(0) torch.manual_seed(0) torch.cuda.manual_seed_all(0) # GPU arrangement: Please customize this function according your own topology. # The GPU server list is configured at "mpi_host_file". # If we have 4 machines and each has two GPUs, and your FL network has 8 workers and a central worker. # The 4 machines will be assigned as follows: # machine 1: worker0, worker4, worker8; # machine 2: worker1, worker5; # machine 3: worker2, worker6; # machine 4: worker3, worker7; # Therefore, we can see that workers are assigned according to the order of machine list. logging.info("process_id = %d, size = %d" % (process_id, worker_number)) device = init_training_device(process_id, worker_number - 1, args.gpu_num_per_server) # load data dataset = load_data(args, args.dataset) [train_data_num, test_data_num, train_data_global, test_data_global, train_data_local_num_dict, train_data_local_dict, test_data_local_dict, class_num] = dataset # create model. # Note if the model is DNN (e.g., ResNet), the training will be very slow. # In this case, please use our FedML distributed version (./fedml_experiments/distributed_fedavg) model = create_model(args, model_name=args.model, output_dim=dataset[7]) # start "federated averaging (FedAvg)" FedML_FedAvg_distributed(process_id, worker_number, device, comm, model, train_data_num, train_data_global, test_data_global, train_data_local_num_dict, train_data_local_dict, test_data_local_dict, args)
args.lr), config=args ) # Set the random seed. The np.random seed determines the dataset partition. # The torch_manual_seed determines the initial weight. # We fix these two, so that we can reproduce the result. random.seed(0) np.random.seed(0) torch.manual_seed(0) torch.cuda.manual_seed_all(0) num_FL_workers = args.client_num_per_round + 1 # Please check "GPU_MAPPING.md" to see how to define the topology device = mapping_processes_to_gpu_device_from_yaml_file(args.fl_worker_index, num_FL_workers, args.gpu_mapping_file, args.gpu_mapping_key) # load data dataset = load_data(args, args.dataset) [train_data_num, test_data_num, train_data_global, test_data_global, train_data_local_num_dict, train_data_local_dict, test_data_local_dict, class_num] = dataset # create model. # Note if the model is DNN (e.g., ResNet), the training will be very slow. # In this case, please use our FedML distributed version (./fedml_experiments/distributed_fedavg) model = create_model(args, model_name=args.model, output_dim=dataset[7]) FedML_FedAvg_distributed(args.fl_worker_index, num_FL_workers, device, None, model, train_data_num, train_data_global, test_data_global, train_data_local_num_dict, train_data_local_dict, test_data_local_dict, args)