def train(args,model, device, federated_train_loader, lr, federate_after_n_batches,epoch,clients_mem): model.train() nr_batches = federate_after_n_batches models = {} loss_values = {} iter(federated_train_loader) batches = get_next_batches(federated_train_loader, nr_batches) counter = 0 while True: print( "Starting training round, batches [{}, {}]".format(counter, counter + nr_batches) ) data_for_all_workers = True for worker in batches: curr_batches = batches[worker] if curr_batches: models[worker], loss_values[worker] = train_on_batches( worker, curr_batches, model, device, lr,epoch,clients_mem,args ) else: data_for_all_workers = False counter += nr_batches if not data_for_all_workers: print("At least one worker ran out of data, stopping.") break model = utils.federated_avg(models) batches = get_next_batches(federated_train_loader, nr_batches) return model
def train(model, device, federated_train_loader, lr, federate_after_n_batches): model.train() nr_batches = federate_after_n_batches models = {} loss_values = {} iter(federated_train_loader) # initialize iterators batches = get_next_batches(federated_train_loader, nr_batches) counter = 0 while True: logger.debug("Starting training round, batches [%s, %s]", counter, counter + nr_batches) data_for_all_workers = True for worker in batches: curr_batches = batches[worker] if curr_batches: models[worker], loss_values[worker] = train_on_batches( worker, curr_batches, model, device, lr) else: data_for_all_workers = False counter += nr_batches if not data_for_all_workers: logger.debug("At least one worker ran out of data, stopping.") break model = utils.federated_avg(models) batches = get_next_batches(federated_train_loader, nr_batches) return model
async def _train_eval(self): """ This is the actual main train eval function, but this is not exposed to the user. """ for curr_epoch in range(1, self.config.train_epochs + 1): # -------------------------------------------------------------------------- # TRAIN # -------------------------------------------------------------------------- logger.info("Starting epoch %d/%d" % (curr_epoch, self.config.train_epochs)) results = await asyncio.gather(*[ self._fit_model_on_worker(worker, curr_epoch) for worker in self.workers ]) # -------------------------------------------------------------------------- # EVAL # -------------------------------------------------------------------------- test_now = curr_epoch % self.config.fed_after_n_batches == 0 # first evaluate each remote model separately if test_now and self.evaluate_each_model: for worker_id, worker_model, _ in results: self._evaluate_model_on_worker(model_identifier=worker_id, worker=self.workers[0], model=worker_model) # update the current models and losses to the latest ones models = {} loss_values = {} for worker_id, worker_model, worker_loss in results: if worker_model is not None: models[worker_id] = worker_model loss_values[worker_id] = worker_loss self.model = utils.federated_avg(models) # then, evaluate the averaged model on the test set too if test_now: self._evaluate_model_on_worker( model_identifier="Federated model", worker=self.workers[0], model=self.model) if self.config.save_model: model_name = "%s_model.pt" % self.config.train_dataset_name torch.save(self.model.state_dict(), model_name)
def fed_avg_every_n_iters(model_pointers, iter, federate_after_n_batches): models_local = {} if (iter % args.federate_after_n_batches == 0): for worker_name, model_pointer in model_pointers.items(): # Assign model to the worker models_local[worker_name] = model_pointer.copy().get() model_avg = utils.federated_avg(models_local) for worker in workers_virtual: model_copied_avg = model_avg.copy() model_ptr = model_copied_avg.send(worker) model_pointers[worker.id] = model_ptr return (model_pointers)
async def main(): args = define_and_get_arguments() hook = sy.TorchHook(torch) kwargs_websocket = {"hook": hook, "verbose": args.verbose, "host": "0.0.0.0"} alice = websocket_client.WebsocketClientWorker(id="alice", port=8777, **kwargs_websocket) bob = websocket_client.WebsocketClientWorker(id="bob", port=8778, **kwargs_websocket) charlie = websocket_client.WebsocketClientWorker(id="charlie", port=8779, **kwargs_websocket) testing = websocket_client.WebsocketClientWorker(id="testing", port=8780, **kwargs_websocket) for wcw in [alice, bob, charlie, testing]: wcw.clear_objects_remote() worker_instances = [alice, bob, charlie] use_cuda = args.cuda and torch.cuda.is_available() torch.manual_seed(args.seed) device = torch.device("cuda" if use_cuda else "cpu") model = Net().to(device) traced_model = torch.jit.trace(model, torch.zeros([1, 1, 28, 28], dtype=torch.float)) learning_rate = args.lr for curr_round in range(1, args.training_rounds + 1): logger.info("Training round %s/%s", curr_round, args.training_rounds) results = await asyncio.gather( *[ fit_model_on_worker( worker=worker, traced_model=traced_model, batch_size=args.batch_size, curr_round=curr_round, max_nr_batches=args.federate_after_n_batches, lr=learning_rate, ) for worker in worker_instances ] ) models = {} loss_values = {} test_models = curr_round % 10 == 1 or curr_round == args.training_rounds if test_models: logger.info("Evaluating models") np.set_printoptions(formatter={"float": "{: .0f}".format}) for worker_id, worker_model, _ in results: evaluate_model_on_worker( model_identifier="Model update " + worker_id, worker=testing, dataset_key="mnist_testing", model=worker_model, nr_bins=10, batch_size=128, print_target_hist=False, ) # Federate models (note that this will also change the model in models[0] for worker_id, worker_model, worker_loss in results: if worker_model is not None: models[worker_id] = worker_model loss_values[worker_id] = worker_loss traced_model = utils.federated_avg(models) if test_models: evaluate_model_on_worker( model_identifier="Federated model", worker=testing, dataset_key="mnist_testing", model=traced_model, nr_bins=10, batch_size=128, print_target_hist=False, ) # decay learning rate learning_rate = max(0.98 * learning_rate, args.lr * 0.01) if args.save_model: torch.save(model.state_dict(), "mnist_cnn.pt")
async def main(): args = define_and_get_arguments() hook = sy.TorchHook(torch) kwargs_websocket = { "hook": hook, "verbose": args.verbose, "host": "0.0.0.0" } alice = workers.WebsocketClientWorker(id="alice", port=8777, **kwargs_websocket) bob = workers.WebsocketClientWorker(id="bob", port=8778, **kwargs_websocket) charlie = workers.WebsocketClientWorker(id="charlie", port=8779, **kwargs_websocket) testing = workers.WebsocketClientWorker(id="testing", port=8780, **kwargs_websocket) worker_instances = [alice, bob, charlie] use_cuda = args.cuda and torch.cuda.is_available() torch.manual_seed(args.seed) device = torch.device("cuda" if use_cuda else "cpu") kwargs = {"num_workers": 1, "pin_memory": True} if use_cuda else {} test_loader = torch.utils.data.DataLoader( datasets.MNIST( "../data", train=False, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307, ), (0.3081, )) ]), ), batch_size=args.test_batch_size, shuffle=False, drop_last=False, **kwargs, ) model = Net().to(device) (data, target) = test_loader.__iter__().next() traced_model = torch.jit.trace(model, data) learning_rate = args.lr for curr_round in range(1, args.training_rounds + 1): logger.info("Starting training round %s/%s", curr_round, args.training_rounds) results = await asyncio.gather(*[ fit_model_on_worker( worker=worker, traced_model=traced_model, batch_size=args.batch_size, curr_round=curr_round, max_nr_batches=args.federate_after_n_batches, lr=learning_rate, ) for worker in worker_instances ]) models = {} loss_values = {} test_models = curr_round % 10 == 1 or curr_round == args.training_rounds if test_models: np.set_printoptions(formatter={"float": "{: .0f}".format}) for worker_id, worker_model, _ in results: evaluate_model_on_worker( model_identifier=worker_id, worker=testing, dataset_key="mnist_testing", model=worker_model, nr_bins=10, batch_size=128, print_target_hist=False, ) for worker_id, worker_model, worker_loss in results: if worker_model is not None: models[worker_id] = worker_model loss_values[worker_id] = worker_loss traced_model = utils.federated_avg(models) if test_models: evaluate_model_on_worker( model_identifier="Federated model", worker=testing, dataset_key="mnist_testing", model=traced_model, nr_bins=10, batch_size=128, print_target_hist=True, ) # decay learning rate learning_rate = max(0.98 * learning_rate, args.lr * 0.01) if args.save_model: torch.save(model.state_dict(), "mnist_cnn.pt")
async def sendmodel(nodes): workers = connect_to_nodes(nodes) (mock_data, target) = test_loader.__iter__().next() model = Net() global traced_model traced_model = torch.jit.trace(model, mock_data) print('Performance measurments') performance = await asyncio.gather( *[get_performance(worker) for worker in workers]) cost_dict = { "h" + str(i + 2): performance[i][0] for i in range(0, len(performance)) } utility_dict = { "h" + str(i + 2): performance[i][1] for i in range(0, len(performance)) } training_count_dict = { "h" + str(i + 2): 0 for i in range(0, len(performance)) } loss_data = [] acc_data = [] chosen_worker_data = [] for current_round in range(max_federated_rounds): print("Starting round" + str(current_round)) chosen_workers = choose_worker(cost_dict, utility_dict, training_count_dict) chosen_workers = choose_worker(cost_dict, utility_dict, training_count_dict) chosen_worker_string = "" for w in chosen_workers: training_count_dict[w] += 1 chosen_worker_string = chosen_worker_string + w + " " results = await asyncio.gather(*[ fit_model_on_worker(worker=worker, traced_model=traced_model, batch_size=batchsize, curr_round=current_round, lr=lr, no_federated_epochs=no_epoch) for worker in filter(lambda w: w.id in chosen_workers, workers) ]) models = {} loss_vals = {} network_dictonary = {} for worker_id, worker_model, worker_loss, network in results: if worker_model is not None: models[worker_id] = worker_model print("Evaluating WORKER {}".format(worker_id)) test(worker_model) for x in network: print(x + ':' + str(network[x])) avg_model = utils.federated_avg(models) traced_model = avg_model print("Evaluating averaged model") accuracy, loss = test(traced_model) loss_data.append(loss) acc_data.append(accuracy) chosen_worker_data.append(chosen_worker_string) if accuracy > target_accuracy: print("Target accuracy has been reached. Terminating training.") break print("Finished Federated training - closing connections") with open('federated_results.csv', 'w', newline='') as csvfile: writer = csv.writer(csvfile, delimiter=' ', quotechar='|', quoting=csv.QUOTE_MINIMAL) writer.writerow(loss_data) writer.writerow(acc_data) writer.writerow(chosen_worker_data) for worker in workers: worker.close()
async def main(): # GENERAL SETUP args = define_and_get_arguments() use_cuda = args.cuda and torch.cuda.is_available() device = torch.device("cuda" if use_cuda else "cpu") torch.manual_seed(args.seed) hook = sy.TorchHook(torch) kwargs_websocket = { "hook": hook, "verbose": args.verbose, "host": "0.0.0.0" } model = Model(model_args).to(device) #use cuda if available # CLIENT WORKER SETUP #workers on the client that interact with remote workers client_workers = [] for i in range(n_workers): client_workers.append( workers.WebsocketClientWorker(id=f"w_{i}", port=base_port + i, **kwargs_websocket)) #local client to test the models of each client as well as of aggregated model testing = workers.WebsocketClientWorker(id="testing", port=base_port + i, **kwargs_websocket) #serialize the model using jit to then send to remote server workers #model, example_input traced_model = torch.jit.trace(model, torch.zeros(1, 42, dtype=torch.float32)) # train model on `args.batch_size` batches on each remote worker. # after x training batches aggregate model # every `args.training_rounds` evaluate each worker's model, and the aggregate model for r in range(1, args.training_rounds + 1): logger.info(f"Starting training round {r}/{args.training_rounds}") # for each training round, distribute model across workers # wait until models across all remote server workers are trained and respond results = await asyncio.gather(*[ fit_model_on_worker(worker=worker, traced_model=traced_model, batch_size=args.batch_size, curr_round=r, max_nr_batches=args.federate_after_n_batches, lr=args.lr) for worker in client_workers ]) models = {} loss_values = {} # run test interation every 10 training rounds test_models = r % 10 == 1 or r == args.training_rounds if test_models: #test each remote server worker's model np.set_printoptions(formatter={"float": "{: .0f}".format}) for worker_id, worker_model, _ in results: evaluate_model_on_worker( model_identifier=worker_id, worker=testing, dataset_key="credit_testing", model=worker_model, nr_bins=10, batch_size=128, print_target_hist=False, ) # compile losses from each individual workers for worker_id, worker_model, worker_loss in results: if worker_model is not None: models[worker_id] = worker_model loss_values[worker_id] = worker_loss # aggregate the model traced_model = utils.federated_avg(models) if test_models: #test aggregated model evaluate_model_on_worker(model_identifier="federated model", worker=testing, dataset_key="credit_testing", model=traced_model, nr_bins=10, batch_size=128) # decay learning rate learning_rate = max(0.98 * learning_rate, args.lr * 0.01) if args.save_model: torch.save(model.state_dict(), "credit_rating.pt")
def main(): args = args_parser() logs = LogSaver(args) acc_test, loss_test, acc_train, loss_train = [], [], [], [] # ToDo change this classes = [i for i in range(args.num_classes)] distrib = Distribute(args.num_workers, len(classes)) train_data_distribution = copy.deepcopy( distrib.get_distribution(args.dstr_Train, args.n_labels_per_agent_Train, args.sub_labels_Train)) test_data_distribution = copy.deepcopy( distrib.get_distribution(args.dstr_Test, args.n_labels_per_agent_Test, args.sub_labels_Test)) print(train_data_distribution, "\n\n TEST DISTRIBUTION", test_data_distribution) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print(device) fed_trainloaders, fed_testloaders, workers = get_dataloaders( train_data_distribution, test_data_distribution, args.dataset, args.train_bs, args.test_bs, args.num_workers) print("TRAINLOADERs ARE CREATED") batches = extract_batches_per_worker(fed_trainloaders) batches_test = extract_batches_per_worker(fed_testloaders) net = Net(10) # copy weights # w_glob = net.state_dict() net.to(device) loss_func = nn.CrossEntropyLoss() for rnd in range(args.rounds): w_local = {} n = [] # For now all of the updates are calculated sequentially for worker in workers: trainloader = batches[worker] # Batch size is needed to calculate accuracy w, loss, acc = train(worker, net, trainloader, loss_func, args.local_ep, args.train_bs, device=device) # ToDo w -> w.state_dict() w_local[worker] = w #.state_dict() n.append(len(trainloader)) loss_train.append(copy.deepcopy(loss)) acc_train.append(copy.deepcopy(acc)) net = federated_avg(w_local) # w_glob = FedAvg(w_local, n) # Analog to model distribution # net.load_state_dict(w_glob) # Perform tests after global update for worker in workers: testloader = batches_test[worker] loss, acc = test(worker, net, testloader, loss_func, args.test_bs, device=device) print(worker.id, "loss", loss, "acc", acc) acc_test.append(copy.deepcopy(acc)) loss_test.append(copy.deepcopy(loss)) print("Round", rnd) #print(acc_train[-1], loss_train[-1], acc_test[-1], loss_test[-1]) logs.add_row(acc_train, loss_train, acc_test, loss_test) print("End of training") print(acc_train, "\n\n", type(acc_train)) logs.plot(loss_train, loss_test, np.array(acc_train), np.array(acc_test)) print("Plots are created\n", acc_train, "\n\n", loss_train) logs.save_model(net)
async def sendmodel(nodes=10): totalnetworkcost = 0 workers = connect_to_nodes(nodes) (mock_data, target) = test_loader.__iter__().next() model = Net() global traced_model traced_model = torch.jit.trace(model, mock_data) print('Performance measurments') performance = await asyncio.gather( *[get_performance(worker) for worker in workers]) cost_dict = { "h" + str(i + 2): performance[i][0] for i in range(0, len(performance)) } utility_dict = { "h" + str(i + 2): performance[i][1] for i in range(0, len(performance)) } training_count_dict = { "h" + str(i + 2): 0 for i in range(0, len(performance)) } #Write to CSV initially to_file = [ "Round", "Accuracy", "Loss", "Workers called", "Total networkcost" ] with open('federated_results.csv', 'a', newline='') as csvfile: writer = csv.writer(csvfile, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL) writer.writerow(to_file) for current_round in range(max_federated_rounds): centralizedmonitor = monitoring() centralizedmonitor.start() print("Starting round" + str(current_round)) chosen_workers = choose_worker(cost_dict, utility_dict, training_count_dict) chosen_worker_string = "" for w in chosen_workers: training_count_dict[w] += 1 chosen_worker_string = chosen_worker_string + w + " " results = await asyncio.gather(*[ fit_model_on_worker(worker=worker, traced_model=traced_model, batch_size=batchsize, curr_round=current_round, lr=lr, no_federated_epochs=no_federated_epochs) for worker in filter(lambda w: w.id in chosen_workers, workers) ]) models = {} loss_vals = {} network_dictonary = {} centralizedmonitor.stop() costofround = centralizedmonitor.getnetworkcost() costofround = costofround / 1000000 totalnetworkcost += costofround for worker_id, worker_model, worker_loss in results: if worker_model is not None: models[worker_id] = worker_model avg_model = utils.federated_avg(models) traced_model = avg_model print("Evaluating averaged model") accuracy, loss = test(traced_model) string_accuracy = ('{:.2f}'.format(accuracy)) string_loss = str(loss) string_round = str(current_round + 1) to_file = [ string_round, string_accuracy, string_loss, chosen_worker_string, totalnetworkcost ] with open('federated_results.csv', 'a', newline='') as csvfile: writer = csv.writer(csvfile, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL) writer.writerow(to_file) if accuracy > target_accuracy: print("Target accuracy has been reached. Terminating training.") break print("Finished Federated training - closing connections") for worker in workers: worker.close()
async def main(): """ Main """ hook = sy.TorchHook(torch) parser = argparse.ArgumentParser(description='Train and validate a Federated model') parser.add_argument('config', type=str, help='Configuration file') args = parser.parse_args() config = configparser.ConfigParser() config.read(args.config) # Train configuration config_rounds = config.getint('TRAIN', 'rounds') config_epochs = config.getint('TRAIN', 'epochs') config_batch = config.getint('TRAIN', 'batch') config_optimizer = config.get('TRAIN', 'optimizer') config_lr = config.getfloat('TRAIN', 'lr') config_shuffle = config.getboolean('TRAIN', 'shuffle') clients = {} clients_results = {} for section in config.sections(): if section.startswith('WORKER'): kwargs_websocket = {'hook': hook, 'id': config.get(section, 'id'), 'host': config.get(section, 'host'), 'port': config.getint(section, 'port'), 'verbose': config.getboolean(section, 'verbose')} federation_participant = config.getboolean(section, 'federation_participant') client = CustomWebsocketClientWorker(**kwargs_websocket) client.federation_participant = federation_participant client.clear_objects_remote() clients[kwargs_websocket['id']] = client clients_results[kwargs_websocket['id']] = [] model = Classifier() traced_model = trace(model, torch.zeros([1, 10], dtype=torch.float)) for curr_round in range(config_rounds): print('Round %s/%s ¡Ding Ding!:' % (curr_round + 1, config_rounds)) results = await asyncio.gather( *[ fit_model_on_worker( worker=clients[client], traced_model=traced_model, optimizer=config_optimizer, batch_size=config_batch, epochs=config_epochs, lr=config_lr, dataset_key='test', shuffle=config_shuffle ) for client in clients if clients[client].federation_participant ] ) print('Training done!') print('Federating model ... ', end='') models = {} for worker_id, worker_model in results: if worker_model is not None: models[worker_id] = worker_model traced_model = utils.federated_avg(models) print('Done!') for client in clients: # Evaluate train train_loss, train_confusion_matrix = evaluate_model_on_worker( worker=clients[client], dataset_key='train', model=traced_model, batch_size=config_batch, ) # Evaluate test test_loss, test_confusion_matrix = evaluate_model_on_worker( worker=clients[client], dataset_key='test', model=traced_model, batch_size=config_batch, ) clients_results[client].append((train_loss, test_loss, test_confusion_matrix)) print('"%s" => Train loss: %.4f. Test loss: %.4f' % (client, train_loss, test_loss)) print('Confusion matrices:') for client in clients_results: print('Model "%s" stats:' % client) train_losses = [cr[0] for cr in clients_results[client]] test_losses = [cr[1] for cr in clients_results[client]] conf_matrices = [cr[2] for cr in clients_results[client]] show_results(conf_matrices, train_losses, test_losses, label=client, loss_xlabel='Round')