Ejemplo n.º 1
0
def train(args,model, device, federated_train_loader, lr, federate_after_n_batches,epoch,clients_mem):
    model.train()

    nr_batches = federate_after_n_batches

    models = {}
    loss_values = {}

    iter(federated_train_loader)
    batches = get_next_batches(federated_train_loader, nr_batches)
    counter = 0

    while True:
        print(
            "Starting training round, batches [{}, {}]".format(counter, counter + nr_batches)
        )
        data_for_all_workers = True
        for worker in batches:
            curr_batches = batches[worker]
            if curr_batches:
                models[worker], loss_values[worker] = train_on_batches(
                    worker, curr_batches, model, device, lr,epoch,clients_mem,args
                )
            else:
                data_for_all_workers = False
        counter += nr_batches
        if not data_for_all_workers:
            print("At least one worker ran out of data, stopping.")
            break

        model = utils.federated_avg(models)
        batches = get_next_batches(federated_train_loader, nr_batches)
    return model
Ejemplo n.º 2
0
def train(model, device, federated_train_loader, lr, federate_after_n_batches):
    model.train()

    nr_batches = federate_after_n_batches

    models = {}
    loss_values = {}

    iter(federated_train_loader)  # initialize iterators
    batches = get_next_batches(federated_train_loader, nr_batches)
    counter = 0

    while True:
        logger.debug("Starting training round, batches [%s, %s]", counter,
                     counter + nr_batches)
        data_for_all_workers = True
        for worker in batches:
            curr_batches = batches[worker]
            if curr_batches:
                models[worker], loss_values[worker] = train_on_batches(
                    worker, curr_batches, model, device, lr)
            else:
                data_for_all_workers = False
        counter += nr_batches
        if not data_for_all_workers:
            logger.debug("At least one worker ran out of data, stopping.")
            break

        model = utils.federated_avg(models)
        batches = get_next_batches(federated_train_loader, nr_batches)
    return model
Ejemplo n.º 3
0
    async def _train_eval(self):
        """
        This is the actual main train eval function, but this is not exposed to
        the user.
        """
        for curr_epoch in range(1, self.config.train_epochs + 1):

            # --------------------------------------------------------------------------
            # TRAIN
            # --------------------------------------------------------------------------

            logger.info("Starting epoch %d/%d" %
                        (curr_epoch, self.config.train_epochs))
            results = await asyncio.gather(*[
                self._fit_model_on_worker(worker, curr_epoch)
                for worker in self.workers
            ])

            # --------------------------------------------------------------------------
            # EVAL
            # --------------------------------------------------------------------------

            test_now = curr_epoch % self.config.fed_after_n_batches == 0

            # first evaluate each remote model separately
            if test_now and self.evaluate_each_model:
                for worker_id, worker_model, _ in results:
                    self._evaluate_model_on_worker(model_identifier=worker_id,
                                                   worker=self.workers[0],
                                                   model=worker_model)

            # update the current models and losses to the latest ones
            models = {}
            loss_values = {}
            for worker_id, worker_model, worker_loss in results:
                if worker_model is not None:
                    models[worker_id] = worker_model
                    loss_values[worker_id] = worker_loss

            self.model = utils.federated_avg(models)

            # then, evaluate the averaged model on the test set too
            if test_now:
                self._evaluate_model_on_worker(
                    model_identifier="Federated model",
                    worker=self.workers[0],
                    model=self.model)

        if self.config.save_model:
            model_name = "%s_model.pt" % self.config.train_dataset_name
            torch.save(self.model.state_dict(), model_name)
def fed_avg_every_n_iters(model_pointers, iter, federate_after_n_batches):
    models_local = {}

    if (iter % args.federate_after_n_batches == 0):
        for worker_name, model_pointer in model_pointers.items():
            # Assign model to the worker
            models_local[worker_name] = model_pointer.copy().get()
        model_avg = utils.federated_avg(models_local)

        for worker in workers_virtual:
            model_copied_avg = model_avg.copy()
            model_ptr = model_copied_avg.send(worker)
            model_pointers[worker.id] = model_ptr

    return (model_pointers)
Ejemplo n.º 5
0
async def main():
    args = define_and_get_arguments()

    hook = sy.TorchHook(torch)

    kwargs_websocket = {"hook": hook, "verbose": args.verbose, "host": "0.0.0.0"}
    alice = websocket_client.WebsocketClientWorker(id="alice", port=8777, **kwargs_websocket)
    bob = websocket_client.WebsocketClientWorker(id="bob", port=8778, **kwargs_websocket)
    charlie = websocket_client.WebsocketClientWorker(id="charlie", port=8779, **kwargs_websocket)
    testing = websocket_client.WebsocketClientWorker(id="testing", port=8780, **kwargs_websocket)

    for wcw in [alice, bob, charlie, testing]:
        wcw.clear_objects_remote()

    worker_instances = [alice, bob, charlie]

    use_cuda = args.cuda and torch.cuda.is_available()

    torch.manual_seed(args.seed)

    device = torch.device("cuda" if use_cuda else "cpu")

    model = Net().to(device)

    traced_model = torch.jit.trace(model, torch.zeros([1, 1, 28, 28], dtype=torch.float))
    learning_rate = args.lr

    for curr_round in range(1, args.training_rounds + 1):
        logger.info("Training round %s/%s", curr_round, args.training_rounds)

        results = await asyncio.gather(
            *[
                fit_model_on_worker(
                    worker=worker,
                    traced_model=traced_model,
                    batch_size=args.batch_size,
                    curr_round=curr_round,
                    max_nr_batches=args.federate_after_n_batches,
                    lr=learning_rate,
                )
                for worker in worker_instances
            ]
        )
        models = {}
        loss_values = {}

        test_models = curr_round % 10 == 1 or curr_round == args.training_rounds
        if test_models:
            logger.info("Evaluating models")
            np.set_printoptions(formatter={"float": "{: .0f}".format})
            for worker_id, worker_model, _ in results:
                evaluate_model_on_worker(
                    model_identifier="Model update " + worker_id,
                    worker=testing,
                    dataset_key="mnist_testing",
                    model=worker_model,
                    nr_bins=10,
                    batch_size=128,
                    print_target_hist=False,
                )

        # Federate models (note that this will also change the model in models[0]
        for worker_id, worker_model, worker_loss in results:
            if worker_model is not None:
                models[worker_id] = worker_model
                loss_values[worker_id] = worker_loss

        traced_model = utils.federated_avg(models)

        if test_models:
            evaluate_model_on_worker(
                model_identifier="Federated model",
                worker=testing,
                dataset_key="mnist_testing",
                model=traced_model,
                nr_bins=10,
                batch_size=128,
                print_target_hist=False,
            )

        # decay learning rate
        learning_rate = max(0.98 * learning_rate, args.lr * 0.01)

    if args.save_model:
        torch.save(model.state_dict(), "mnist_cnn.pt")
Ejemplo n.º 6
0
async def main():
    args = define_and_get_arguments()

    hook = sy.TorchHook(torch)

    kwargs_websocket = {
        "hook": hook,
        "verbose": args.verbose,
        "host": "0.0.0.0"
    }
    alice = workers.WebsocketClientWorker(id="alice",
                                          port=8777,
                                          **kwargs_websocket)
    bob = workers.WebsocketClientWorker(id="bob",
                                        port=8778,
                                        **kwargs_websocket)
    charlie = workers.WebsocketClientWorker(id="charlie",
                                            port=8779,
                                            **kwargs_websocket)
    testing = workers.WebsocketClientWorker(id="testing",
                                            port=8780,
                                            **kwargs_websocket)

    worker_instances = [alice, bob, charlie]

    use_cuda = args.cuda and torch.cuda.is_available()

    torch.manual_seed(args.seed)

    device = torch.device("cuda" if use_cuda else "cpu")

    kwargs = {"num_workers": 1, "pin_memory": True} if use_cuda else {}

    test_loader = torch.utils.data.DataLoader(
        datasets.MNIST(
            "../data",
            train=False,
            transform=transforms.Compose([
                transforms.ToTensor(),
                transforms.Normalize((0.1307, ), (0.3081, ))
            ]),
        ),
        batch_size=args.test_batch_size,
        shuffle=False,
        drop_last=False,
        **kwargs,
    )

    model = Net().to(device)

    (data, target) = test_loader.__iter__().next()
    traced_model = torch.jit.trace(model, data)
    learning_rate = args.lr

    for curr_round in range(1, args.training_rounds + 1):
        logger.info("Starting training round %s/%s", curr_round,
                    args.training_rounds)

        results = await asyncio.gather(*[
            fit_model_on_worker(
                worker=worker,
                traced_model=traced_model,
                batch_size=args.batch_size,
                curr_round=curr_round,
                max_nr_batches=args.federate_after_n_batches,
                lr=learning_rate,
            ) for worker in worker_instances
        ])
        models = {}
        loss_values = {}

        test_models = curr_round % 10 == 1 or curr_round == args.training_rounds
        if test_models:
            np.set_printoptions(formatter={"float": "{: .0f}".format})
            for worker_id, worker_model, _ in results:
                evaluate_model_on_worker(
                    model_identifier=worker_id,
                    worker=testing,
                    dataset_key="mnist_testing",
                    model=worker_model,
                    nr_bins=10,
                    batch_size=128,
                    print_target_hist=False,
                )

        for worker_id, worker_model, worker_loss in results:
            if worker_model is not None:
                models[worker_id] = worker_model
                loss_values[worker_id] = worker_loss

        traced_model = utils.federated_avg(models)
        if test_models:
            evaluate_model_on_worker(
                model_identifier="Federated model",
                worker=testing,
                dataset_key="mnist_testing",
                model=traced_model,
                nr_bins=10,
                batch_size=128,
                print_target_hist=True,
            )

        # decay learning rate
        learning_rate = max(0.98 * learning_rate, args.lr * 0.01)

    if args.save_model:
        torch.save(model.state_dict(), "mnist_cnn.pt")
Ejemplo n.º 7
0
async def sendmodel(nodes):

    workers = connect_to_nodes(nodes)
    (mock_data, target) = test_loader.__iter__().next()
    model = Net()
    global traced_model

    traced_model = torch.jit.trace(model, mock_data)

    print('Performance measurments')
    performance = await asyncio.gather(
        *[get_performance(worker) for worker in workers])
    cost_dict = {
        "h" + str(i + 2): performance[i][0]
        for i in range(0, len(performance))
    }
    utility_dict = {
        "h" + str(i + 2): performance[i][1]
        for i in range(0, len(performance))
    }
    training_count_dict = {
        "h" + str(i + 2): 0
        for i in range(0, len(performance))
    }
    loss_data = []
    acc_data = []
    chosen_worker_data = []
    for current_round in range(max_federated_rounds):
        print("Starting round" + str(current_round))
        chosen_workers = choose_worker(cost_dict, utility_dict,
                                       training_count_dict)
        chosen_workers = choose_worker(cost_dict, utility_dict,
                                       training_count_dict)
        chosen_worker_string = ""
        for w in chosen_workers:
            training_count_dict[w] += 1
            chosen_worker_string = chosen_worker_string + w + " "
        results = await asyncio.gather(*[
            fit_model_on_worker(worker=worker,
                                traced_model=traced_model,
                                batch_size=batchsize,
                                curr_round=current_round,
                                lr=lr,
                                no_federated_epochs=no_epoch)
            for worker in filter(lambda w: w.id in chosen_workers, workers)
        ])
        models = {}
        loss_vals = {}
        network_dictonary = {}
        for worker_id, worker_model, worker_loss, network in results:
            if worker_model is not None:
                models[worker_id] = worker_model
                print("Evaluating WORKER {}".format(worker_id))
                test(worker_model)
                for x in network:
                    print(x + ':' + str(network[x]))

        avg_model = utils.federated_avg(models)
        traced_model = avg_model
        print("Evaluating averaged model")
        accuracy, loss = test(traced_model)
        loss_data.append(loss)
        acc_data.append(accuracy)
        chosen_worker_data.append(chosen_worker_string)
        if accuracy > target_accuracy:
            print("Target accuracy has been reached. Terminating training.")
            break

    print("Finished Federated training - closing connections")
    with open('federated_results.csv', 'w', newline='') as csvfile:
        writer = csv.writer(csvfile,
                            delimiter=' ',
                            quotechar='|',
                            quoting=csv.QUOTE_MINIMAL)
        writer.writerow(loss_data)
        writer.writerow(acc_data)
        writer.writerow(chosen_worker_data)
    for worker in workers:
        worker.close()
async def main():
    # GENERAL SETUP
    args = define_and_get_arguments()
    use_cuda = args.cuda and torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")
    torch.manual_seed(args.seed)
    hook = sy.TorchHook(torch)
    kwargs_websocket = {
        "hook": hook,
        "verbose": args.verbose,
        "host": "0.0.0.0"
    }
    model = Model(model_args).to(device)  #use cuda if available

    # CLIENT WORKER SETUP
    #workers on the client that interact with remote workers
    client_workers = []
    for i in range(n_workers):
        client_workers.append(
            workers.WebsocketClientWorker(id=f"w_{i}",
                                          port=base_port + i,
                                          **kwargs_websocket))

    #local client to test the models of each client as well as of aggregated model
    testing = workers.WebsocketClientWorker(id="testing",
                                            port=base_port + i,
                                            **kwargs_websocket)

    #serialize the model using jit to then send to remote server workers
    #model, example_input
    traced_model = torch.jit.trace(model,
                                   torch.zeros(1, 42, dtype=torch.float32))

    # train model on `args.batch_size` batches on each remote worker.
    # after x training batches aggregate model
    # every `args.training_rounds` evaluate each worker's model, and the aggregate model
    for r in range(1, args.training_rounds + 1):
        logger.info(f"Starting training round {r}/{args.training_rounds}")

        # for each training round, distribute model across workers
        # wait until models across all remote server workers are trained and respond
        results = await asyncio.gather(*[
            fit_model_on_worker(worker=worker,
                                traced_model=traced_model,
                                batch_size=args.batch_size,
                                curr_round=r,
                                max_nr_batches=args.federate_after_n_batches,
                                lr=args.lr) for worker in client_workers
        ])
        models = {}
        loss_values = {}

        # run test interation every 10 training rounds
        test_models = r % 10 == 1 or r == args.training_rounds
        if test_models:  #test each remote server worker's model
            np.set_printoptions(formatter={"float": "{: .0f}".format})
            for worker_id, worker_model, _ in results:
                evaluate_model_on_worker(
                    model_identifier=worker_id,
                    worker=testing,
                    dataset_key="credit_testing",
                    model=worker_model,
                    nr_bins=10,
                    batch_size=128,
                    print_target_hist=False,
                )

        # compile losses from each individual workers
        for worker_id, worker_model, worker_loss in results:
            if worker_model is not None:
                models[worker_id] = worker_model
                loss_values[worker_id] = worker_loss

        # aggregate the model
        traced_model = utils.federated_avg(models)
        if test_models:  #test aggregated model
            evaluate_model_on_worker(model_identifier="federated model",
                                     worker=testing,
                                     dataset_key="credit_testing",
                                     model=traced_model,
                                     nr_bins=10,
                                     batch_size=128)

        # decay learning rate
        learning_rate = max(0.98 * learning_rate, args.lr * 0.01)

    if args.save_model:
        torch.save(model.state_dict(), "credit_rating.pt")
Ejemplo n.º 9
0
def main():

    args = args_parser()

    logs = LogSaver(args)

    acc_test, loss_test, acc_train, loss_train = [], [], [], []

    # ToDo change this
    classes = [i for i in range(args.num_classes)]

    distrib = Distribute(args.num_workers, len(classes))

    train_data_distribution = copy.deepcopy(
        distrib.get_distribution(args.dstr_Train,
                                 args.n_labels_per_agent_Train,
                                 args.sub_labels_Train))

    test_data_distribution = copy.deepcopy(
        distrib.get_distribution(args.dstr_Test, args.n_labels_per_agent_Test,
                                 args.sub_labels_Test))

    print(train_data_distribution, "\n\n TEST DISTRIBUTION",
          test_data_distribution)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(device)

    fed_trainloaders, fed_testloaders, workers = get_dataloaders(
        train_data_distribution, test_data_distribution, args.dataset,
        args.train_bs, args.test_bs, args.num_workers)
    print("TRAINLOADERs ARE CREATED")

    batches = extract_batches_per_worker(fed_trainloaders)
    batches_test = extract_batches_per_worker(fed_testloaders)

    net = Net(10)

    # copy weights
    # w_glob = net.state_dict()

    net.to(device)
    loss_func = nn.CrossEntropyLoss()

    for rnd in range(args.rounds):

        w_local = {}
        n = []

        # For now all of the updates are calculated sequentially
        for worker in workers:

            trainloader = batches[worker]

            # Batch size is needed to calculate accuracy
            w, loss, acc = train(worker,
                                 net,
                                 trainloader,
                                 loss_func,
                                 args.local_ep,
                                 args.train_bs,
                                 device=device)
            # ToDo w -> w.state_dict()
            w_local[worker] = w  #.state_dict()
            n.append(len(trainloader))
            loss_train.append(copy.deepcopy(loss))
            acc_train.append(copy.deepcopy(acc))

        net = federated_avg(w_local)

        # w_glob = FedAvg(w_local, n)
        # Analog to model distribution
        # net.load_state_dict(w_glob)

        # Perform tests after global update
        for worker in workers:
            testloader = batches_test[worker]
            loss, acc = test(worker,
                             net,
                             testloader,
                             loss_func,
                             args.test_bs,
                             device=device)

            print(worker.id, "loss", loss, "acc", acc)
            acc_test.append(copy.deepcopy(acc))
            loss_test.append(copy.deepcopy(loss))

        print("Round", rnd)
        #print(acc_train[-1], loss_train[-1], acc_test[-1], loss_test[-1])
        logs.add_row(acc_train, loss_train, acc_test, loss_test)

    print("End of training")

    print(acc_train, "\n\n", type(acc_train))

    logs.plot(loss_train, loss_test, np.array(acc_train), np.array(acc_test))
    print("Plots are created\n", acc_train, "\n\n", loss_train)
    logs.save_model(net)
Ejemplo n.º 10
0
async def sendmodel(nodes=10):
    totalnetworkcost = 0

    workers = connect_to_nodes(nodes)
    (mock_data, target) = test_loader.__iter__().next()
    model = Net()
    global traced_model

    traced_model = torch.jit.trace(model, mock_data)

    print('Performance measurments')
    performance = await asyncio.gather(
        *[get_performance(worker) for worker in workers])
    cost_dict = {
        "h" + str(i + 2): performance[i][0]
        for i in range(0, len(performance))
    }
    utility_dict = {
        "h" + str(i + 2): performance[i][1]
        for i in range(0, len(performance))
    }
    training_count_dict = {
        "h" + str(i + 2): 0
        for i in range(0, len(performance))
    }

    #Write to CSV initially
    to_file = [
        "Round", "Accuracy", "Loss", "Workers called", "Total networkcost"
    ]

    with open('federated_results.csv', 'a', newline='') as csvfile:
        writer = csv.writer(csvfile,
                            delimiter=',',
                            quotechar='|',
                            quoting=csv.QUOTE_MINIMAL)
        writer.writerow(to_file)

    for current_round in range(max_federated_rounds):

        centralizedmonitor = monitoring()
        centralizedmonitor.start()

        print("Starting round" + str(current_round))
        chosen_workers = choose_worker(cost_dict, utility_dict,
                                       training_count_dict)
        chosen_worker_string = ""
        for w in chosen_workers:
            training_count_dict[w] += 1
            chosen_worker_string = chosen_worker_string + w + " "
        results = await asyncio.gather(*[
            fit_model_on_worker(worker=worker,
                                traced_model=traced_model,
                                batch_size=batchsize,
                                curr_round=current_round,
                                lr=lr,
                                no_federated_epochs=no_federated_epochs)
            for worker in filter(lambda w: w.id in chosen_workers, workers)
        ])
        models = {}
        loss_vals = {}
        network_dictonary = {}

        centralizedmonitor.stop()
        costofround = centralizedmonitor.getnetworkcost()
        costofround = costofround / 1000000
        totalnetworkcost += costofround

        for worker_id, worker_model, worker_loss in results:
            if worker_model is not None:
                models[worker_id] = worker_model

        avg_model = utils.federated_avg(models)
        traced_model = avg_model
        print("Evaluating averaged model")
        accuracy, loss = test(traced_model)

        string_accuracy = ('{:.2f}'.format(accuracy))
        string_loss = str(loss)
        string_round = str(current_round + 1)

        to_file = [
            string_round, string_accuracy, string_loss, chosen_worker_string,
            totalnetworkcost
        ]

        with open('federated_results.csv', 'a', newline='') as csvfile:
            writer = csv.writer(csvfile,
                                delimiter=',',
                                quotechar='|',
                                quoting=csv.QUOTE_MINIMAL)
            writer.writerow(to_file)

        if accuracy > target_accuracy:
            print("Target accuracy has been reached. Terminating training.")
            break

    print("Finished Federated training - closing connections")

    for worker in workers:
        worker.close()
Ejemplo n.º 11
0
async def main():
    """ Main """

    hook = sy.TorchHook(torch)
    parser = argparse.ArgumentParser(description='Train and validate a Federated model')
    parser.add_argument('config', type=str, help='Configuration file')
    args = parser.parse_args()
    config = configparser.ConfigParser()
    config.read(args.config)

    # Train configuration

    config_rounds = config.getint('TRAIN', 'rounds')
    config_epochs = config.getint('TRAIN', 'epochs')
    config_batch = config.getint('TRAIN', 'batch')
    config_optimizer = config.get('TRAIN', 'optimizer')
    config_lr = config.getfloat('TRAIN', 'lr')
    config_shuffle = config.getboolean('TRAIN', 'shuffle')

    clients = {}
    clients_results = {}

    for section in config.sections():
        if section.startswith('WORKER'):
            kwargs_websocket = {'hook': hook, 'id': config.get(section, 'id'), 'host': config.get(section, 'host'),
                                'port': config.getint(section, 'port'),
                                'verbose': config.getboolean(section, 'verbose')}
            federation_participant = config.getboolean(section, 'federation_participant')
            client = CustomWebsocketClientWorker(**kwargs_websocket)
            client.federation_participant = federation_participant
            client.clear_objects_remote()
            clients[kwargs_websocket['id']] = client
            clients_results[kwargs_websocket['id']] = []

    model = Classifier()
    traced_model = trace(model, torch.zeros([1, 10], dtype=torch.float))

    for curr_round in range(config_rounds):

        print('Round %s/%s ¡Ding Ding!:' % (curr_round + 1, config_rounds))

        results = await asyncio.gather(
            *[
                fit_model_on_worker(
                    worker=clients[client],
                    traced_model=traced_model,
                    optimizer=config_optimizer,
                    batch_size=config_batch,
                    epochs=config_epochs,
                    lr=config_lr,
                    dataset_key='test',
                    shuffle=config_shuffle
                )
                for client in clients if clients[client].federation_participant
            ]
        )

        print('Training done!')

        print('Federating model ... ', end='')
        models = {}
        for worker_id, worker_model in results:
            if worker_model is not None:
                models[worker_id] = worker_model
        traced_model = utils.federated_avg(models)
        print('Done!')

        for client in clients:
            # Evaluate train
            train_loss, train_confusion_matrix = evaluate_model_on_worker(
                worker=clients[client],
                dataset_key='train',
                model=traced_model,
                batch_size=config_batch,
            )
            # Evaluate test
            test_loss, test_confusion_matrix = evaluate_model_on_worker(
                worker=clients[client],
                dataset_key='test',
                model=traced_model,
                batch_size=config_batch,
            )

            clients_results[client].append((train_loss, test_loss, test_confusion_matrix))
            print('"%s" => Train loss: %.4f. Test loss: %.4f' % (client, train_loss, test_loss))

    print('Confusion matrices:')

    for client in clients_results:
        print('Model "%s" stats:' % client)
        train_losses = [cr[0] for cr in clients_results[client]]
        test_losses = [cr[1] for cr in clients_results[client]]
        conf_matrices = [cr[2] for cr in clients_results[client]]
        show_results(conf_matrices, train_losses, test_losses, label=client, loss_xlabel='Round')