Example #1
0
def run_fedavg_round(aggregator: 'BaseAggregatorParticipant',
                     participants: List['BaseTrainingParticipant'],
                     training_args: TrainArgs,
                     client_fraction=1.0):
    """
    Routine to run a training round with the given clients based on the server model and then aggregate the results
    :param client_fraction: client fraction to train with
    :param aggregator: aggregator participant that will aggregate the resulting training models
    :param participants: training participants in this round
    :param training_args: training arguments for this round
    :return:
    """
    logger.debug('distribute the initial model to the clients.')
    initial_model_state = aggregator.model.state_dict()

    success_threshold = max(int(len(participants) * client_fraction),
                            1) if client_fraction < 1.0 else -1
    participant_fraction = sample_randomly_by_fraction(participants,
                                                       client_fraction)
    logger.debug(
        f'starting training round with {len(participant_fraction)}/{len(participants)}.'
    )
    trained_participants = run_fedavg_train_round(initial_model_state,
                                                  participant_fraction,
                                                  training_args,
                                                  success_threshold=-1)

    logger.debug('starting aggregation.')
    num_train_samples = [p.num_train_samples for p in trained_participants]
    aggregator.aggregate(trained_participants,
                         num_train_samples=num_train_samples)

    logger.debug('distribute the aggregated global model to clients')
    resulting_model_state = aggregator.model.state_dict()
    overwrite_participants_models(resulting_model_state, participants)
Example #2
0
def reptile_train_step(aggregator: ReptileServer,
                       participants: List[ReptileClient],
                       inner_training_args: TrainArgs,
                       meta_training_args: TrainArgs = None,
                       evaluation_mode: bool = False,
                       *args, **kwargs):
    """
    Routine to run a Reptile training step
    :param aggregator: aggregator participant that will aggregate the resulting training models
    :param participants: training participants in this round
    :param inner_training_args: training arguments for participant models
    :param meta_training_args: training arguments for meta model
    :param evaluation_mode: is evaluation step
    :return:
    """
    logger.debug('distribute the initial model to the clients.')
    initial_model_state = copy.deepcopy(aggregator.model.state_dict())
    overwrite_participants_models(initial_model_state, participants)

    logger.debug('starting training round.')
    run_train_round(participants, inner_training_args)

    # Aggregate only when not in evaluation mode
    if not evaluation_mode:
        assert meta_training_args is not None, ('Argument meta_training_args '
            'must not be None when not in evaluation_mode')
        logger.debug(
            (f"Starting aggregation: num_participants={len(participants)}, "
             f"meta_learning_rate={meta_training_args.kwargs['meta_learning_rate']}")
        )
        aggregator.aggregate(
            participants=participants,
            meta_learning_rate=meta_training_args.kwargs['meta_learning_rate']
        )
Example #3
0
def run_fedavg_train_round(
        initial_model_state: Dict[str, Tensor],
        participants: List['BaseTrainingParticipant'],
        training_args: TrainArgs,
        success_threshold=-1) -> List['BaseTrainingParticipant']:
    """
    Routine to run a single round of training on the clients and return the results additional args are passed to the
    clients training routines.
    :param initial_model_state: model state to communicate before training
    :param participants: participants to train in this round
    :param training_args: arguments passed for training
    :param success_threshold: threshold for how many clients should at least participate in the round
    :return:
    """
    overwrite_participants_models(initial_model_state, participants)
    successful_participants = []
    for participant in participants:
        try:
            logger.debug(
                f'invoking training on participant {participant._name}')
            participant.train(training_args)
            successful_participants.append(participant)
            if success_threshold != -1 and success_threshold <= len(
                    successful_participants):
                break
        except GradientExplodingError as gradient_exception:
            logger.error(
                f'participant {participant._name} failed due to exploding gradients',
                gradient_exception)
        except Exception as e:
            logger.error(f'training on participant {participant._name} failed',
                         e)

    if success_threshold != -1 and len(
            successful_participants) < success_threshold:
        raise ExecutionError(
            'Failed to execute training round, not enough clients participated successfully'
        )
    return successful_participants
def run_hierarchical_clustering(local_evaluation_steps,
                                seed,
                                lr,
                                name,
                                total_fedavg_rounds,
                                cluster_initialization_rounds,
                                client_fraction,
                                local_epochs,
                                batch_size,
                                num_clients,
                                sample_threshold,
                                num_label_limit,
                                train_args,
                                dataset,
                                partitioner_class,
                                linkage_mech,
                                criterion,
                                dis_metric,
                                max_value_criterion,
                                reallocate_clients,
                                threshold_min_client_cluster,
                                use_colored_images,
                                use_pattern,
                                train_cluster_args=None,
                                mean=None,
                                std=None):
    fix_random_seeds(seed)
    global_tag = 'global_performance'
    global_tag_local = 'global_performance_personalized'
    initialize_clients_fn = DEFAULT_CLIENT_INIT_FN
    if dataset == 'ham10k':
        fed_dataset = load_ham10k_federated(partitions=num_clients,
                                            batch_size=batch_size,
                                            mean=mean,
                                            std=std)
        initialize_clients_fn = initialize_ham10k_clients
    else:
        raise ValueError(f'dataset "{dataset}" unknown')

    if not hasattr(max_value_criterion, '__iter__'):
        max_value_criterion = [max_value_criterion]
    if not hasattr(lr, '__iter__'):
        lr = [lr]

    for cf in client_fraction:
        for lr_i in lr:
            optimizer_args = OptimizerArgs(optim.SGD, lr=lr_i)
            model_args = ModelArgs(MobileNetV2Lightning,
                                   optimizer_args=optimizer_args,
                                   num_classes=7)
            fedavg_context = FedAvgExperimentContext(
                name=name,
                client_fraction=cf,
                local_epochs=local_epochs,
                lr=lr_i,
                batch_size=batch_size,
                optimizer_args=optimizer_args,
                model_args=model_args,
                train_args=train_args,
                dataset_name=dataset)
            experiment_specification = f'{fedavg_context}'
            experiment_logger = create_tensorboard_logger(
                fedavg_context.name, experiment_specification)
            fedavg_context.experiment_logger = experiment_logger
            for init_rounds, max_value in generate_configuration(
                    cluster_initialization_rounds, max_value_criterion):
                # load the model state
                round_model_state = load_fedavg_state(fedavg_context,
                                                      init_rounds)

                server = FedAvgServer('initial_server',
                                      fedavg_context.model_args,
                                      fedavg_context)
                server.overwrite_model_state(round_model_state)
                logger.info('initializing clients ...')
                clients = initialize_clients_fn(fedavg_context, fed_dataset,
                                                server.model.state_dict())

                overwrite_participants_models(round_model_state, clients)
                # initialize the cluster configuration
                round_configuration = {
                    'num_rounds_init': init_rounds,
                    'num_rounds_cluster': total_fedavg_rounds - init_rounds
                }
                if partitioner_class == DatadependentPartitioner:
                    clustering_dataset = load_femnist_colored_dataset(
                        str((REPO_ROOT / 'data').absolute()),
                        num_clients=num_clients,
                        batch_size=batch_size,
                        sample_threshold=sample_threshold)
                    dataloader = load_n_of_each_class(
                        clustering_dataset,
                        n=5,
                        tabu=list(fed_dataset.train_data_local_dict.keys()))
                    cluster_args = ClusterArgs(
                        partitioner_class,
                        linkage_mech=linkage_mech,
                        criterion=criterion,
                        dis_metric=dis_metric,
                        max_value_criterion=max_value,
                        plot_dendrogram=False,
                        reallocate_clients=reallocate_clients,
                        threshold_min_client_cluster=
                        threshold_min_client_cluster,
                        dataloader=dataloader,
                        **round_configuration)
                else:
                    cluster_args = ClusterArgs(
                        partitioner_class,
                        linkage_mech=linkage_mech,
                        criterion=criterion,
                        dis_metric=dis_metric,
                        max_value_criterion=max_value,
                        plot_dendrogram=False,
                        reallocate_clients=reallocate_clients,
                        threshold_min_client_cluster=
                        threshold_min_client_cluster,
                        **round_configuration)
                # create new logger for cluster experiment
                experiment_specification = f'{fedavg_context}_{cluster_args}'
                experiment_logger = create_tensorboard_logger(
                    fedavg_context.name, experiment_specification)
                fedavg_context.experiment_logger = experiment_logger

                initial_train_fn = partial(run_fedavg_train_round,
                                           round_model_state,
                                           training_args=train_cluster_args)
                create_aggregator_fn = partial(FedAvgServer,
                                               model_args=model_args,
                                               context=fedavg_context)
                federated_round_fn = partial(run_fedavg_round,
                                             training_args=train_args,
                                             client_fraction=cf)

                after_post_clustering_evaluation = [
                    partial(log_after_round_evaluation, experiment_logger,
                            'post_clustering')
                ]
                after_clustering_round_evaluation = [
                    partial(log_after_round_evaluation, experiment_logger)
                ]
                after_federated_round_evaluation = [
                    partial(log_after_round_evaluation, experiment_logger,
                            ['final hierarchical', global_tag])
                ]
                after_clustering_fn = [
                    partial(log_cluster_distribution,
                            experiment_logger,
                            num_classes=fed_dataset.class_num),
                    partial(log_sample_images_from_each_client,
                            experiment_logger)
                ]
                after_federated_round_fn = [
                    partial(
                        log_personalized_global_cluster_performance,
                        experiment_logger,
                        ['final hierarchical personalized', global_tag_local],
                        local_evaluation_steps)
                ]
                run_fedavg_hierarchical(
                    server,
                    clients,
                    cluster_args,
                    initial_train_fn,
                    federated_round_fn,
                    create_aggregator_fn,
                    after_post_clustering_evaluation,
                    after_clustering_round_evaluation,
                    after_federated_round_evaluation,
                    after_clustering_fn,
                    after_federated_round=after_federated_round_fn)
def clustering_test(mean, std, seed, lr, local_epochs, client_fraction,
                    optimizer_args, total_fedavg_rounds, batch_size,
                    num_clients, model_args, train_args, train_cluster_args,
                    initialization_rounds, partitioner_class, linkage_mech,
                    criterion, dis_metric, max_value_criterion):
    fix_random_seeds(seed)

    fed_dataset = load_ham10k_federated(partitions=num_clients,
                                        batch_size=batch_size,
                                        mean=mean,
                                        std=std)
    initialize_clients_fn = initialize_ham10k_clients

    fedavg_context = FedAvgExperimentContext(name='ham10k_clustering',
                                             client_fraction=client_fraction,
                                             local_epochs=local_epochs,
                                             lr=lr,
                                             batch_size=batch_size,
                                             optimizer_args=optimizer_args,
                                             model_args=model_args,
                                             train_args=train_args,
                                             dataset_name='ham10k')
    experiment_specification = f'{fedavg_context}'
    experiment_logger = create_tensorboard_logger(fedavg_context.name,
                                                  experiment_specification)

    log_dataset_distribution(experiment_logger, 'full dataset', fed_dataset)

    server, clients = run_fedavg(context=fedavg_context,
                                 num_rounds=total_fedavg_rounds,
                                 dataset=fed_dataset,
                                 save_states=True,
                                 restore_state=True,
                                 evaluate_rounds=False,
                                 initialize_clients_fn=initialize_clients_fn)

    for init_rounds in initialization_rounds:
        # load the model state
        round_model_state = load_fedavg_state(fedavg_context, init_rounds)
        overwrite_participants_models(round_model_state, clients)
        run_fedavg_train_round(round_model_state,
                               training_args=train_cluster_args,
                               participants=clients)
        for max_value in max_value_criterion:
            # initialize the cluster configuration
            round_configuration = {
                'num_rounds_init': init_rounds,
                'num_rounds_cluster': total_fedavg_rounds - init_rounds
            }
            cluster_args = ClusterArgs(partitioner_class,
                                       linkage_mech=linkage_mech,
                                       criterion=criterion,
                                       dis_metric=dis_metric,
                                       max_value_criterion=max_value,
                                       plot_dendrogram=False,
                                       reallocate_clients=False,
                                       threshold_min_client_cluster=-1,
                                       **round_configuration)
            experiment_logger = create_tensorboard_logger(
                fedavg_context.name,
                f'{experiment_specification}{cluster_args}')
            partitioner = cluster_args()
            cluster_clients_dic = partitioner.cluster(clients, server)
            log_cluster_distribution(experiment_logger, cluster_clients_dic, 7)
def run_hierarchical_clustering_reptile(
        seed,
        name,
        dataset,
        num_clients,
        batch_size,
        num_label_limit,
        use_colored_images,
        sample_threshold,
        hc_lr,
        hc_cluster_initialization_rounds,
        hc_client_fraction,
        hc_local_epochs,
        hc_train_args,
        hc_partitioner_class,
        hc_linkage_mech,
        hc_criterion,
        hc_dis_metric,
        hc_max_value_criterion,  # distance threshold
        hc_reallocate_clients,  #
        hc_threshold_min_client_cluster,  # only with hc_reallocate_clients = True,
        # results in clusters having at least this number of clients
    hc_train_cluster_args,
        rp_sgd,  # True -> Use SGD as inner optimizer; False -> Use Adam
        rp_adam_betas,  # Used only if sgd = False
        rp_meta_batch_size,
        rp_num_meta_steps,
        rp_meta_learning_rate_initial,
        rp_meta_learning_rate_final,
        rp_eval_interval,
        rp_inner_learning_rate,
        rp_num_inner_steps,
        rp_num_inner_steps_eval):
    fix_random_seeds(seed)
    global_tag = 'global_performance'

    if dataset == 'femnist':
        if use_colored_images:
            fed_dataset = load_femnist_colored_dataset(
                data_dir=str((REPO_ROOT / 'data').absolute()),
                num_clients=num_clients,
                batch_size=batch_size,
                sample_threshold=sample_threshold)
        else:
            fed_dataset = load_femnist_dataset(
                data_dir=str((REPO_ROOT / 'data').absolute()),
                num_clients=num_clients,
                batch_size=batch_size,
                sample_threshold=sample_threshold)
        if num_label_limit != -1:
            fed_dataset = scratch_labels(fed_dataset, num_label_limit)
    else:
        raise ValueError(f'dataset "{dataset}" unknown')

    if not hasattr(hc_max_value_criterion, '__iter__'):
        hc_max_value_criterion = [hc_max_value_criterion]
    if not hasattr(hc_lr, '__iter__'):
        hc_lr = [hc_lr]
    input_channels = 3 if use_colored_images else 1
    data_distribution_logged = False
    for cf in hc_client_fraction:
        for lr_i in hc_lr:
            # Initialize experiment context parameters
            fedavg_optimizer_args = OptimizerArgs(optim.SGD, lr=lr_i)
            fedavg_model_args = ModelArgs(CNNLightning,
                                          optimizer_args=fedavg_optimizer_args,
                                          input_channels=input_channels,
                                          only_digits=False)
            fedavg_context = FedAvgExperimentContext(
                name=name,
                client_fraction=cf,
                local_epochs=hc_local_epochs,
                lr=lr_i,
                batch_size=batch_size,
                optimizer_args=fedavg_optimizer_args,
                model_args=fedavg_model_args,
                train_args=hc_train_args,
                dataset_name=dataset)
            reptile_context = ReptileExperimentContext(
                name=name,
                dataset_name=dataset,
                swap_labels=False,
                num_classes_per_client=0,
                num_shots_per_class=0,
                seed=seed,
                model_class=CNNLightning,
                sgd=rp_sgd,
                adam_betas=rp_adam_betas,
                num_clients_train=num_clients,
                num_clients_test=0,
                meta_batch_size=rp_meta_batch_size,
                num_meta_steps=rp_num_meta_steps,
                meta_learning_rate_initial=rp_meta_learning_rate_initial,
                meta_learning_rate_final=rp_meta_learning_rate_final,
                eval_interval=rp_eval_interval,
                num_eval_clients_training=-1,
                do_final_evaluation=True,
                num_eval_clients_final=-1,
                inner_batch_size=batch_size,
                inner_learning_rate=rp_inner_learning_rate,
                num_inner_steps=rp_num_inner_steps,
                num_inner_steps_eval=rp_num_inner_steps_eval)
            experiment_specification = f'{fedavg_context}'
            experiment_logger = create_tensorboard_logger(
                name, experiment_specification)
            if not data_distribution_logged:
                log_dataset_distribution(experiment_logger, 'full dataset',
                                         fed_dataset)
                data_distribution_logged = True

            log_after_round_evaluation_fns = [
                partial(log_after_round_evaluation, experiment_logger,
                        'fedavg'),
                partial(log_after_round_evaluation, experiment_logger,
                        global_tag)
            ]
            server, clients = run_fedavg(
                context=fedavg_context,
                num_rounds=max(hc_cluster_initialization_rounds),
                dataset=fed_dataset,
                save_states=True,
                restore_state=True,
                after_round_evaluation=log_after_round_evaluation_fns)

            for init_rounds, max_value in generate_configuration(
                    hc_cluster_initialization_rounds, hc_max_value_criterion):
                # load the model state
                round_model_state = load_fedavg_state(fedavg_context,
                                                      init_rounds)
                overwrite_participants_models(round_model_state, clients)
                # initialize the cluster configuration
                round_configuration = {
                    'num_rounds_init': init_rounds,
                    'num_rounds_cluster': 0
                }
                cluster_args = ClusterArgs(
                    hc_partitioner_class,
                    linkage_mech=hc_linkage_mech,
                    criterion=hc_criterion,
                    dis_metric=hc_dis_metric,
                    max_value_criterion=max_value,
                    plot_dendrogram=False,
                    reallocate_clients=hc_reallocate_clients,
                    threshold_min_client_cluster=
                    hc_threshold_min_client_cluster,
                    **round_configuration)
                # create new logger for cluster experiment
                experiment_specification = f'{fedavg_context}_{cluster_args}_{reptile_context}'
                experiment_logger = create_tensorboard_logger(
                    name, experiment_specification)
                fedavg_context.experiment_logger = experiment_logger

                initial_train_fn = partial(run_fedavg_train_round,
                                           round_model_state,
                                           training_args=hc_train_cluster_args)
                create_aggregator_fn = partial(FedAvgServer,
                                               model_args=fedavg_model_args,
                                               context=fedavg_context)

                # HIERARCHICAL CLUSTERING
                logger.debug('starting local training before clustering.')
                trained_participants = initial_train_fn(clients)
                if len(trained_participants) != len(clients):
                    raise ValueError(
                        'not all clients successfully participated in the clustering round'
                    )

                # Clustering of participants by model updates
                partitioner = cluster_args()
                cluster_clients_dic = partitioner.cluster(clients, server)
                _cluster_clients_dic = dict()
                for cluster_id, participants in cluster_clients_dic.items():
                    _cluster_clients_dic[cluster_id] = [
                        c._name for c in participants
                    ]
                log_cluster_distribution(experiment_logger,
                                         cluster_clients_dic, 62)

                # Initialize cluster models
                cluster_server_dic = {}
                for cluster_id, participants in cluster_clients_dic.items():
                    intermediate_cluster_server = create_aggregator_fn(
                        'cluster_server' + cluster_id)
                    intermediate_cluster_server.aggregate(participants)
                    cluster_server = ReptileServer(
                        participant_name=f'cluster_server{cluster_id}',
                        model_args=reptile_context.meta_model_args,
                        context=reptile_context,
                        initial_model_state=intermediate_cluster_server.model.
                        state_dict())
                    #create_aggregator_fn('cluster_server' + cluster_id)
                    #cluster_server.aggregate(participants)
                    cluster_server_dic[cluster_id] = cluster_server

                # REPTILE TRAINING INSIDE CLUSTERS
                after_round_evaluation = [log_after_round_evaluation]
                RANDOM = random.Random(seed)

                # Perform training
                for i in range(reptile_context.num_meta_steps):
                    for cluster_id, participants in cluster_clients_dic.items(
                    ):

                        if reptile_context.meta_batch_size == -1:
                            meta_batch = participants
                        else:
                            meta_batch = [
                                participants[k] for k in cyclerange(
                                    start=i * reptile_context.meta_batch_size %
                                    len(participants),
                                    interval=reptile_context.meta_batch_size,
                                    total_len=len(participants))
                            ]
                        # Meta training step
                        reptile_train_step(
                            aggregator=cluster_server_dic[cluster_id],
                            participants=meta_batch,
                            inner_training_args=reptile_context.
                            get_inner_training_args(),
                            meta_training_args=reptile_context.
                            get_meta_training_args(
                                frac_done=i / reptile_context.num_meta_steps))

                    # Evaluation on train and test clients
                    if i % reptile_context.eval_interval == 0:
                        global_step = init_rounds + i
                        global_loss, global_acc = [], []

                        for cluster_id, participants in cluster_clients_dic.items(
                        ):
                            # Test on all clients inside clusters
                            reptile_train_step(
                                aggregator=cluster_server_dic[cluster_id],
                                participants=participants,
                                inner_training_args=reptile_context.
                                get_inner_training_args(eval=True),
                                evaluation_mode=True)
                            result = evaluate_local_models(
                                participants=participants)
                            loss = result.get('test/loss')
                            acc = result.get('test/acc')

                            # Log
                            if after_round_evaluation is not None:
                                for c in after_round_evaluation:
                                    c(experiment_logger,
                                      f'cluster_{cluster_id}', loss, acc,
                                      global_step)
                            loss_list = loss.tolist()
                            acc_list = acc.tolist()
                            global_loss.extend(loss_list if isinstance(
                                loss_list, list) else [loss_list])
                            global_acc.extend(acc_list if isinstance(
                                acc_list, list) else [acc_list])

                        if after_round_evaluation is not None:
                            for c in after_round_evaluation:
                                c(experiment_logger, 'mean_over_all_clients',
                                  Tensor(global_loss), Tensor(global_acc),
                                  global_step)

                    logger.info(f'Finished Reptile training round {i}')

                # Final evaluation at end of training
                if reptile_context.do_final_evaluation:
                    global_loss, global_acc = [], []

                    for cluster_id, participants in cluster_clients_dic.items(
                    ):
                        # Final evaluation on train and test clients
                        # Test on all clients inside clusters
                        reptile_train_step(
                            aggregator=cluster_server_dic[cluster_id],
                            participants=participants,
                            inner_training_args=reptile_context.
                            get_inner_training_args(eval=True),
                            evaluation_mode=True)
                        result = evaluate_local_models(
                            participants=participants)
                        loss = result.get('test/loss')
                        acc = result.get('test/acc')
                        print(
                            f'Cluster {cluster_id} ({len(participants)} part.): loss = {loss}, acc = {acc}'
                        )

                        loss_list = loss.tolist()
                        acc_list = acc.tolist()
                        global_loss.extend(loss_list if isinstance(
                            loss_list, list) else [loss_list])
                        global_acc.extend(acc_list if isinstance(
                            acc_list, list) else [acc_list])

                        # Log
                        if after_round_evaluation is not None:
                            for c in after_round_evaluation:
                                c(experiment_logger, f'cluster_{cluster_id}',
                                  loss, acc, reptile_context.num_meta_steps)

                    log_loss_and_acc('overall_mean', Tensor(global_loss),
                                     Tensor(global_acc), experiment_logger, 0)