コード例 #1
0
    def initialize_tensorkeys_for_functions(self, with_opt_vars=False):
        """
        Set the required tensors for all publicly accessible methods \
            that could be called as part of a task.

        By default, this is just all of the layers and optimizer of the model.
        Custom tensors should be added to this function

        Parameters
        ----------
        None

        Returns
        -------
        None
        """
        # TODO there should be a way to programmatically iterate through all
        #  of the methods in the class and declare the tensors.
        # For now this is done manually

        output_model_dict = self.get_tensor_dict(with_opt_vars=with_opt_vars)
        global_model_dict, local_model_dict = split_tensor_dict_for_holdouts(
            self.logger, output_model_dict, **self.tensor_dict_split_fn_kwargs)
        if not with_opt_vars:
            validation_global_model_dict = global_model_dict
            validation_local_model_dict = local_model_dict
        else:
            output_model_dict = self.get_tensor_dict(with_opt_vars=False)
            validation_global_model_dict, validation_local_model_dict =\
                split_tensor_dict_for_holdouts(
                    self.logger,
                    output_model_dict,
                    **self.tensor_dict_split_fn_kwargs
                )

        self.required_tensorkeys_for_function['train'] = [
            TensorKey(tensor_name, 'GLOBAL', 0, False, ('model', ))
            for tensor_name in global_model_dict
        ]
        self.required_tensorkeys_for_function['train'] += [
            TensorKey(tensor_name, 'LOCAL', 0, False, ('model', ))
            for tensor_name in local_model_dict
        ]

        # Validation may be performed on local or aggregated (global) model,
        # so there is an extra lookup dimension for kwargs
        self.required_tensorkeys_for_function['validate'] = {}
        # TODO This is not stateless. The optimizer will not be
        self.required_tensorkeys_for_function['validate']['apply=local'] = \
            [TensorKey(tensor_name, 'LOCAL', 0, False, ('trained',))
             for tensor_name in {
                 **validation_global_model_dict,
                 **validation_local_model_dict}]
        self.required_tensorkeys_for_function['validate']['apply=global'] = \
            [TensorKey(tensor_name, 'GLOBAL', 0, False, ('model',))
             for tensor_name in validation_global_model_dict]
        self.required_tensorkeys_for_function['validate']['apply=global'] += \
            [TensorKey(tensor_name, 'LOCAL', 0, False, ('model',))
             for tensor_name in validation_local_model_dict]
コード例 #2
0
ファイル: experiment.py プロジェクト: karol-brejna-i/openfl
 def _get_initial_tensor_dict(self, model_provider):
     """Extract initial weights from the model."""
     self.task_runner_stub = self.plan.get_core_task_runner(
         model_provider=model_provider)
     tensor_dict, _ = split_tensor_dict_for_holdouts(
         self.logger, self.task_runner_stub.get_tensor_dict(False),
         **self.task_runner_stub.tensor_dict_split_fn_kwargs)
     return tensor_dict
コード例 #3
0
    def train(self, col_name, round_num, input_tensor_dict, epochs, **kwargs):
        """
        Perform the training for a specified number of batches.

        Is expected to perform draws randomly, without replacement until data is exausted.
        Then data is replaced and shuffled and draws continue.

        Returns
        -------
        dict
            'TensorKey: nparray'
        """
        if 'metrics' not in kwargs:
            raise KeyError('metrics must be included in kwargs')
        # if 'batch_size' in kwargs:
        #     batch_size = kwargs['batch_size']
        # else:
        #     batch_size = self.data_loader.batch_size

        # rebuild model with updated weights
        self.rebuild_model(round_num, input_tensor_dict)

        history = self.model.fit(
            self.data_loader.X_train,
            self.data_loader.y_train,
            batch_size=self.data_loader.batch_size,
            epochs=epochs,
            verbose=0,
        )

        # TODO Currently assuming that all metrics are defined at
        #  initialization (build_model).
        #  If metrics are added (i.e. not a subset of what was originally
        #  defined) then the model must be recompiled.
        model_metrics_names = self.model.metrics_names
        param_metrics = kwargs['metrics']

        # TODO if there are new metrics in the flplan that were not included
        #  in the originally
        #  compiled model, that behavior is not currently handled.
        for param in param_metrics:
            if param not in model_metrics_names:
                error = 'KerasTaskRunner does not support specifying new' \
                        ' metrics. ' \
                        'Param_metrics = {}, model_metrics_names =' \
                        ' {}'.format(param_metrics, model_metrics_names)
                raise ValueError(error)

        # output metric tensors (scalar)
        origin = col_name
        tags = ('trained', )
        output_metric_dict = {
            TensorKey(metric, origin, round_num, True, ('metric', )):
            np.array(np.mean([history.history[metric]]))
            for metric in param_metrics
        }

        # output model tensors (Doesn't include TensorKey)
        output_model_dict = self.get_tensor_dict(with_opt_vars=True)
        global_model_dict, local_model_dict = split_tensor_dict_for_holdouts(
            self.logger, output_model_dict, **self.tensor_dict_split_fn_kwargs)

        # create global tensorkeys
        global_tensorkey_model_dict = {
            TensorKey(tensor_name, origin, round_num, False, tags): nparray
            for tensor_name, nparray in global_model_dict.items()
        }
        # create tensorkeys that should stay local
        local_tensorkey_model_dict = {
            TensorKey(tensor_name, origin, round_num, False, tags): nparray
            for tensor_name, nparray in local_model_dict.items()
        }
        # the train/validate aggregated function of the next round will look
        # for the updated model parameters.
        # this ensures they will be resolved locally
        next_local_tensorkey_model_dict = {
            TensorKey(tensor_name, origin, round_num + 1, False, ('model', )):
            nparray
            for tensor_name, nparray in local_model_dict.items()
        }

        global_tensor_dict = {
            **output_metric_dict,
            **global_tensorkey_model_dict
        }
        local_tensor_dict = {
            **local_tensorkey_model_dict,
            **next_local_tensorkey_model_dict
        }

        # update the required tensors if they need to be pulled from the
        # aggregator
        # TODO this logic can break if different collaborators have different
        # roles between rounds.
        # for example, if a collaborator only performs validation in the first
        # round but training in the second, it has no way of knowing the
        # optimizer state tensor names to request from the aggregator because
        # these are only created after training occurs. A work around could
        # involve doing a single epoch of training on random data to get the
        # optimizer names, and then throwing away the model.
        if self.opt_treatment == 'CONTINUE_GLOBAL':
            self.initialize_tensorkeys_for_functions(with_opt_vars=True)

        # return global_tensor_dict, local_tensor_dict
        return global_tensor_dict, local_tensor_dict
コード例 #4
0
def run_challenge_experiment(aggregation_function,
                             choose_training_collaborators,
                             training_hyper_parameters_for_round,
                             institution_split_csv_filename,
                             brats_training_data_parent_dir,
                             db_store_rounds=5,
                             rounds_to_train=5,
                             device='cpu',
                             save_checkpoints=True,
                             restore_from_checkpoint_folder=None,
                             include_validation_with_hausdorff=True,
                             use_pretrained_model=True):

    fx.init('fets_challenge_workspace')

    from sys import path, exit

    file = Path(__file__).resolve()
    root = file.parent.resolve()  # interface root, containing command modules
    work = Path.cwd().resolve()

    path.append(str(root))
    path.insert(0, str(work))

    # create gandlf_csv and get collaborator names
    gandlf_csv_path = os.path.join(work, 'gandlf_paths.csv')
    # split_csv_path = os.path.join(work, institution_split_csv_filename)
    collaborator_names = construct_fedsim_csv(brats_training_data_parent_dir,
                                              institution_split_csv_filename,
                                              0.8, gandlf_csv_path)

    aggregation_wrapper = CustomAggregationWrapper(aggregation_function)

    overrides = {
        'aggregator.settings.rounds_to_train': rounds_to_train,
        'aggregator.settings.db_store_rounds': db_store_rounds,
        'tasks.train.aggregation_type': aggregation_wrapper,
        'task_runner.settings.device': device,
    }

    # Update the plan if necessary
    plan = fx.update_plan(overrides)

    if not include_validation_with_hausdorff:
        plan.config['task_runner']['settings']['fets_config_dict'][
            'metrics'] = ['dice', 'dice_per_label']

    # Overwrite collaborator names
    plan.authorized_cols = collaborator_names
    # overwrite datapath values with the collaborator name itself
    for col in collaborator_names:
        plan.cols_data_paths[col] = col

    # get the data loaders for each collaborator
    collaborator_data_loaders = {
        col: copy(plan).get_data_loader(col)
        for col in collaborator_names
    }

    transformed_csv_dict = extract_csv_partitions(
        os.path.join(work, 'gandlf_paths.csv'))
    # get the task runner, passing the first data loader
    for col in collaborator_data_loaders:
        #Insert logic to serialize train / val CSVs here
        transformed_csv_dict[col]['train'].to_csv(
            os.path.join(work, 'seg_test_train.csv'))
        transformed_csv_dict[col]['val'].to_csv(
            os.path.join(work, 'seg_test_val.csv'))
        task_runner = copy(plan).get_task_runner(
            collaborator_data_loaders[col])

    if use_pretrained_model:
        print('Loading pretrained model...')
        if device == 'cpu':
            checkpoint = torch.load(
                f'{root}/pretrained_model/resunet_pretrained.pth',
                map_location=torch.device('cpu'))
            task_runner.model.load_state_dict(checkpoint['model_state_dict'])
            task_runner.optimizer.load_state_dict(
                checkpoint['optimizer_state_dict'])
        else:
            checkpoint = torch.load(
                f'{root}/pretrained_model/resunet_pretrained.pth')
            task_runner.model.load_state_dict(checkpoint['model_state_dict'])
            task_runner.optimizer.load_state_dict(
                checkpoint['optimizer_state_dict'])

    tensor_pipe = plan.get_tensor_pipe()

    # Initialize model weights
    init_state_path = plan.config['aggregator']['settings']['init_state_path']
    tensor_dict, _ = split_tensor_dict_for_holdouts(
        logger, task_runner.get_tensor_dict(False))

    model_snap = utils.construct_model_proto(tensor_dict=tensor_dict,
                                             round_number=0,
                                             tensor_pipe=tensor_pipe)

    utils.dump_proto(model_proto=model_snap, fpath=init_state_path)

    # get the aggregator, now that we have the initial weights file set up
    logger.info('Creating aggregator...')
    aggregator = plan.get_aggregator()
    # manually override the aggregator UUID (for checkpoint resume when rounds change)
    aggregator.uuid = 'aggregator'
    aggregator._load_initial_tensors()

    # create our collaborators
    logger.info('Creating collaborators...')
    collaborators = {
        col: copy(plan).get_collaborator(col,
                                         task_runner=task_runner,
                                         client=aggregator)
        for col in collaborator_names
    }

    collaborator_time_stats = gen_collaborator_time_stats(plan.authorized_cols)

    collaborators_chosen_each_round = {}
    collaborator_times_per_round = {}

    logger.info('Starting experiment')

    total_simulated_time = 0
    best_dice = -1.0
    best_dice_over_time_auc = 0

    # results dataframe data
    experiment_results = {
        'round': [],
        'time': [],
        'convergence_score': [],
        'round_dice': [],
        'dice_label_0': [],
        'dice_label_1': [],
        'dice_label_2': [],
        'dice_label_4': [],
    }
    if include_validation_with_hausdorff:
        experiment_results.update({
            'hausdorff95_label_0': [],
            'hausdorff95_label_1': [],
            'hausdorff95_label_2': [],
            'hausdorff95_label_4': [],
        })

    if restore_from_checkpoint_folder is None:
        checkpoint_folder = setup_checkpoint_folder()
        logger.info(f'\nCreated experiment folder {checkpoint_folder}...')
        starting_round_num = 0
    else:
        if not Path(f'checkpoint/{restore_from_checkpoint_folder}').exists():
            logger.warning(
                f'Could not find provided checkpoint folder: {restore_from_checkpoint_folder}. Exiting...'
            )
            exit(1)
        else:
            logger.info(
                f'Attempting to load last completed round from {restore_from_checkpoint_folder}'
            )
            state = load_checkpoint(restore_from_checkpoint_folder)
            checkpoint_folder = restore_from_checkpoint_folder

            [
                loaded_collaborator_names, starting_round_num,
                collaborator_time_stats, total_simulated_time, best_dice,
                best_dice_over_time_auc, collaborators_chosen_each_round,
                collaborator_times_per_round, experiment_results, summary,
                agg_tensor_db
            ] = state

            if loaded_collaborator_names != collaborator_names:
                logger.error(
                    f'Collaborator names found in checkpoint ({loaded_collaborator_names}) '
                    f'do not match provided collaborators ({collaborator_names})'
                )
                exit(1)

            logger.info(f'Previous summary for round {starting_round_num}')
            logger.info(summary)

            starting_round_num += 1
            aggregator.tensor_db.tensor_db = agg_tensor_db
            aggregator.round_number = starting_round_num

    for round_num in range(starting_round_num, rounds_to_train):
        # pick collaborators to train for the round
        training_collaborators = choose_training_collaborators(
            collaborator_names, aggregator.tensor_db._iterate(), round_num,
            collaborators_chosen_each_round, collaborator_times_per_round)

        logger.info('Collaborators chosen to train for round {}:\n\t{}'.format(
            round_num, training_collaborators))

        # save the collaborators chosen this round
        collaborators_chosen_each_round[round_num] = training_collaborators

        # get the hyper-parameters from the competitor
        hparams = training_hyper_parameters_for_round(
            collaborator_names, aggregator.tensor_db._iterate(), round_num,
            collaborators_chosen_each_round, collaborator_times_per_round)

        learning_rate, epochs_per_round, batches_per_round = hparams

        if (epochs_per_round is None) == (batches_per_round is None):
            logger.error(
                'Hyper-parameter function error: function must return "None" for either "epochs_per_round" or "batches_per_round" but not both.'
            )
            return

        hparam_message = "\n\tlearning rate: {}".format(learning_rate)

        # None gets mapped to -1 in the tensor_db
        if epochs_per_round is None:
            epochs_per_round = -1
            hparam_message += "\n\tbatches_per_round: {}".format(
                batches_per_round)
        elif batches_per_round is None:
            batches_per_round = -1
            hparam_message += "\n\tepochs_per_round: {}".format(
                epochs_per_round)

        logger.info("Hyper-parameters for round {}:{}".format(
            round_num, hparam_message))

        # cache each tensor in the aggregator tensor_db
        hparam_dict = {}
        tk = TensorKey(tensor_name='learning_rate',
                       origin=aggregator.uuid,
                       round_number=round_num,
                       report=False,
                       tags=('hparam', 'model'))
        hparam_dict[tk] = np.array(learning_rate)
        tk = TensorKey(tensor_name='epochs_per_round',
                       origin=aggregator.uuid,
                       round_number=round_num,
                       report=False,
                       tags=('hparam', 'model'))
        hparam_dict[tk] = np.array(epochs_per_round)
        tk = TensorKey(tensor_name='batches_per_round',
                       origin=aggregator.uuid,
                       round_number=round_num,
                       report=False,
                       tags=('hparam', 'model'))
        hparam_dict[tk] = np.array(batches_per_round)
        aggregator.tensor_db.cache_tensor(hparam_dict)

        # pre-compute the times for each collaborator
        times_per_collaborator = compute_times_per_collaborator(
            collaborator_names, training_collaborators, batches_per_round,
            epochs_per_round, collaborator_data_loaders,
            collaborator_time_stats, round_num)
        collaborator_times_per_round[round_num] = times_per_collaborator

        aggregator.assigner.set_training_collaborators(training_collaborators)

        # update the state in the aggregation wrapper
        aggregation_wrapper.set_state_data_for_round(
            collaborators_chosen_each_round, collaborator_times_per_round)

        # turn the times list into a list of tuples and sort it
        times_list = [(t, col) for col, t in times_per_collaborator.items()]
        times_list = sorted(times_list)

        # now call each collaborator in order of time
        # FIXME: this doesn't break up each task. We need this if we're doing straggler handling
        for t, col in times_list:
            # set the task_runner data loader
            task_runner.data_loader = collaborator_data_loaders[col]

            # run the collaborator
            collaborators[col].run_simulation()

            logger.info(
                "Collaborator {} took simulated time: {} minutes".format(
                    col, round(t / 60, 2)))

        # the round time is the max of the times_list
        round_time = max([t for t, _ in times_list])
        total_simulated_time += round_time

        # get the performace validation scores for the round
        round_dice = get_metric('valid_dice', round_num, aggregator.tensor_db)
        dice_label_0 = get_metric('valid_dice_per_label_0', round_num,
                                  aggregator.tensor_db)
        dice_label_1 = get_metric('valid_dice_per_label_1', round_num,
                                  aggregator.tensor_db)
        dice_label_2 = get_metric('valid_dice_per_label_2', round_num,
                                  aggregator.tensor_db)
        dice_label_4 = get_metric('valid_dice_per_label_4', round_num,
                                  aggregator.tensor_db)
        if include_validation_with_hausdorff:
            hausdorff95_label_0 = get_metric('valid_hd95_per_label_0',
                                             round_num, aggregator.tensor_db)
            hausdorff95_label_1 = get_metric('valid_hd95_per_label_1',
                                             round_num, aggregator.tensor_db)
            hausdorff95_label_2 = get_metric('valid_hd95_per_label_2',
                                             round_num, aggregator.tensor_db)
            hausdorff95_label_4 = get_metric('valid_hd95_per_label_4',
                                             round_num, aggregator.tensor_db)

        # update best score
        if best_dice < round_dice:
            best_dice = round_dice
            # Set the weights for the final model
            if round_num == 0:
                # here the initial model was validated (temp model does not exist)
                logger.info(
                    f'Skipping best model saving to disk as it is a random initialization.'
                )
            elif not os.path.exists(
                    f'checkpoint/{checkpoint_folder}/temp_model.pkl'):
                raise ValueError(
                    f'Expected temporary model at: checkpoint/{checkpoint_folder}/temp_model.pkl to exist but it was not found.'
                )
            else:
                # here the temp model was the one validated
                shutil.copyfile(
                    src=f'checkpoint/{checkpoint_folder}/temp_model.pkl',
                    dst=f'checkpoint/{checkpoint_folder}/best_model.pkl')
                logger.info(
                    f'Saved model with best average binary DICE: {best_dice} to ~/.local/workspace/checkpoint/{checkpoint_folder}/best_model.pkl'
                )

        ## RUN VALIDATION ON INTERMEDIATE CONSENSUS MODEL
        # set the task_runner data loader
        # task_runner.data_loader = collaborator_data_loaders[col]
        ### DELETE THIS LINE ###
        # print(f'Collaborator {col} training data count = {task_runner.data_loader.get_train_data_size()}')

        # run the collaborator
        #collaborators[col].run_simulation()

        ## CONVERGENCE METRIC COMPUTATION
        # update the auc score
        best_dice_over_time_auc += best_dice * round_time

        # project the auc score as remaining time * best dice
        # this projection assumes that the current best score is carried forward for the entire week
        projected_auc = (MAX_SIMULATION_TIME - total_simulated_time
                         ) * best_dice + best_dice_over_time_auc
        projected_auc /= MAX_SIMULATION_TIME

        # End of round summary
        summary = '"**** END OF ROUND {} SUMMARY *****"'.format(round_num)
        summary += "\n\tSimulation Time: {} minutes".format(
            round(total_simulated_time / 60, 2))
        summary += "\n\t(Projected) Convergence Score: {}".format(
            projected_auc)
        summary += "\n\tDICE Label 0: {}".format(dice_label_0)
        summary += "\n\tDICE Label 1: {}".format(dice_label_1)
        summary += "\n\tDICE Label 2: {}".format(dice_label_2)
        summary += "\n\tDICE Label 4: {}".format(dice_label_4)
        if include_validation_with_hausdorff:
            summary += "\n\tHausdorff95 Label 0: {}".format(
                hausdorff95_label_0)
            summary += "\n\tHausdorff95 Label 1: {}".format(
                hausdorff95_label_1)
            summary += "\n\tHausdorff95 Label 2: {}".format(
                hausdorff95_label_2)
            summary += "\n\tHausdorff95 Label 4: {}".format(
                hausdorff95_label_4)

        experiment_results['round'].append(round_num)
        experiment_results['time'].append(total_simulated_time)
        experiment_results['convergence_score'].append(projected_auc)
        experiment_results['round_dice'].append(round_dice)
        experiment_results['dice_label_0'].append(dice_label_0)
        experiment_results['dice_label_1'].append(dice_label_1)
        experiment_results['dice_label_2'].append(dice_label_2)
        experiment_results['dice_label_4'].append(dice_label_4)
        if include_validation_with_hausdorff:
            experiment_results['hausdorff95_label_0'].append(
                hausdorff95_label_0)
            experiment_results['hausdorff95_label_1'].append(
                hausdorff95_label_1)
            experiment_results['hausdorff95_label_2'].append(
                hausdorff95_label_2)
            experiment_results['hausdorff95_label_4'].append(
                hausdorff95_label_4)
        logger.info(summary)

        if save_checkpoints:
            logger.info(f'Saving checkpoint for round {round_num}')
            logger.info(
                f'To resume from this checkpoint, set the restore_from_checkpoint_folder parameter to \'{checkpoint_folder}\''
            )
            save_checkpoint(checkpoint_folder, aggregator, collaborator_names,
                            collaborators, round_num, collaborator_time_stats,
                            total_simulated_time, best_dice,
                            best_dice_over_time_auc,
                            collaborators_chosen_each_round,
                            collaborator_times_per_round, experiment_results,
                            summary)

        # if the total_simulated_time has exceeded the maximum time, we break
        # in practice, this means that the previous round's model is the last model scored,
        # so a long final round should not actually benefit the competitor, since that final
        # model is never globally validated
        if total_simulated_time > MAX_SIMULATION_TIME:
            logger.info("Simulation time exceeded. Ending Experiment")
            break

        # save the most recent aggregated model in native format to be copied over as best when appropriate
        # (note this model has not been validated by the collaborators yet)
        task_runner.rebuild_model(round_num,
                                  aggregator.last_tensor_dict,
                                  validation=True)
        task_runner.save_native(
            f'checkpoint/{checkpoint_folder}/temp_model.pkl')

    return pd.DataFrame.from_dict(experiment_results), checkpoint_folder
コード例 #5
0
    def train(self,
              col_name,
              round_num,
              input_tensor_dict,
              metrics,
              num_batches=None,
              **kwargs):
        """
        Perform the training for a specified number of batches.

        Is expected to perform draws randomly, without replacement until data is exausted.
        Then data is replaced and shuffled and draws continue.

        Returns
        -------
        dict
            'TensorKey: nparray'
        """
        if metrics is None:
            raise KeyError('metrics must be defined')
        # if 'batch_size' in kwargs:
        #     batch_size = kwargs['batch_size']
        # else:
        #     batch_size = self.data_loader.batch_size

        # rebuild model with updated weights
        self.rebuild_model(round_num, input_tensor_dict)

        results = self.train_iteration(
            self.data_loader.get_train_loader(num_batches),
            metrics=metrics,
            **kwargs)

        # output metric tensors (scalar)
        origin = col_name
        tags = ('trained', )
        output_metric_dict = {
            TensorKey(metric_name, origin, round_num, True, ('metric', )):
            metric_value
            for (metric_name, metric_value) in results
        }

        # output model tensors (Doesn't include TensorKey)
        output_model_dict = self.get_tensor_dict(with_opt_vars=True)
        global_model_dict, local_model_dict = split_tensor_dict_for_holdouts(
            self.logger, output_model_dict, **self.tensor_dict_split_fn_kwargs)

        # create global tensorkeys
        global_tensorkey_model_dict = {
            TensorKey(tensor_name, origin, round_num, False, tags): nparray
            for tensor_name, nparray in global_model_dict.items()
        }
        # create tensorkeys that should stay local
        local_tensorkey_model_dict = {
            TensorKey(tensor_name, origin, round_num, False, tags): nparray
            for tensor_name, nparray in local_model_dict.items()
        }
        # the train/validate aggregated function of the next round will look
        # for the updated model parameters.
        # this ensures they will be resolved locally
        next_local_tensorkey_model_dict = {
            TensorKey(tensor_name, origin, round_num + 1, False, ('model', )):
            nparray
            for tensor_name, nparray in local_model_dict.items()
        }

        global_tensor_dict = {
            **output_metric_dict,
            **global_tensorkey_model_dict
        }
        local_tensor_dict = {
            **local_tensorkey_model_dict,
            **next_local_tensorkey_model_dict
        }

        # update the required tensors if they need to be pulled from the
        # aggregator
        # TODO this logic can break if different collaborators have different
        # roles between rounds.
        # for example, if a collaborator only performs validation in the first
        # round but training in the second, it has no way of knowing the
        # optimizer state tensor names to request from the aggregator because
        # these are only created after training occurs. A work around could
        # involve doing a single epoch of training on random data to get the
        # optimizer names, and then throwing away the model.
        if self.opt_treatment == 'CONTINUE_GLOBAL':
            self.initialize_tensorkeys_for_functions(with_opt_vars=True)

        # return global_tensor_dict, local_tensor_dict
        return global_tensor_dict, local_tensor_dict
コード例 #6
0
    def train_batches(self,
                      col_name,
                      round_num,
                      input_tensor_dict,
                      num_batches=None,
                      use_tqdm=True,
                      **kwargs):
        """Train batches.

        Train the model on the requested number of batches.

        Args:
            col_name:            Name of the collaborator
            round_num:           What round is it
            input_tensor_dict:   Required input tensors (for model)
            num_batches:         The number of batches to train on before returning
            use_tqdm (bool):     Use tqdm to print a progress bar (Default=True)

        Returns:
            global_output_dict:  Tensors to send back to the aggregator
            local_output_dict:   Tensors to maintain in the local TensorDB
        """
        self.rebuild_model(round_num, input_tensor_dict)
        # set to "training" mode
        self.train()

        losses = []

        loader = self.data_loader.get_train_loader(num_batches=num_batches)
        if use_tqdm:
            loader = tqdm.tqdm(loader, desc="train epoch")
            # shuffling occurs every time this loader is used as an interator
            for data, target in loader:
                data, target = (torch.tensor(data).to(self.device),
                                torch.tensor(target).to(self.device))
                self.optimizer.zero_grad()
                output = self(data)
                loss = self.loss_fn(output, target)
                loss.backward()
                self.optimizer.step()
                losses.append(loss.detach().cpu().numpy())

        # output metric tensors (scalar)
        origin = col_name
        tags = ('trained', )
        output_metric_dict = {
            TensorKey(self.loss_fn.__class__.__name__, origin, round_num, True, ('metric', )):
            np.array(np.mean(losses))
        }

        # output model tensors (Doesn't include TensorKey)
        output_model_dict = self.get_tensor_dict(with_opt_vars=True)
        global_model_dict, local_model_dict = split_tensor_dict_for_holdouts(
            self.logger, output_model_dict, **self.tensor_dict_split_fn_kwargs)

        # create global tensorkeys
        global_tensorkey_model_dict = {
            TensorKey(tensor_name, origin, round_num, False, tags): nparray
            for tensor_name, nparray in global_model_dict.items()
        }
        # create tensorkeys that should stay local
        local_tensorkey_model_dict = {
            TensorKey(tensor_name, origin, round_num, False, tags): nparray
            for tensor_name, nparray in local_model_dict.items()
        }
        # the train/validate aggregated function of the next round will look
        # for the updated model parameters
        # this ensures they will be resolved locally
        next_local_tensorkey_model_dict = {
            TensorKey(tensor_name, origin, round_num + 1, False, ('model', )):
            nparray
            for tensor_name, nparray in local_model_dict.items()
        }

        global_tensor_dict = {
            **output_metric_dict,
            **global_tensorkey_model_dict
        }
        local_tensor_dict = {
            **local_tensorkey_model_dict,
            **next_local_tensorkey_model_dict
        }

        # update the required tensors if they need to be pulled
        # from the aggregator
        # TODO this logic can break if different collaborators have different
        #  roles between rounds.
        # for example, if a collaborator only performs validation in the first
        # round but training in the second, it has no way of knowing the
        # optimizer state tensor names to request from the aggregator
        # because these are only created after training occurs. A work
        # around could involve doing a single epoch of training
        # on random data to get the optimizer names, and then throwing away
        # the model.
        if self.opt_treatment == 'CONTINUE_GLOBAL':
            self.initialize_tensorkeys_for_functions(with_opt_vars=True)

        # this will signal that the optimizer values are now present, and can
        # be loaded when the model is rebuilt
        self.train_round_completed = True

        return global_tensor_dict, local_tensor_dict
コード例 #7
0
def run_experiment(collaborator_dict, override_config={}):
    """
    Core function that executes the FL Plan.

    Args:
        collaborator_dict : dict {collaborator_name(str): FederatedModel}
            This dictionary defines which collaborators will participate in the
            experiment, as well as a reference to that collaborator's
            federated model.
        override_config : dict {flplan.key : flplan.value}
            Override any of the plan parameters at runtime using this
            dictionary. To get a list of the available options, execute
            `fx.get_plan()`

    Returns:
        final_federated_model : FederatedModel
            The final model resulting from the federated learning experiment
    """
    from sys import path

    file = Path(__file__).resolve()
    root = file.parent.resolve()  # interface root, containing command modules
    work = Path.cwd().resolve()

    path.append(str(root))
    path.insert(0, str(work))

    # Update the plan if necessary
    if len(override_config) > 0:
        update_plan(override_config)

    # TODO: Fix this implementation. The full plan parsing is reused here,
    # but the model and data will be overwritten based on user specifications
    plan_config = 'plan/plan.yaml'
    cols_config = 'plan/cols.yaml'
    data_config = 'plan/data.yaml'

    plan = Plan.Parse(plan_config_path=Path(plan_config),
                      cols_config_path=Path(cols_config),
                      data_config_path=Path(data_config))

    # Overwrite plan values
    plan.authorized_cols = list(collaborator_dict)
    tensor_pipe = plan.get_tensor_pipe()

    # This must be set to the final index of the list (this is the last
    # tensorflow session to get created)
    plan.runner_ = list(collaborator_dict.values())[-1]
    model = plan.runner_

    # Initialize model weights
    init_state_path = plan.config['aggregator']['settings']['init_state_path']
    rounds_to_train = plan.config['aggregator']['settings']['rounds_to_train']
    tensor_dict, holdout_params = split_tensor_dict_for_holdouts(
        logger, plan.runner_.get_tensor_dict(False))

    model_snap = utils.construct_model_proto(tensor_dict=tensor_dict,
                                             round_number=0,
                                             tensor_pipe=tensor_pipe)

    logger.info(f'Creating Initial Weights File    🠆 {init_state_path}')

    utils.dump_proto(model_proto=model_snap, fpath=init_state_path)

    logger.info('Starting Experiment...')

    aggregator = plan.get_aggregator()

    model_states = {
        collaborator: None
        for collaborator in collaborator_dict.keys()
    }

    # Create the collaborators
    collaborators = {
        collaborator: create_collaborator(plan, collaborator, model,
                                          aggregator)
        for collaborator in plan.authorized_cols
    }

    for round_num in range(rounds_to_train):
        for col in plan.authorized_cols:

            collaborator = collaborators[col]
            model.set_data_loader(collaborator_dict[col].data_loader)

            if round_num != 0:
                model.rebuild_model(round_num, model_states[col])

            collaborator.run_simulation()

            model_states[col] = model.get_tensor_dict(with_opt_vars=True)

    # Set the weights for the final model
    model.rebuild_model(rounds_to_train - 1,
                        aggregator.last_tensor_dict,
                        validation=True)
    return model
コード例 #8
0
    def train_batches(self, col_name, round_num, input_tensor_dict,
                      num_batches, use_tqdm=False, **kwargs):
        """
        Perform the training for a specified number of batches.

        Is expected to perform draws randomly, without replacement until data is exausted. Then
        data is replaced and shuffled and draws continue.

        Args:
            num_batches: Number of batches to train on
            use_tqdm (bool): True = use tqdm to print a progress
             bar (Default=False)
        Returns:
            float: loss metric
        """
        batch_size = self.data_loader.batch_size

        if kwargs['batch_size']:
            batch_size = kwargs['batch_size']

        # rebuild model with updated weights
        self.rebuild_model(round_num, input_tensor_dict)

        tf.keras.backend.set_learning_phase(True)

        losses = []
        batch_num = 0

        while batch_num < num_batches:
            # get iterator for batch draws (shuffling happens here)
            gen = self.data_loader.get_train_loader(batch_size)
            if use_tqdm:
                gen = tqdm.tqdm(gen, desc="training epoch")

            for (X, y) in gen:
                if batch_num >= num_batches:
                    break
                else:
                    losses.append(self.train_batch(X, y))
                    batch_num += 1

        # Output metric tensors (scalar)
        origin = col_name
        tags = ('trained',)
        output_metric_dict = {
            TensorKey(
                self.loss_name, origin, round_num, True, ('metric',)
            ): np.array(np.mean(losses))
        }

        # output model tensors (Doesn't include TensorKey)
        output_model_dict = self.get_tensor_dict(with_opt_vars=True)
        global_model_dict, local_model_dict = split_tensor_dict_for_holdouts(
            self.logger, output_model_dict,
            **self.tensor_dict_split_fn_kwargs
        )

        # Create global tensorkeys
        global_tensorkey_model_dict = {
            TensorKey(tensor_name, origin, round_num, False, tags):
                nparray for tensor_name, nparray in global_model_dict.items()
        }
        # Create tensorkeys that should stay local
        local_tensorkey_model_dict = {
            TensorKey(tensor_name, origin, round_num, False, tags):
                nparray for tensor_name, nparray in local_model_dict.items()
        }
        # The train/validate aggregated function of the next round will
        # look for the updated model parameters.
        # This ensures they will be resolved locally
        next_local_tensorkey_model_dict = {
            TensorKey(
                tensor_name, origin, round_num + 1, False, ('model',)
            ): nparray for tensor_name, nparray in local_model_dict.items()}

        global_tensor_dict = {
            **output_metric_dict,
            **global_tensorkey_model_dict
        }
        local_tensor_dict = {
            **local_tensorkey_model_dict,
            **next_local_tensorkey_model_dict
        }

        # Update the required tensors if they need to be pulled from
        # the aggregator
        # TODO this logic can break if different collaborators have different
        #  roles between rounds.
        # For example, if a collaborator only performs validation in the first
        # round but training in the second, it has no way of knowing the
        # optimizer state tensor names to request from the aggregator because
        # these are only created after training occurs. A work around could
        # involve doing a single epoch of training on random data to get the
        # optimizer names, and then throwing away the model.
        if self.opt_treatment == 'CONTINUE_GLOBAL':
            self.initialize_tensorkeys_for_functions(with_opt_vars=True)

        # return global_tensor_dict, local_tensor_dict
        return global_tensor_dict, local_tensor_dict
コード例 #9
0
    def fit(self):
        """Run the estimator."""
        import fastestimator as fe
        from fastestimator.trace.io.best_model_saver import BestModelSaver
        from sys import path

        file = Path(__file__).resolve()
        # interface root, containing command modules
        root = file.parent.resolve()
        work = Path.cwd().resolve()

        path.append(str(root))
        path.insert(0, str(work))

        # TODO: Fix this implementation. The full plan parsing is reused here,
        # but the model and data will be overwritten based on
        # user specifications
        plan_config = (Path(fx.WORKSPACE_PREFIX) / 'plan' / 'plan.yaml')
        cols_config = (Path(fx.WORKSPACE_PREFIX) / 'plan' / 'cols.yaml')
        data_config = (Path(fx.WORKSPACE_PREFIX) / 'plan' / 'data.yaml')

        plan = Plan.Parse(plan_config_path=plan_config,
                          cols_config_path=cols_config,
                          data_config_path=data_config)

        self.rounds = plan.config['aggregator']['settings']['rounds_to_train']
        data_loader = FastEstimatorDataLoader(self.estimator.pipeline)
        runner = FastEstimatorTaskRunner(self.estimator,
                                         data_loader=data_loader)
        # Overwrite plan values
        tensor_pipe = plan.get_tensor_pipe()
        # Initialize model weights
        init_state_path = plan.config['aggregator']['settings'][
            'init_state_path']
        tensor_dict, holdout_params = split_tensor_dict_for_holdouts(
            self.logger, runner.get_tensor_dict(False))

        model_snap = utils.construct_model_proto(tensor_dict=tensor_dict,
                                                 round_number=0,
                                                 tensor_pipe=tensor_pipe)

        self.logger.info(f'Creating Initial Weights File'
                         f'    🠆 {init_state_path}')

        utils.dump_proto(model_proto=model_snap, fpath=init_state_path)

        self.logger.info('Starting Experiment...')

        aggregator = plan.get_aggregator()

        model_states = {
            collaborator: None
            for collaborator in plan.authorized_cols
        }
        runners = {}
        save_dir = {}
        data_path = 1
        for col in plan.authorized_cols:
            data = self.estimator.pipeline.data
            train_data, eval_data, test_data = split_data(
                data['train'], data['eval'], data['test'], data_path,
                len(plan.authorized_cols))
            pipeline_kwargs = {}
            for k, v in self.estimator.pipeline.__dict__.items():
                if k in [
                        'batch_size', 'ops', 'num_process', 'drop_last',
                        'pad_value', 'collate_fn'
                ]:
                    pipeline_kwargs[k] = v
            pipeline_kwargs.update({
                'train_data': train_data,
                'eval_data': eval_data,
                'test_data': test_data
            })
            pipeline = fe.Pipeline(**pipeline_kwargs)

            data_loader = FastEstimatorDataLoader(pipeline)
            self.estimator.system.pipeline = pipeline

            runners[col] = FastEstimatorTaskRunner(estimator=self.estimator,
                                                   data_loader=data_loader)
            runners[col].set_optimizer_treatment('CONTINUE_LOCAL')

            for trace in runners[col].estimator.system.traces:
                if isinstance(trace, BestModelSaver):
                    save_dir_path = f'{trace.save_dir}/{col}'
                    os.makedirs(save_dir_path, exist_ok=True)
                    save_dir[col] = save_dir_path

            data_path += 1

        # Create the collaborators
        collaborators = {
            collaborator:
            fx.create_collaborator(plan, collaborator, runners[collaborator],
                                   aggregator)
            for collaborator in plan.authorized_cols
        }

        model = None
        for round_num in range(self.rounds):
            for col in plan.authorized_cols:

                collaborator = collaborators[col]

                if round_num != 0:
                    # For FastEstimator Jupyter notebook, models must be
                    # saved in different directories (i.e. path must be
                    # reset here)

                    runners[col].estimator.system.load_state(
                        f'save/{col}_state')
                    runners[col].rebuild_model(round_num, model_states[col])

                # Reset the save directory if BestModelSaver is present
                # in traces
                for trace in runners[col].estimator.system.traces:
                    if isinstance(trace, BestModelSaver):
                        trace.save_dir = save_dir[col]

                collaborator.run_simulation()

                model_states[col] = runners[col].get_tensor_dict(
                    with_opt_vars=True)
                model = runners[col].model
                runners[col].estimator.system.save_state(f'save/{col}_state')

        # TODO This will return the model from the last collaborator,
        #  NOT the final aggregated model (though they should be similar).
        # There should be a method added to the aggregator that will load
        # the best model from disk and return it
        return model
コード例 #10
0
    def train(self, col_name, round_num, input_tensor_dict, epochs, **kwargs):
        """Perform training for a specified number of epochs."""
        if 'metrics' not in kwargs:
            raise KeyError('metrics must be included in kwargs')
        param_metrics = kwargs['metrics']

        self.rebuild_model(round_num, input_tensor_dict)

        # Estimators need to be given an experiment name to produce an output
        # summary
        summary = self.estimator.fit("experiment", warmup=False)
        self.epoch_idx = self.estimator.system.epoch_idx
        self.global_step = self.estimator.system.global_step
        self.estimator.system.total_epochs += self.total_epochs
        # Define what the ouptut is to encapsulate in tensorkeys and return
        # output metric tensors (scalar)
        origin = col_name
        tags = ('trained', )
        output_metric_dict = {
            TensorKey(metric, origin, round_num, True, ('metric', )):
            np.array(list(summary.history['train'][metric].values())[-1])
            for metric in param_metrics
        }

        # output model tensors (Doesn't include TensorKey)
        output_model_dict = self.get_tensor_dict(with_opt_vars=True)
        global_model_dict, local_model_dict = split_tensor_dict_for_holdouts(
            self.logger, output_model_dict, **self.tensor_dict_split_fn_kwargs)

        # create global tensorkeys
        global_tensorkey_model_dict = {
            TensorKey(tensor_name, origin, round_num, False, tags): nparray
            for tensor_name, nparray in global_model_dict.items()
        }
        # create tensorkeys that should stay local
        local_tensorkey_model_dict = {
            TensorKey(tensor_name, origin, round_num, False, tags): nparray
            for tensor_name, nparray in local_model_dict.items()
        }
        # the train/validate aggregated function of the next round will look
        # for the updated model parameters.
        # this ensures they will be resolved locally
        next_local_tensorkey_model_dict = {
            TensorKey(tensor_name, origin, round_num + 1, False, ('model', )):
            nparray
            for tensor_name, nparray in local_model_dict.items()
        }

        global_tensor_dict = {
            **output_metric_dict,
            **global_tensorkey_model_dict
        }
        local_tensor_dict = {
            **local_tensorkey_model_dict,
            **next_local_tensorkey_model_dict
        }

        # update the required tensors if they need to be pulled from the
        # aggregator
        # TODO this logic can break if different collaborators have different
        #  roles between rounds.
        # for example, if a collaborator only performs validation in the first
        # round but training in the second, it has no way of knowing the
        # optimizer state tensor names to request from the aggregator
        # because these are only created after training occurs.
        # A work around could involve doing a single epoch of training
        # on random data to get the optimizer names, and then throwing away
        # the model.
        if self.opt_treatment == 'CONTINUE_GLOBAL':
            self.initialize_tensorkeys_for_functions(with_opt_vars=True)

        # return global_tensor_dict, local_tensor_dict
        return global_tensor_dict, local_tensor_dict
コード例 #11
0
def initialize(context, plan_config, cols_config, data_config,
               aggregator_address, feature_shape):
    """
    Initialize Data Science plan.

    Create a protocol buffer file of the initial model weights for
     the federation.
    """
    plan = Plan.Parse(plan_config_path=Path(plan_config),
                      cols_config_path=Path(cols_config),
                      data_config_path=Path(data_config))

    init_state_path = plan.config['aggregator']['settings']['init_state_path']

    # TODO:  Is this part really needed?  Why would we need to collaborator
    #  name to know the input shape to the model?

    # if  feature_shape is None:
    #     if  cols_config is None:
    #         exit('You must specify either a feature
    #         shape or authorized collaborator
    #         list in order for the script to determine the input layer shape')
    print(plan.cols_data_paths)

    collaborator_cname = list(plan.cols_data_paths)[0]

    # else:

    #     logger.info(f'Using data object of type {type(data)}
    #     and feature shape {feature_shape}')
    #     raise NotImplementedError()

    # data_loader = plan.get_data_loader(collaborator_cname)
    # task_runner = plan.get_task_runner(collaborator_cname)

    data_loader = plan.get_data_loader(collaborator_cname)
    task_runner = plan.get_task_runner(data_loader)
    tensor_pipe = plan.get_tensor_pipe()

    # I believe there is no need for this line as task_runner has this variable
    # initialized with empty dict tensor_dict_split_fn_kwargs =
    # task_runner.tensor_dict_split_fn_kwargs or {}
    tensor_dict, holdout_params = split_tensor_dict_for_holdouts(
        logger, task_runner.get_tensor_dict(False),
        **task_runner.tensor_dict_split_fn_kwargs)

    logger.warn(f'Following parameters omitted from global initial model, '
                f'local initialization will determine'
                f' values: {list(holdout_params.keys())}')

    model_snap = utils.construct_model_proto(tensor_dict=tensor_dict,
                                             round_number=0,
                                             tensor_pipe=tensor_pipe)

    logger.info(f'Creating Initial Weights File    🠆 {init_state_path}')

    utils.dump_proto(model_proto=model_snap, fpath=init_state_path)

    plan_origin = Plan.Parse(Path(plan_config), resolve=False).config

    if (plan_origin['network']['settings']['agg_addr'] == 'auto'
            or aggregator_address):
        plan_origin['network']['settings'] = plan_origin['network'].get(
            'settings', {})
        plan_origin['network']['settings']['agg_addr'] =\
            aggregator_address or getfqdn()

        logger.warn(f"Patching Aggregator Addr in Plan"
                    f" 🠆 {plan_origin['network']['settings']['agg_addr']}")

        Plan.Dump(Path(plan_config), plan_origin)

    plan.config = plan_origin

    # Record that plan with this hash has been initialized
    if 'plans' not in context.obj:
        context.obj['plans'] = []
    context.obj['plans'].append(f"{Path(plan_config).stem}_{plan.hash[:8]}")
    logger.info(f"{context.obj['plans']}")