Beispiel #1
0
    def initialize_tensorkeys_for_functions(self, with_opt_vars=False):
        """
        Set the required tensors for all publicly accessible methods \
            that could be called as part of a task.

        By default, this is just all of the layers and optimizer of the model.
        Custom tensors should be added to this function

        Parameters
        ----------
        None

        Returns
        -------
        None
        """
        # TODO there should be a way to programmatically iterate through all
        #  of the methods in the class and declare the tensors.
        # For now this is done manually

        output_model_dict = self.get_tensor_dict(with_opt_vars=with_opt_vars)
        global_model_dict, local_model_dict = split_tensor_dict_for_holdouts(
            self.logger, output_model_dict, **self.tensor_dict_split_fn_kwargs)
        if not with_opt_vars:
            validation_global_model_dict = global_model_dict
            validation_local_model_dict = local_model_dict
        else:
            output_model_dict = self.get_tensor_dict(with_opt_vars=False)
            validation_global_model_dict, validation_local_model_dict =\
                split_tensor_dict_for_holdouts(
                    self.logger,
                    output_model_dict,
                    **self.tensor_dict_split_fn_kwargs
                )

        self.required_tensorkeys_for_function['train'] = [
            TensorKey(tensor_name, 'GLOBAL', 0, False, ('model', ))
            for tensor_name in global_model_dict
        ]
        self.required_tensorkeys_for_function['train'] += [
            TensorKey(tensor_name, 'LOCAL', 0, False, ('model', ))
            for tensor_name in local_model_dict
        ]

        # Validation may be performed on local or aggregated (global) model,
        # so there is an extra lookup dimension for kwargs
        self.required_tensorkeys_for_function['validate'] = {}
        # TODO This is not stateless. The optimizer will not be
        self.required_tensorkeys_for_function['validate']['apply=local'] = \
            [TensorKey(tensor_name, 'LOCAL', 0, False, ('trained',))
             for tensor_name in {
                 **validation_global_model_dict,
                 **validation_local_model_dict}]
        self.required_tensorkeys_for_function['validate']['apply=global'] = \
            [TensorKey(tensor_name, 'GLOBAL', 0, False, ('model',))
             for tensor_name in validation_global_model_dict]
        self.required_tensorkeys_for_function['validate']['apply=global'] += \
            [TensorKey(tensor_name, 'LOCAL', 0, False, ('model',))
             for tensor_name in validation_local_model_dict]
Beispiel #2
0
    def validate(self,
                 col_name,
                 round_num,
                 input_tensor_dict,
                 use_tqdm=False,
                 **kwargs):
        """Validate.
        Run validation of the model on the local data.
        Args:
            col_name:            Name of the collaborator
            round_num:           What round is it
            input_tensor_dict:   Required input tensors (for model)
            use_tqdm (bool):     Use tqdm to print a progress bar (Default=True)
            kwargs:              Key word arguments passed to GaNDLF main_run
        Returns:
            global_output_dict:   Tensors to send back to the aggregator
            local_output_dict:   Tensors to maintain in the local TensorDB
        """
        self.rebuild_model(round_num, input_tensor_dict, validation=True)
        self.model.eval()
        # self.model.to(self.device)

        epoch_valid_loss, epoch_valid_metric = validate_network(
            self.model,
            self.data_loader.val_dataloader,
            self.scheduler,
            self.params,
            round_num,
            mode="validation")

        self.logger.info(epoch_valid_loss)
        self.logger.info(epoch_valid_metric)

        origin = col_name
        suffix = 'validate'
        if kwargs['apply'] == 'local':
            suffix += '_local'
        else:
            suffix += '_agg'
        tags = ('metric', suffix)

        output_tensor_dict = {}
        output_tensor_dict[TensorKey('valid_loss', origin, round_num, True,
                                     tags)] = np.array(epoch_valid_loss)
        for k, v in epoch_valid_metric.items():
            if np.array(v).size == 1:
                output_tensor_dict[TensorKey(f'valid_{k}', origin, round_num,
                                             True, tags)] = np.array(v)
            else:
                for idx, label in enumerate([0, 1, 2, 4]):
                    output_tensor_dict[TensorKey(f'valid_{k}_{label}', origin,
                                                 round_num, True,
                                                 tags)] = np.array(v[idx])

        return output_tensor_dict, {}
Beispiel #3
0
    def validate(self,
                 col_name,
                 round_num,
                 input_tensor_dict,
                 use_tqdm=False,
                 **kwargs):
        """Validate.

        Run validation of the model on the local data.

        Args:
            col_name:            Name of the collaborator
            round_num:           What round is it
            input_tensor_dict:   Required input tensors (for model)
            use_tqdm (bool):     Use tqdm to print a progress bar (Default=True)

        Returns:
            global_output_dict:  Tensors to send back to the aggregator
            local_output_dict:   Tensors to maintain in the local TensorDB

        """
        self.rebuild_model(round_num, input_tensor_dict, validation=True)
        self.eval()
        self.to(self.device)
        val_score = 0
        total_samples = 0

        loader = self.data_loader.get_valid_loader()
        if use_tqdm:
            loader = tqdm.tqdm(loader, desc="validate")

        with pt.no_grad():
            for data, target in loader:
                samples = target.shape[0]
                total_samples += samples
                data, target = pt.tensor(data).to(
                    self.device), pt.tensor(target).to(self.device,
                                                       dtype=pt.int64)
                output = self(data)
                # get the index of the max log-probability
                pred = output.argmax(dim=1, keepdim=True)
                target_categorical = target.argmax(dim=1, keepdim=True)
                val_score += pred.eq(target_categorical).sum().cpu().numpy()

        origin = col_name
        suffix = 'validate'
        if kwargs['apply'] == 'local':
            suffix += '_local'
        else:
            suffix += '_agg'
        tags = ('metric', suffix)
        # TODO figure out a better way to pass in metric for this pytorch
        #  validate function
        output_tensor_dict = {
            TensorKey('acc', origin, round_num, True, tags):
            np.array(val_score / total_samples)
        }

        # Empty list represents metrics that should only be stored locally
        return output_tensor_dict, {}
    def _load_initial_tensors(self):
        """
        Load all of the tensors required to begin federated learning.

        Required tensors are: \
            1. Initial model.

        Returns:
            None
        """
        tensor_dict, round_number = utils.deconstruct_model_proto(
            self.model, compression_pipeline=self.compression_pipeline)

        if round_number > self.round_number:
            self.logger.info(
                'Starting training from round {} of previously saved'
                ' model'.format(round_number))
            self.round_number = round_number
        tensor_key_dict = {
            TensorKey(k, self.uuid, self.round_number, False, ('model', )): v
            for k, v in tensor_dict.items()
        }
        # all initial model tensors are loaded here
        self.tensor_db.cache_tensor(tensor_key_dict)
        self.logger.debug('This is the initial tensor_db:'
                          ' {}'.format(self.tensor_db))
    def _save_model(self, round_number, file_path):
        """
        Save the best or latest model.

        Args:
            round_number: int
                Model round to be saved
            file_path: str
                Either the best model or latest model file path

        Returns:
            None
        """
        # Extract the model from TensorDB and set it to the new model
        og_tensor_dict, _ = utils.deconstruct_model_proto(
            self.model, compression_pipeline=self.compression_pipeline)
        tensor_keys = [
            TensorKey(k, self.uuid, round_number, False, ('model', ))
            for k, v in og_tensor_dict.items()
        ]
        tensor_dict = {}
        for tk in tensor_keys:
            tk_name, _, _, _, _ = tk
            tensor_dict[tk_name] = self.tensor_db.get_tensor_from_cache(tk)
            if tensor_dict[tk_name] is None:
                self.logger.info('Cannot save model for round {}.'
                                 ' Continuing...'.format(round_number))
                return
        if file_path == self.best_state_path:
            self.best_tensor_dict = tensor_dict
        if file_path == self.last_state_path:
            self.last_tensor_dict = tensor_dict
        self.model = utils.construct_model_proto(tensor_dict, round_number,
                                                 self.compression_pipeline)
        utils.dump_proto(self.model, file_path)
Beispiel #6
0
    def apply_delta(tensor_key,
                    delta,
                    base_model_nparray,
                    creates_model=False):
        """
        Add delta to the nparray.

        Args:
            tensor_key:             This is the tensor_key associated with the
                                    delta. Should have a tag of 'trained' or
                                    'aggregated'
            delta:                  Weight delta between the new model and
                                    old model
            base_model_nparray:     The nparray that corresponds to the prior
                                    weights
            creates_model:          If flag is set, the tensorkey returned
                                    will correspond to the aggregator model

        Returns:
            new_model_tensor_key:   Latest model layer tensorkey
            new_model_nparray:      Latest layer weights

        """
        tensor_name, origin, round_number, report, tags = tensor_key
        if not np.isscalar(base_model_nparray):
            assert (delta.shape == base_model_nparray.shape), (
                'Shape of delta ({}) is not equal to shape of model'
                ' layer ({})'.format(delta.shape, base_model_nparray.shape))
        # assert('model' in tensor_key[3]), 'The tensorkey should be provided
        # from the base model'
        # Aggregator UUID has the prefix 'aggregator'
        if 'aggregator' in origin and not creates_model:
            tags = list(tags)
            tags.remove('delta')
            new_tags = tuple(tags)
            new_model_tensor_key = TensorKey(tensor_name, origin, round_number,
                                             report, new_tags)
        else:
            new_model_tensor_key = TensorKey(tensor_name, origin, round_number,
                                             report, ('model', ))

        return new_model_tensor_key, base_model_nparray + delta
Beispiel #7
0
    def nparray_to_named_tensor(self, tensor_key, nparray):
        """
        Construct the NamedTensor Protobuf.

        Includes logic to create delta, compress tensors with the TensorCodec, etc.
        """
        # if we have an aggregated tensor, we can make a delta
        tensor_name, origin, round_number, report, tags = tensor_key
        if 'trained' in tags and self.delta_updates:
            # Should get the pretrained model to create the delta. If training
            # has happened,
            # Model should already be stored in the TensorDB
            model_nparray = self.tensor_db.get_tensor_from_cache(
                TensorKey(
                    tensor_name,
                    origin,
                    round_number,
                    report,
                    ('model',)
                )
            )

            # The original model will not be present for the optimizer on the
            # first round.
            if model_nparray is not None:
                delta_tensor_key, delta_nparray = \
                    self.tensor_codec.generate_delta(
                        tensor_key,
                        nparray,
                        model_nparray
                    )
                delta_comp_tensor_key, delta_comp_nparray, metadata = \
                    self.tensor_codec.compress(delta_tensor_key, delta_nparray)
                named_tensor = utils.construct_named_tensor(
                    delta_comp_tensor_key,
                    delta_comp_nparray,
                    metadata,
                    lossless=False
                )
                return named_tensor

        # Assume every other tensor requires lossless compression
        compressed_tensor_key, compressed_nparray, metadata = \
            self.tensor_codec.compress(
                tensor_key, nparray, require_lossless=True
            )
        named_tensor = utils.construct_named_tensor(
            compressed_tensor_key,
            compressed_nparray,
            metadata,
            lossless=True
        )

        return named_tensor
    def validate(self,
                 col_name,
                 round_num,
                 input_tensor_dict,
                 use_tqdm=True,
                 **kwargs):
        """Run validation of the model on the local data.

        Args:
            col_name:            Name of the collaborator
            round_num:           What round is it
            input_tensor_dict:   Required input tensors (for model)
            use_tqdm:     Use tqdm to print a progress bar (Default=True)

        Returns:
            global_output_dict:  Tensors to send back to the aggregator
            local_output_dict:   Tensors to maintain in the local TensorDB
        """
        self.rebuild_model(round_num, input_tensor_dict, validation=True)
        self.eval()
        self.to(self.device)
        val_score = 0
        total_samples = 0

        loader = self.data_loader.get_valid_loader()
        if use_tqdm:
            loader = tqdm.tqdm(loader, desc="validate")

        with torch.no_grad():
            for data, target in loader:
                samples = target.shape[0]
                total_samples += samples
                data, target = torch.tensor(data).to(
                    self.device), torch.tensor(target).to(self.device)
                output = self(data)
                # get the index of the max log-probability
                val = soft_dice_coef(output, target)
                val_score += val.sum().cpu().numpy()

        origin = col_name
        suffix = 'validate'
        if kwargs['apply'] == 'local':
            suffix += '_local'
        else:
            suffix += '_agg'
        tags = ('metric', suffix)
        # TODO figure out a better way to pass in metric for this pytorch
        #  validate function
        output_tensor_dict = {
            TensorKey('dice_coef', origin, round_num, True, tags):
            np.array(val_score / total_samples)
        }

        return output_tensor_dict, {}
Beispiel #9
0
    def validate(self, col_name, round_num, input_tensor_dict, **kwargs):
        """
        Run the trained model on validation data; report results.

        Parameters
        ----------
        input_tensor_dict : either the last aggregated or locally trained model

        Returns
        -------
        output_tensor_dict : {TensorKey: nparray} (these correspond to acc,
         precision, f1_score, etc.)
        """
        batch_size = 1
        if 'batch_size' in kwargs:
            batch_size = kwargs['batch_size']
        self.rebuild_model(round_num, input_tensor_dict, validation=True)
        param_metrics = kwargs['metrics']

        vals = self.model.evaluate(self.data_loader.X_valid,
                                   self.data_loader.y_valid,
                                   batch_size=batch_size,
                                   verbose=0)
        model_metrics_names = self.model.metrics_names
        if type(vals) is not list:
            vals = [vals]
        ret_dict = dict(zip(model_metrics_names, vals))

        # TODO if there are new metrics in the flplan that were not included in
        #  the originally compiled model, that behavior is not currently
        #  handled.
        for param in param_metrics:
            if param not in model_metrics_names:
                error = 'KerasTaskRunner does not support specifying new' \
                        ' metrics. ' \
                        'Param_metrics = {}, model_metrics_names' \
                        ' = {}'.format(param_metrics, model_metrics_names)
                raise ValueError(error)

        origin = col_name
        suffix = 'validate'
        if kwargs['apply'] == 'local':
            suffix += '_local'
        else:
            suffix += '_agg'
        tags = ('metric', suffix)
        output_tensor_dict = {
            TensorKey(metric, origin, round_num, True, tags):
            np.array(ret_dict[metric])
            for metric in param_metrics
        }

        return output_tensor_dict, {}
    def find_dependencies(self, tensor_key, send_model_deltas):
        """Resolve the tensors required to do the specified operation."""
        tensor_key_dependencies = []

        tensor_name, origin, round_number, report, tags = tensor_key

        if 'model' in tags and send_model_deltas:
            if round_number >= 1:
                # The new model can be generated by previous model + delta
                tensor_key_dependencies.append(
                    TensorKey(tensor_name, origin, round_number - 1, report,
                              tags))
                if self.compression_pipeline.is_lossy():
                    new_tags = ('aggregated', 'delta', 'lossy_compressed')
                else:
                    new_tags = ('aggregated', 'delta', 'compressed')
                tensor_key_dependencies.append(
                    TensorKey(tensor_name, origin, round_number, report,
                              new_tags))

        return tensor_key_dependencies
Beispiel #11
0
    def update_tensorkeys_for_functions(self):
        """
        Update the required tensors for all publicly accessible methods \
            that could be called as part of a task.

        By default, this is just all of the layers and optimizer of the model.
        Custom tensors should be added to this function

        Parameters
        ----------
        None

        Returns
        -------
        None
        """
        # TODO complete this function. It is only needed for opt_treatment,
        #  and making the model stateless

        # Minimal required tensors for train function
        model_layer_names = self._get_weights_names(self.model)
        opt_names = self._get_weights_names(self.model.optimizer)
        tensor_names = model_layer_names + opt_names
        self.logger.debug(
            'Updating model tensor names: {}'.format(tensor_names))
        self.required_tensorkeys_for_function['train'] = [
            TensorKey(tensor_name, 'GLOBAL', 0, ('model', ))
            for tensor_name in tensor_names
        ]

        # Validation may be performed on local or aggregated (global) model,
        # so there is an extra lookup dimension for kwargs
        self.required_tensorkeys_for_function['validate'] = {}
        self.required_tensorkeys_for_function['validate']['local_model=True'] = \
            [TensorKey(tensor_name, 'LOCAL', 0, ('trained',))
             for tensor_name in tensor_names]
        self.required_tensorkeys_for_function['validate']['local_model=False'] = \
            [TensorKey(tensor_name, 'GLOBAL', 0, ('model',))
             for tensor_name in tensor_names]
Beispiel #12
0
def construct_model_proto(tensor_dict, round_number, tensor_pipe):
    # compress the arrays in the tensor_dict, and form the model proto
    # TODO: Hold-out tensors from the tensor compression pipeline.
    named_tensors = []
    for key, nparray in tensor_dict.items():
        bytes, transformer_metadata = tensor_pipe.forward(data=nparray)
        tensor_key = TensorKey(key, 'agg', round_number, False, ('model', ))
        named_tensors.append(
            construct_named_tensor(tensor_key,
                                   bytes,
                                   transformer_metadata,
                                   lossless=True))

    return ModelProto(tensors=named_tensors)
Beispiel #13
0
    def do_task(self, task, round_number):
        """Do the specified task."""
        # map this task to an actual function name and kwargs
        func_name = self.task_config[task]['function']
        kwargs = self.task_config[task]['kwargs']

        # this would return a list of what tensors we require as TensorKeys
        required_tensorkeys_relative = \
            self.task_runner.get_required_tensorkeys_for_function(
                func_name, **kwargs
            )

        # models actually return "relative" tensorkeys of (name, LOCAL|GLOBAL,
        # round_offset)
        # so we need to update these keys to their "absolute values"
        required_tensorkeys = []
        for tname, origin, rnd_num, report, tags in required_tensorkeys_relative:
            if origin == 'GLOBAL':
                origin = self.aggregator_uuid
            else:
                origin = self.collaborator_name

            # rnd_num is the relative round. So if rnd_num is -1, get the
            # tensor from the previous round
            required_tensorkeys.append(
                TensorKey(tname, origin, rnd_num + round_number, report, tags)
            )

        # print('Required tensorkeys = {}'.format(
        # [tk[0] for tk in required_tensorkeys]))
        input_tensor_dict = self.get_numpy_dict_for_tensorkeys(
            required_tensorkeys
        )

        # now we have whatever the model needs to do the task
        func = getattr(self.task_runner, func_name)
        global_output_tensor_dict, local_output_tensor_dict = func(
            col_name=self.collaborator_name,
            round_num=round_number,
            input_tensor_dict=input_tensor_dict,
            **kwargs)

        # Save global and local output_tensor_dicts to TensorDB
        self.tensor_db.cache_tensor(global_output_tensor_dict)
        self.tensor_db.cache_tensor(local_output_tensor_dict)

        # send the results for this tasks; delta and compression will occur in
        # this function
        self.send_task_results(global_output_tensor_dict, round_number, task)
Beispiel #14
0
    def named_tensor_to_nparray(self, named_tensor):
        """Convert named tensor to a numpy array."""
        # do the stuff we do now for decompression and frombuffer and stuff
        # This should probably be moved back to protoutils
        raw_bytes = named_tensor.data_bytes
        metadata = [{'int_to_float': proto.int_to_float,
                     'int_list': proto.int_list,
                     'bool_list': proto.bool_list
                     } for proto in named_tensor.transformer_metadata]
        # The tensor has already been transfered to collaborator, so
        # the newly constructed tensor should have the collaborator origin
        tensor_key = TensorKey(
            named_tensor.name,
            self.collaborator_name,
            named_tensor.round_number,
            named_tensor.report,
            tuple(named_tensor.tags)
        )
        tensor_name, origin, round_number, report, tags = tensor_key
        if 'compressed' in tags:
            decompressed_tensor_key, decompressed_nparray = \
                self.tensor_codec.decompress(
                    tensor_key,
                    data=raw_bytes,
                    transformer_metadata=metadata,
                    require_lossless=True
                )
        elif 'lossy_compressed' in tags:
            decompressed_tensor_key, decompressed_nparray = \
                self.tensor_codec.decompress(
                    tensor_key,
                    data=raw_bytes,
                    transformer_metadata=metadata
                )
        else:
            # There could be a case where the compression pipeline is bypassed
            # entirely
            self.logger.warning('Bypassing tensor codec...')
            decompressed_tensor_key = tensor_key
            decompressed_nparray = raw_bytes

        self.tensor_db.cache_tensor(
            {decompressed_tensor_key: decompressed_nparray}
        )

        return decompressed_nparray
Beispiel #15
0
    def _load_initial_tensors_from_dict(self, tensor_dict):
        """
        Load all of the tensors required to begin federated learning.

        Required tensors are: \
            1. Initial model.

        Returns:
            None
        """
        tensor_key_dict = {
            TensorKey(k, self.uuid, self.round_number, False, ('model', )): v
            for k, v in tensor_dict.items()
        }
        # all initial model tensors are loaded here
        self.tensor_db.cache_tensor(tensor_key_dict)
        self.logger.debug('This is the initial tensor_db:'
                          ' {}'.format(self.tensor_db))
Beispiel #16
0
    def _nparray_to_named_tensor(self, tensor_key, nparray, send_model_deltas,
                                 compress_lossless):
        """
        Construct the NamedTensor Protobuf.

        Also includes logic to create delta, compress tensors with the TensorCodec, etc.
        """
        tensor_name, origin, round_number, report, tags = tensor_key
        # if we have an aggregated tensor, we can make a delta
        if 'aggregated' in tags and send_model_deltas:
            # Should get the pretrained model to create the delta. If training
            # has happened, Model should already be stored in the TensorDB
            model_tk = TensorKey(tensor_name, origin, round_number - 1, report,
                                 ('model', ))

            model_nparray = self.tensor_db.get_tensor_from_cache(model_tk)

            assert (model_nparray is not None), (
                "The original model layer should be present if the latest "
                "aggregated model is present")
            delta_tensor_key, delta_nparray = self.tensor_codec.generate_delta(
                tensor_key, nparray, model_nparray)
            delta_comp_tensor_key, delta_comp_nparray, metadata = \
                self.tensor_codec.compress(delta_tensor_key, delta_nparray,
                                           lossless=compress_lossless)
            named_tensor = utils.construct_named_tensor(
                delta_comp_tensor_key,
                delta_comp_nparray,
                metadata,
                lossless=compress_lossless)

        else:
            # Assume every other tensor requires lossless compression
            compressed_tensor_key, compressed_nparray, metadata = \
                self.tensor_codec.compress(tensor_key, nparray,
                                           require_lossless=True)
            named_tensor = utils.construct_named_tensor(
                compressed_tensor_key,
                compressed_nparray,
                metadata,
                lossless=compress_lossless)

        return named_tensor
    def compress(self, tensor_key, data, require_lossless=False, **kwargs):
        """
        Function-wrapper around the tensor_pipeline.forward function.

        It also keeps track of the tensorkeys associated with the compressed nparray

        Args:
            tensor_key:             TensorKey is provided to verify it should
                                    be compressed, and new TensorKeys returned
                                    will be derivatives of the existing
                                    tensor_name

            data:                   (uncompressed) numpy array associated with
                                    the tensor_key

            require_lossless:       boolean. Does tensor require
                                    compression

        Returns:
            compressed_tensor_key:  Tensorkey corresponding to the decompressed
                                    tensor

            compressed_nparray:     The compressed tensor

            metadata:               metadata associated with compressed tensor

        """
        if require_lossless:
            compressed_nparray, metadata = self.lossless_pipeline.forward(
                data, **kwargs)
        else:
            compressed_nparray, metadata = self.compression_pipeline.forward(
                data, **kwargs)
        # Define the compressed tensorkey that should be
        # returned ('trained.delta'->'trained.delta.lossy_compressed')
        tensor_name, origin, round_number, report, tags = tensor_key
        if not self.compression_pipeline.is_lossy() or require_lossless:
            new_tags = tuple(list(tags) + ['compressed'])
        else:
            new_tags = tuple(list(tags) + ['lossy_compressed'])
        compressed_tensor_key = TensorKey(tensor_name, origin, round_number,
                                          report, new_tags)
        return compressed_tensor_key, compressed_nparray, metadata
    def validate(self, col_name, round_num,
                 input_tensor_dict, use_tqdm=False, **kwargs):
        """
        Run validation.

        Returns:
            dict: {<metric>: <value>}
        """
        batch_size = self.data_loader.batch_size

        if kwargs['batch_size']:
            batch_size = kwargs['batch_size']

        self.rebuild_model(round_num, input_tensor_dict, validation=True)

        tf.keras.backend.set_learning_phase(False)

        score = 0

        gen = self.data_loader.get_valid_loader(batch_size)
        if use_tqdm:
            gen = tqdm.tqdm(gen, desc="validating")

        for X, y in gen:
            weight = X.shape[0] / self.data_loader.get_valid_data_size()
            _, s = self.validate_batch(X, y)
            score += s * weight

        origin = col_name
        suffix = 'validate'
        if kwargs['apply'] == 'local':
            suffix += '_local'
        else:
            suffix += '_agg'
        tags = ('metric', suffix)
        output_tensor_dict = {
            TensorKey(
                self.validation_metric_name, origin, round_num, True, tags
            ): np.array(score)}

        # return empty dict for local metrics
        return output_tensor_dict, {}
    def generate_delta(tensor_key, nparray, base_model_nparray):
        """
        Create delta from the updated layer and base layer.

        Args:
            tensor_key:         This is the tensor_key associated with the
                                nparray.
                                Should have a tag of 'trained' or 'aggregated'

            nparray:            The nparray that corresponds to the tensorkey

            base_model_nparray: The base model tensor that will be subtracted
                                from the new weights

        Returns:
            delta_tensor_key:   Tensorkey that corresponds to the delta weight
                                array

            delta:              Difference between the provided tensors

        """
        tensor_name, origin, round_number, report, tags = tensor_key
        if not np.isscalar(nparray):
            assert nparray.shape == base_model_nparray.shape, (
                'Shape of updated layer ({}) is not equal to base '
                'layer shape of ({})'.format(nparray.shape,
                                             base_model_nparray.shape))
        assert 'model' not in tags, (
            'The tensorkey should be provided '
            'from the layer with new weights, not the base model')
        if type(tags) == str:
            new_tags = tuple([tensor_key[3]] + ['delta'])
        else:
            new_tags = tuple(list(tags) + ['delta'])
        delta_tensor_key = TensorKey(tensor_name, origin, round_number, report,
                                     new_tags)
        return delta_tensor_key, nparray - base_model_nparray
Beispiel #20
0
    def validate(self, col_name, round_num, input_tensor_dict, **kwargs):
        """
        Run the trained model on validation data; report results.

        Parameters
        ----------
        input_tensor_dict : either the last aggregated or locally trained model

        Returns
        -------
        output_tensor_dict : {TensorKey: nparray} (these correspond to acc,
         precision, f1_score, etc.)
        """
        self.rebuild_model(round_num, input_tensor_dict, validation=True)
        param_metrics = kwargs['metrics']

        results = self.estimator.test('experiment')
        ret_dict = {
            metric: list(results.history['test'][metric].values())[-1]
            for metric in param_metrics
        }

        origin = col_name
        suffix = 'validate'
        if kwargs['apply'] == 'local':
            suffix += '_local'
        else:
            suffix += '_agg'
        tags = ('metric', suffix)
        output_tensor_dict = {
            TensorKey(metric, origin, round_num, True, tags):
            np.array(ret_dict[metric])
            for metric in param_metrics
        }

        return output_tensor_dict, {}
Beispiel #21
0
    def get_data_for_tensorkey(self, tensor_key):
        """
        Resolve the tensor corresponding to the requested tensorkey.

        Args
        ----
        tensor_key:         Tensorkey that will be resolved locally or
                            remotely. May be the product of other tensors
        """
        # try to get from the store
        tensor_name, origin, round_number, report, tags = tensor_key
        self.logger.debug(
            'Attempting to retrieve tensor {} from local store'.format(
                tensor_key)
        )
        nparray = self.tensor_db.get_tensor_from_cache(tensor_key)

        # if None and origin is our client, request it from the client
        if nparray is None:
            if origin == self.collaborator_name:
                self.logger.info(
                    'Attempting to find locally stored {} tensor from prior'
                    ' round...'.format(tensor_name))
                prior_round = round_number - 1
                while prior_round >= 0:
                    nparray = self.tensor_db.get_tensor_from_cache(
                        TensorKey(tensor_name, origin, prior_round, report, tags))
                    if nparray is not None:
                        self.logger.debug(
                            'Found tensor {} in local TensorDB for round'
                            ' {}'.format(tensor_name, prior_round))
                        return nparray
                    prior_round -= 1
                self.logger.info('Cannot find any prior version of tensor {}'
                                 ' locally...'.format(tensor_name))
            self.logger.debug('Unable to get tensor from local store...'
                              'attempting to retrieve from client')
            # Determine whether there are additional compression related
            # dependencies.
            # Typically, dependencies are only relevant to model layers
            tensor_dependencies = self.tensor_codec.find_dependencies(
                tensor_key, self.delta_updates
            )
            # self.logger.info('tensor_dependencies = {}'.format(
            # tensor_dependencies))
            if len(tensor_dependencies) > 0:
                # Resolve dependencies
                # tensor_dependencies[0] corresponds to the prior version
                # of the model.
                # If it exists locally, should pull the remote delta because
                # this is the least costly path
                prior_model_layer = self.tensor_db.get_tensor_from_cache(
                    tensor_dependencies[0]
                )
                if prior_model_layer is not None:
                    uncompressed_delta = \
                        self.get_aggregated_tensor_from_aggregator(
                            tensor_dependencies[1]
                        )
                    new_model_tk, nparray = self.tensor_codec.apply_delta(
                        tensor_dependencies[1],
                        uncompressed_delta,
                        prior_model_layer
                    )
                    self.logger.debug('Applied delta to tensor {}'.format(
                        tensor_dependencies[0][0])
                    )
                else:
                    # The original model tensor should be fetched from client
                    nparray = self.get_aggregated_tensor_from_aggregator(
                        tensor_key
                    )
            elif 'model' in tags:
                # Pulling the model for the first time or
                nparray = self.get_aggregated_tensor_from_aggregator(
                    tensor_key,
                    require_lossless=True
                )
        else:
            self.logger.debug('Found tensor {} in local TensorDB'.format(
                tensor_key))

        return nparray
    def _compute_validation_related_task_metrics(self, task_name):
        """
        Compute all validation related metrics.

        Args:
            task_name : str
                The task name to compute
        """
        self.logger.info('{} task metrics...'.format(task_name))
        # By default, print out all of the metrics that the validation
        # task sent
        # This handles getting the subset of collaborators that may be
        # part of the validation task
        collaborators_for_task = self.assigner.get_collaborators_for_task(
            task_name, self.round_number)
        # The collaborator data sizes for that task
        collaborator_weights_unnormalized = {
            c: self.collaborator_task_weight[TaskResultKey(
                task_name, c, self.round_number)]
            for c in collaborators_for_task
        }
        weight_total = sum(collaborator_weights_unnormalized.values())
        collaborator_weight_dict = {
            k: v / weight_total
            for k, v in collaborator_weights_unnormalized.items()
        }

        # The validation task should have just a couple tensors (i.e.
        # metrics) associated with it. Because each collaborator should
        # have sent the same tensor list, we can use the first
        # collaborator in our subset, and apply the correct
        # transformations to the tensorkey to resolve the aggregated
        # tensor for that round
        agg_functions = self.assigner.get_aggregation_type_for_task(task_name)
        task_key = TaskResultKey(task_name, collaborators_for_task[0],
                                 self.round_number)
        for tensor_key in self.collaborator_tasks_results[task_key]:
            tensor_name, origin, round_number, report, tags = tensor_key
            assert (tags[-1] == collaborators_for_task[0]), \
                'Tensor {} in task {} has not been processed' \
                ' correctly'.format(tensor_key, task_name)
            # Strip the collaborator label, and lookup aggregated tensor
            new_tags = tuple(list(tags[:-1]))
            agg_tensor_key = TensorKey(tensor_name, origin, round_number,
                                       report, new_tags)
            agg_tensor_name, agg_origin, agg_round_number, agg_report, agg_tags = agg_tensor_key
            agg_results, agg_metadata_dict = self.tensor_db.get_aggregated_tensor(
                agg_tensor_key, collaborator_weight_dict, agg_functions)
            if report:
                # Print the aggregated metric
                if agg_results is None:
                    self.logger.warning(
                        'Aggregated metric {} could not be collected for round {}. '
                        'Skipping reporting for this round'.format(
                            agg_tensor_name, self.round_number))
                if agg_functions is not None:
                    self.logger.info('{0} {1}:\t{2:.4f}'.format(
                        agg_functions[0], agg_tensor_name, agg_results))
                else:
                    self.logger.info('{0}:\t{1:.4f}'.format(
                        agg_tensor_name, agg_results))
                for met in agg_metadata_dict:
                    self.logger.info('{0} {1}:\t{2:.4f}'.format(
                        met, agg_tensor_name, agg_metadata_dict[met]))
                # TODO Add all of the logic for saving the model based
                #  on best accuracy, lowest loss, etc.
                if 'validate_agg' in tags:
                    # Compare the accuracy of the model, and
                    # potentially save it
                    if self.best_model_score is None or self.best_model_score < agg_results:
                        self.logger.info(
                            'Saved the best model with score {:f}'.format(
                                agg_results))
                        self.best_model_score = agg_results
                        self._save_model(round_number, self.best_state_path)
            if 'trained' in tags:
                self._prepare_trained(tensor_name, origin, round_number,
                                      report, agg_results)
    def _prepare_trained(self, tensor_name, origin, round_number, report,
                         agg_results):
        """
        Prepare aggregated tensorkey tags.

        Args:
           tensor_name : str
           origin:
           round_number: int
           report: bool
           agg_results: np.array
        """
        # The aggregated tensorkey tags should have the form of
        # 'trained' or 'trained.lossy_decompressed'
        # They need to be relabeled to 'aggregated' and
        # reinserted. Then delta performed, compressed, etc.
        # then reinserted to TensorDB with 'model' tag

        # First insert the aggregated model layer with the
        # correct tensorkey
        agg_tag_tk = TensorKey(tensor_name, origin, round_number + 1, report,
                               ('aggregated', ))
        self.tensor_db.cache_tensor({agg_tag_tk: agg_results})

        # Create delta and save it in TensorDB
        base_model_tk = TensorKey(tensor_name, origin, round_number, report,
                                  ('model', ))
        base_model_nparray = self.tensor_db.get_tensor_from_cache(
            base_model_tk)
        if base_model_nparray is not None:
            delta_tk, delta_nparray = self.tensor_codec.generate_delta(
                agg_tag_tk, agg_results, base_model_nparray)
            self.tensor_db.cache_tensor({delta_tk: delta_nparray})
        else:
            # This condition is possible for base model
            # optimizer states (i.e. Adam/iter:0, SGD, etc.)
            # These values couldn't be present for the base
            # model because no training occurs on the aggregator
            delta_tk, delta_nparray = agg_tag_tk, agg_results

        # Compress lossless/lossy
        compressed_delta_tk, compressed_delta_nparray, metadata = self.tensor_codec.compress(
            delta_tk, delta_nparray)

        # TODO extend the TensorDB so that compressed data is
        #  supported. Once that is in place
        # the compressed delta can just be stored here instead
        # of recreating it for every request

        # Decompress lossless/lossy
        decompressed_delta_tk, decompressed_delta_nparray = self.tensor_codec.decompress(
            compressed_delta_tk, compressed_delta_nparray, metadata)

        # Apply delta (unless delta couldn't be created)
        if base_model_nparray is not None:
            new_model_tk, new_model_nparray = self.tensor_codec.apply_delta(
                decompressed_delta_tk, decompressed_delta_nparray,
                base_model_nparray)
        else:
            new_model_tk, new_model_nparray = decompressed_delta_tk, decompressed_delta_nparray

        # Now that the model has been compressed/decompressed
        # with delta operations,
        # Relabel the tags to 'model'
        (new_model_tensor_name, new_model_origin, new_model_round_number,
         new_model_report, new_model_tags) = new_model_tk
        final_model_tk = TensorKey(new_model_tensor_name, new_model_origin,
                                   new_model_round_number, new_model_report,
                                   ('model', ))

        # Finally, cache the updated model tensor
        self.tensor_db.cache_tensor({final_model_tk: new_model_nparray})
    def _process_named_tensor(self, named_tensor, collaborator_name):
        """
        Extract the named tensor fields.

        Performs decompression, delta computation, and inserts results into TensorDB.

        Args:
            named_tensor:       NamedTensor (protobuf)
                protobuf that will be extracted from and processed
            collaborator_name:  str
                Collaborator name is needed for proper tagging of resulting
                tensorkeys

        Returns:
            tensor_key : TensorKey (named_tuple)
                The tensorkey extracted from the protobuf
            nparray : np.array
                The numpy array associated with the returned tensorkey
        """
        raw_bytes = named_tensor.data_bytes
        metadata = [{
            'int_to_float': proto.int_to_float,
            'int_list': proto.int_list,
            'bool_list': proto.bool_list
        } for proto in named_tensor.transformer_metadata]
        # The tensor has already been transfered to aggregator,
        # so the newly constructed tensor should have the aggregator origin
        tensor_key = TensorKey(named_tensor.name, self.uuid,
                               named_tensor.round_number, named_tensor.report,
                               tuple(named_tensor.tags))
        tensor_name, origin, round_number, report, tags = tensor_key
        assert ('compressed' in tags or 'lossy_decompressed' in tags), (
            'Named tensor {} is not compressed'.format(tensor_key))
        if 'compressed' in tags:
            dec_tk, decompressed_nparray = self.tensor_codec.decompress(
                tensor_key,
                data=raw_bytes,
                transformer_metadata=metadata,
                require_lossless=True)
            dec_name, dec_origin, dec_round_num, dec_report, dec_tags = dec_tk
            # Need to add the collaborator tag to the resulting tensor
            if type(dec_tags) == str:
                new_tags = tuple([dec_tags] + [collaborator_name])
            else:
                new_tags = tuple(list(dec_tags) + [collaborator_name])
            # layer.agg.n.trained.delta.col_i
            decompressed_tensor_key = TensorKey(dec_name, dec_origin,
                                                dec_round_num, dec_report,
                                                new_tags)
        if 'lossy_compressed' in tags:
            dec_tk, decompressed_nparray = self.tensor_codec.decompress(
                tensor_key, data=raw_bytes, transformer_metadata=metadata)
            dec_name, dec_origin, dec_round_num, dec_report, dec_tags = dec_tk
            if type(dec_tags) == str:
                new_tags = tuple([dec_tags] + [collaborator_name])
            else:
                new_tags = tuple(list(dec_tags) + [collaborator_name])
            # layer.agg.n.trained.delta.lossy_decompressed.col_i
            decompressed_tensor_key = TensorKey(dec_name, dec_origin,
                                                dec_round_num, dec_report,
                                                new_tags)

        if 'delta' in tags:
            base_model_tensor_key = TensorKey(tensor_name, origin,
                                              round_number, report,
                                              ('model', ))
            base_model_nparray = self.tensor_db.get_tensor_from_cache(
                base_model_tensor_key)
            if base_model_nparray is None:
                raise ValueError('Base model {} not present in'
                                 ' TensorDB'.format(base_model_tensor_key))
            final_tensor_key, final_nparray = self.tensor_codec.apply_delta(
                decompressed_tensor_key, decompressed_nparray,
                base_model_nparray)
        else:
            final_tensor_key = decompressed_tensor_key
            final_nparray = decompressed_nparray

        assert (final_nparray is not None), (
            'Could not create tensorkey {}'.format(final_tensor_key))
        self.tensor_db.cache_tensor({final_tensor_key: final_nparray})
        self.logger.debug('Created TensorKey: {}'.format(final_tensor_key))

        return final_tensor_key, final_nparray
    def get_aggregated_tensor(self, collaborator_name, tensor_name,
                              round_number, report, tags, require_lossless):
        """
        RPC called by collaborator.

        Performs local lookup to determine if there is an aggregated tensor available \
            that matches the request.

        Args:
            collaborator_name : str
                Requested tensor key collaborator name
            tensor_name: str
            require_lossless: bool
            round_number: int
            report: bool
            tags: list[str]
        Returns:
            named_tensor : protobuf NamedTensor
                the tensor requested by the collaborator
        """
        self.logger.debug(
            'Retrieving aggregated tensor {} for collaborator {}'.format(
                tensor_name, collaborator_name))

        if 'compressed' in tags or require_lossless:
            compress_lossless = True

        # TODO the TensorDB doesn't support compressed data yet.
        #  The returned tensor will
        # be recompressed anyway.
        if 'compressed' in tags:
            tags.remove('compressed')

        tensor_key = TensorKey(tensor_name, self.uuid, round_number, report,
                               tuple(tags))
        tensor_name, origin, round_number, report, tags = tensor_key

        # send_model_deltas = False
        compress_lossless = False

        if 'aggregated' in tags and 'delta' in tags and round_number != 0:
            # send_model_deltas = True
            agg_tensor_key = TensorKey(tensor_name, origin, round_number,
                                       report, ('aggregated', ))
        else:
            agg_tensor_key = tensor_key

        nparray = self.tensor_db.get_tensor_from_cache(tensor_key)

        if nparray is None:
            raise ValueError("Aggregator does not have an aggregated tensor"
                             " for {}".format(tensor_key))

        # quite a bit happens in here, including compression, delta handling,
        # etc...
        # we might want to cache these as well
        named_tensor = self._nparray_to_named_tensor(
            agg_tensor_key,
            nparray,
            send_model_deltas=True,
            compress_lossless=compress_lossless)

        return named_tensor
Beispiel #26
0
    def train(self, col_name, round_num, input_tensor_dict, epochs, **kwargs):
        """
        Perform the training for a specified number of batches.

        Is expected to perform draws randomly, without replacement until data is exausted.
        Then data is replaced and shuffled and draws continue.

        Returns
        -------
        dict
            'TensorKey: nparray'
        """
        if 'metrics' not in kwargs:
            raise KeyError('metrics must be included in kwargs')
        # if 'batch_size' in kwargs:
        #     batch_size = kwargs['batch_size']
        # else:
        #     batch_size = self.data_loader.batch_size

        # rebuild model with updated weights
        self.rebuild_model(round_num, input_tensor_dict)

        history = self.model.fit(
            self.data_loader.X_train,
            self.data_loader.y_train,
            batch_size=self.data_loader.batch_size,
            epochs=epochs,
            verbose=0,
        )

        # TODO Currently assuming that all metrics are defined at
        #  initialization (build_model).
        #  If metrics are added (i.e. not a subset of what was originally
        #  defined) then the model must be recompiled.
        model_metrics_names = self.model.metrics_names
        param_metrics = kwargs['metrics']

        # TODO if there are new metrics in the flplan that were not included
        #  in the originally
        #  compiled model, that behavior is not currently handled.
        for param in param_metrics:
            if param not in model_metrics_names:
                error = 'KerasTaskRunner does not support specifying new' \
                        ' metrics. ' \
                        'Param_metrics = {}, model_metrics_names =' \
                        ' {}'.format(param_metrics, model_metrics_names)
                raise ValueError(error)

        # output metric tensors (scalar)
        origin = col_name
        tags = ('trained', )
        output_metric_dict = {
            TensorKey(metric, origin, round_num, True, ('metric', )):
            np.array(np.mean([history.history[metric]]))
            for metric in param_metrics
        }

        # output model tensors (Doesn't include TensorKey)
        output_model_dict = self.get_tensor_dict(with_opt_vars=True)
        global_model_dict, local_model_dict = split_tensor_dict_for_holdouts(
            self.logger, output_model_dict, **self.tensor_dict_split_fn_kwargs)

        # create global tensorkeys
        global_tensorkey_model_dict = {
            TensorKey(tensor_name, origin, round_num, False, tags): nparray
            for tensor_name, nparray in global_model_dict.items()
        }
        # create tensorkeys that should stay local
        local_tensorkey_model_dict = {
            TensorKey(tensor_name, origin, round_num, False, tags): nparray
            for tensor_name, nparray in local_model_dict.items()
        }
        # the train/validate aggregated function of the next round will look
        # for the updated model parameters.
        # this ensures they will be resolved locally
        next_local_tensorkey_model_dict = {
            TensorKey(tensor_name, origin, round_num + 1, False, ('model', )):
            nparray
            for tensor_name, nparray in local_model_dict.items()
        }

        global_tensor_dict = {
            **output_metric_dict,
            **global_tensorkey_model_dict
        }
        local_tensor_dict = {
            **local_tensorkey_model_dict,
            **next_local_tensorkey_model_dict
        }

        # update the required tensors if they need to be pulled from the
        # aggregator
        # TODO this logic can break if different collaborators have different
        # roles between rounds.
        # for example, if a collaborator only performs validation in the first
        # round but training in the second, it has no way of knowing the
        # optimizer state tensor names to request from the aggregator because
        # these are only created after training occurs. A work around could
        # involve doing a single epoch of training on random data to get the
        # optimizer names, and then throwing away the model.
        if self.opt_treatment == 'CONTINUE_GLOBAL':
            self.initialize_tensorkeys_for_functions(with_opt_vars=True)

        # return global_tensor_dict, local_tensor_dict
        return global_tensor_dict, local_tensor_dict
    def decompress(self,
                   tensor_key,
                   data,
                   transformer_metadata,
                   require_lossless=False,
                   **kwargs):
        """
        Function-wrapper around the tensor_pipeline.backward function.

        It also keeps track of the tensorkeys associated with the decompressed nparray

        Args:
            tensor_key:             TensorKey is provided to verify it should
                                    be decompressed, and new TensorKeys
                                    returned will be derivatives of the
                                    existing tensor_name

            data:                   (compressed) numpy array associated with
                                    the tensor_key

            transformer_metadata:   metadata associated with the compressed
                                    tensor

            require_lossless:       boolean, does data require lossless
                                    decompression

        Returns:
            decompressed_tensor_key:    Tensorkey corresponding to the
                                        decompressed tensor

            decompressed_nparray:       The decompressed tensor

        """
        tensor_name, origin, round_number, report, tags = tensor_key

        assert (len(transformer_metadata) >
                0), ('metadata must be included for decompression')
        assert (('compressed' in tags)
                or ('lossy_compressed'
                    in tags)), ("Cannot decompress an uncompressed tensor")
        if require_lossless:
            assert ('compressed'
                    in tags), ("Cannot losslessly decompress lossy tensor")

        if require_lossless or 'compressed' in tags:
            decompressed_nparray = self.lossless_pipeline.backward(
                data, transformer_metadata, **kwargs)
        else:
            decompressed_nparray = self.compression_pipeline.backward(
                data, transformer_metadata, **kwargs)
        # Define the decompressed tensorkey that should be returned
        if 'lossy_compressed' in tags:
            lc_idx = tags.index('lossy_compressed')
            new_tags = list(tags)
            new_tags[lc_idx] = 'lossy_decompressed'
            decompressed_tensor_key = TensorKey(tensor_name, origin,
                                                round_number, report,
                                                tuple(new_tags))
        elif 'compressed' in tags:
            # 'compressed' == lossless compression; no need for
            # compression related tag after decompression
            new_tags = list(tags)
            new_tags.remove('compressed')
            decompressed_tensor_key = TensorKey(tensor_name, origin,
                                                round_number, report,
                                                tuple(new_tags))
        else:
            raise NotImplementedError(
                "Decompression is only supported on compressed data")

        return decompressed_tensor_key, decompressed_nparray
    def train_batches(self,
                      col_name,
                      round_num,
                      input_tensor_dict,
                      num_batches=None,
                      use_tqdm=True,
                      **kwargs):
        """Train batches.

        Train the model on the requested number of batches.

        Args:
            col_name:            Name of the collaborator
            round_num:           What round is it
            input_tensor_dict:   Required input tensors (for model)
            num_batches:         The number of batches to train on before returning
            use_tqdm (bool):     Use tqdm to print a progress bar (Default=True)

        Returns:
            global_output_dict:  Tensors to send back to the aggregator
            local_output_dict:   Tensors to maintain in the local TensorDB
        """
        self.rebuild_model(round_num, input_tensor_dict)
        # set to "training" mode
        self.train()

        losses = []

        loader = self.data_loader.get_train_loader(num_batches=num_batches)
        if use_tqdm:
            loader = tqdm.tqdm(loader, desc="train epoch")
            # shuffling occurs every time this loader is used as an interator
            for data, target in loader:
                data, target = (torch.tensor(data).to(self.device),
                                torch.tensor(target).to(self.device))
                self.optimizer.zero_grad()
                output = self(data)
                loss = self.loss_fn(output, target)
                loss.backward()
                self.optimizer.step()
                losses.append(loss.detach().cpu().numpy())

        # output metric tensors (scalar)
        origin = col_name
        tags = ('trained', )
        output_metric_dict = {
            TensorKey(self.loss_fn.__class__.__name__, origin, round_num, True, ('metric', )):
            np.array(np.mean(losses))
        }

        # output model tensors (Doesn't include TensorKey)
        output_model_dict = self.get_tensor_dict(with_opt_vars=True)
        global_model_dict, local_model_dict = split_tensor_dict_for_holdouts(
            self.logger, output_model_dict, **self.tensor_dict_split_fn_kwargs)

        # create global tensorkeys
        global_tensorkey_model_dict = {
            TensorKey(tensor_name, origin, round_num, False, tags): nparray
            for tensor_name, nparray in global_model_dict.items()
        }
        # create tensorkeys that should stay local
        local_tensorkey_model_dict = {
            TensorKey(tensor_name, origin, round_num, False, tags): nparray
            for tensor_name, nparray in local_model_dict.items()
        }
        # the train/validate aggregated function of the next round will look
        # for the updated model parameters
        # this ensures they will be resolved locally
        next_local_tensorkey_model_dict = {
            TensorKey(tensor_name, origin, round_num + 1, False, ('model', )):
            nparray
            for tensor_name, nparray in local_model_dict.items()
        }

        global_tensor_dict = {
            **output_metric_dict,
            **global_tensorkey_model_dict
        }
        local_tensor_dict = {
            **local_tensorkey_model_dict,
            **next_local_tensorkey_model_dict
        }

        # update the required tensors if they need to be pulled
        # from the aggregator
        # TODO this logic can break if different collaborators have different
        #  roles between rounds.
        # for example, if a collaborator only performs validation in the first
        # round but training in the second, it has no way of knowing the
        # optimizer state tensor names to request from the aggregator
        # because these are only created after training occurs. A work
        # around could involve doing a single epoch of training
        # on random data to get the optimizer names, and then throwing away
        # the model.
        if self.opt_treatment == 'CONTINUE_GLOBAL':
            self.initialize_tensorkeys_for_functions(with_opt_vars=True)

        # this will signal that the optimizer values are now present, and can
        # be loaded when the model is rebuilt
        self.train_round_completed = True

        return global_tensor_dict, local_tensor_dict
Beispiel #29
0
    def train(self,
              col_name,
              round_num,
              input_tensor_dict,
              metrics,
              num_batches=None,
              **kwargs):
        """
        Perform the training for a specified number of batches.

        Is expected to perform draws randomly, without replacement until data is exausted.
        Then data is replaced and shuffled and draws continue.

        Returns
        -------
        dict
            'TensorKey: nparray'
        """
        if metrics is None:
            raise KeyError('metrics must be defined')
        # if 'batch_size' in kwargs:
        #     batch_size = kwargs['batch_size']
        # else:
        #     batch_size = self.data_loader.batch_size

        # rebuild model with updated weights
        self.rebuild_model(round_num, input_tensor_dict)

        results = self.train_iteration(
            self.data_loader.get_train_loader(num_batches),
            metrics=metrics,
            **kwargs)

        # output metric tensors (scalar)
        origin = col_name
        tags = ('trained', )
        output_metric_dict = {
            TensorKey(metric_name, origin, round_num, True, ('metric', )):
            metric_value
            for (metric_name, metric_value) in results
        }

        # output model tensors (Doesn't include TensorKey)
        output_model_dict = self.get_tensor_dict(with_opt_vars=True)
        global_model_dict, local_model_dict = split_tensor_dict_for_holdouts(
            self.logger, output_model_dict, **self.tensor_dict_split_fn_kwargs)

        # create global tensorkeys
        global_tensorkey_model_dict = {
            TensorKey(tensor_name, origin, round_num, False, tags): nparray
            for tensor_name, nparray in global_model_dict.items()
        }
        # create tensorkeys that should stay local
        local_tensorkey_model_dict = {
            TensorKey(tensor_name, origin, round_num, False, tags): nparray
            for tensor_name, nparray in local_model_dict.items()
        }
        # the train/validate aggregated function of the next round will look
        # for the updated model parameters.
        # this ensures they will be resolved locally
        next_local_tensorkey_model_dict = {
            TensorKey(tensor_name, origin, round_num + 1, False, ('model', )):
            nparray
            for tensor_name, nparray in local_model_dict.items()
        }

        global_tensor_dict = {
            **output_metric_dict,
            **global_tensorkey_model_dict
        }
        local_tensor_dict = {
            **local_tensorkey_model_dict,
            **next_local_tensorkey_model_dict
        }

        # update the required tensors if they need to be pulled from the
        # aggregator
        # TODO this logic can break if different collaborators have different
        # roles between rounds.
        # for example, if a collaborator only performs validation in the first
        # round but training in the second, it has no way of knowing the
        # optimizer state tensor names to request from the aggregator because
        # these are only created after training occurs. A work around could
        # involve doing a single epoch of training on random data to get the
        # optimizer names, and then throwing away the model.
        if self.opt_treatment == 'CONTINUE_GLOBAL':
            self.initialize_tensorkeys_for_functions(with_opt_vars=True)

        # return global_tensor_dict, local_tensor_dict
        return global_tensor_dict, local_tensor_dict
Beispiel #30
0
def run_challenge_experiment(aggregation_function,
                             choose_training_collaborators,
                             training_hyper_parameters_for_round,
                             institution_split_csv_filename,
                             brats_training_data_parent_dir,
                             db_store_rounds=5,
                             rounds_to_train=5,
                             device='cpu',
                             save_checkpoints=True,
                             restore_from_checkpoint_folder=None,
                             include_validation_with_hausdorff=True,
                             use_pretrained_model=True):

    fx.init('fets_challenge_workspace')

    from sys import path, exit

    file = Path(__file__).resolve()
    root = file.parent.resolve()  # interface root, containing command modules
    work = Path.cwd().resolve()

    path.append(str(root))
    path.insert(0, str(work))

    # create gandlf_csv and get collaborator names
    gandlf_csv_path = os.path.join(work, 'gandlf_paths.csv')
    # split_csv_path = os.path.join(work, institution_split_csv_filename)
    collaborator_names = construct_fedsim_csv(brats_training_data_parent_dir,
                                              institution_split_csv_filename,
                                              0.8, gandlf_csv_path)

    aggregation_wrapper = CustomAggregationWrapper(aggregation_function)

    overrides = {
        'aggregator.settings.rounds_to_train': rounds_to_train,
        'aggregator.settings.db_store_rounds': db_store_rounds,
        'tasks.train.aggregation_type': aggregation_wrapper,
        'task_runner.settings.device': device,
    }

    # Update the plan if necessary
    plan = fx.update_plan(overrides)

    if not include_validation_with_hausdorff:
        plan.config['task_runner']['settings']['fets_config_dict'][
            'metrics'] = ['dice', 'dice_per_label']

    # Overwrite collaborator names
    plan.authorized_cols = collaborator_names
    # overwrite datapath values with the collaborator name itself
    for col in collaborator_names:
        plan.cols_data_paths[col] = col

    # get the data loaders for each collaborator
    collaborator_data_loaders = {
        col: copy(plan).get_data_loader(col)
        for col in collaborator_names
    }

    transformed_csv_dict = extract_csv_partitions(
        os.path.join(work, 'gandlf_paths.csv'))
    # get the task runner, passing the first data loader
    for col in collaborator_data_loaders:
        #Insert logic to serialize train / val CSVs here
        transformed_csv_dict[col]['train'].to_csv(
            os.path.join(work, 'seg_test_train.csv'))
        transformed_csv_dict[col]['val'].to_csv(
            os.path.join(work, 'seg_test_val.csv'))
        task_runner = copy(plan).get_task_runner(
            collaborator_data_loaders[col])

    if use_pretrained_model:
        print('Loading pretrained model...')
        if device == 'cpu':
            checkpoint = torch.load(
                f'{root}/pretrained_model/resunet_pretrained.pth',
                map_location=torch.device('cpu'))
            task_runner.model.load_state_dict(checkpoint['model_state_dict'])
            task_runner.optimizer.load_state_dict(
                checkpoint['optimizer_state_dict'])
        else:
            checkpoint = torch.load(
                f'{root}/pretrained_model/resunet_pretrained.pth')
            task_runner.model.load_state_dict(checkpoint['model_state_dict'])
            task_runner.optimizer.load_state_dict(
                checkpoint['optimizer_state_dict'])

    tensor_pipe = plan.get_tensor_pipe()

    # Initialize model weights
    init_state_path = plan.config['aggregator']['settings']['init_state_path']
    tensor_dict, _ = split_tensor_dict_for_holdouts(
        logger, task_runner.get_tensor_dict(False))

    model_snap = utils.construct_model_proto(tensor_dict=tensor_dict,
                                             round_number=0,
                                             tensor_pipe=tensor_pipe)

    utils.dump_proto(model_proto=model_snap, fpath=init_state_path)

    # get the aggregator, now that we have the initial weights file set up
    logger.info('Creating aggregator...')
    aggregator = plan.get_aggregator()
    # manually override the aggregator UUID (for checkpoint resume when rounds change)
    aggregator.uuid = 'aggregator'
    aggregator._load_initial_tensors()

    # create our collaborators
    logger.info('Creating collaborators...')
    collaborators = {
        col: copy(plan).get_collaborator(col,
                                         task_runner=task_runner,
                                         client=aggregator)
        for col in collaborator_names
    }

    collaborator_time_stats = gen_collaborator_time_stats(plan.authorized_cols)

    collaborators_chosen_each_round = {}
    collaborator_times_per_round = {}

    logger.info('Starting experiment')

    total_simulated_time = 0
    best_dice = -1.0
    best_dice_over_time_auc = 0

    # results dataframe data
    experiment_results = {
        'round': [],
        'time': [],
        'convergence_score': [],
        'round_dice': [],
        'dice_label_0': [],
        'dice_label_1': [],
        'dice_label_2': [],
        'dice_label_4': [],
    }
    if include_validation_with_hausdorff:
        experiment_results.update({
            'hausdorff95_label_0': [],
            'hausdorff95_label_1': [],
            'hausdorff95_label_2': [],
            'hausdorff95_label_4': [],
        })

    if restore_from_checkpoint_folder is None:
        checkpoint_folder = setup_checkpoint_folder()
        logger.info(f'\nCreated experiment folder {checkpoint_folder}...')
        starting_round_num = 0
    else:
        if not Path(f'checkpoint/{restore_from_checkpoint_folder}').exists():
            logger.warning(
                f'Could not find provided checkpoint folder: {restore_from_checkpoint_folder}. Exiting...'
            )
            exit(1)
        else:
            logger.info(
                f'Attempting to load last completed round from {restore_from_checkpoint_folder}'
            )
            state = load_checkpoint(restore_from_checkpoint_folder)
            checkpoint_folder = restore_from_checkpoint_folder

            [
                loaded_collaborator_names, starting_round_num,
                collaborator_time_stats, total_simulated_time, best_dice,
                best_dice_over_time_auc, collaborators_chosen_each_round,
                collaborator_times_per_round, experiment_results, summary,
                agg_tensor_db
            ] = state

            if loaded_collaborator_names != collaborator_names:
                logger.error(
                    f'Collaborator names found in checkpoint ({loaded_collaborator_names}) '
                    f'do not match provided collaborators ({collaborator_names})'
                )
                exit(1)

            logger.info(f'Previous summary for round {starting_round_num}')
            logger.info(summary)

            starting_round_num += 1
            aggregator.tensor_db.tensor_db = agg_tensor_db
            aggregator.round_number = starting_round_num

    for round_num in range(starting_round_num, rounds_to_train):
        # pick collaborators to train for the round
        training_collaborators = choose_training_collaborators(
            collaborator_names, aggregator.tensor_db._iterate(), round_num,
            collaborators_chosen_each_round, collaborator_times_per_round)

        logger.info('Collaborators chosen to train for round {}:\n\t{}'.format(
            round_num, training_collaborators))

        # save the collaborators chosen this round
        collaborators_chosen_each_round[round_num] = training_collaborators

        # get the hyper-parameters from the competitor
        hparams = training_hyper_parameters_for_round(
            collaborator_names, aggregator.tensor_db._iterate(), round_num,
            collaborators_chosen_each_round, collaborator_times_per_round)

        learning_rate, epochs_per_round, batches_per_round = hparams

        if (epochs_per_round is None) == (batches_per_round is None):
            logger.error(
                'Hyper-parameter function error: function must return "None" for either "epochs_per_round" or "batches_per_round" but not both.'
            )
            return

        hparam_message = "\n\tlearning rate: {}".format(learning_rate)

        # None gets mapped to -1 in the tensor_db
        if epochs_per_round is None:
            epochs_per_round = -1
            hparam_message += "\n\tbatches_per_round: {}".format(
                batches_per_round)
        elif batches_per_round is None:
            batches_per_round = -1
            hparam_message += "\n\tepochs_per_round: {}".format(
                epochs_per_round)

        logger.info("Hyper-parameters for round {}:{}".format(
            round_num, hparam_message))

        # cache each tensor in the aggregator tensor_db
        hparam_dict = {}
        tk = TensorKey(tensor_name='learning_rate',
                       origin=aggregator.uuid,
                       round_number=round_num,
                       report=False,
                       tags=('hparam', 'model'))
        hparam_dict[tk] = np.array(learning_rate)
        tk = TensorKey(tensor_name='epochs_per_round',
                       origin=aggregator.uuid,
                       round_number=round_num,
                       report=False,
                       tags=('hparam', 'model'))
        hparam_dict[tk] = np.array(epochs_per_round)
        tk = TensorKey(tensor_name='batches_per_round',
                       origin=aggregator.uuid,
                       round_number=round_num,
                       report=False,
                       tags=('hparam', 'model'))
        hparam_dict[tk] = np.array(batches_per_round)
        aggregator.tensor_db.cache_tensor(hparam_dict)

        # pre-compute the times for each collaborator
        times_per_collaborator = compute_times_per_collaborator(
            collaborator_names, training_collaborators, batches_per_round,
            epochs_per_round, collaborator_data_loaders,
            collaborator_time_stats, round_num)
        collaborator_times_per_round[round_num] = times_per_collaborator

        aggregator.assigner.set_training_collaborators(training_collaborators)

        # update the state in the aggregation wrapper
        aggregation_wrapper.set_state_data_for_round(
            collaborators_chosen_each_round, collaborator_times_per_round)

        # turn the times list into a list of tuples and sort it
        times_list = [(t, col) for col, t in times_per_collaborator.items()]
        times_list = sorted(times_list)

        # now call each collaborator in order of time
        # FIXME: this doesn't break up each task. We need this if we're doing straggler handling
        for t, col in times_list:
            # set the task_runner data loader
            task_runner.data_loader = collaborator_data_loaders[col]

            # run the collaborator
            collaborators[col].run_simulation()

            logger.info(
                "Collaborator {} took simulated time: {} minutes".format(
                    col, round(t / 60, 2)))

        # the round time is the max of the times_list
        round_time = max([t for t, _ in times_list])
        total_simulated_time += round_time

        # get the performace validation scores for the round
        round_dice = get_metric('valid_dice', round_num, aggregator.tensor_db)
        dice_label_0 = get_metric('valid_dice_per_label_0', round_num,
                                  aggregator.tensor_db)
        dice_label_1 = get_metric('valid_dice_per_label_1', round_num,
                                  aggregator.tensor_db)
        dice_label_2 = get_metric('valid_dice_per_label_2', round_num,
                                  aggregator.tensor_db)
        dice_label_4 = get_metric('valid_dice_per_label_4', round_num,
                                  aggregator.tensor_db)
        if include_validation_with_hausdorff:
            hausdorff95_label_0 = get_metric('valid_hd95_per_label_0',
                                             round_num, aggregator.tensor_db)
            hausdorff95_label_1 = get_metric('valid_hd95_per_label_1',
                                             round_num, aggregator.tensor_db)
            hausdorff95_label_2 = get_metric('valid_hd95_per_label_2',
                                             round_num, aggregator.tensor_db)
            hausdorff95_label_4 = get_metric('valid_hd95_per_label_4',
                                             round_num, aggregator.tensor_db)

        # update best score
        if best_dice < round_dice:
            best_dice = round_dice
            # Set the weights for the final model
            if round_num == 0:
                # here the initial model was validated (temp model does not exist)
                logger.info(
                    f'Skipping best model saving to disk as it is a random initialization.'
                )
            elif not os.path.exists(
                    f'checkpoint/{checkpoint_folder}/temp_model.pkl'):
                raise ValueError(
                    f'Expected temporary model at: checkpoint/{checkpoint_folder}/temp_model.pkl to exist but it was not found.'
                )
            else:
                # here the temp model was the one validated
                shutil.copyfile(
                    src=f'checkpoint/{checkpoint_folder}/temp_model.pkl',
                    dst=f'checkpoint/{checkpoint_folder}/best_model.pkl')
                logger.info(
                    f'Saved model with best average binary DICE: {best_dice} to ~/.local/workspace/checkpoint/{checkpoint_folder}/best_model.pkl'
                )

        ## RUN VALIDATION ON INTERMEDIATE CONSENSUS MODEL
        # set the task_runner data loader
        # task_runner.data_loader = collaborator_data_loaders[col]
        ### DELETE THIS LINE ###
        # print(f'Collaborator {col} training data count = {task_runner.data_loader.get_train_data_size()}')

        # run the collaborator
        #collaborators[col].run_simulation()

        ## CONVERGENCE METRIC COMPUTATION
        # update the auc score
        best_dice_over_time_auc += best_dice * round_time

        # project the auc score as remaining time * best dice
        # this projection assumes that the current best score is carried forward for the entire week
        projected_auc = (MAX_SIMULATION_TIME - total_simulated_time
                         ) * best_dice + best_dice_over_time_auc
        projected_auc /= MAX_SIMULATION_TIME

        # End of round summary
        summary = '"**** END OF ROUND {} SUMMARY *****"'.format(round_num)
        summary += "\n\tSimulation Time: {} minutes".format(
            round(total_simulated_time / 60, 2))
        summary += "\n\t(Projected) Convergence Score: {}".format(
            projected_auc)
        summary += "\n\tDICE Label 0: {}".format(dice_label_0)
        summary += "\n\tDICE Label 1: {}".format(dice_label_1)
        summary += "\n\tDICE Label 2: {}".format(dice_label_2)
        summary += "\n\tDICE Label 4: {}".format(dice_label_4)
        if include_validation_with_hausdorff:
            summary += "\n\tHausdorff95 Label 0: {}".format(
                hausdorff95_label_0)
            summary += "\n\tHausdorff95 Label 1: {}".format(
                hausdorff95_label_1)
            summary += "\n\tHausdorff95 Label 2: {}".format(
                hausdorff95_label_2)
            summary += "\n\tHausdorff95 Label 4: {}".format(
                hausdorff95_label_4)

        experiment_results['round'].append(round_num)
        experiment_results['time'].append(total_simulated_time)
        experiment_results['convergence_score'].append(projected_auc)
        experiment_results['round_dice'].append(round_dice)
        experiment_results['dice_label_0'].append(dice_label_0)
        experiment_results['dice_label_1'].append(dice_label_1)
        experiment_results['dice_label_2'].append(dice_label_2)
        experiment_results['dice_label_4'].append(dice_label_4)
        if include_validation_with_hausdorff:
            experiment_results['hausdorff95_label_0'].append(
                hausdorff95_label_0)
            experiment_results['hausdorff95_label_1'].append(
                hausdorff95_label_1)
            experiment_results['hausdorff95_label_2'].append(
                hausdorff95_label_2)
            experiment_results['hausdorff95_label_4'].append(
                hausdorff95_label_4)
        logger.info(summary)

        if save_checkpoints:
            logger.info(f'Saving checkpoint for round {round_num}')
            logger.info(
                f'To resume from this checkpoint, set the restore_from_checkpoint_folder parameter to \'{checkpoint_folder}\''
            )
            save_checkpoint(checkpoint_folder, aggregator, collaborator_names,
                            collaborators, round_num, collaborator_time_stats,
                            total_simulated_time, best_dice,
                            best_dice_over_time_auc,
                            collaborators_chosen_each_round,
                            collaborator_times_per_round, experiment_results,
                            summary)

        # if the total_simulated_time has exceeded the maximum time, we break
        # in practice, this means that the previous round's model is the last model scored,
        # so a long final round should not actually benefit the competitor, since that final
        # model is never globally validated
        if total_simulated_time > MAX_SIMULATION_TIME:
            logger.info("Simulation time exceeded. Ending Experiment")
            break

        # save the most recent aggregated model in native format to be copied over as best when appropriate
        # (note this model has not been validated by the collaborators yet)
        task_runner.rebuild_model(round_num,
                                  aggregator.last_tensor_dict,
                                  validation=True)
        task_runner.save_native(
            f'checkpoint/{checkpoint_folder}/temp_model.pkl')

    return pd.DataFrame.from_dict(experiment_results), checkpoint_folder