def initialize_tensorkeys_for_functions(self, with_opt_vars=False): """ Set the required tensors for all publicly accessible methods \ that could be called as part of a task. By default, this is just all of the layers and optimizer of the model. Custom tensors should be added to this function Parameters ---------- None Returns ------- None """ # TODO there should be a way to programmatically iterate through all # of the methods in the class and declare the tensors. # For now this is done manually output_model_dict = self.get_tensor_dict(with_opt_vars=with_opt_vars) global_model_dict, local_model_dict = split_tensor_dict_for_holdouts( self.logger, output_model_dict, **self.tensor_dict_split_fn_kwargs) if not with_opt_vars: validation_global_model_dict = global_model_dict validation_local_model_dict = local_model_dict else: output_model_dict = self.get_tensor_dict(with_opt_vars=False) validation_global_model_dict, validation_local_model_dict =\ split_tensor_dict_for_holdouts( self.logger, output_model_dict, **self.tensor_dict_split_fn_kwargs ) self.required_tensorkeys_for_function['train'] = [ TensorKey(tensor_name, 'GLOBAL', 0, False, ('model', )) for tensor_name in global_model_dict ] self.required_tensorkeys_for_function['train'] += [ TensorKey(tensor_name, 'LOCAL', 0, False, ('model', )) for tensor_name in local_model_dict ] # Validation may be performed on local or aggregated (global) model, # so there is an extra lookup dimension for kwargs self.required_tensorkeys_for_function['validate'] = {} # TODO This is not stateless. The optimizer will not be self.required_tensorkeys_for_function['validate']['apply=local'] = \ [TensorKey(tensor_name, 'LOCAL', 0, False, ('trained',)) for tensor_name in { **validation_global_model_dict, **validation_local_model_dict}] self.required_tensorkeys_for_function['validate']['apply=global'] = \ [TensorKey(tensor_name, 'GLOBAL', 0, False, ('model',)) for tensor_name in validation_global_model_dict] self.required_tensorkeys_for_function['validate']['apply=global'] += \ [TensorKey(tensor_name, 'LOCAL', 0, False, ('model',)) for tensor_name in validation_local_model_dict]
def validate(self, col_name, round_num, input_tensor_dict, use_tqdm=False, **kwargs): """Validate. Run validation of the model on the local data. Args: col_name: Name of the collaborator round_num: What round is it input_tensor_dict: Required input tensors (for model) use_tqdm (bool): Use tqdm to print a progress bar (Default=True) kwargs: Key word arguments passed to GaNDLF main_run Returns: global_output_dict: Tensors to send back to the aggregator local_output_dict: Tensors to maintain in the local TensorDB """ self.rebuild_model(round_num, input_tensor_dict, validation=True) self.model.eval() # self.model.to(self.device) epoch_valid_loss, epoch_valid_metric = validate_network( self.model, self.data_loader.val_dataloader, self.scheduler, self.params, round_num, mode="validation") self.logger.info(epoch_valid_loss) self.logger.info(epoch_valid_metric) origin = col_name suffix = 'validate' if kwargs['apply'] == 'local': suffix += '_local' else: suffix += '_agg' tags = ('metric', suffix) output_tensor_dict = {} output_tensor_dict[TensorKey('valid_loss', origin, round_num, True, tags)] = np.array(epoch_valid_loss) for k, v in epoch_valid_metric.items(): if np.array(v).size == 1: output_tensor_dict[TensorKey(f'valid_{k}', origin, round_num, True, tags)] = np.array(v) else: for idx, label in enumerate([0, 1, 2, 4]): output_tensor_dict[TensorKey(f'valid_{k}_{label}', origin, round_num, True, tags)] = np.array(v[idx]) return output_tensor_dict, {}
def validate(self, col_name, round_num, input_tensor_dict, use_tqdm=False, **kwargs): """Validate. Run validation of the model on the local data. Args: col_name: Name of the collaborator round_num: What round is it input_tensor_dict: Required input tensors (for model) use_tqdm (bool): Use tqdm to print a progress bar (Default=True) Returns: global_output_dict: Tensors to send back to the aggregator local_output_dict: Tensors to maintain in the local TensorDB """ self.rebuild_model(round_num, input_tensor_dict, validation=True) self.eval() self.to(self.device) val_score = 0 total_samples = 0 loader = self.data_loader.get_valid_loader() if use_tqdm: loader = tqdm.tqdm(loader, desc="validate") with pt.no_grad(): for data, target in loader: samples = target.shape[0] total_samples += samples data, target = pt.tensor(data).to( self.device), pt.tensor(target).to(self.device, dtype=pt.int64) output = self(data) # get the index of the max log-probability pred = output.argmax(dim=1, keepdim=True) target_categorical = target.argmax(dim=1, keepdim=True) val_score += pred.eq(target_categorical).sum().cpu().numpy() origin = col_name suffix = 'validate' if kwargs['apply'] == 'local': suffix += '_local' else: suffix += '_agg' tags = ('metric', suffix) # TODO figure out a better way to pass in metric for this pytorch # validate function output_tensor_dict = { TensorKey('acc', origin, round_num, True, tags): np.array(val_score / total_samples) } # Empty list represents metrics that should only be stored locally return output_tensor_dict, {}
def _load_initial_tensors(self): """ Load all of the tensors required to begin federated learning. Required tensors are: \ 1. Initial model. Returns: None """ tensor_dict, round_number = utils.deconstruct_model_proto( self.model, compression_pipeline=self.compression_pipeline) if round_number > self.round_number: self.logger.info( 'Starting training from round {} of previously saved' ' model'.format(round_number)) self.round_number = round_number tensor_key_dict = { TensorKey(k, self.uuid, self.round_number, False, ('model', )): v for k, v in tensor_dict.items() } # all initial model tensors are loaded here self.tensor_db.cache_tensor(tensor_key_dict) self.logger.debug('This is the initial tensor_db:' ' {}'.format(self.tensor_db))
def _save_model(self, round_number, file_path): """ Save the best or latest model. Args: round_number: int Model round to be saved file_path: str Either the best model or latest model file path Returns: None """ # Extract the model from TensorDB and set it to the new model og_tensor_dict, _ = utils.deconstruct_model_proto( self.model, compression_pipeline=self.compression_pipeline) tensor_keys = [ TensorKey(k, self.uuid, round_number, False, ('model', )) for k, v in og_tensor_dict.items() ] tensor_dict = {} for tk in tensor_keys: tk_name, _, _, _, _ = tk tensor_dict[tk_name] = self.tensor_db.get_tensor_from_cache(tk) if tensor_dict[tk_name] is None: self.logger.info('Cannot save model for round {}.' ' Continuing...'.format(round_number)) return if file_path == self.best_state_path: self.best_tensor_dict = tensor_dict if file_path == self.last_state_path: self.last_tensor_dict = tensor_dict self.model = utils.construct_model_proto(tensor_dict, round_number, self.compression_pipeline) utils.dump_proto(self.model, file_path)
def apply_delta(tensor_key, delta, base_model_nparray, creates_model=False): """ Add delta to the nparray. Args: tensor_key: This is the tensor_key associated with the delta. Should have a tag of 'trained' or 'aggregated' delta: Weight delta between the new model and old model base_model_nparray: The nparray that corresponds to the prior weights creates_model: If flag is set, the tensorkey returned will correspond to the aggregator model Returns: new_model_tensor_key: Latest model layer tensorkey new_model_nparray: Latest layer weights """ tensor_name, origin, round_number, report, tags = tensor_key if not np.isscalar(base_model_nparray): assert (delta.shape == base_model_nparray.shape), ( 'Shape of delta ({}) is not equal to shape of model' ' layer ({})'.format(delta.shape, base_model_nparray.shape)) # assert('model' in tensor_key[3]), 'The tensorkey should be provided # from the base model' # Aggregator UUID has the prefix 'aggregator' if 'aggregator' in origin and not creates_model: tags = list(tags) tags.remove('delta') new_tags = tuple(tags) new_model_tensor_key = TensorKey(tensor_name, origin, round_number, report, new_tags) else: new_model_tensor_key = TensorKey(tensor_name, origin, round_number, report, ('model', )) return new_model_tensor_key, base_model_nparray + delta
def nparray_to_named_tensor(self, tensor_key, nparray): """ Construct the NamedTensor Protobuf. Includes logic to create delta, compress tensors with the TensorCodec, etc. """ # if we have an aggregated tensor, we can make a delta tensor_name, origin, round_number, report, tags = tensor_key if 'trained' in tags and self.delta_updates: # Should get the pretrained model to create the delta. If training # has happened, # Model should already be stored in the TensorDB model_nparray = self.tensor_db.get_tensor_from_cache( TensorKey( tensor_name, origin, round_number, report, ('model',) ) ) # The original model will not be present for the optimizer on the # first round. if model_nparray is not None: delta_tensor_key, delta_nparray = \ self.tensor_codec.generate_delta( tensor_key, nparray, model_nparray ) delta_comp_tensor_key, delta_comp_nparray, metadata = \ self.tensor_codec.compress(delta_tensor_key, delta_nparray) named_tensor = utils.construct_named_tensor( delta_comp_tensor_key, delta_comp_nparray, metadata, lossless=False ) return named_tensor # Assume every other tensor requires lossless compression compressed_tensor_key, compressed_nparray, metadata = \ self.tensor_codec.compress( tensor_key, nparray, require_lossless=True ) named_tensor = utils.construct_named_tensor( compressed_tensor_key, compressed_nparray, metadata, lossless=True ) return named_tensor
def validate(self, col_name, round_num, input_tensor_dict, use_tqdm=True, **kwargs): """Run validation of the model on the local data. Args: col_name: Name of the collaborator round_num: What round is it input_tensor_dict: Required input tensors (for model) use_tqdm: Use tqdm to print a progress bar (Default=True) Returns: global_output_dict: Tensors to send back to the aggregator local_output_dict: Tensors to maintain in the local TensorDB """ self.rebuild_model(round_num, input_tensor_dict, validation=True) self.eval() self.to(self.device) val_score = 0 total_samples = 0 loader = self.data_loader.get_valid_loader() if use_tqdm: loader = tqdm.tqdm(loader, desc="validate") with torch.no_grad(): for data, target in loader: samples = target.shape[0] total_samples += samples data, target = torch.tensor(data).to( self.device), torch.tensor(target).to(self.device) output = self(data) # get the index of the max log-probability val = soft_dice_coef(output, target) val_score += val.sum().cpu().numpy() origin = col_name suffix = 'validate' if kwargs['apply'] == 'local': suffix += '_local' else: suffix += '_agg' tags = ('metric', suffix) # TODO figure out a better way to pass in metric for this pytorch # validate function output_tensor_dict = { TensorKey('dice_coef', origin, round_num, True, tags): np.array(val_score / total_samples) } return output_tensor_dict, {}
def validate(self, col_name, round_num, input_tensor_dict, **kwargs): """ Run the trained model on validation data; report results. Parameters ---------- input_tensor_dict : either the last aggregated or locally trained model Returns ------- output_tensor_dict : {TensorKey: nparray} (these correspond to acc, precision, f1_score, etc.) """ batch_size = 1 if 'batch_size' in kwargs: batch_size = kwargs['batch_size'] self.rebuild_model(round_num, input_tensor_dict, validation=True) param_metrics = kwargs['metrics'] vals = self.model.evaluate(self.data_loader.X_valid, self.data_loader.y_valid, batch_size=batch_size, verbose=0) model_metrics_names = self.model.metrics_names if type(vals) is not list: vals = [vals] ret_dict = dict(zip(model_metrics_names, vals)) # TODO if there are new metrics in the flplan that were not included in # the originally compiled model, that behavior is not currently # handled. for param in param_metrics: if param not in model_metrics_names: error = 'KerasTaskRunner does not support specifying new' \ ' metrics. ' \ 'Param_metrics = {}, model_metrics_names' \ ' = {}'.format(param_metrics, model_metrics_names) raise ValueError(error) origin = col_name suffix = 'validate' if kwargs['apply'] == 'local': suffix += '_local' else: suffix += '_agg' tags = ('metric', suffix) output_tensor_dict = { TensorKey(metric, origin, round_num, True, tags): np.array(ret_dict[metric]) for metric in param_metrics } return output_tensor_dict, {}
def find_dependencies(self, tensor_key, send_model_deltas): """Resolve the tensors required to do the specified operation.""" tensor_key_dependencies = [] tensor_name, origin, round_number, report, tags = tensor_key if 'model' in tags and send_model_deltas: if round_number >= 1: # The new model can be generated by previous model + delta tensor_key_dependencies.append( TensorKey(tensor_name, origin, round_number - 1, report, tags)) if self.compression_pipeline.is_lossy(): new_tags = ('aggregated', 'delta', 'lossy_compressed') else: new_tags = ('aggregated', 'delta', 'compressed') tensor_key_dependencies.append( TensorKey(tensor_name, origin, round_number, report, new_tags)) return tensor_key_dependencies
def update_tensorkeys_for_functions(self): """ Update the required tensors for all publicly accessible methods \ that could be called as part of a task. By default, this is just all of the layers and optimizer of the model. Custom tensors should be added to this function Parameters ---------- None Returns ------- None """ # TODO complete this function. It is only needed for opt_treatment, # and making the model stateless # Minimal required tensors for train function model_layer_names = self._get_weights_names(self.model) opt_names = self._get_weights_names(self.model.optimizer) tensor_names = model_layer_names + opt_names self.logger.debug( 'Updating model tensor names: {}'.format(tensor_names)) self.required_tensorkeys_for_function['train'] = [ TensorKey(tensor_name, 'GLOBAL', 0, ('model', )) for tensor_name in tensor_names ] # Validation may be performed on local or aggregated (global) model, # so there is an extra lookup dimension for kwargs self.required_tensorkeys_for_function['validate'] = {} self.required_tensorkeys_for_function['validate']['local_model=True'] = \ [TensorKey(tensor_name, 'LOCAL', 0, ('trained',)) for tensor_name in tensor_names] self.required_tensorkeys_for_function['validate']['local_model=False'] = \ [TensorKey(tensor_name, 'GLOBAL', 0, ('model',)) for tensor_name in tensor_names]
def construct_model_proto(tensor_dict, round_number, tensor_pipe): # compress the arrays in the tensor_dict, and form the model proto # TODO: Hold-out tensors from the tensor compression pipeline. named_tensors = [] for key, nparray in tensor_dict.items(): bytes, transformer_metadata = tensor_pipe.forward(data=nparray) tensor_key = TensorKey(key, 'agg', round_number, False, ('model', )) named_tensors.append( construct_named_tensor(tensor_key, bytes, transformer_metadata, lossless=True)) return ModelProto(tensors=named_tensors)
def do_task(self, task, round_number): """Do the specified task.""" # map this task to an actual function name and kwargs func_name = self.task_config[task]['function'] kwargs = self.task_config[task]['kwargs'] # this would return a list of what tensors we require as TensorKeys required_tensorkeys_relative = \ self.task_runner.get_required_tensorkeys_for_function( func_name, **kwargs ) # models actually return "relative" tensorkeys of (name, LOCAL|GLOBAL, # round_offset) # so we need to update these keys to their "absolute values" required_tensorkeys = [] for tname, origin, rnd_num, report, tags in required_tensorkeys_relative: if origin == 'GLOBAL': origin = self.aggregator_uuid else: origin = self.collaborator_name # rnd_num is the relative round. So if rnd_num is -1, get the # tensor from the previous round required_tensorkeys.append( TensorKey(tname, origin, rnd_num + round_number, report, tags) ) # print('Required tensorkeys = {}'.format( # [tk[0] for tk in required_tensorkeys])) input_tensor_dict = self.get_numpy_dict_for_tensorkeys( required_tensorkeys ) # now we have whatever the model needs to do the task func = getattr(self.task_runner, func_name) global_output_tensor_dict, local_output_tensor_dict = func( col_name=self.collaborator_name, round_num=round_number, input_tensor_dict=input_tensor_dict, **kwargs) # Save global and local output_tensor_dicts to TensorDB self.tensor_db.cache_tensor(global_output_tensor_dict) self.tensor_db.cache_tensor(local_output_tensor_dict) # send the results for this tasks; delta and compression will occur in # this function self.send_task_results(global_output_tensor_dict, round_number, task)
def named_tensor_to_nparray(self, named_tensor): """Convert named tensor to a numpy array.""" # do the stuff we do now for decompression and frombuffer and stuff # This should probably be moved back to protoutils raw_bytes = named_tensor.data_bytes metadata = [{'int_to_float': proto.int_to_float, 'int_list': proto.int_list, 'bool_list': proto.bool_list } for proto in named_tensor.transformer_metadata] # The tensor has already been transfered to collaborator, so # the newly constructed tensor should have the collaborator origin tensor_key = TensorKey( named_tensor.name, self.collaborator_name, named_tensor.round_number, named_tensor.report, tuple(named_tensor.tags) ) tensor_name, origin, round_number, report, tags = tensor_key if 'compressed' in tags: decompressed_tensor_key, decompressed_nparray = \ self.tensor_codec.decompress( tensor_key, data=raw_bytes, transformer_metadata=metadata, require_lossless=True ) elif 'lossy_compressed' in tags: decompressed_tensor_key, decompressed_nparray = \ self.tensor_codec.decompress( tensor_key, data=raw_bytes, transformer_metadata=metadata ) else: # There could be a case where the compression pipeline is bypassed # entirely self.logger.warning('Bypassing tensor codec...') decompressed_tensor_key = tensor_key decompressed_nparray = raw_bytes self.tensor_db.cache_tensor( {decompressed_tensor_key: decompressed_nparray} ) return decompressed_nparray
def _load_initial_tensors_from_dict(self, tensor_dict): """ Load all of the tensors required to begin federated learning. Required tensors are: \ 1. Initial model. Returns: None """ tensor_key_dict = { TensorKey(k, self.uuid, self.round_number, False, ('model', )): v for k, v in tensor_dict.items() } # all initial model tensors are loaded here self.tensor_db.cache_tensor(tensor_key_dict) self.logger.debug('This is the initial tensor_db:' ' {}'.format(self.tensor_db))
def _nparray_to_named_tensor(self, tensor_key, nparray, send_model_deltas, compress_lossless): """ Construct the NamedTensor Protobuf. Also includes logic to create delta, compress tensors with the TensorCodec, etc. """ tensor_name, origin, round_number, report, tags = tensor_key # if we have an aggregated tensor, we can make a delta if 'aggregated' in tags and send_model_deltas: # Should get the pretrained model to create the delta. If training # has happened, Model should already be stored in the TensorDB model_tk = TensorKey(tensor_name, origin, round_number - 1, report, ('model', )) model_nparray = self.tensor_db.get_tensor_from_cache(model_tk) assert (model_nparray is not None), ( "The original model layer should be present if the latest " "aggregated model is present") delta_tensor_key, delta_nparray = self.tensor_codec.generate_delta( tensor_key, nparray, model_nparray) delta_comp_tensor_key, delta_comp_nparray, metadata = \ self.tensor_codec.compress(delta_tensor_key, delta_nparray, lossless=compress_lossless) named_tensor = utils.construct_named_tensor( delta_comp_tensor_key, delta_comp_nparray, metadata, lossless=compress_lossless) else: # Assume every other tensor requires lossless compression compressed_tensor_key, compressed_nparray, metadata = \ self.tensor_codec.compress(tensor_key, nparray, require_lossless=True) named_tensor = utils.construct_named_tensor( compressed_tensor_key, compressed_nparray, metadata, lossless=compress_lossless) return named_tensor
def compress(self, tensor_key, data, require_lossless=False, **kwargs): """ Function-wrapper around the tensor_pipeline.forward function. It also keeps track of the tensorkeys associated with the compressed nparray Args: tensor_key: TensorKey is provided to verify it should be compressed, and new TensorKeys returned will be derivatives of the existing tensor_name data: (uncompressed) numpy array associated with the tensor_key require_lossless: boolean. Does tensor require compression Returns: compressed_tensor_key: Tensorkey corresponding to the decompressed tensor compressed_nparray: The compressed tensor metadata: metadata associated with compressed tensor """ if require_lossless: compressed_nparray, metadata = self.lossless_pipeline.forward( data, **kwargs) else: compressed_nparray, metadata = self.compression_pipeline.forward( data, **kwargs) # Define the compressed tensorkey that should be # returned ('trained.delta'->'trained.delta.lossy_compressed') tensor_name, origin, round_number, report, tags = tensor_key if not self.compression_pipeline.is_lossy() or require_lossless: new_tags = tuple(list(tags) + ['compressed']) else: new_tags = tuple(list(tags) + ['lossy_compressed']) compressed_tensor_key = TensorKey(tensor_name, origin, round_number, report, new_tags) return compressed_tensor_key, compressed_nparray, metadata
def validate(self, col_name, round_num, input_tensor_dict, use_tqdm=False, **kwargs): """ Run validation. Returns: dict: {<metric>: <value>} """ batch_size = self.data_loader.batch_size if kwargs['batch_size']: batch_size = kwargs['batch_size'] self.rebuild_model(round_num, input_tensor_dict, validation=True) tf.keras.backend.set_learning_phase(False) score = 0 gen = self.data_loader.get_valid_loader(batch_size) if use_tqdm: gen = tqdm.tqdm(gen, desc="validating") for X, y in gen: weight = X.shape[0] / self.data_loader.get_valid_data_size() _, s = self.validate_batch(X, y) score += s * weight origin = col_name suffix = 'validate' if kwargs['apply'] == 'local': suffix += '_local' else: suffix += '_agg' tags = ('metric', suffix) output_tensor_dict = { TensorKey( self.validation_metric_name, origin, round_num, True, tags ): np.array(score)} # return empty dict for local metrics return output_tensor_dict, {}
def generate_delta(tensor_key, nparray, base_model_nparray): """ Create delta from the updated layer and base layer. Args: tensor_key: This is the tensor_key associated with the nparray. Should have a tag of 'trained' or 'aggregated' nparray: The nparray that corresponds to the tensorkey base_model_nparray: The base model tensor that will be subtracted from the new weights Returns: delta_tensor_key: Tensorkey that corresponds to the delta weight array delta: Difference between the provided tensors """ tensor_name, origin, round_number, report, tags = tensor_key if not np.isscalar(nparray): assert nparray.shape == base_model_nparray.shape, ( 'Shape of updated layer ({}) is not equal to base ' 'layer shape of ({})'.format(nparray.shape, base_model_nparray.shape)) assert 'model' not in tags, ( 'The tensorkey should be provided ' 'from the layer with new weights, not the base model') if type(tags) == str: new_tags = tuple([tensor_key[3]] + ['delta']) else: new_tags = tuple(list(tags) + ['delta']) delta_tensor_key = TensorKey(tensor_name, origin, round_number, report, new_tags) return delta_tensor_key, nparray - base_model_nparray
def validate(self, col_name, round_num, input_tensor_dict, **kwargs): """ Run the trained model on validation data; report results. Parameters ---------- input_tensor_dict : either the last aggregated or locally trained model Returns ------- output_tensor_dict : {TensorKey: nparray} (these correspond to acc, precision, f1_score, etc.) """ self.rebuild_model(round_num, input_tensor_dict, validation=True) param_metrics = kwargs['metrics'] results = self.estimator.test('experiment') ret_dict = { metric: list(results.history['test'][metric].values())[-1] for metric in param_metrics } origin = col_name suffix = 'validate' if kwargs['apply'] == 'local': suffix += '_local' else: suffix += '_agg' tags = ('metric', suffix) output_tensor_dict = { TensorKey(metric, origin, round_num, True, tags): np.array(ret_dict[metric]) for metric in param_metrics } return output_tensor_dict, {}
def get_data_for_tensorkey(self, tensor_key): """ Resolve the tensor corresponding to the requested tensorkey. Args ---- tensor_key: Tensorkey that will be resolved locally or remotely. May be the product of other tensors """ # try to get from the store tensor_name, origin, round_number, report, tags = tensor_key self.logger.debug( 'Attempting to retrieve tensor {} from local store'.format( tensor_key) ) nparray = self.tensor_db.get_tensor_from_cache(tensor_key) # if None and origin is our client, request it from the client if nparray is None: if origin == self.collaborator_name: self.logger.info( 'Attempting to find locally stored {} tensor from prior' ' round...'.format(tensor_name)) prior_round = round_number - 1 while prior_round >= 0: nparray = self.tensor_db.get_tensor_from_cache( TensorKey(tensor_name, origin, prior_round, report, tags)) if nparray is not None: self.logger.debug( 'Found tensor {} in local TensorDB for round' ' {}'.format(tensor_name, prior_round)) return nparray prior_round -= 1 self.logger.info('Cannot find any prior version of tensor {}' ' locally...'.format(tensor_name)) self.logger.debug('Unable to get tensor from local store...' 'attempting to retrieve from client') # Determine whether there are additional compression related # dependencies. # Typically, dependencies are only relevant to model layers tensor_dependencies = self.tensor_codec.find_dependencies( tensor_key, self.delta_updates ) # self.logger.info('tensor_dependencies = {}'.format( # tensor_dependencies)) if len(tensor_dependencies) > 0: # Resolve dependencies # tensor_dependencies[0] corresponds to the prior version # of the model. # If it exists locally, should pull the remote delta because # this is the least costly path prior_model_layer = self.tensor_db.get_tensor_from_cache( tensor_dependencies[0] ) if prior_model_layer is not None: uncompressed_delta = \ self.get_aggregated_tensor_from_aggregator( tensor_dependencies[1] ) new_model_tk, nparray = self.tensor_codec.apply_delta( tensor_dependencies[1], uncompressed_delta, prior_model_layer ) self.logger.debug('Applied delta to tensor {}'.format( tensor_dependencies[0][0]) ) else: # The original model tensor should be fetched from client nparray = self.get_aggregated_tensor_from_aggregator( tensor_key ) elif 'model' in tags: # Pulling the model for the first time or nparray = self.get_aggregated_tensor_from_aggregator( tensor_key, require_lossless=True ) else: self.logger.debug('Found tensor {} in local TensorDB'.format( tensor_key)) return nparray
def _compute_validation_related_task_metrics(self, task_name): """ Compute all validation related metrics. Args: task_name : str The task name to compute """ self.logger.info('{} task metrics...'.format(task_name)) # By default, print out all of the metrics that the validation # task sent # This handles getting the subset of collaborators that may be # part of the validation task collaborators_for_task = self.assigner.get_collaborators_for_task( task_name, self.round_number) # The collaborator data sizes for that task collaborator_weights_unnormalized = { c: self.collaborator_task_weight[TaskResultKey( task_name, c, self.round_number)] for c in collaborators_for_task } weight_total = sum(collaborator_weights_unnormalized.values()) collaborator_weight_dict = { k: v / weight_total for k, v in collaborator_weights_unnormalized.items() } # The validation task should have just a couple tensors (i.e. # metrics) associated with it. Because each collaborator should # have sent the same tensor list, we can use the first # collaborator in our subset, and apply the correct # transformations to the tensorkey to resolve the aggregated # tensor for that round agg_functions = self.assigner.get_aggregation_type_for_task(task_name) task_key = TaskResultKey(task_name, collaborators_for_task[0], self.round_number) for tensor_key in self.collaborator_tasks_results[task_key]: tensor_name, origin, round_number, report, tags = tensor_key assert (tags[-1] == collaborators_for_task[0]), \ 'Tensor {} in task {} has not been processed' \ ' correctly'.format(tensor_key, task_name) # Strip the collaborator label, and lookup aggregated tensor new_tags = tuple(list(tags[:-1])) agg_tensor_key = TensorKey(tensor_name, origin, round_number, report, new_tags) agg_tensor_name, agg_origin, agg_round_number, agg_report, agg_tags = agg_tensor_key agg_results, agg_metadata_dict = self.tensor_db.get_aggregated_tensor( agg_tensor_key, collaborator_weight_dict, agg_functions) if report: # Print the aggregated metric if agg_results is None: self.logger.warning( 'Aggregated metric {} could not be collected for round {}. ' 'Skipping reporting for this round'.format( agg_tensor_name, self.round_number)) if agg_functions is not None: self.logger.info('{0} {1}:\t{2:.4f}'.format( agg_functions[0], agg_tensor_name, agg_results)) else: self.logger.info('{0}:\t{1:.4f}'.format( agg_tensor_name, agg_results)) for met in agg_metadata_dict: self.logger.info('{0} {1}:\t{2:.4f}'.format( met, agg_tensor_name, agg_metadata_dict[met])) # TODO Add all of the logic for saving the model based # on best accuracy, lowest loss, etc. if 'validate_agg' in tags: # Compare the accuracy of the model, and # potentially save it if self.best_model_score is None or self.best_model_score < agg_results: self.logger.info( 'Saved the best model with score {:f}'.format( agg_results)) self.best_model_score = agg_results self._save_model(round_number, self.best_state_path) if 'trained' in tags: self._prepare_trained(tensor_name, origin, round_number, report, agg_results)
def _prepare_trained(self, tensor_name, origin, round_number, report, agg_results): """ Prepare aggregated tensorkey tags. Args: tensor_name : str origin: round_number: int report: bool agg_results: np.array """ # The aggregated tensorkey tags should have the form of # 'trained' or 'trained.lossy_decompressed' # They need to be relabeled to 'aggregated' and # reinserted. Then delta performed, compressed, etc. # then reinserted to TensorDB with 'model' tag # First insert the aggregated model layer with the # correct tensorkey agg_tag_tk = TensorKey(tensor_name, origin, round_number + 1, report, ('aggregated', )) self.tensor_db.cache_tensor({agg_tag_tk: agg_results}) # Create delta and save it in TensorDB base_model_tk = TensorKey(tensor_name, origin, round_number, report, ('model', )) base_model_nparray = self.tensor_db.get_tensor_from_cache( base_model_tk) if base_model_nparray is not None: delta_tk, delta_nparray = self.tensor_codec.generate_delta( agg_tag_tk, agg_results, base_model_nparray) self.tensor_db.cache_tensor({delta_tk: delta_nparray}) else: # This condition is possible for base model # optimizer states (i.e. Adam/iter:0, SGD, etc.) # These values couldn't be present for the base # model because no training occurs on the aggregator delta_tk, delta_nparray = agg_tag_tk, agg_results # Compress lossless/lossy compressed_delta_tk, compressed_delta_nparray, metadata = self.tensor_codec.compress( delta_tk, delta_nparray) # TODO extend the TensorDB so that compressed data is # supported. Once that is in place # the compressed delta can just be stored here instead # of recreating it for every request # Decompress lossless/lossy decompressed_delta_tk, decompressed_delta_nparray = self.tensor_codec.decompress( compressed_delta_tk, compressed_delta_nparray, metadata) # Apply delta (unless delta couldn't be created) if base_model_nparray is not None: new_model_tk, new_model_nparray = self.tensor_codec.apply_delta( decompressed_delta_tk, decompressed_delta_nparray, base_model_nparray) else: new_model_tk, new_model_nparray = decompressed_delta_tk, decompressed_delta_nparray # Now that the model has been compressed/decompressed # with delta operations, # Relabel the tags to 'model' (new_model_tensor_name, new_model_origin, new_model_round_number, new_model_report, new_model_tags) = new_model_tk final_model_tk = TensorKey(new_model_tensor_name, new_model_origin, new_model_round_number, new_model_report, ('model', )) # Finally, cache the updated model tensor self.tensor_db.cache_tensor({final_model_tk: new_model_nparray})
def _process_named_tensor(self, named_tensor, collaborator_name): """ Extract the named tensor fields. Performs decompression, delta computation, and inserts results into TensorDB. Args: named_tensor: NamedTensor (protobuf) protobuf that will be extracted from and processed collaborator_name: str Collaborator name is needed for proper tagging of resulting tensorkeys Returns: tensor_key : TensorKey (named_tuple) The tensorkey extracted from the protobuf nparray : np.array The numpy array associated with the returned tensorkey """ raw_bytes = named_tensor.data_bytes metadata = [{ 'int_to_float': proto.int_to_float, 'int_list': proto.int_list, 'bool_list': proto.bool_list } for proto in named_tensor.transformer_metadata] # The tensor has already been transfered to aggregator, # so the newly constructed tensor should have the aggregator origin tensor_key = TensorKey(named_tensor.name, self.uuid, named_tensor.round_number, named_tensor.report, tuple(named_tensor.tags)) tensor_name, origin, round_number, report, tags = tensor_key assert ('compressed' in tags or 'lossy_decompressed' in tags), ( 'Named tensor {} is not compressed'.format(tensor_key)) if 'compressed' in tags: dec_tk, decompressed_nparray = self.tensor_codec.decompress( tensor_key, data=raw_bytes, transformer_metadata=metadata, require_lossless=True) dec_name, dec_origin, dec_round_num, dec_report, dec_tags = dec_tk # Need to add the collaborator tag to the resulting tensor if type(dec_tags) == str: new_tags = tuple([dec_tags] + [collaborator_name]) else: new_tags = tuple(list(dec_tags) + [collaborator_name]) # layer.agg.n.trained.delta.col_i decompressed_tensor_key = TensorKey(dec_name, dec_origin, dec_round_num, dec_report, new_tags) if 'lossy_compressed' in tags: dec_tk, decompressed_nparray = self.tensor_codec.decompress( tensor_key, data=raw_bytes, transformer_metadata=metadata) dec_name, dec_origin, dec_round_num, dec_report, dec_tags = dec_tk if type(dec_tags) == str: new_tags = tuple([dec_tags] + [collaborator_name]) else: new_tags = tuple(list(dec_tags) + [collaborator_name]) # layer.agg.n.trained.delta.lossy_decompressed.col_i decompressed_tensor_key = TensorKey(dec_name, dec_origin, dec_round_num, dec_report, new_tags) if 'delta' in tags: base_model_tensor_key = TensorKey(tensor_name, origin, round_number, report, ('model', )) base_model_nparray = self.tensor_db.get_tensor_from_cache( base_model_tensor_key) if base_model_nparray is None: raise ValueError('Base model {} not present in' ' TensorDB'.format(base_model_tensor_key)) final_tensor_key, final_nparray = self.tensor_codec.apply_delta( decompressed_tensor_key, decompressed_nparray, base_model_nparray) else: final_tensor_key = decompressed_tensor_key final_nparray = decompressed_nparray assert (final_nparray is not None), ( 'Could not create tensorkey {}'.format(final_tensor_key)) self.tensor_db.cache_tensor({final_tensor_key: final_nparray}) self.logger.debug('Created TensorKey: {}'.format(final_tensor_key)) return final_tensor_key, final_nparray
def get_aggregated_tensor(self, collaborator_name, tensor_name, round_number, report, tags, require_lossless): """ RPC called by collaborator. Performs local lookup to determine if there is an aggregated tensor available \ that matches the request. Args: collaborator_name : str Requested tensor key collaborator name tensor_name: str require_lossless: bool round_number: int report: bool tags: list[str] Returns: named_tensor : protobuf NamedTensor the tensor requested by the collaborator """ self.logger.debug( 'Retrieving aggregated tensor {} for collaborator {}'.format( tensor_name, collaborator_name)) if 'compressed' in tags or require_lossless: compress_lossless = True # TODO the TensorDB doesn't support compressed data yet. # The returned tensor will # be recompressed anyway. if 'compressed' in tags: tags.remove('compressed') tensor_key = TensorKey(tensor_name, self.uuid, round_number, report, tuple(tags)) tensor_name, origin, round_number, report, tags = tensor_key # send_model_deltas = False compress_lossless = False if 'aggregated' in tags and 'delta' in tags and round_number != 0: # send_model_deltas = True agg_tensor_key = TensorKey(tensor_name, origin, round_number, report, ('aggregated', )) else: agg_tensor_key = tensor_key nparray = self.tensor_db.get_tensor_from_cache(tensor_key) if nparray is None: raise ValueError("Aggregator does not have an aggregated tensor" " for {}".format(tensor_key)) # quite a bit happens in here, including compression, delta handling, # etc... # we might want to cache these as well named_tensor = self._nparray_to_named_tensor( agg_tensor_key, nparray, send_model_deltas=True, compress_lossless=compress_lossless) return named_tensor
def train(self, col_name, round_num, input_tensor_dict, epochs, **kwargs): """ Perform the training for a specified number of batches. Is expected to perform draws randomly, without replacement until data is exausted. Then data is replaced and shuffled and draws continue. Returns ------- dict 'TensorKey: nparray' """ if 'metrics' not in kwargs: raise KeyError('metrics must be included in kwargs') # if 'batch_size' in kwargs: # batch_size = kwargs['batch_size'] # else: # batch_size = self.data_loader.batch_size # rebuild model with updated weights self.rebuild_model(round_num, input_tensor_dict) history = self.model.fit( self.data_loader.X_train, self.data_loader.y_train, batch_size=self.data_loader.batch_size, epochs=epochs, verbose=0, ) # TODO Currently assuming that all metrics are defined at # initialization (build_model). # If metrics are added (i.e. not a subset of what was originally # defined) then the model must be recompiled. model_metrics_names = self.model.metrics_names param_metrics = kwargs['metrics'] # TODO if there are new metrics in the flplan that were not included # in the originally # compiled model, that behavior is not currently handled. for param in param_metrics: if param not in model_metrics_names: error = 'KerasTaskRunner does not support specifying new' \ ' metrics. ' \ 'Param_metrics = {}, model_metrics_names =' \ ' {}'.format(param_metrics, model_metrics_names) raise ValueError(error) # output metric tensors (scalar) origin = col_name tags = ('trained', ) output_metric_dict = { TensorKey(metric, origin, round_num, True, ('metric', )): np.array(np.mean([history.history[metric]])) for metric in param_metrics } # output model tensors (Doesn't include TensorKey) output_model_dict = self.get_tensor_dict(with_opt_vars=True) global_model_dict, local_model_dict = split_tensor_dict_for_holdouts( self.logger, output_model_dict, **self.tensor_dict_split_fn_kwargs) # create global tensorkeys global_tensorkey_model_dict = { TensorKey(tensor_name, origin, round_num, False, tags): nparray for tensor_name, nparray in global_model_dict.items() } # create tensorkeys that should stay local local_tensorkey_model_dict = { TensorKey(tensor_name, origin, round_num, False, tags): nparray for tensor_name, nparray in local_model_dict.items() } # the train/validate aggregated function of the next round will look # for the updated model parameters. # this ensures they will be resolved locally next_local_tensorkey_model_dict = { TensorKey(tensor_name, origin, round_num + 1, False, ('model', )): nparray for tensor_name, nparray in local_model_dict.items() } global_tensor_dict = { **output_metric_dict, **global_tensorkey_model_dict } local_tensor_dict = { **local_tensorkey_model_dict, **next_local_tensorkey_model_dict } # update the required tensors if they need to be pulled from the # aggregator # TODO this logic can break if different collaborators have different # roles between rounds. # for example, if a collaborator only performs validation in the first # round but training in the second, it has no way of knowing the # optimizer state tensor names to request from the aggregator because # these are only created after training occurs. A work around could # involve doing a single epoch of training on random data to get the # optimizer names, and then throwing away the model. if self.opt_treatment == 'CONTINUE_GLOBAL': self.initialize_tensorkeys_for_functions(with_opt_vars=True) # return global_tensor_dict, local_tensor_dict return global_tensor_dict, local_tensor_dict
def decompress(self, tensor_key, data, transformer_metadata, require_lossless=False, **kwargs): """ Function-wrapper around the tensor_pipeline.backward function. It also keeps track of the tensorkeys associated with the decompressed nparray Args: tensor_key: TensorKey is provided to verify it should be decompressed, and new TensorKeys returned will be derivatives of the existing tensor_name data: (compressed) numpy array associated with the tensor_key transformer_metadata: metadata associated with the compressed tensor require_lossless: boolean, does data require lossless decompression Returns: decompressed_tensor_key: Tensorkey corresponding to the decompressed tensor decompressed_nparray: The decompressed tensor """ tensor_name, origin, round_number, report, tags = tensor_key assert (len(transformer_metadata) > 0), ('metadata must be included for decompression') assert (('compressed' in tags) or ('lossy_compressed' in tags)), ("Cannot decompress an uncompressed tensor") if require_lossless: assert ('compressed' in tags), ("Cannot losslessly decompress lossy tensor") if require_lossless or 'compressed' in tags: decompressed_nparray = self.lossless_pipeline.backward( data, transformer_metadata, **kwargs) else: decompressed_nparray = self.compression_pipeline.backward( data, transformer_metadata, **kwargs) # Define the decompressed tensorkey that should be returned if 'lossy_compressed' in tags: lc_idx = tags.index('lossy_compressed') new_tags = list(tags) new_tags[lc_idx] = 'lossy_decompressed' decompressed_tensor_key = TensorKey(tensor_name, origin, round_number, report, tuple(new_tags)) elif 'compressed' in tags: # 'compressed' == lossless compression; no need for # compression related tag after decompression new_tags = list(tags) new_tags.remove('compressed') decompressed_tensor_key = TensorKey(tensor_name, origin, round_number, report, tuple(new_tags)) else: raise NotImplementedError( "Decompression is only supported on compressed data") return decompressed_tensor_key, decompressed_nparray
def train_batches(self, col_name, round_num, input_tensor_dict, num_batches=None, use_tqdm=True, **kwargs): """Train batches. Train the model on the requested number of batches. Args: col_name: Name of the collaborator round_num: What round is it input_tensor_dict: Required input tensors (for model) num_batches: The number of batches to train on before returning use_tqdm (bool): Use tqdm to print a progress bar (Default=True) Returns: global_output_dict: Tensors to send back to the aggregator local_output_dict: Tensors to maintain in the local TensorDB """ self.rebuild_model(round_num, input_tensor_dict) # set to "training" mode self.train() losses = [] loader = self.data_loader.get_train_loader(num_batches=num_batches) if use_tqdm: loader = tqdm.tqdm(loader, desc="train epoch") # shuffling occurs every time this loader is used as an interator for data, target in loader: data, target = (torch.tensor(data).to(self.device), torch.tensor(target).to(self.device)) self.optimizer.zero_grad() output = self(data) loss = self.loss_fn(output, target) loss.backward() self.optimizer.step() losses.append(loss.detach().cpu().numpy()) # output metric tensors (scalar) origin = col_name tags = ('trained', ) output_metric_dict = { TensorKey(self.loss_fn.__class__.__name__, origin, round_num, True, ('metric', )): np.array(np.mean(losses)) } # output model tensors (Doesn't include TensorKey) output_model_dict = self.get_tensor_dict(with_opt_vars=True) global_model_dict, local_model_dict = split_tensor_dict_for_holdouts( self.logger, output_model_dict, **self.tensor_dict_split_fn_kwargs) # create global tensorkeys global_tensorkey_model_dict = { TensorKey(tensor_name, origin, round_num, False, tags): nparray for tensor_name, nparray in global_model_dict.items() } # create tensorkeys that should stay local local_tensorkey_model_dict = { TensorKey(tensor_name, origin, round_num, False, tags): nparray for tensor_name, nparray in local_model_dict.items() } # the train/validate aggregated function of the next round will look # for the updated model parameters # this ensures they will be resolved locally next_local_tensorkey_model_dict = { TensorKey(tensor_name, origin, round_num + 1, False, ('model', )): nparray for tensor_name, nparray in local_model_dict.items() } global_tensor_dict = { **output_metric_dict, **global_tensorkey_model_dict } local_tensor_dict = { **local_tensorkey_model_dict, **next_local_tensorkey_model_dict } # update the required tensors if they need to be pulled # from the aggregator # TODO this logic can break if different collaborators have different # roles between rounds. # for example, if a collaborator only performs validation in the first # round but training in the second, it has no way of knowing the # optimizer state tensor names to request from the aggregator # because these are only created after training occurs. A work # around could involve doing a single epoch of training # on random data to get the optimizer names, and then throwing away # the model. if self.opt_treatment == 'CONTINUE_GLOBAL': self.initialize_tensorkeys_for_functions(with_opt_vars=True) # this will signal that the optimizer values are now present, and can # be loaded when the model is rebuilt self.train_round_completed = True return global_tensor_dict, local_tensor_dict
def train(self, col_name, round_num, input_tensor_dict, metrics, num_batches=None, **kwargs): """ Perform the training for a specified number of batches. Is expected to perform draws randomly, without replacement until data is exausted. Then data is replaced and shuffled and draws continue. Returns ------- dict 'TensorKey: nparray' """ if metrics is None: raise KeyError('metrics must be defined') # if 'batch_size' in kwargs: # batch_size = kwargs['batch_size'] # else: # batch_size = self.data_loader.batch_size # rebuild model with updated weights self.rebuild_model(round_num, input_tensor_dict) results = self.train_iteration( self.data_loader.get_train_loader(num_batches), metrics=metrics, **kwargs) # output metric tensors (scalar) origin = col_name tags = ('trained', ) output_metric_dict = { TensorKey(metric_name, origin, round_num, True, ('metric', )): metric_value for (metric_name, metric_value) in results } # output model tensors (Doesn't include TensorKey) output_model_dict = self.get_tensor_dict(with_opt_vars=True) global_model_dict, local_model_dict = split_tensor_dict_for_holdouts( self.logger, output_model_dict, **self.tensor_dict_split_fn_kwargs) # create global tensorkeys global_tensorkey_model_dict = { TensorKey(tensor_name, origin, round_num, False, tags): nparray for tensor_name, nparray in global_model_dict.items() } # create tensorkeys that should stay local local_tensorkey_model_dict = { TensorKey(tensor_name, origin, round_num, False, tags): nparray for tensor_name, nparray in local_model_dict.items() } # the train/validate aggregated function of the next round will look # for the updated model parameters. # this ensures they will be resolved locally next_local_tensorkey_model_dict = { TensorKey(tensor_name, origin, round_num + 1, False, ('model', )): nparray for tensor_name, nparray in local_model_dict.items() } global_tensor_dict = { **output_metric_dict, **global_tensorkey_model_dict } local_tensor_dict = { **local_tensorkey_model_dict, **next_local_tensorkey_model_dict } # update the required tensors if they need to be pulled from the # aggregator # TODO this logic can break if different collaborators have different # roles between rounds. # for example, if a collaborator only performs validation in the first # round but training in the second, it has no way of knowing the # optimizer state tensor names to request from the aggregator because # these are only created after training occurs. A work around could # involve doing a single epoch of training on random data to get the # optimizer names, and then throwing away the model. if self.opt_treatment == 'CONTINUE_GLOBAL': self.initialize_tensorkeys_for_functions(with_opt_vars=True) # return global_tensor_dict, local_tensor_dict return global_tensor_dict, local_tensor_dict
def run_challenge_experiment(aggregation_function, choose_training_collaborators, training_hyper_parameters_for_round, institution_split_csv_filename, brats_training_data_parent_dir, db_store_rounds=5, rounds_to_train=5, device='cpu', save_checkpoints=True, restore_from_checkpoint_folder=None, include_validation_with_hausdorff=True, use_pretrained_model=True): fx.init('fets_challenge_workspace') from sys import path, exit file = Path(__file__).resolve() root = file.parent.resolve() # interface root, containing command modules work = Path.cwd().resolve() path.append(str(root)) path.insert(0, str(work)) # create gandlf_csv and get collaborator names gandlf_csv_path = os.path.join(work, 'gandlf_paths.csv') # split_csv_path = os.path.join(work, institution_split_csv_filename) collaborator_names = construct_fedsim_csv(brats_training_data_parent_dir, institution_split_csv_filename, 0.8, gandlf_csv_path) aggregation_wrapper = CustomAggregationWrapper(aggregation_function) overrides = { 'aggregator.settings.rounds_to_train': rounds_to_train, 'aggregator.settings.db_store_rounds': db_store_rounds, 'tasks.train.aggregation_type': aggregation_wrapper, 'task_runner.settings.device': device, } # Update the plan if necessary plan = fx.update_plan(overrides) if not include_validation_with_hausdorff: plan.config['task_runner']['settings']['fets_config_dict'][ 'metrics'] = ['dice', 'dice_per_label'] # Overwrite collaborator names plan.authorized_cols = collaborator_names # overwrite datapath values with the collaborator name itself for col in collaborator_names: plan.cols_data_paths[col] = col # get the data loaders for each collaborator collaborator_data_loaders = { col: copy(plan).get_data_loader(col) for col in collaborator_names } transformed_csv_dict = extract_csv_partitions( os.path.join(work, 'gandlf_paths.csv')) # get the task runner, passing the first data loader for col in collaborator_data_loaders: #Insert logic to serialize train / val CSVs here transformed_csv_dict[col]['train'].to_csv( os.path.join(work, 'seg_test_train.csv')) transformed_csv_dict[col]['val'].to_csv( os.path.join(work, 'seg_test_val.csv')) task_runner = copy(plan).get_task_runner( collaborator_data_loaders[col]) if use_pretrained_model: print('Loading pretrained model...') if device == 'cpu': checkpoint = torch.load( f'{root}/pretrained_model/resunet_pretrained.pth', map_location=torch.device('cpu')) task_runner.model.load_state_dict(checkpoint['model_state_dict']) task_runner.optimizer.load_state_dict( checkpoint['optimizer_state_dict']) else: checkpoint = torch.load( f'{root}/pretrained_model/resunet_pretrained.pth') task_runner.model.load_state_dict(checkpoint['model_state_dict']) task_runner.optimizer.load_state_dict( checkpoint['optimizer_state_dict']) tensor_pipe = plan.get_tensor_pipe() # Initialize model weights init_state_path = plan.config['aggregator']['settings']['init_state_path'] tensor_dict, _ = split_tensor_dict_for_holdouts( logger, task_runner.get_tensor_dict(False)) model_snap = utils.construct_model_proto(tensor_dict=tensor_dict, round_number=0, tensor_pipe=tensor_pipe) utils.dump_proto(model_proto=model_snap, fpath=init_state_path) # get the aggregator, now that we have the initial weights file set up logger.info('Creating aggregator...') aggregator = plan.get_aggregator() # manually override the aggregator UUID (for checkpoint resume when rounds change) aggregator.uuid = 'aggregator' aggregator._load_initial_tensors() # create our collaborators logger.info('Creating collaborators...') collaborators = { col: copy(plan).get_collaborator(col, task_runner=task_runner, client=aggregator) for col in collaborator_names } collaborator_time_stats = gen_collaborator_time_stats(plan.authorized_cols) collaborators_chosen_each_round = {} collaborator_times_per_round = {} logger.info('Starting experiment') total_simulated_time = 0 best_dice = -1.0 best_dice_over_time_auc = 0 # results dataframe data experiment_results = { 'round': [], 'time': [], 'convergence_score': [], 'round_dice': [], 'dice_label_0': [], 'dice_label_1': [], 'dice_label_2': [], 'dice_label_4': [], } if include_validation_with_hausdorff: experiment_results.update({ 'hausdorff95_label_0': [], 'hausdorff95_label_1': [], 'hausdorff95_label_2': [], 'hausdorff95_label_4': [], }) if restore_from_checkpoint_folder is None: checkpoint_folder = setup_checkpoint_folder() logger.info(f'\nCreated experiment folder {checkpoint_folder}...') starting_round_num = 0 else: if not Path(f'checkpoint/{restore_from_checkpoint_folder}').exists(): logger.warning( f'Could not find provided checkpoint folder: {restore_from_checkpoint_folder}. Exiting...' ) exit(1) else: logger.info( f'Attempting to load last completed round from {restore_from_checkpoint_folder}' ) state = load_checkpoint(restore_from_checkpoint_folder) checkpoint_folder = restore_from_checkpoint_folder [ loaded_collaborator_names, starting_round_num, collaborator_time_stats, total_simulated_time, best_dice, best_dice_over_time_auc, collaborators_chosen_each_round, collaborator_times_per_round, experiment_results, summary, agg_tensor_db ] = state if loaded_collaborator_names != collaborator_names: logger.error( f'Collaborator names found in checkpoint ({loaded_collaborator_names}) ' f'do not match provided collaborators ({collaborator_names})' ) exit(1) logger.info(f'Previous summary for round {starting_round_num}') logger.info(summary) starting_round_num += 1 aggregator.tensor_db.tensor_db = agg_tensor_db aggregator.round_number = starting_round_num for round_num in range(starting_round_num, rounds_to_train): # pick collaborators to train for the round training_collaborators = choose_training_collaborators( collaborator_names, aggregator.tensor_db._iterate(), round_num, collaborators_chosen_each_round, collaborator_times_per_round) logger.info('Collaborators chosen to train for round {}:\n\t{}'.format( round_num, training_collaborators)) # save the collaborators chosen this round collaborators_chosen_each_round[round_num] = training_collaborators # get the hyper-parameters from the competitor hparams = training_hyper_parameters_for_round( collaborator_names, aggregator.tensor_db._iterate(), round_num, collaborators_chosen_each_round, collaborator_times_per_round) learning_rate, epochs_per_round, batches_per_round = hparams if (epochs_per_round is None) == (batches_per_round is None): logger.error( 'Hyper-parameter function error: function must return "None" for either "epochs_per_round" or "batches_per_round" but not both.' ) return hparam_message = "\n\tlearning rate: {}".format(learning_rate) # None gets mapped to -1 in the tensor_db if epochs_per_round is None: epochs_per_round = -1 hparam_message += "\n\tbatches_per_round: {}".format( batches_per_round) elif batches_per_round is None: batches_per_round = -1 hparam_message += "\n\tepochs_per_round: {}".format( epochs_per_round) logger.info("Hyper-parameters for round {}:{}".format( round_num, hparam_message)) # cache each tensor in the aggregator tensor_db hparam_dict = {} tk = TensorKey(tensor_name='learning_rate', origin=aggregator.uuid, round_number=round_num, report=False, tags=('hparam', 'model')) hparam_dict[tk] = np.array(learning_rate) tk = TensorKey(tensor_name='epochs_per_round', origin=aggregator.uuid, round_number=round_num, report=False, tags=('hparam', 'model')) hparam_dict[tk] = np.array(epochs_per_round) tk = TensorKey(tensor_name='batches_per_round', origin=aggregator.uuid, round_number=round_num, report=False, tags=('hparam', 'model')) hparam_dict[tk] = np.array(batches_per_round) aggregator.tensor_db.cache_tensor(hparam_dict) # pre-compute the times for each collaborator times_per_collaborator = compute_times_per_collaborator( collaborator_names, training_collaborators, batches_per_round, epochs_per_round, collaborator_data_loaders, collaborator_time_stats, round_num) collaborator_times_per_round[round_num] = times_per_collaborator aggregator.assigner.set_training_collaborators(training_collaborators) # update the state in the aggregation wrapper aggregation_wrapper.set_state_data_for_round( collaborators_chosen_each_round, collaborator_times_per_round) # turn the times list into a list of tuples and sort it times_list = [(t, col) for col, t in times_per_collaborator.items()] times_list = sorted(times_list) # now call each collaborator in order of time # FIXME: this doesn't break up each task. We need this if we're doing straggler handling for t, col in times_list: # set the task_runner data loader task_runner.data_loader = collaborator_data_loaders[col] # run the collaborator collaborators[col].run_simulation() logger.info( "Collaborator {} took simulated time: {} minutes".format( col, round(t / 60, 2))) # the round time is the max of the times_list round_time = max([t for t, _ in times_list]) total_simulated_time += round_time # get the performace validation scores for the round round_dice = get_metric('valid_dice', round_num, aggregator.tensor_db) dice_label_0 = get_metric('valid_dice_per_label_0', round_num, aggregator.tensor_db) dice_label_1 = get_metric('valid_dice_per_label_1', round_num, aggregator.tensor_db) dice_label_2 = get_metric('valid_dice_per_label_2', round_num, aggregator.tensor_db) dice_label_4 = get_metric('valid_dice_per_label_4', round_num, aggregator.tensor_db) if include_validation_with_hausdorff: hausdorff95_label_0 = get_metric('valid_hd95_per_label_0', round_num, aggregator.tensor_db) hausdorff95_label_1 = get_metric('valid_hd95_per_label_1', round_num, aggregator.tensor_db) hausdorff95_label_2 = get_metric('valid_hd95_per_label_2', round_num, aggregator.tensor_db) hausdorff95_label_4 = get_metric('valid_hd95_per_label_4', round_num, aggregator.tensor_db) # update best score if best_dice < round_dice: best_dice = round_dice # Set the weights for the final model if round_num == 0: # here the initial model was validated (temp model does not exist) logger.info( f'Skipping best model saving to disk as it is a random initialization.' ) elif not os.path.exists( f'checkpoint/{checkpoint_folder}/temp_model.pkl'): raise ValueError( f'Expected temporary model at: checkpoint/{checkpoint_folder}/temp_model.pkl to exist but it was not found.' ) else: # here the temp model was the one validated shutil.copyfile( src=f'checkpoint/{checkpoint_folder}/temp_model.pkl', dst=f'checkpoint/{checkpoint_folder}/best_model.pkl') logger.info( f'Saved model with best average binary DICE: {best_dice} to ~/.local/workspace/checkpoint/{checkpoint_folder}/best_model.pkl' ) ## RUN VALIDATION ON INTERMEDIATE CONSENSUS MODEL # set the task_runner data loader # task_runner.data_loader = collaborator_data_loaders[col] ### DELETE THIS LINE ### # print(f'Collaborator {col} training data count = {task_runner.data_loader.get_train_data_size()}') # run the collaborator #collaborators[col].run_simulation() ## CONVERGENCE METRIC COMPUTATION # update the auc score best_dice_over_time_auc += best_dice * round_time # project the auc score as remaining time * best dice # this projection assumes that the current best score is carried forward for the entire week projected_auc = (MAX_SIMULATION_TIME - total_simulated_time ) * best_dice + best_dice_over_time_auc projected_auc /= MAX_SIMULATION_TIME # End of round summary summary = '"**** END OF ROUND {} SUMMARY *****"'.format(round_num) summary += "\n\tSimulation Time: {} minutes".format( round(total_simulated_time / 60, 2)) summary += "\n\t(Projected) Convergence Score: {}".format( projected_auc) summary += "\n\tDICE Label 0: {}".format(dice_label_0) summary += "\n\tDICE Label 1: {}".format(dice_label_1) summary += "\n\tDICE Label 2: {}".format(dice_label_2) summary += "\n\tDICE Label 4: {}".format(dice_label_4) if include_validation_with_hausdorff: summary += "\n\tHausdorff95 Label 0: {}".format( hausdorff95_label_0) summary += "\n\tHausdorff95 Label 1: {}".format( hausdorff95_label_1) summary += "\n\tHausdorff95 Label 2: {}".format( hausdorff95_label_2) summary += "\n\tHausdorff95 Label 4: {}".format( hausdorff95_label_4) experiment_results['round'].append(round_num) experiment_results['time'].append(total_simulated_time) experiment_results['convergence_score'].append(projected_auc) experiment_results['round_dice'].append(round_dice) experiment_results['dice_label_0'].append(dice_label_0) experiment_results['dice_label_1'].append(dice_label_1) experiment_results['dice_label_2'].append(dice_label_2) experiment_results['dice_label_4'].append(dice_label_4) if include_validation_with_hausdorff: experiment_results['hausdorff95_label_0'].append( hausdorff95_label_0) experiment_results['hausdorff95_label_1'].append( hausdorff95_label_1) experiment_results['hausdorff95_label_2'].append( hausdorff95_label_2) experiment_results['hausdorff95_label_4'].append( hausdorff95_label_4) logger.info(summary) if save_checkpoints: logger.info(f'Saving checkpoint for round {round_num}') logger.info( f'To resume from this checkpoint, set the restore_from_checkpoint_folder parameter to \'{checkpoint_folder}\'' ) save_checkpoint(checkpoint_folder, aggregator, collaborator_names, collaborators, round_num, collaborator_time_stats, total_simulated_time, best_dice, best_dice_over_time_auc, collaborators_chosen_each_round, collaborator_times_per_round, experiment_results, summary) # if the total_simulated_time has exceeded the maximum time, we break # in practice, this means that the previous round's model is the last model scored, # so a long final round should not actually benefit the competitor, since that final # model is never globally validated if total_simulated_time > MAX_SIMULATION_TIME: logger.info("Simulation time exceeded. Ending Experiment") break # save the most recent aggregated model in native format to be copied over as best when appropriate # (note this model has not been validated by the collaborators yet) task_runner.rebuild_model(round_num, aggregator.last_tensor_dict, validation=True) task_runner.save_native( f'checkpoint/{checkpoint_folder}/temp_model.pkl') return pd.DataFrame.from_dict(experiment_results), checkpoint_folder