def _check_state(self, state): state_attributes = [ 'manager', 'manager.epoch', 'manager.batch_step', 'manager.global_iter', 'manager.logs', 'manager.metric_windows', 'callbacks', 'trainer' ] for attr in state_attributes: v = get_value_at(attr, state, warn_on_failure=False) if v is None: if attr == "manager.metric_windows": v = get_value_at('manager.training_loss_window', state, warn_on_failure=False) if v is not None: self._log.debug( "Legacy state detected (training_loss_window) ... " ) continue self._log.error( f"Given state does not have a value for {attr}, state is invalid" ) return False return True
def _calc_gradients(self, batch_data, training_settings=None): """ :param batch_data: :param training_settings: :return: :raises LossNotAvailableException """ if not self.batch_chunk_size: with tf.GradientTape() as tape: results = self.evaluate_loss( batch_data, inference_mode=False, evaluate_settings=training_settings) if 'loss' not in results: raise LossNotAvailableException() if self.trainable_variables is None: # We now have evaluated the model and the trainable variables should be available self._retrieve_trainable_variables() loss = results['loss'] auxiliary_results = get_value_at('auxiliary_results', results, warn_on_failure=False) gradients = self._back_propagate_from(loss, tape) else: loss, auxiliary_results, gradients = self._calc_gradients_chunked( batch_data, training_settings) return loss, auxiliary_results, gradients
def _evaluate_loss(self, batch_data, evaluate_settings=None, inference_mode=None): use_teacher_forcing = get_value_at('use_teacher_forcing', evaluate_settings) if use_teacher_forcing is None: use_teacher_forcing = True padded_input_batch, input_lengths, output_batch, output_mask, max_output_len = batch_data padded_input_batch = padded_input_batch.to(device) input_lengths = input_lengths.to(device) output_batch = output_batch.to(device) output_mask = output_mask.to(device) batch_size = padded_input_batch.size(1) init_decoder_input = torch.tensor( [[SOS_token for _ in range(batch_size)]], dtype=torch.long, device=device) results = self.training_model(padded_input_batch, input_lengths, init_decoder_input, max_output_len, output_batch, use_teacher_forcing) per_sample_loss = results["loss"] # loss, loss_sum, num_samples return masked_loss(per_sample_loss, output_mask)
def _get_model_quality(self, current_logs): model_quality = get_value_at(self._metric_to_monitor, current_logs, warn_on_failure=self._warn_on_model_quality_not_available) if type(model_quality) is tuple: # use the first value as metric value, the other values are auxiliary results meant for other purposes model_quality = model_quality[0] return model_quality
def _get_current_evaluate_settings(self, logs): # Use custom settings if available, else use default settings evaluate_settings = get_value_at('evaluate_settings', logs, warn_on_failure=False) if evaluate_settings is None: evaluate_settings = self._evaluate_settings return evaluate_settings
def _get_model_quality(self, current_logs): if not self._metric_to_monitor: return None model_quality = get_value_at(self._metric_to_monitor, current_logs) if type(model_quality) is tuple: # use the first value as metric value, the other values are auxiliary results meant for other purposes model_quality = model_quality[0] return model_quality
def _back_propagate_from(self, loss, tape, last_chunk=False): gradients = {} for optimizer_name in self.optimizers.keys(): trainable_variables = get_value_at(optimizer_name, self.trainable_variables, warn_on_failure=False) gradients[optimizer_name] = tape.gradient(loss, trainable_variables) return gradients
def _update_model_parameters(self, gradients): for optimizer_name, optimizer in self.get_optimizers().items(): trainable_variables = get_value_at(optimizer_name, self.trainable_variables) if trainable_variables is None: raise MLPugException( "Unexpected state : trainable variables not found. Please file an issue." ) optimizer.apply_gradients( zip(gradients[optimizer_name], trainable_variables))
def _calc_whole_dataset_metrics(self, logs, log_path): current = self._get_logs_base(logs) metrics_log = get_value_at(log_path, current) evaluate_settings = self._get_current_evaluate_settings(logs) return self._metric_evaluator.calc_dataset_metrics_for( self._dataset, metrics_log, evaluate_settings=evaluate_settings, dataset_name=self._dataset_name)
def _check_state(self, state): state_attributes = ['model_components', 'optimizers'] for attr in state_attributes: v = get_value_at(attr, state, warn_on_failure=False) if v is None: self._log.error( f"Given state does not have a value for {attr}, state is invalid" ) return False return True
def on_batch_training_completed(self, dataset_batch, logs): if not self.instance_valid(): self._log.error(f"{self} is not valid, skipping this hook ... ") return False if not self._batch_level: return True if not self._log_condition_func(logs=logs, dataset_batch=dataset_batch): return True self._init_logs(logs) if self._logging_mode is MetricsLoggingMode.WHOLE_DATASET_METRICS: return self._calc_whole_dataset_metrics( logs, f"{self._dataset_name}.dataset") else: current = self._get_logs_base(logs) batch_metrics = {} if not self._calc_batch_metric_data_from(dataset_batch, batch_metrics, logs): return False base_path = f"{self._dataset_name}.batch" dataset_batch_logs = get_value_at(base_path, current) # Merge in new batch level results dataset_batch_logs = {**dataset_batch_logs, **batch_metrics} if self._logging_mode is MetricsLoggingMode.BATCH_AND_WINDOW_AVERAGE_METRICS: set_value_at(base_path, current, dataset_batch_logs) metric_names = self._metric_evaluator.get_metric_names() metric_paths = get_key_paths( dataset_batch_logs, keys_to_consider=metric_names, keys_not_to_consider=["auxiliary_results"]) self._update_metrics_windows_for(metric_paths, dataset_batch_logs, base_path=base_path) # gather all window data batch_metrics_lists = { p: s.window for p, s in self._metric_windows.items() } if not self._reduce(batch_metrics_lists, current[self._dataset_name]['window_average']): return False return True
def _calc_batch_metric_data_from(self, batch, batch_metrics, logs): evaluate_settings = self._get_current_evaluate_settings(logs) model_output = None if self._dataset is None: current = self._get_logs_base(logs) loss = get_value_at(f"{self._dataset_name}.batch.loss", current) auxiliary_results = get_value_at( f"{self._dataset_name}.batch.auxiliary_results", current, warn_on_failure=False) model_output = { 'loss': loss, 'auxiliary_results': auxiliary_results } return self._metric_evaluator.calc_batch_metrics_for( batch, batch_metrics, evaluate_settings=evaluate_settings, model_output=model_output)
def _update_window(self, metric_path): try: window = self._metric_windows[metric_path] if window is None: return value = get_value_at(metric_path, self.logs["current"]) window.slide(value) except Exception as e: _.log_exception( self._log, f"Exception occurred updating sliding window {metric_path}, skipped...", e)
def _retrieve_trainable_variables(self): if len(self.optimizers) > 1: return # This only needs to be done once # Further, this situation only occurs when there is only one optimizer optimizer_name = next(iter(self.optimizers)) trainable_variables = get_value_at(optimizer_name, self.trainable_variables, warn_on_failure=False) if trainable_variables is None: trainable_variables = self.training_model.trainable_variables self.trainable_variables = {optimizer_name: trainable_variables}
def _update_metrics_windows_for(self, metric_paths, batch_metrics, base_path): for metric_path in metric_paths: metric_value = get_value_at(metric_path, batch_metrics) full_metric_path = f"{base_path}.{metric_path}" sliding_window = self._metric_windows[ metric_path] if metric_path in self._metric_windows else None if sliding_window is None: self._log.debug( f"Creating sliding window for {full_metric_path}") sliding_window = self._sliding_window_factory( length=self._batch_averaging_window, name=full_metric_path) self._metric_windows[metric_path] = sliding_window sliding_window.slide(metric_value)
def _set_metric_windows_states(self, state): success = True try: metric_windows_state = get_value_at("manager.metric_windows", state) if metric_windows_state is None: return success for metric_path, window_state in metric_windows_state.items(): success &= self._set_metric_window_state( metric_path, window_state) except Exception as e: _.log_exception( self._log, f"Unable to set metric windows state, skipped ...", e) success = False return success
def _update_logs(self, logs): current = self._get_logs_base(logs) schedule_level = self._get_schedule_level() ctp = current['training_params'][schedule_level] try: current_lr = self._get_current_lr() lr = get_value_at('lr', ctp, warn_on_failure=False) or {} ctp['lr'] = {**lr, **current_lr} return True except Exception as e: _.log_exception( self._log, "An unexpected error occurred, " "unable to add current learning rate values to the logs object", e) return False
def on_batch_training_completed(self, training_batch, logs): if not self.batch_level: return True success = True current = self._get_logs_base(logs) batch_step = current["batch_step"] has_dataset_level_metrics = False for set_name in self.set_names: dataset_metrics = get_value_at(f"{set_name}.dataset", current, warn_on_failure=False) has_dataset_level_metrics |= type(dataset_metrics) is dict and len( dataset_metrics) > 0 if has_dataset_level_metrics: break if batch_step == 0 or batch_step % self.log_period == 0 or has_dataset_level_metrics: eta = self._calc_eta(logs) average_duration = self._get_average_batch_duration(logs) self._write('\nEpoch {:d}/{:d} - ETA: {:s}\tBatch {:d}/{:d} ' 'Average batch training time {:s}\n'.format( current["epoch"], logs["final_epoch"], eta, current["batch_step"], logs["final_batch_step"], average_duration)) for metric_level in [ 'batch', 'window_average', 'dataset', 'epoch' ]: self._write_metric_logs(metric_level, logs) self._write(f'\n') self._write(f'\n') return success
def _calc_gradients_chunked(self, batch_data, training_settings=None): """ See `train_on` method. This method slices the `batch_data` in slices of size `self.batch_chunk_size`. For each slice the loss is calculated and the gradients are updated through back prop. return: loss, auxiliary_results, accumulated_grads loss: weighted average of chunk losses auxiliary_results: list of dicts: [ ... { "results": chunk aux. results, "num_samples": num samples in chunk } ... ] accumulated_grads: weighted average of chunk gradients """ if not is_chunkable(batch_data): raise BatchNotChunkableException() auxiliary_results = BatchChunkingResults() loss = 0 # Will be set when we have the trainable variables accumulated_grads = None batch_size = len(batch_data) num_chunks = math.ceil(batch_size / self.batch_chunk_size) for chunk_idx in range(num_chunks): chunk_start = chunk_idx * self.batch_chunk_size chunk_end = min((chunk_idx + 1) * self.batch_chunk_size, batch_size) chunk_len = chunk_end - chunk_start chunk = batch_data[chunk_start:chunk_end] with tf.GradientTape() as tape: results = self.evaluate_loss( chunk, inference_mode=False, evaluate_settings=training_settings) if 'loss' not in results: raise LossNotAvailableException() if self.trainable_variables is None: # We now have evaluated the model and the trainable variables should be available self._retrieve_trainable_variables() if accumulated_grads is None: if self.trainable_variables is None: raise MLPugException( "Unexpected state : trainable variables not found. Please file an issue." ) accumulated_grads = {} for optimizer_name, tvs in self.trainable_variables.item(): accumulated_grads[optimizer_name] = [ tf.zeros_like(tv) for tv in tvs ] loss = results['loss'] aux_results = get_value_at('auxiliary_results', results, warn_on_failure=False) # loss is assumed to be the average over the sample loss for the chunk # Divide through batch size to factor in that this loss is part of a larger batch. last_chunk = chunk_idx == (num_chunks - 1) chunk_loss = chunk_len * loss / batch_size chunk_gradients = self._back_propagate_from(chunk_loss, tape, last_chunk=last_chunk) loss += chunk_loss for optimizer_name, chunk_grads in chunk_gradients.items(): accu_grads = accumulated_grads[optimizer_name] accumulated_grads[optimizer_name] = [ (accu_grad + chunk_grad) for accu_grad, chunk_grad in zip(accu_grads, chunk_grads) ] auxiliary_results += [{ "results": aux_results, "num_samples": chunk_len }] return loss, auxiliary_results, accumulated_grads
def create_optimizer_weights(): for optimizer_name, optimizer in self.optimizers.items(): trainable_variables = get_value_at(optimizer_name, self.trainable_variables, warn_on_failure=False) optimizer._create_all_weights(trainable_variables)
def get_model_component(self, name): return get_value_at(name, self.get_model_components())
def get_optimizer(self, name): return get_value_at(name, self.get_optimizers())
def _create_set_metrics_log_for(self, set_name, metric_level, logs): current = self._get_logs_base(logs) key_path = f"{set_name}.{metric_level}" metrics = get_value_at(key_path, current, warn_on_failure=False) return self._create_log_for(metrics)