def on_train_begin(self, logs=None): # pylint: disable=unused-argument """Summary. Args: logs (None, optional): Description """ try_mlflow_log(log_param, 'num_layers', len(self.model.layers)) try_mlflow_log(log_param, 'optimizer_name', type(self.model.optimizer).__name__) if hasattr(self.model.optimizer, 'lr'): lr = self.model.optimizer.lr if \ type(self.model.optimizer.lr) is float \ else keras.backend.eval(self.model.optimizer.lr) try_mlflow_log(log_param, 'learning_rate', lr) if hasattr(self.model.optimizer, 'epsilon'): epsilon = self.model.optimizer.epsilon if \ type(self.model.optimizer.epsilon) is float \ else keras.backend.eval(self.model.optimizer.epsilon) try_mlflow_log(log_param, 'epsilon', epsilon) sum_list = [] self.model.summary(print_fn=sum_list.append) summary = '\n'.join(sum_list) tempdir = tempfile.mkdtemp() try: summary_file = os.path.join(tempdir, 'model_summary.txt') with open(summary_file, 'w') as f: f.write(summary) try_mlflow_log(log_artifact, key='model_summary.txt', path=summary_file) finally: shutil.rmtree(tempdir)
def on_epoch_end(self, epoch, logs=None): """Summary. Args: epoch (TYPE): Description logs (None, optional): Description Returns: TYPE: Description """ self.current_epoch = epoch if not logs: return # sys_data = gpu_metrics() logs_copy = copy.deepcopy(logs) # logs_copy.update(gpu_data) # Removing system_metrics for now, as these are not frequently used # cpu_data = system_metrics() # sys_data.update(cpu_data) # try_mlflow_log( # log_metrics, # sys_data, # step=self.num_step, # epoch=self.current_epoch, # tags={'sys_metric': 'yes'}) try_mlflow_log(log_metrics, logs_copy, step=self.num_step, epoch=self.current_epoch)
def _save_model(self, epoch, logs): super(ModelCheckpointAndUpload, self)._save_model(epoch, logs) filepath = self._get_file_path(epoch, logs) if os.path.exists(filepath): output_filename = os.path.join(tempfile.gettempdir(), 'checkpoint_segmind_track') if os.path.isfile(output_filename): os.remove(output_filename) # zip filepath folder if os.path.isdir(filepath): shutil.make_archive(output_filename, 'zip', filepath) # log as artifact print(f'Uploading checkpoint {output_filename} ...') try_mlflow_log(log_artifact, key=os.path.basename(filepath) + '.zip', path=output_filename + '.zip') else: # log as artifact print(f'Uploading checkpoint {filepath} ...') try_mlflow_log(log_artifact, key=os.path.basename(filepath), path=filepath)
def on_train_batch_end(self, trainer, pl_module, outputs, batch, batch_idx, dataloader_idx): """Summary. Args: trainer (TYPE): Description pl_module (None, optional): Description batch (TYPE) : Description batch_idx (TYPE) : Description dataloader_idx (TYPE) : Description Returns: TYPE: Description """ self.num_step += 1 logs = trainer.logger_connector.callback_metrics if self.step_logging and self.num_step % self.log_evry_n_step == 0: gpu_data = gpu_metrics() logs_copy = copy.deepcopy(logs) logs_copy.update(gpu_data) cpu_data = system_metrics() logs_copy.update(cpu_data) try_mlflow_log(log_metrics, logs_copy, step=self.num_step)
def _save_model(self, filepath: str, trainer, pl_module): super(PytorchModelCheckpointAndUpload, self)._save_model(filepath, trainer, pl_module) # noqa: E501 # filepath = self._get_file_path(epoch, logs) trainer.dev_debugger.track_checkpointing_history(filepath) if trainer.is_global_zero: self._fs.makedirs(os.path.dirname(filepath), exist_ok=True) if self.save_function is not None: self.save_function(filepath, self.save_weights_only) if os.path.exists(filepath): output_filename = os.path.join(tempfile.gettempdir(), 'checkpoint_segmind_track') if os.path.isfile(output_filename): os.remove(output_filename) # zip filepath folder if os.path.isdir(filepath): shutil.make_archive(output_filename, 'zip', filepath) # log as artifact print(f'Uploading checkpoint {output_filename} ...') try_mlflow_log( log_artifact, key=os.path.basename(filepath) + '.zip', path=output_filename + '.zip') else: # log as artifact print(f'Uploading checkpoint {filepath} ...') try_mlflow_log( log_artifact, key=os.path.basename(filepath), path=filepath)
def on_epoch_end(self, trainer, pl_module): """Summary. Args: trainer (TYPE): Description pl_module (None, optional): Description Returns: TYPE: Description """ # self.current_epoch = epoch logs = trainer.logger_connector.callback_metrics self.current_epoch += 1 # sys_data = gpu_metrics() # Removing system_metrics for now, as these are not frequently used # sys_data.update(system_metrics()) # try_mlflow_log( # log_metrics, # sys_data, # step=self.num_step, # epoch=self.current_epoch, # tags={'sys_metric': 'yes'}) try_mlflow_log(log_metrics, logs, step=self.num_step, epoch=self.current_epoch)
def on_test_batch_end(self, trainer, pl_module, outputs, batch, batch_idx, dataloader_idx): """Summary. Args: trainer (TYPE): Description pl_module (None, optional): Description batch (TYPE) : Description batch_idx (TYPE) : Description dataloader_idx (TYPE) : Description Returns: TYPE: Description """ self.num_test_step += 1 logs = trainer.logger_connector.callback_metrics if self.step_logging and self.num_test_step % self.log_evry_n_step == 0: # noqa: E501 # sys_data = gpu_metrics() # sys_data.update(system_metrics()) # try_mlflow_log( # log_metrics, # sys_data, # step=self.num_step, # epoch=self.current_epoch, # tags={'sys_metric': 'yes'}) try_mlflow_log( log_metrics, logs, step=self.num_step, epoch=self.current_epoch, )
def on_test_end(self, logs=None): self.num_test_epoch += 1 if logs: try_mlflow_log(log_metrics, logs, step=self.num_test_step, epoch=self.num_test_epoch)
def on_test_end(self, trainer, pl_module): """Summary. Args: trainer (TYPE): Description pl_module (None, optional): Description Returns: TYPE: Description """ logs = trainer.logger_connector.callback_metrics try_mlflow_log(log_metrics, logs, step=self.num_step)
def callback(env): """internal function.""" if env.rank != 0 or (not env.evaluation_result_list) or period is False or period == 0: # noqa: E501 return step = env.iteration results = {} gpu_data = gpu_metrics() results.update(gpu_data) cpu_data = system_metrics() results.update(cpu_data) if step % period == 0 or step + 1 == env.begin_iteration or step + 1 == env.end_iteration: # noqa: E501 for x in env.evaluation_result_list: results[x[0]] = x[1] try_mlflow_log(log_metrics, results, step=step)
def on_test_batch_end(self, batch, logs=None): """Summary. Args: batch (TYPE): Description logs (None, optional): Description Returns: TYPE: Description """ self.num_test_step += 1 if not logs: return if self.step_logging and self.num_test_step % self.log_evry_n_step == 0: # noqa: E501 gpu_data = gpu_metrics() logs_copy = copy.deepcopy(logs) logs_copy.update(gpu_data) cpu_data = system_metrics() logs_copy.update(cpu_data) try_mlflow_log(log_metrics, logs_copy, step=self.num_step)
def on_epoch_end(self, epoch, logs=None): """Summary. Args: epoch (TYPE): Description logs (None, optional): Description Returns: TYPE: Description """ self.current_epoch = epoch if not logs: return gpu_data = gpu_metrics() logs_copy = copy.deepcopy(logs) logs_copy.update(gpu_data) cpu_data = system_metrics() logs_copy.update(cpu_data) try_mlflow_log(log_metrics, logs_copy, step=self.num_step)
def on_epoch_end(self, trainer, pl_module): """Summary. Args: trainer (TYPE): Description pl_module (None, optional): Description Returns: TYPE: Description """ # self.current_epoch = epoch logs = trainer.logger_connector.callback_metrics gpu_data = gpu_metrics() logs_copy = copy.deepcopy(logs) logs_copy.update(gpu_data) cpu_data = system_metrics() logs_copy.update(cpu_data) try_mlflow_log(log_metrics, logs_copy, step=self.num_step) print('end of epoch')
def on_train_start(self, trainer, pl_module): # pylint: disable=unused-arg """Summary. Args: trainer (TYPE): Description pl_module (None, optional): Description Returns: TYPE: Description """ optimizer = pl_module.configure_optimizers() try_mlflow_log(log_param, 'optimizer_name', optimizer.__class__.__name__) lr = optimizer.param_groups[0]['lr'] try_mlflow_log(log_param, 'learning_rate', lr) print('learning rate value is ', lr) epsilon = optimizer.param_groups[0]['eps'] try_mlflow_log(log_param, 'epsilon', epsilon) print('epsilon value is ', epsilon) sum_list = [] x = pl_module.summarize() x = str(x) sum_list.append(x) summary = '\n'.join(sum_list) # try_mlflow_log(set_tag, 'model_summary', summary) tempdir = tempfile.mkdtemp() try: summary_file = os.path.join(tempdir, 'model_summary.txt') with open(summary_file, 'w') as f: f.write(summary) try_mlflow_log( log_artifact, key='model_summary.txt', path=summary_file) finally: shutil.rmtree(tempdir)
def on_test_end(self, logs=None): if not logs: return else: try_mlflow_log(log_metrics, logs, step=self.num_step)