コード例 #1
0
def test_tensorboard_log_metrics(tmpdir, step_idx):
    logger = TensorBoardLogger(tmpdir)
    metrics = {
        "float": 0.3,
        "int": 1,
        "FloatTensor": torch.tensor(0.1),
        "IntTensor": torch.tensor(1)
    }
    logger.log_metrics(metrics, step_idx)
コード例 #2
0
class CustomLogger(LightningLoggerBase):
    def __init__(self, config):
        super().__init__()
        self._logger = logger
        self.config = config
        self._name = self.config.model_name
        self._save_dir = config.log_path
        self._version = None
        if config.test:
            self._save_dir = "test_" + config.log_path
        self._experiment = None

    # @rank_zero_only
    def _create_logger(self):
        # CLI logger
        self._logger.remove()
        self._logger.configure(handlers=[
            dict(
                sink=lambda msg: tqdm.write(msg, end=''),
                level='DEBUG',
                colorize=True,
                format=
                "<green>{time: MM-DD at HH:mm}</green>  <level>{message}</level>",
                enqueue=True),
        ])

        # add file handler for training mode
        if self.config.disable_logfile == 'false':
            os.makedirs(self.log_dir, exist_ok=True)
            logfile = os.path.join(self.log_dir, "log.txt")
            self._logger.info(f"Log to file {logfile}")
            self._logger.add(sink=logfile,
                             mode='w',
                             format="{time: MM-DD at HH:mm} | {message}",
                             level="DEBUG",
                             enqueue=True)

        # enable tensorboard logger
        self.tb_log = self.config.tb_log
        if self.config.tb_log:
            self.tb_logger = TensorBoardLogger(os.path.join(
                self.log_dir, "tb_logs"),
                                               name=self.name)

    @property
    def log_dir(self):
        version = self.version if isinstance(
            self.version, str) else f"version_{self.version}"
        log_dir = os.path.join(self.root_dir, version)
        return log_dir

    @property
    @rank_zero_experiment
    def experiment(self):
        if self._experiment:
            return self._experiment
        self._create_logger()
        self._experiment = self._logger
        return self._experiment

    @staticmethod
    def _handle_value(value):
        if isinstance(value, torch.Tensor):
            try:
                return value.item()
            except ValueError:
                return value.mean().item()
        return value

    @rank_zero_only
    def log_metrics(self, metrics, step=None):
        if len(metrics) == 0:
            return
        if self.tb_log:
            self.tb_logger.log_metrics(metrics, step)

        metrics_str = "  ".join([
            f"{k}: {self._handle_value(v):<4.4f}" for k, v in metrics.items()
            if k != 'epoch'
        ])

        if metrics_str.strip() == '':
            return

        if step is not None:
            metrics_str = f"step: {step:<6d} :: " + metrics_str
        if 'epoch' in metrics:
            metrics_str = f"epoch: {int(metrics['epoch']):<4d}  " + metrics_str
        self.experiment.info(metrics_str)

    @rank_zero_only
    def info_metrics(self, metrics, epoch=None, step=None, level='INFO'):
        if isinstance(metrics, str):
            self.experiment.info(metrics)
            return

        _str = ""
        for k, v in metrics.items():
            if isinstance(v, torch.Tensor):
                v = v.item()
            _str += f"{k}= {v:<4.4f}  "
        self.experiment.log(level,
                            f"epoch {epoch: <4d}: step {step:<6d}:: {_str}")

    @rank_zero_only
    def log(self, msg, level='DEBUG'):
        self.experiment.log(level, msg)

    @property
    def name(self):
        return self._name

    @property
    def version(self) -> int:
        if self._version is None:
            self._version = self._get_next_version()
        return self._version

    def _get_next_version(self):
        root_dir = os.path.join(self._save_dir, self.name)

        if not os.path.isdir(root_dir):
            logger.warning('Missing logger folder: %s', root_dir)
            return 0

        existing_versions = []
        for d in os.listdir(root_dir):
            if os.path.isdir(os.path.join(root_dir,
                                          d)) and d.startswith("version_"):
                existing_versions.append(int(d.split("_")[1]))

        if len(existing_versions) == 0:
            return 0

        return max(existing_versions) + 1

    @rank_zero_only
    def log_hyperparams(self, params):
        _str = ""
        for k in sorted(params):
            v = params[k]
            _str += Fore.LIGHTCYAN_EX + str(k) + "="
            _str += Fore.WHITE + str(v) + ", "
        self.experiment.info("\nhyper-parameters:\n" + _str)
        return

    @property
    def root_dir(self) -> str:
        if not self.name:
            return self.save_dir
        return os.path.join(self.save_dir, self.name)

    @property
    def save_dir(self):
        return self._save_dir
コード例 #3
0
class TrainerLoggingMixin(ABC):

    def __init__(self):
        # this is just a summary on variables used in this abstract class,
        #  the proper values/initialisation should be done in child class
        self.current_epoch = None
        self.on_gpu = None
        self.log_gpu_memory = None
        self.logger = None
        self.tqdm_metrics = None
        self.global_step = None
        self.proc_rank = None
        self.use_dp = None
        self.use_ddp2 = None
        self.num_gpus = None

    def configure_logger(self, logger):
        if logger is True:
            # default logger
            self.logger = TensorBoardLogger(
                save_dir=self.default_save_path,
                version=self.slurm_job_id,
                name='lightning_logs'
            )
            self.logger.rank = 0
        elif logger is False:
            self.logger = None
        else:
            self.logger = logger
            self.logger.rank = 0

    def log_metrics(self, metrics, grad_norm_dic, step=None):
        """Logs the metric dict passed in.
        If `step` parameter is None and `step` key is presented is metrics,
        uses metrics["step"] as a step
        :param metrics (dict): Metric values
        :param grad_norm_dic (dict): Gradient norms
        :param step (int): Step for which metrics should be logged. Default value corresponds to `self.global_step`
        """
        # add gpu memory
        if self.on_gpu and self.log_gpu_memory:
            mem_map = memory.get_memory_profile(self.log_gpu_memory)
            metrics.update(mem_map)

        # add norms
        metrics.update(grad_norm_dic)

        # turn all tensors to scalars
        scalar_metrics = self.metrics_to_scalars(metrics)

        if "step" in scalar_metrics and step is None:
            step = scalar_metrics.pop("step")
        else:
            # added metrics by Lightning for convenience
            metrics['epoch'] = self.current_epoch
            step = step if step is not None else self.global_step
        # log actual metrics
        if self.proc_rank == 0 and self.logger is not None:
            self.logger.log_metrics(scalar_metrics, step=step)
            self.logger.save()

    def add_tqdm_metrics(self, metrics):
        for k, v in metrics.items():
            if isinstance(v, torch.Tensor):
                v = v.item()

            self.tqdm_metrics[k] = v

    def metrics_to_scalars(self, metrics):
        new_metrics = {}
        for k, v in metrics.items():
            if isinstance(v, torch.Tensor):
                v = v.item()

            if isinstance(v, dict):
                v = self.metrics_to_scalars(v)

            new_metrics[k] = v

        return new_metrics

    def process_output(self, output, train=False):
        """Reduces output according to the training mode.

        Separates loss from logging and tqdm metrics
        :param output:
        :return:
        """
        # ---------------
        # EXTRACT CALLBACK KEYS
        # ---------------
        # all keys not progress_bar or log are candidates for callbacks
        callback_metrics = {}
        for k, v in output.items():
            if k not in ['progress_bar', 'log', 'hiddens']:
                callback_metrics[k] = v

        if train and (self.use_dp or self.use_ddp2):
            num_gpus = self.num_gpus
            callback_metrics = self.reduce_distributed_output(callback_metrics, num_gpus)

        for k, v in callback_metrics.items():
            if isinstance(v, torch.Tensor):
                callback_metrics[k] = v.item()

        # ---------------
        # EXTRACT PROGRESS BAR KEYS
        # ---------------
        try:
            progress_output = output['progress_bar']

            # reduce progress metrics for tqdm when using dp
            if train and (self.use_dp or self.use_ddp2):
                num_gpus = self.num_gpus
                progress_output = self.reduce_distributed_output(progress_output, num_gpus)

            progress_bar_metrics = progress_output
        except Exception:
            progress_bar_metrics = {}

        # ---------------
        # EXTRACT LOGGING KEYS
        # ---------------
        # extract metrics to log to experiment
        try:
            log_output = output['log']

            # reduce progress metrics for tqdm when using dp
            if train and (self.use_dp or self.use_ddp2):
                num_gpus = self.num_gpus
                log_output = self.reduce_distributed_output(log_output, num_gpus)

            log_metrics = log_output
        except Exception:
            log_metrics = {}

        # ---------------
        # EXTRACT LOSS
        # ---------------
        # if output dict doesn't have the keyword loss
        # then assume the output=loss if scalar
        loss = None
        if train:
            try:
                loss = output['loss']
            except Exception:
                if isinstance(output, torch.Tensor):
                    loss = output
                else:
                    raise RuntimeError(
                        'No `loss` value in the dictionary returned from `model.training_step()`.'
                    )

            # when using dp need to reduce the loss
            if self.use_dp or self.use_ddp2:
                loss = self.reduce_distributed_output(loss, self.num_gpus)

        # ---------------
        # EXTRACT HIDDEN
        # ---------------
        hiddens = output.get('hiddens')

        # use every metric passed in as a candidate for callback
        callback_metrics.update(progress_bar_metrics)
        callback_metrics.update(log_metrics)

        # convert tensors to numpy
        for k, v in callback_metrics.items():
            if isinstance(v, torch.Tensor):
                callback_metrics[k] = v.item()

        return loss, progress_bar_metrics, log_metrics, callback_metrics, hiddens

    def reduce_distributed_output(self, output, num_gpus):
        if num_gpus <= 1:
            return output

        # when using DP, we get one output per gpu
        # average outputs and return
        if isinstance(output, torch.Tensor):
            return output.mean()

        for k, v in output.items():
            # recurse on nested dics
            if isinstance(output[k], dict):
                output[k] = self.reduce_distributed_output(output[k], num_gpus)

            # do nothing when there's a scalar
            elif isinstance(output[k], torch.Tensor) and output[k].dim() == 0:
                pass

            # reduce only metrics that have the same number of gpus
            elif output[k].size(0) == num_gpus:
                reduced = torch.mean(output[k])
                output[k] = reduced
        return output
コード例 #4
0
ファイル: baseline.py プロジェクト: abhinavg97/ABSA_GNN
    val_class_recall_scores = literal_eval(val_class_recall_scores_list[epoch])

    val_class_f1_scores_dict = {
        label_id_to_label_text[i]: val_class_f1_scores[i]
        for i in range(len(label_id_to_label_text))
    }
    val_class_recall_scores_dict = {
        label_id_to_label_text[i]: val_class_recall_scores[i]
        for i in range(len(label_id_to_label_text))
    }
    val_class_precision_scores_dict = {
        label_id_to_label_text[i]: val_class_precision_scores[i]
        for i in range(len(label_id_to_label_text))
    }

    logger.log_metrics(metrics={'avg_train_loss': avg_train_loss[epoch]},
                       step=epoch)
    logger.log_metrics(metrics={'avg_val_loss': avg_val_loss[epoch]},
                       step=epoch)
    logger.log_metrics(metrics={'avg_val_f1_score': avg_val_f1_score[epoch]},
                       step=epoch)
    logger.log_metrics(
        metrics={'avg_val_precision_score': avg_val_precision_score[epoch]},
        step=epoch)
    logger.log_metrics(
        metrics={'avg_val_recall_score': avg_val_recall_score[epoch]},
        step=epoch)
    logger.log_metrics(
        metrics={'avg_val_accuracy_score': avg_val_accuracy_score[epoch]},
        step=epoch)
    logger.experiment.add_scalars('val_class_f1_scores',
                                  val_class_f1_scores_dict,