def report_realtime_metric(self): tps = self.realtime_meters["tps"].avg ups = self.realtime_meters["ups"].avg if not tps or not ups: return metrics = RealtimeMetrics(samples=self.n_batches + 1, tps=tps, ups=ups) print(metrics, flush=True)
def report_realtime_metric(self, stage): if stage != Stage.TRAIN: return samples_total = self.n_batches + 1 tps_total = self.realtime_meters["tps"].n ups_total = self.realtime_meters["ups"].n elapsed_time = self.realtime_meters["tps"].elapsed_time if cuda.DISTRIBUTED_WORLD_SIZE > 1: tensor = torch.cuda.IntTensor([samples_total, tps_total, ups_total]) torch.distributed.all_reduce(tensor) [samples_total, tps_total, ups_total] = tensor.data.tolist()[:] tps = tps_total / elapsed_time ups = ups_total / elapsed_time if not tps or not ups: return metrics = RealtimeMetrics(samples=samples_total, tps=tps, ups=ups) print(metrics, flush=True)