Example #1
0
    def report_realtime_metric(self):
        tps = self.realtime_meters["tps"].avg
        ups = self.realtime_meters["ups"].avg

        if not tps or not ups:
            return
        metrics = RealtimeMetrics(samples=self.n_batches + 1, tps=tps, ups=ups)
        print(metrics, flush=True)
Example #2
0
    def report_realtime_metric(self, stage):
        if stage != Stage.TRAIN:
            return

        samples_total = self.n_batches + 1
        tps_total = self.realtime_meters["tps"].n
        ups_total = self.realtime_meters["ups"].n
        elapsed_time = self.realtime_meters["tps"].elapsed_time

        if cuda.DISTRIBUTED_WORLD_SIZE > 1:
            tensor = torch.cuda.IntTensor([samples_total, tps_total, ups_total])
            torch.distributed.all_reduce(tensor)
            [samples_total, tps_total, ups_total] = tensor.data.tolist()[:]

        tps = tps_total / elapsed_time
        ups = ups_total / elapsed_time

        if not tps or not ups:
            return
        metrics = RealtimeMetrics(samples=samples_total, tps=tps, ups=ups)
        print(metrics, flush=True)