Example #1
0
def benchmark_data(cfg: AttrDict, split: str = "train"):
    split = split.upper()
    total_images = MAX_ITERS * cfg["DATA"][split]["BATCHSIZE_PER_REPLICA"]
    timer = Timer()
    dataset = build_dataset(cfg, split)

    try:
        device = torch.device("cuda" if cfg.MACHINE.DEVICE == "gpu" else "cpu")
    except AttributeError:
        device = torch.device("cuda")

    # Gives sampler same seed for entire distributed group as per pytorch documentation.
    sampler_seed = cfg.SEED_VALUE
    dataloader = get_loader(
        dataset=dataset,
        dataset_config=cfg["DATA"][split],
        num_dataloader_workers=cfg.DATA.NUM_DATALOADER_WORKERS,
        pin_memory=False,
        multi_processing_method=cfg.MULTI_PROCESSING_METHOD,
        device=device,
        sampler_seed=sampler_seed,
    )

    # Fairstore data sampler would require setting the start iter before it can start.
    if hasattr(dataloader.sampler, "set_start_iter"):
        dataloader.sampler.set_start_iter(0)

    # initial warmup measured as warmup time
    timer.reset()
    data_iterator = iter(dataloader)
    for i in range(10):  # warmup
        next(data_iterator)
        if i == 0:
            # the total number of seconds since the start/reset of the timer
            warmup_time = timer.seconds()
    logging.info(f"Warmup time {WARMUP_ITERS} batches: {warmup_time} seconds")

    # measure the number of images per sec in 1000 iterations.
    timer = Timer()
    for _ in tqdm.trange(MAX_ITERS):
        next(data_iterator)
    time_elapsed = timer.seconds()
    logging.info(
        f"iters: {MAX_ITERS}; images: {total_images}; time: {time_elapsed} seconds; "
        f"images/sec: {round(float(total_images / time_elapsed), 4)}; "
        f"ms/img: {round(float(1000 * time_elapsed / total_images), 4)} ")

    # run benchmark for a few more rounds to catch fluctuations
    for round_idx in range(BENCHMARK_ROUNDS):
        timer = Timer()
        for _ in tqdm.trange(MAX_ITERS):
            next(data_iterator)
        time_elapsed = timer.seconds()
        logging.info(
            f"round: {round_idx}: iters: {MAX_ITERS}; images: {total_images}; "
            f"time: {time_elapsed} seconds; "
            f"images/sec: {round(float(total_images / time_elapsed), 4)}; "
            f"ms/img: {round(float(1000 * time_elapsed / total_images), 4)} ")
    del data_iterator
    del dataloader
Example #2
0
def benchmark_data(args):
    cfg = setup(args)

    timer = Timer()
    dataloader = build_detection_train_loader(cfg)
    logger.info("Initialize loader using {} seconds.".format(timer.seconds()))

    timer.reset()
    itr = iter(dataloader)
    for i in range(10):  # warmup
        next(itr)
        if i == 0:
            startup_time = timer.seconds()
    timer = Timer()
    max_iter = 1000
    for _ in tqdm.trange(max_iter):
        next(itr)
    logger.info("{} iters ({} images) in {} seconds.".format(
        max_iter, max_iter * cfg.SOLVER.IMS_PER_BATCH, timer.seconds()))
    logger.info("Startup time: {} seconds".format(startup_time))
    vram = psutil.virtual_memory()
    logger.info("RAM Usage: {:.2f}/{:.2f} GB".format(
        (vram.total - vram.available) / 1024**3, vram.total / 1024**3))

    # test for a few more rounds
    for _ in range(10):
        timer = Timer()
        max_iter = 1000
        for _ in tqdm.trange(max_iter):
            next(itr)
        logger.info("{} iters ({} images) in {} seconds.".format(
            max_iter, max_iter * cfg.SOLVER.IMS_PER_BATCH, timer.seconds()))
Example #3
0
def benchmark_data(args):
    cfg = setup(args)

    logger.info("After spawning " + RAM_msg())
    timer = Timer()
    dataloader = build_detection_train_loader(cfg)
    logger.info("Initialize loader using {} seconds.".format(timer.seconds()))

    timer.reset()
    itr = iter(dataloader)
    for i in range(10):  # warmup
        next(itr)
        if i == 0:
            startup_time = timer.seconds()
    logger.info("Startup time: {} seconds".format(startup_time))
    timer = Timer()
    max_iter = 1000
    for _ in tqdm.trange(max_iter):
        next(itr)
    logger.info("{} iters ({} images) in {} seconds.".format(
        max_iter, max_iter * cfg.SOLVER.IMS_PER_BATCH, timer.seconds()))

    # test for a few more rounds
    for k in range(10):
        logger.info(f"Iteration {k} " + RAM_msg())
        timer = Timer()
        max_iter = 1000
        for _ in tqdm.trange(max_iter):
            next(itr)
        logger.info("{} iters ({} images) in {} seconds.".format(
            max_iter, max_iter * cfg.SOLVER.IMS_PER_BATCH, timer.seconds()))
Example #4
0
class TestMeter(object):
    def __init__(self, cfg):
        self.cfg = cfg
        self.forward_timer = Timer()
        self.total_time = 0
        self.cnt = 0
        self.score = dict()
        self.output_dir = Join(cfg.TEST.OUTPUT_DIR, cfg.TEST.DATASET)
        self.save_img = cfg.TEST.SAVE_IMG
        if not os.path.exists(self.output_dir):
            os.makedirs(self.output_dir)
        self.score_csv = open(Join(self.output_dir, "score.csv"), 'w')
        self.score_csv.write("vid, image_id, psnr, ssim\n")

    def forward_tic(self):
        """
        Start to record time.
        """
        self.forward_timer.reset()

    def forward_toc(self):
        """
        Stop to record time.
        """
        self.forward_timer.pause()
        self.total_time += self.forward_timer.seconds()
        self.cnt += 1

    def log_img_result(self, img_out, vid, img_id, psnr, ssim):
        if vid not in self.score.keys():
            self.score[vid] = {}

        # log score
        self.score[vid][img_id] = (psnr, ssim)
        self.score_csv.write("{},{},{},{}\n".format(vid, img_id, psnr, ssim))

        # save img
        if self.save_img:
            # if not os.path.exists(Join(self.output_dir, vid)):
            #     os.makedirs(Join(self.output_dir, vid))
            img_out = cv2.cvtColor(img_out, cv2.COLOR_RGB2BGR)
            cv2.imwrite(Join(self.output_dir, img_id), img_out)

    def log_average_score(self):
        score_per_vid = {}
        for vid in self.score.keys():
            psnrs = [x[0] for x in self.score[vid].values()]
            ssims = [x[1] for x in self.score[vid].values()]
            score_per_vid[vid] = (np.mean(psnrs), np.mean(ssims))

        with open(Join(self.output_dir, 'videos_scores.csv'), 'w') as f:
            f.write('video_id, psnr, ssim\n')
            for vid in self.score.keys():
                f.write("{},{},{}\n".format(vid, score_per_vid[vid][0],
                                            score_per_vid[vid][1]))
        return score_per_vid

    def speed(self):
        return self.total_time, self.total_time / self.cnt
Example #5
0
    def test_timer(self):
        timer = Timer()
        time.sleep(0.5)
        self.assertTrue(0.99 > timer.seconds() >= 0.5)

        timer.pause()
        time.sleep(0.5)

        self.assertTrue(0.99 > timer.seconds() >= 0.5)

        timer.resume()
        time.sleep(0.5)
        self.assertTrue(1.49 > timer.seconds() >= 1.0)

        timer.reset()
        self.assertTrue(0.49 > timer.seconds() >= 0)
Example #6
0
    def test_timer(self) -> None:
        """
        Test basic timer functions (pause, resume, and reset).
        """
        timer = Timer()
        time.sleep(0.5)
        self.assertTrue(0.99 > timer.seconds() >= 0.5)

        timer.pause()
        time.sleep(0.5)

        self.assertTrue(0.99 > timer.seconds() >= 0.5)

        timer.resume()
        time.sleep(0.5)
        self.assertTrue(1.49 > timer.seconds() >= 1.0)

        timer.reset()
        self.assertTrue(0.49 > timer.seconds() >= 0)
        def test_model(epoch):
            """ Evaluate the model on the test set """
            model.eval()
            test_metrics = {"loss": [], "acc": []}
            timer = Timer()
            for batch_i, (X, y) in enumerate(test_dataloader):
                batch_i += 1
                image_sequences = Variable(X.to(device), requires_grad=False)
                labels = Variable(y, requires_grad=False).to(device)

                with torch.no_grad():
                    # Reset LSTM hidden state
                    model.lstm.reset_hidden_state()
                    # Get sequence predictions
                    predictions = model(image_sequences)

                # Compute metrics
                loss = criterion(predictions, labels)
                acc = (predictions.detach().argmax(1) == labels
                       ).cpu().numpy().mean()

                # Keep track of loss and accuracy
                test_metrics["loss"].append(loss.item())
                test_metrics["acc"].append(acc)

                # Determine approximate time left
                batches_done = batch_i - 1
                batches_left = len(test_dataloader) - batches_done
                time_left = datetime.timedelta(seconds=batches_left *
                                               timer.seconds())
                time_iter = round(timer.seconds(), 3)
                timer.reset()

                # Log test performance
                logger.info(
                    f'Testing - [Epoch: {epoch}/{cfg.train.num_epochs}] [Batch: {batch_i}/{len(test_dataloader)}] [Loss: {np.mean(test_metrics["loss"]):.3f}] [Acc: {np.mean(test_metrics["acc"]):.3f}] [ETA: {time_left}] [Iter time: {time_iter}s/it]'
                )

            writer.add_scalar("test/loss", np.mean(test_metrics["loss"]),
                              epoch)
            writer.add_scalar("test/acc", np.mean(test_metrics["acc"]), epoch)

            model.train()
Example #8
0
class EpochTimer:
    """
    A timer which computes the epoch time.
    """

    def __init__(self) -> None:
        self.timer = Timer()
        self.timer.reset()
        self.epoch_times = []

    def reset(self) -> None:
        """
        Reset the epoch timer.
        """
        self.timer.reset()
        self.epoch_times = []

    def epoch_tic(self):
        """
        Start to record time.
        """
        self.timer.reset()

    def epoch_toc(self):
        """
        Stop to record time.
        """
        self.timer.pause()
        self.epoch_times.append(self.timer.seconds())

    def last_epoch_time(self):
        """
        Get the time for the last epoch.
        """
        assert len(self.epoch_times) > 0, "No epoch time has been recorded!"

        return self.epoch_times[-1]

    def avg_epoch_time(self):
        """
        Calculate the average epoch time among the recorded epochs.
        """
        assert len(self.epoch_times) > 0, "No epoch time has been recorded!"

        return np.mean(self.epoch_times)

    def median_epoch_time(self):
        """
        Calculate the median epoch time among the recorded epochs.
        """
        assert len(self.epoch_times) > 0, "No epoch time has been recorded!"

        return np.median(self.epoch_times)
Example #9
0
class IterationTimer(HookBase):
    def __init__(self, warmup_iter=3):
        self._warmup_iter = warmup_iter
        self._step_timer = Timer()
        self._start_time = time.perf_counter()
        self._total_timer = Timer()

    def before_train(self):
        self._start_time = time.perf_counter()
        self._total_timer.reset()
        self._total_timer.pause()

    def after_train(self):
        logger = logging.getLogger(__name__)
        total_time = time.perf_counter() - self._start_time
        total_time_minus_hooks = self._total_timer.seconds()
        hook_time = total_time - total_time_minus_hooks

        num_iter = self.trainer.iter + 1 - self.trainer.start_iter - self._warmup_iter

        if num_iter > 0 and total_time_minus_hooks > 0:
            logger.info(
                "Overall training speed: {} iterations in {} ({:.4f} s / it)".format(
                    num_iter,
                    str(datetime.timedelta(seconds=int(total_time_minus_hooks))),
                    total_time_minus_hooks / num_iter,
                )
            )

        logger.info(
            "Total training time: {} ({} on hooks)".format(
                str(datetime.timedelta(seconds=int(total_time))),
                str(datetime.timedelta(seconds=int(hook_time))),
            )
        )

    def before_step(self):
        self._step_timer.reset()
        self._total_timer.resume()

    def after_step(self):
        iter_done = self.trainer.iter - self.trainer.start_iter + 1
        if iter_done >= self._warmup_iter:
            sec = self._step_timer.seconds()
            self.trainer.storage.put_scalars(time=sec)
        else:
            self._start_time = time.perf_counter()
            self._total_timer.reset()

        self._total_timer.pause()
Example #10
0
    #     learning_rate = 0.00075
    # if epoch == 3:
    #     learning_rate = 0.001
    if epoch == 30:
        learning_rate = 0.0001
    if epoch == 40:
        learning_rate = 0.00001
    # optimizer = torch.optim.SGD(net.parameters(),lr=learning_rate*0.1,momentum=0.9,weight_decay=1e-4)
    for param_group in optimizer.param_groups:
        param_group['lr'] = learning_rate

    print('\n\nStarting epoch %d / %d' % (epoch + 1, num_epochs))
    print('Learning Rate for this epoch: {}'.format(learning_rate))

    total_loss = 0.
    tt.reset()
    for i, (images, target) in enumerate(train_loader):
        # a,b=next(imgiter);images = Variable(a) ;target = Variable(b)
        # images = Variable(images) #torch.Size([4, 3, 448, 448])
        # target = Variable(target) #torch.Size([4, 14, 14, 30])

        if CHANNEL_LAST:
            images = images.to(memory_format=torch.channels_last)
            target = target.to(memory_format=torch.channels_last)
        if use_gpu:
            images, target = images.cuda(), target.cuda()

        pred = net(images)  # torch.Size([4, 14, 14, 30])
        loss = criterion(pred, target)  # torch.Size([])
        tloss = loss.item()
        # print(tloss)
Example #11
0
class TrainMeter(object):
    """
    Measure training stats.
    """
    def __init__(self, epoch_iters, cfg):
        """
        Args:
            epoch_iters (int): the overall number of iterations of one epoch.
            cfg (CfgNode): configs.
        """
        self._cfg = cfg
        self.epoch_iters = epoch_iters
        self.MAX_EPOCH = cfg.SOLVER.MAX_EPOCH * epoch_iters
        self.iter_timer = Timer()
        self.data_timer = Timer()
        self.net_timer = Timer()
        self.loss = ScalarMeter(cfg.LOG_PERIOD)
        self.loss_total = 0.0
        self.lr = None
        # Current minibatch errors (smoothed over a window).
        self.mb_top1_err = ScalarMeter(cfg.LOG_PERIOD)
        self.mb_top5_err = ScalarMeter(cfg.LOG_PERIOD)
        # Number of misclassified examples.
        self.num_top1_mis = 0
        self.num_top5_mis = 0
        self.num_samples = 0
        self.output_dir = cfg.OUTPUT_DIR

    def reset(self):
        """
        Reset the Meter.
        """
        self.loss.reset()
        self.loss_total = 0.0
        self.lr = None
        self.mb_top1_err.reset()
        self.mb_top5_err.reset()
        self.num_top1_mis = 0
        self.num_top5_mis = 0
        self.num_samples = 0

    def iter_tic(self):
        """
        Start to record time.
        """
        self.iter_timer.reset()
        self.data_timer.reset()

    def iter_toc(self):
        """
        Stop to record time.
        """
        self.iter_timer.pause()
        self.net_timer.pause()

    def data_toc(self):
        self.data_timer.pause()
        self.net_timer.reset()

    def update_stats(self, top1_err, top5_err, loss, lr, mb_size):
        """
        Update the current stats.
        Args:
            top1_err (float): top1 error rate.
            top5_err (float): top5 error rate.
            loss (float): loss value.
            lr (float): learning rate.
            mb_size (int): mini batch size.
        """
        self.loss.add_value(loss)
        self.lr = lr
        self.loss_total += loss * mb_size
        self.num_samples += mb_size

        if not self._cfg.DATA.MULTI_LABEL:
            # Current minibatch stats
            self.mb_top1_err.add_value(top1_err)
            self.mb_top5_err.add_value(top5_err)
            # Aggregate stats
            self.num_top1_mis += top1_err * mb_size
            self.num_top5_mis += top5_err * mb_size

    def log_iter_stats(self, cur_epoch, cur_iter):
        """
        log the stats of the current iteration.
        Args:
            cur_epoch (int): the number of current epoch.
            cur_iter (int): the number of current iteration.
        """
        if (cur_iter + 1) % self._cfg.LOG_PERIOD != 0:
            return
        eta_sec = self.iter_timer.seconds() * (
            self.MAX_EPOCH - (cur_epoch * self.epoch_iters + cur_iter + 1))
        eta = str(datetime.timedelta(seconds=int(eta_sec)))
        stats = {
            "_type": "train_iter",
            "epoch": "{}/{}".format(cur_epoch + 1, self._cfg.SOLVER.MAX_EPOCH),
            "iter": "{}/{}".format(cur_iter + 1, self.epoch_iters),
            "dt": self.iter_timer.seconds(),
            "dt_data": self.data_timer.seconds(),
            "dt_net": self.net_timer.seconds(),
            "eta": eta,
            "loss": self.loss.get_win_median(),
            "lr": self.lr,
            "gpu_mem": "{:.2f}G".format(misc.gpu_mem_usage()),
        }
        if not self._cfg.DATA.MULTI_LABEL:
            stats["top1_err"] = self.mb_top1_err.get_win_median()
            stats["top5_err"] = self.mb_top5_err.get_win_median()
        logging.log_json_stats(stats)

    def log_epoch_stats(self, cur_epoch):
        """
        Log the stats of the current epoch.
        Args:
            cur_epoch (int): the number of current epoch.
        """
        eta_sec = self.iter_timer.seconds() * (
            self.MAX_EPOCH - (cur_epoch + 1) * self.epoch_iters)
        eta = str(datetime.timedelta(seconds=int(eta_sec)))
        stats = {
            "_type": "train_epoch",
            "epoch": "{}/{}".format(cur_epoch + 1, self._cfg.SOLVER.MAX_EPOCH),
            "dt": self.iter_timer.seconds(),
            "dt_data": self.data_timer.seconds(),
            "dt_net": self.net_timer.seconds(),
            "eta": eta,
            "lr": self.lr,
            "gpu_mem": "{:.2f}G".format(misc.gpu_mem_usage()),
            "RAM": "{:.2f}/{:.2f}G".format(*misc.cpu_mem_usage()),
        }
        if not self._cfg.DATA.MULTI_LABEL:
            top1_err = self.num_top1_mis / self.num_samples
            top5_err = self.num_top5_mis / self.num_samples
            avg_loss = self.loss_total / self.num_samples
            stats["top1_err"] = top1_err
            stats["top5_err"] = top5_err
            stats["loss"] = avg_loss
        logging.log_json_stats(stats)
Example #12
0
class TestMeter(object):
    """
    Perform the multi-view ensemble for testing: each video with an unique index
    will be sampled with multiple clips, and the predictions of the clips will
    be aggregated to produce the final prediction for the video.
    The accuracy is calculated with the given ground truth labels.
    """
    def __init__(
        self,
        num_videos,
        num_clips,
        num_cls,
        overall_iters,
        multi_label=False,
        ensemble_method="sum",
    ):
        """
        Construct tensors to store the predictions and labels. Expect to get
        num_clips predictions from each video, and calculate the metrics on
        num_videos videos.
        Args:
            num_videos (int): number of videos to test.
            num_clips (int): number of clips sampled from each video for
                aggregating the final prediction for the video.
            num_cls (int): number of classes for each prediction.
            overall_iters (int): overall iterations for testing.
            multi_label (bool): if True, use map as the metric.
            ensemble_method (str): method to perform the ensemble, options
                include "sum", and "max".
        """

        self.iter_timer = Timer()
        self.data_timer = Timer()
        self.net_timer = Timer()
        self.num_clips = num_clips
        self.overall_iters = overall_iters
        self.multi_label = multi_label
        self.ensemble_method = ensemble_method
        # Initialize tensors.
        self.video_preds = torch.zeros((num_videos, num_cls))
        if multi_label:
            self.video_preds -= 1e10

        self.video_labels = (torch.zeros(
            (num_videos, num_cls)) if multi_label else torch.zeros(
                (num_videos)).long())
        self.clip_count = torch.zeros((num_videos)).long()
        self.topk_accs = []
        self.stats = {}

        # Reset metric.
        self.reset()

    def reset(self):
        """
        Reset the metric.
        """
        self.clip_count.zero_()
        self.video_preds.zero_()
        if self.multi_label:
            self.video_preds -= 1e10
        self.video_labels.zero_()

    def update_stats(self, preds, labels, clip_ids):
        """
        Collect the predictions from the current batch and perform on-the-flight
        summation as ensemble.
        Args:
            preds (tensor): predictions from the current batch. Dimension is
                N x C where N is the batch size and C is the channel size
                (num_cls).
            labels (tensor): the corresponding labels of the current batch.
                Dimension is N.
            clip_ids (tensor): clip indexes of the current batch, dimension is
                N.
        """
        for ind in range(preds.shape[0]):
            vid_id = int(clip_ids[ind]) // self.num_clips
            if self.video_labels[vid_id].sum() > 0:
                assert torch.equal(
                    self.video_labels[vid_id].type(torch.FloatTensor),
                    labels[ind].type(torch.FloatTensor),
                )
            self.video_labels[vid_id] = labels[ind]
            if self.ensemble_method == "sum":
                self.video_preds[vid_id] += preds[ind]
            elif self.ensemble_method == "max":
                self.video_preds[vid_id] = torch.max(self.video_preds[vid_id],
                                                     preds[ind])
            else:
                raise NotImplementedError(
                    "Ensemble Method {} is not supported".format(
                        self.ensemble_method))
            self.clip_count[vid_id] += 1

    def log_iter_stats(self, cur_iter):
        """
        Log the stats.
        Args:
            cur_iter (int): the current iteration of testing.
        """
        eta_sec = self.iter_timer.seconds() * (self.overall_iters - cur_iter)
        eta = str(datetime.timedelta(seconds=int(eta_sec)))
        stats = {
            "split": "test_iter",
            "cur_iter": "{}".format(cur_iter + 1),
            "eta": eta,
            "time_diff": self.iter_timer.seconds(),
        }
        logging.log_json_stats(stats)

    def iter_tic(self):
        """
        Start to record time.
        """
        self.iter_timer.reset()
        self.data_timer.reset()

    def iter_toc(self):
        """
        Stop to record time.
        """
        self.iter_timer.pause()
        self.net_timer.pause()

    def data_toc(self):
        self.data_timer.pause()
        self.net_timer.reset()

    def finalize_metrics(self, ks=(1, 5)):
        """
        Calculate and log the final ensembled metrics.
        ks (tuple): list of top-k values for topk_accuracies. For example,
            ks = (1, 5) correspods to top-1 and top-5 accuracy.
        """
        if not all(self.clip_count == self.num_clips):
            logger.warning("clip count {} ~= num clips {}".format(
                ", ".join([
                    "{}: {}".format(i, k)
                    for i, k in enumerate(self.clip_count.tolist())
                ]),
                self.num_clips,
            ))

        self.stats = {"split": "test_final"}
        if self.multi_label:
            map = get_map(self.video_preds.cpu().numpy(),
                          self.video_labels.cpu().numpy())
            self.stats["map"] = map
        else:
            num_topks_correct = metrics.topks_correct(self.video_preds,
                                                      self.video_labels, ks)
            topks = [(x / self.video_preds.size(0)) * 100.0
                     for x in num_topks_correct]
            assert len({len(ks), len(topks)}) == 1
            for k, topk in zip(ks, topks):
                self.stats["top{}_acc".format(k)] = "{:.{prec}f}".format(
                    topk, prec=2)
        logging.log_json_stats(self.stats)
Example #13
0
class ValMeter(object):
    """
    Measures validation stats.
    """

    def __init__(self, max_iter, cfg):
        """
        Args:
            max_iter (int): the max number of iteration of the current epoch.
            cfg (CfgNode): configs.
        """
        self._cfg = cfg
        self.max_iter = max_iter
        self.iter_timer = Timer()
        # Current minibatch errors (smoothed over a window).
        self.mb_top1_err = ScalarMeter(cfg.LOG_PERIOD)
        self.mb_top5_err = ScalarMeter(cfg.LOG_PERIOD)
        # Min errors (over the full val set).
        self.min_top1_err = 100.0
        self.min_top5_err = 100.0
        # Number of misclassified examples.
        self.num_top1_mis = 0
        self.num_top5_mis = 0
        self.num_samples = 0

    def reset(self):
        """
        Reset the Meter.
        """
        self.iter_timer.reset()
        self.mb_top1_err.reset()
        self.mb_top5_err.reset()
        self.num_top1_mis = 0
        self.num_top5_mis = 0
        self.num_samples = 0

    def iter_tic(self):
        """
        Start to record time.
        """
        self.iter_timer.reset()

    def iter_toc(self):
        """
        Stop to record time.
        """
        self.iter_timer.pause()

    def update_stats(self, top1_err, top5_err, mb_size):
        """
        Update the current stats.
        Args:
            top1_err (float): top1 error rate.
            top5_err (float): top5 error rate.
            mb_size (int): mini batch size.
        """
        self.mb_top1_err.add_value(top1_err)
        self.mb_top5_err.add_value(top5_err)
        self.num_top1_mis += top1_err * mb_size
        self.num_top5_mis += top5_err * mb_size
        self.num_samples += mb_size

    def log_iter_stats(self, cur_epoch, cur_iter):
        """
        log the stats of the current iteration.
        Args:
            cur_epoch (int): the number of current epoch.
            cur_iter (int): the number of current iteration.
        """
        if (cur_iter + 1) % self._cfg.LOG_PERIOD != 0:
            return
        eta_sec = self.iter_timer.seconds() * (self.max_iter - cur_iter - 1)
        eta = str(datetime.timedelta(seconds=int(eta_sec)))
        mem_usage = misc.gpu_mem_usage()
        stats = {
            "_type": "val_iter",
            "epoch": "{}/{}".format(cur_epoch + 1, self._cfg.SOLVER.MAX_EPOCH),
            "iter": "{}/{}".format(cur_iter + 1, self.max_iter),
            "time_diff": self.iter_timer.seconds(),
            "eta": eta,
            "top1_err": self.mb_top1_err.get_win_median(),
            "top5_err": self.mb_top5_err.get_win_median(),
            "mem": int(np.ceil(mem_usage)),
        }
        logging.log_json_stats(stats)

    def log_epoch_stats(self, cur_epoch):
        """
        Log the stats of the current epoch.
        Args:
            cur_epoch (int): the number of current epoch.
        """
        top1_err = self.num_top1_mis / self.num_samples
        top5_err = self.num_top5_mis / self.num_samples
        self.min_top1_err = min(self.min_top1_err, top1_err)
        self.min_top5_err = min(self.min_top5_err, top5_err)
        mem_usage = misc.gpu_mem_usage()
        stats = {
            "_type": "val_epoch",
            "epoch": "{}/{}".format(cur_epoch + 1, self._cfg.SOLVER.MAX_EPOCH),
            "time_diff": self.iter_timer.seconds(),
            "top1_err": top1_err,
            "top5_err": top5_err,
            "min_top1_err": self.min_top1_err,
            "min_top5_err": self.min_top5_err,
            "mem": int(np.ceil(mem_usage)),
        }
        logging.log_json_stats(stats)

        return self.min_top1_err
Example #14
0
class EPICTrainMeter(object):
    """
    Measure training stats.
    """
    def __init__(self, summary_writer, epoch_iters, cfg):
        """
        Args:
            epoch_iters (int): the overall number of iterations of one epoch.
            cfg (CfgNode): configs.
        """
        self._cfg = cfg
        self.epoch_iters = epoch_iters
        self.MAX_EPOCH = cfg.SOLVER.MAX_EPOCH * epoch_iters
        self.iter_timer = Timer()
        self.loss = ScalarMeter(cfg.LOG_PERIOD)
        self.loss_total = 0.0
        self.loss_verb = ScalarMeter(cfg.LOG_PERIOD)
        self.loss_verb_total = 0.0
        self.loss_noun = ScalarMeter(cfg.LOG_PERIOD)
        self.loss_noun_total = 0.0
        self.lr = None
        # Current minibatch accuracies (smoothed over a window).
        self.mb_top1_acc = ScalarMeter(cfg.LOG_PERIOD)
        self.mb_top5_acc = ScalarMeter(cfg.LOG_PERIOD)
        self.mb_verb_top1_acc = ScalarMeter(cfg.LOG_PERIOD)
        self.mb_verb_top5_acc = ScalarMeter(cfg.LOG_PERIOD)
        self.mb_noun_top1_acc = ScalarMeter(cfg.LOG_PERIOD)
        self.mb_noun_top5_acc = ScalarMeter(cfg.LOG_PERIOD)
        # Number of correctly classified examples.
        self.num_top1_cor = 0
        self.num_top5_cor = 0
        self.num_verb_top1_cor = 0
        self.num_verb_top5_cor = 0
        self.num_noun_top1_cor = 0
        self.num_noun_top5_cor = 0
        self.num_samples = 0

        self.tb_writer: SummaryWriter = summary_writer

    def reset(self):
        """
        Reset the Meter.
        """
        self.loss.reset()
        self.loss_total = 0.0
        self.loss_verb.reset()
        self.loss_verb_total = 0.0
        self.loss_noun.reset()
        self.loss_noun_total = 0.0
        self.lr = None
        self.mb_top1_acc.reset()
        self.mb_top5_acc.reset()
        self.mb_verb_top1_acc.reset()
        self.mb_verb_top5_acc.reset()
        self.mb_noun_top1_acc.reset()
        self.mb_noun_top5_acc.reset()
        self.num_top1_cor = 0
        self.num_top5_cor = 0
        self.num_verb_top1_cor = 0
        self.num_verb_top5_cor = 0
        self.num_noun_top1_cor = 0
        self.num_noun_top5_cor = 0
        self.num_samples = 0

    def iter_tic(self):
        """
        Start to record time.
        """
        self.iter_timer.reset()

    def iter_toc(self):
        """
        Stop to record time.
        """
        self.iter_timer.pause()

    def update_stats(self, top1_acc, top5_acc, loss, lr, mb_size):
        """
        Update the current stats.
        Args:
            top1_acc (float): top1 accuracy rate.
            top5_acc (float): top5 accuracy rate.
            loss (float): loss value.
            lr (float): learning rate.
            mb_size (int): mini batch size.
        """
        # Current minibatch stats
        self.mb_verb_top1_acc.add_value(top1_acc[0])
        self.mb_verb_top5_acc.add_value(top5_acc[0])
        self.mb_noun_top1_acc.add_value(top1_acc[1])
        self.mb_noun_top5_acc.add_value(top5_acc[1])
        self.mb_top1_acc.add_value(top1_acc[2])
        self.mb_top5_acc.add_value(top5_acc[2])
        self.loss_verb.add_value(loss[0])
        self.loss_noun.add_value(loss[1])
        self.loss.add_value(loss[2])
        self.lr = lr
        # Aggregate stats
        self.num_verb_top1_cor += top1_acc[0] * mb_size
        self.num_verb_top5_cor += top5_acc[0] * mb_size
        self.num_noun_top1_cor += top1_acc[1] * mb_size
        self.num_noun_top5_cor += top5_acc[1] * mb_size
        self.num_top1_cor += top1_acc[2] * mb_size
        self.num_top5_cor += top5_acc[2] * mb_size
        self.loss_verb_total += loss[0] * mb_size
        self.loss_noun_total += loss[1] * mb_size
        self.loss_total += loss[2] * mb_size
        self.num_samples += mb_size

    def log_iter_stats(self, cur_epoch, cur_iter):
        """
        log the stats of the current iteration.
        Args:
            cur_epoch (int): the number of current epoch.
            cur_iter (int): the number of current iteration.
        """
        if (cur_iter + 1) % self._cfg.LOG_PERIOD != 0:
            return
        eta_sec = self.iter_timer.seconds() * (
            self.MAX_EPOCH - (cur_epoch * self.epoch_iters + cur_iter + 1))
        eta = str(datetime.timedelta(seconds=int(eta_sec)))
        mem_usage = misc.gpu_mem_usage()
        stats = {
            "_type": "train_iter",
            "epoch": "{}/{}".format(cur_epoch + 1, self._cfg.SOLVER.MAX_EPOCH),
            "iter": "{}/{}".format(cur_iter + 1, self.epoch_iters),
            "time_diff": self.iter_timer.seconds(),
            "eta": eta,
            "verb_top1_acc": self.mb_verb_top1_acc.get_win_median(),
            "verb_top5_acc": self.mb_verb_top5_acc.get_win_median(),
            "noun_top1_acc": self.mb_noun_top1_acc.get_win_median(),
            "noun_top5_acc": self.mb_noun_top5_acc.get_win_median(),
            "top1_acc": self.mb_top1_acc.get_win_median(),
            "top5_acc": self.mb_top5_acc.get_win_median(),
            "verb_loss": self.loss_verb.get_win_median(),
            "noun_loss": self.loss_noun.get_win_median(),
            "loss": self.loss.get_win_median(),
            "lr": self.lr,
            "mem": int(np.ceil(mem_usage)),
        }
        log_to_tensorboard(self.tb_writer, stats)
        logging.log_json_stats(stats)

    def log_epoch_stats(self, cur_epoch):
        """
        Log the stats of the current epoch.
        Args:
            cur_epoch (int): the number of current epoch.
        """
        eta_sec = self.iter_timer.seconds() * (
            self.MAX_EPOCH - (cur_epoch + 1) * self.epoch_iters)
        eta = str(datetime.timedelta(seconds=int(eta_sec)))
        mem_usage = misc.gpu_mem_usage()
        verb_top1_acc = self.num_verb_top1_cor / self.num_samples
        verb_top5_acc = self.num_verb_top5_cor / self.num_samples
        noun_top1_acc = self.num_noun_top1_cor / self.num_samples
        noun_top5_acc = self.num_noun_top5_cor / self.num_samples
        top1_acc = self.num_top1_cor / self.num_samples
        top5_acc = self.num_top5_cor / self.num_samples
        avg_loss_verb = self.loss_verb_total / self.num_samples
        avg_loss_noun = self.loss_noun_total / self.num_samples
        avg_loss = self.loss_total / self.num_samples
        stats = {
            "_type": "train_epoch",
            "epoch": "{}/{}".format(cur_epoch + 1, self._cfg.SOLVER.MAX_EPOCH),
            "time_diff": self.iter_timer.seconds(),
            "eta": eta,
            "verb_top1_acc": verb_top1_acc,
            "verb_top5_acc": verb_top5_acc,
            "noun_top1_acc": noun_top1_acc,
            "noun_top5_acc": noun_top5_acc,
            "top1_acc": top1_acc,
            "top5_acc": top5_acc,
            "verb_loss": avg_loss_verb,
            "noun_loss": avg_loss_noun,
            "loss": avg_loss,
            "lr": self.lr,
            "mem": int(np.ceil(mem_usage)),
        }
        log_to_tensorboard(self.tb_writer, stats, False)
        logging.log_json_stats(stats)
Example #15
0
def benchmark_data(cfg):
    # Set up environment.
    setup_environment()
    # Set random seed from configs.
    np.random.seed(cfg.RNG_SEED)
    torch.manual_seed(cfg.RNG_SEED)

    # Setup logging format.
    logging.setup_logging()

    # Print config.
    logger.info("Benchmark data loading with config:")
    logger.info(pprint.pformat(cfg))

    timer = Timer()
    dataloader = loader.construct_loader(cfg, "train")
    logger.info("Initialize loader using {:.2f} seconds.".format(
        timer.seconds()))
    batch_size = cfg.TRAIN.BATCH_SIZE
    log_period = cfg.BENCHMARK.LOG_PERIOD
    epoch_times = []
    # Test for a few epochs.
    for cur_epoch in range(cfg.BENCHMARK.NUM_EPOCHS):
        timer = Timer()
        timer_epoch = Timer()
        iter_times = []
        for cur_iter, _ in enumerate(tqdm.tqdm(dataloader)):
            if cur_iter > 0 and cur_iter % log_period == 0:
                iter_times.append(timer.seconds())
                vram = psutil.virtual_memory()
                logger.info(
                    "Epoch {}: {} iters ({} videos) in {:.2f} seconds. "
                    "RAM Usage: {:.2f}/{:.2f} GB.".format(
                        cur_epoch,
                        log_period,
                        log_period * batch_size,
                        iter_times[-1],
                        (vram.total - vram.available) / 1024**3,
                        vram.total / 1024**3,
                    ))
                timer.reset()
        epoch_times.append(timer_epoch.seconds())
        vram = psutil.virtual_memory()
        logger.info(
            "Epoch {}: in total {} iters ({} videos) in {:.2f} seconds. "
            "RAM Usage: {:.2f}/{:.2f} GB.".format(
                cur_epoch,
                len(dataloader),
                len(dataloader) * batch_size,
                epoch_times[-1],
                (vram.total - vram.available) / 1024**3,
                vram.total / 1024**3,
            ))
        logger.info(
            "Epoch {}: on average every {} iters ({} videos) take {:.2f}/{:.2f} "
            "(avg/std) seconds.".format(
                cur_epoch,
                log_period,
                log_period * batch_size,
                np.mean(iter_times),
                np.std(iter_times),
            ))
    logger.info("On average every epoch ({} videos) takes {:.2f}/{:.2f} "
                "(avg/std) seconds.".format(
                    len(dataloader) * batch_size,
                    np.mean(epoch_times),
                    np.std(epoch_times),
                ))
Example #16
0
class TrainMeter(object):
    """
    Measures training stats.
    """
    def __init__(self, epoch_iters, cfg):
        """
        Args:
            epoch_iters (int): the overall number of iterations of one epoch.
            cfg (CfgNode): configs.
        """
        self._cfg = cfg
        self.epoch_iters = epoch_iters
        self.MAX_EPOCH = cfg.SOLVER.MAX_EPOCH * epoch_iters
        self.iter_timer = Timer()
        self.log_period = cfg.LOG_PERIOD

        self.infos = None
        self.num_samples = 0

    def init(self, keys):
        self.infos = {}
        for key in keys:
            self.infos[key] = ScalarMeter(self.log_period)

    def reset(self):
        """
        Reset the Meter.
        """
        for k, v in self.infos.items():
            v.reset()

    def iter_tic(self):
        """
        Start to record time.
        """
        self.iter_timer.reset()

    def iter_toc(self):
        """
        Stop to record time.
        """
        self.iter_timer.pause()

    def update_stats(self, info_dict):
        """
        Update the current stats.
        Args:
            psnr (float): psnr
            loss (float): loss value.
            lr (float): learning rate.
            mb_size (int): mini batch size.
        """
        # Current minibatch stats
        if self.infos is None:
            self.init(info_dict.keys())
        # reduce from all gpus
        if self._cfg.NUM_GPUS > 1:
            for k, v in info_dict.items():
                info_dict[k] = du.all_reduce([v])
        # syncronize from gpu to cpu
        info_dict = {k: v.item() for k, v in info_dict.items()}
        # log value into scalar meter
        for k, v in info_dict.items():
            self.infos[k].add_value(v)

    def log_iter_stats(self, cur_epoch, cur_iter):
        """
        log the stats of the current iteration.
        Args:
            cur_epoch (int): the number of current epoch.
            cur_iter (int): the number of current iteration.
        """
        if (cur_iter + 1) % self._cfg.LOG_PERIOD != 0:
            return
        eta_sec = self.iter_timer.seconds() * (
            self.MAX_EPOCH - (cur_epoch * self.epoch_iters + cur_iter + 1))
        eta = str(datetime.timedelta(seconds=int(eta_sec)))
        mem_usage = misc.gpu_mem_usage()
        stats = {
            "_type": "train_iter",
            "epoch": "{}/{}".format(cur_epoch + 1, self._cfg.SOLVER.MAX_EPOCH),
            "iter": "{}/{}".format(cur_iter + 1, self.epoch_iters),
            "time_diff": self.iter_timer.seconds(),
            "eta": eta,
            "mem": int(np.ceil(mem_usage)),
        }
        infos = {k: v.get_win_avg() for k, v in self.infos}
        stats.update(infos)
        logging.log_json_stats(stats)

    def log_epoch_stats(self, cur_epoch):
        """
        Log the stats of the current epoch.
        Args:
            cur_epoch (int): the number of current epoch.
        """
        eta_sec = self.iter_timer.seconds() * (
            self.MAX_EPOCH - (cur_epoch + 1) * self.epoch_iters)
        eta = str(datetime.timedelta(seconds=int(eta_sec)))
        mem_usage = misc.gpu_mem_usage()
        stats = {
            "_type": "train_epoch",
            "epoch": "{}/{}".format(cur_epoch + 1, self._cfg.SOLVER.MAX_EPOCH),
            "time_diff": self.iter_timer.seconds(),
            "eta": eta,
            "mem": int(np.ceil(mem_usage)),
        }
        infos = {k: v.get_global_avg() for k, v in self.infos}
        stats.update(infos)
        logging.log_json_stats(stats)
Example #17
0
def do_train(cfg, model, resume=False):
    model.train()
    optimizer = build_optimizer(cfg, model)
    scheduler = build_lr_scheduler(cfg, optimizer)

    checkpointer = DetectionCheckpointer(model,
                                         cfg.OUTPUT_DIR,
                                         optimizer=optimizer,
                                         scheduler=scheduler)

    start_iter = (checkpointer.resume_or_load(
        cfg.MODEL.WEIGHTS,
        resume=resume,
    ).get("iteration", -1) + 1)
    if cfg.SOLVER.RESET_ITER:
        logger.info('Reset loaded iteration. Start training from iteration 0.')
        start_iter = 0
    max_iter = cfg.SOLVER.MAX_ITER if cfg.SOLVER.TRAIN_ITER < 0 else cfg.SOLVER.TRAIN_ITER

    periodic_checkpointer = PeriodicCheckpointer(checkpointer,
                                                 cfg.SOLVER.CHECKPOINT_PERIOD,
                                                 max_iter=max_iter)

    writers = ([
        CommonMetricPrinter(max_iter),
        JSONWriter(os.path.join(cfg.OUTPUT_DIR, "metrics.json")),
        TensorboardXWriter(cfg.OUTPUT_DIR),
    ] if comm.is_main_process() else [])


    mapper = DatasetMapper(cfg, True) if cfg.INPUT.CUSTOM_AUG == '' else \
        DatasetMapper(cfg, True, augmentations=build_custom_augmentation(cfg, True))
    if cfg.DATALOADER.SAMPLER_TRAIN in [
            'TrainingSampler', 'RepeatFactorTrainingSampler'
    ]:
        data_loader = build_detection_train_loader(cfg, mapper=mapper)
    else:
        from centernet.data.custom_dataset_dataloader import build_custom_train_loader
        data_loader = build_custom_train_loader(cfg, mapper=mapper)

    logger.info("Starting training from iteration {}".format(start_iter))
    with EventStorage(start_iter) as storage:
        step_timer = Timer()
        data_timer = Timer()
        start_time = time.perf_counter()
        for data, iteration in zip(data_loader, range(start_iter, max_iter)):
            data_time = data_timer.seconds()
            storage.put_scalars(data_time=data_time)
            step_timer.reset()
            iteration = iteration + 1
            storage.step()
            loss_dict = model(data)

            losses = sum(loss for k, loss in loss_dict.items())
            assert torch.isfinite(losses).all(), loss_dict

            loss_dict_reduced = {k: v.item() \
                for k, v in comm.reduce_dict(loss_dict).items()}
            losses_reduced = sum(loss for loss in loss_dict_reduced.values())
            if comm.is_main_process():
                storage.put_scalars(total_loss=losses_reduced,
                                    **loss_dict_reduced)

            optimizer.zero_grad()
            losses.backward()
            optimizer.step()

            storage.put_scalar("lr",
                               optimizer.param_groups[0]["lr"],
                               smoothing_hint=False)

            step_time = step_timer.seconds()
            storage.put_scalars(time=step_time)
            data_timer.reset()
            scheduler.step()

            if (cfg.TEST.EVAL_PERIOD > 0
                    and iteration % cfg.TEST.EVAL_PERIOD == 0
                    and iteration != max_iter):
                do_test(cfg, model)
                comm.synchronize()

            if iteration - start_iter > 5 and \
                (iteration % 20 == 0 or iteration == max_iter):
                for writer in writers:
                    writer.write()
            periodic_checkpointer.step(iteration)

        total_time = time.perf_counter() - start_time
        logger.info("Total training time: {}".format(
            str(datetime.timedelta(seconds=int(total_time)))))
def main(cfg: DictConfig) -> None:

    if "experiments" in cfg.keys():
        cfg = OmegaConf.merge(cfg, cfg.experiments)

    if "debug" in cfg.keys():
        logger.info(f"Run script in debug")
        cfg = OmegaConf.merge(cfg, cfg.debug)

    # A logger for this file
    logger = logging.getLogger(__name__)

    # NOTE: hydra causes the python file to run in hydra.run.dir by default
    logger.info(f"Run script in {HydraConfig.get().run.dir}")

    writer = SummaryWriter(log_dir=cfg.train.tensorboard_dir)

    checkpoints_dir = Path(cfg.train.checkpoints_dir)
    if not checkpoints_dir.exists():
        checkpoints_dir.mkdir(parents=True)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    image_shape = (cfg.train.channels, cfg.train.image_height,
                   cfg.train.image_width)

    # NOTE: With hydra, the python file runs in hydra.run.dir by default, so set the dataset path to a full path or an appropriate relative path
    dataset_path = Path(cfg.dataset.root) / cfg.dataset.frames
    split_path = Path(cfg.dataset.root) / cfg.dataset.split_file
    assert dataset_path.exists(), "Video image folder not found"
    assert (split_path.exists()
            ), "The file that describes the split of train/test not found."

    # Define training set
    train_dataset = Dataset(
        dataset_path=dataset_path,
        split_path=split_path,
        split_number=cfg.dataset.split_number,
        input_shape=image_shape,
        sequence_length=cfg.train.sequence_length,
        training=True,
    )

    # Define train dataloader
    train_dataloader = DataLoader(
        train_dataset,
        batch_size=cfg.train.batch_size,
        shuffle=True,
        num_workers=cfg.train.num_workers,
    )

    # Define test set
    test_dataset = Dataset(
        dataset_path=dataset_path,
        split_path=split_path,
        split_number=cfg.dataset.split_number,
        input_shape=image_shape,
        sequence_length=cfg.train.sequence_length,
        training=False,
    )

    # Define test dataloader
    test_dataloader = DataLoader(
        test_dataset,
        batch_size=cfg.train.batch_size,
        shuffle=False,
        num_workers=cfg.train.num_workers,
    )

    # Classification criterion
    criterion = nn.CrossEntropyLoss().to(device)

    # Define network
    model = CNNLSTM(
        num_classes=train_dataset.num_classes,
        latent_dim=cfg.train.latent_dim,
        lstm_layers=cfg.train.lstm_layers,
        hidden_dim=cfg.train.hidden_dim,
        bidirectional=cfg.train.bidirectional,
        attention=cfg.train.attention,
    )
    model = model.to(device)

    optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)

    checkpointer = Checkpointer(
        model,
        optimizer=optimizer,
        # scheduler=scheduler,
        save_dir=cfg.train.checkpoints_dir,
        save_to_disk=True,
    )

    if cfg.train.resume:
        if not checkpointer.has_checkpoint():
            start_epoch = 0
        else:
            ckpt = checkpointer.resume_or_load("", resume=True)
            start_epoch = ckpt["epoch"]
            model.to(device)
            for state in optimizer.state.values():
                for k, v in state.items():
                    if isinstance(v, torch.Tensor):
                        state[k] = v.to(device)
    elif cfg.train.checkpoint_model != "":
        ckpt = torch.load(cfg.train.checkpoint_model, map_location="cpu")
        model.load_state_dict(ckpt["model"])
        model.to(device)
        start_epoch = 0
    else:
        start_epoch = 0

    for epoch in range(start_epoch, cfg.train.num_epochs):
        epoch += 1
        epoch_metrics = {"loss": [], "acc": []}
        timer = Timer()
        for batch_i, (X, y) in enumerate(train_dataloader):
            batch_i += 1
            if X.size(0) == 1:
                continue

            image_sequences = Variable(X.to(device), requires_grad=True)
            labels = Variable(y.to(device), requires_grad=False)

            optimizer.zero_grad()

            # Reset LSTM hidden state
            model.lstm.reset_hidden_state()

            # Get sequence predictions
            predictions = model(image_sequences)

            # Compute metrics
            loss = criterion(predictions, labels)
            acc = (
                predictions.detach().argmax(1) == labels).cpu().numpy().mean()

            loss.backward()
            optimizer.step()

            # Keep track of epoch metrics
            epoch_metrics["loss"].append(loss.item())
            epoch_metrics["acc"].append(acc)

            # Determine approximate time left
            batches_done = (epoch - 1) * len(train_dataloader) + (batch_i - 1)
            batches_left = cfg.train.num_epochs * len(
                train_dataloader) - batches_done
            time_left = datetime.timedelta(seconds=batches_left *
                                           timer.seconds())
            time_iter = round(timer.seconds(), 3)
            timer.reset()

            logger.info(
                f'Training - [Epoch: {epoch}/{cfg.train.num_epochs}] [Batch: {batch_i}/{len(train_dataloader)}] [Loss: {np.mean(epoch_metrics["loss"]):.3f}] [Acc: {np.mean(epoch_metrics["acc"]):.3f}] [ETA: {time_left}] [Iter time: {time_iter}s/it]'
            )

            # Empty cache
            if torch.cuda.is_available():
                torch.cuda.empty_cache()

        writer.add_scalar("train/loss", np.mean(epoch_metrics["loss"]), epoch)
        writer.add_scalar("train/acc", np.mean(epoch_metrics["acc"]), epoch)

        def test_model(epoch):
            """ Evaluate the model on the test set """
            model.eval()
            test_metrics = {"loss": [], "acc": []}
            timer = Timer()
            for batch_i, (X, y) in enumerate(test_dataloader):
                batch_i += 1
                image_sequences = Variable(X.to(device), requires_grad=False)
                labels = Variable(y, requires_grad=False).to(device)

                with torch.no_grad():
                    # Reset LSTM hidden state
                    model.lstm.reset_hidden_state()
                    # Get sequence predictions
                    predictions = model(image_sequences)

                # Compute metrics
                loss = criterion(predictions, labels)
                acc = (predictions.detach().argmax(1) == labels
                       ).cpu().numpy().mean()

                # Keep track of loss and accuracy
                test_metrics["loss"].append(loss.item())
                test_metrics["acc"].append(acc)

                # Determine approximate time left
                batches_done = batch_i - 1
                batches_left = len(test_dataloader) - batches_done
                time_left = datetime.timedelta(seconds=batches_left *
                                               timer.seconds())
                time_iter = round(timer.seconds(), 3)
                timer.reset()

                # Log test performance
                logger.info(
                    f'Testing - [Epoch: {epoch}/{cfg.train.num_epochs}] [Batch: {batch_i}/{len(test_dataloader)}] [Loss: {np.mean(test_metrics["loss"]):.3f}] [Acc: {np.mean(test_metrics["acc"]):.3f}] [ETA: {time_left}] [Iter time: {time_iter}s/it]'
                )

            writer.add_scalar("test/loss", np.mean(test_metrics["loss"]),
                              epoch)
            writer.add_scalar("test/acc", np.mean(test_metrics["acc"]), epoch)

            model.train()

        # Evaluate the model on the test set
        test_model(epoch)

        # Save model checkpoint
        if epoch % cfg.train.checkpoint_interval == 0:
            checkpointer.save(f"checkpoint_{epoch:04}", epoch=epoch)

    writer.close()
Example #19
0
class EPICValMeter(object):
    """
    Measures validation stats.
    """

    def __init__(self, max_iter, cfg):
        """
        Args:
            max_iter (int): the max number of iteration of the current epoch.
            cfg (CfgNode): configs.
        """
        self._cfg = cfg
        self.max_iter = max_iter
        self.iter_timer = Timer()
        self.data_timer = Timer()
        self.net_timer = Timer()
        # Current minibatch accuracies (smoothed over a window).
        self.mb_top1_acc = ScalarMeter(cfg.LOG_PERIOD)
        self.mb_top5_acc = ScalarMeter(cfg.LOG_PERIOD)
        self.mb_verb_top1_acc = ScalarMeter(cfg.LOG_PERIOD)
        self.mb_verb_top5_acc = ScalarMeter(cfg.LOG_PERIOD)
        self.mb_noun_top1_acc = ScalarMeter(cfg.LOG_PERIOD)
        self.mb_noun_top5_acc = ScalarMeter(cfg.LOG_PERIOD)
        # Max accuracies (over the full val set).
        self.max_top1_acc = 0.0
        self.max_top5_acc = 0.0
        self.max_verb_top1_acc = 0.0
        self.max_verb_top5_acc = 0.0
        self.max_noun_top1_acc = 0.0
        self.max_noun_top5_acc = 0.0
        # Number of correctly classified examples.
        self.num_top1_cor = 0
        self.num_top5_cor = 0
        self.num_verb_top1_cor = 0
        self.num_verb_top5_cor = 0
        self.num_noun_top1_cor = 0
        self.num_noun_top5_cor = 0
        self.num_samples = 0
        self.all_verb_preds = []
        self.all_verb_labels = []
        self.all_noun_preds = []
        self.all_noun_labels = []
        self.output_dir = cfg.OUTPUT_DIR

    def reset(self):
        """
        Reset the Meter.
        """
        self.iter_timer.reset()
        self.mb_top1_acc.reset()
        self.mb_top5_acc.reset()
        self.mb_verb_top1_acc.reset()
        self.mb_verb_top5_acc.reset()
        self.mb_noun_top1_acc.reset()
        self.mb_noun_top5_acc.reset()
        self.num_top1_cor = 0
        self.num_top5_cor = 0
        self.num_verb_top1_cor = 0
        self.num_verb_top5_cor = 0
        self.num_noun_top1_cor = 0
        self.num_noun_top5_cor = 0
        self.num_samples = 0
        self.all_verb_preds = []
        self.all_verb_labels = []
        self.all_noun_preds = []
        self.all_noun_labels = []

    def iter_tic(self):
        """
        Start to record time.
        """
        self.iter_timer.reset()
        self.data_timer.reset()

    def iter_toc(self):
        """
        Stop to record time.
        """
        self.iter_timer.pause()
        self.net_timer.pause()

    def data_toc(self):
        self.data_timer.pause()
        self.net_timer.reset()

    def update_stats(self, top1_acc, top5_acc, mb_size):
        """
        Update the current stats.
        Args:
            top1_acc (float): top1 accuracy rate.
            top5_acc (float): top5 accuracy rate.
            mb_size (int): mini batch size.
        """
        self.mb_verb_top1_acc.add_value(top1_acc[0])
        self.mb_verb_top5_acc.add_value(top5_acc[0])
        self.mb_noun_top1_acc.add_value(top1_acc[1])
        self.mb_noun_top5_acc.add_value(top5_acc[1])
        self.mb_top1_acc.add_value(top1_acc[2])
        self.mb_top5_acc.add_value(top5_acc[2])
        self.num_verb_top1_cor += top1_acc[0] * mb_size
        self.num_verb_top5_cor += top5_acc[0] * mb_size
        self.num_noun_top1_cor += top1_acc[1] * mb_size
        self.num_noun_top5_cor += top5_acc[1] * mb_size
        self.num_top1_cor += top1_acc[2] * mb_size
        self.num_top5_cor += top5_acc[2] * mb_size
        self.num_samples += mb_size

    def update_predictions(self, preds, labels):
        """
        Update predictions and labels.
        Args:
            preds (tensor): model output predictions.
            labels (tensor): labels.
        """
        # TODO: merge update_prediction with update_stats.
        self.all_verb_preds.append(preds[0])
        self.all_verb_labels.append(labels[0])
        self.all_noun_preds.append(preds[1])
        self.all_noun_labels.append(labels[1])

    def log_iter_stats(self, cur_epoch, cur_iter):
        """
        log the stats of the current iteration.
        Args:
            cur_epoch (int): the number of current epoch.
            cur_iter (int): the number of current iteration.
        """
        if (cur_iter + 1) % self._cfg.LOG_PERIOD != 0:
            return
        eta_sec = self.iter_timer.seconds() * (self.max_iter - cur_iter - 1)
        eta = str(datetime.timedelta(seconds=int(eta_sec)))
        stats = {
            "_type": "val_iter",
            "epoch": "{}/{}".format(cur_epoch + 1, self._cfg.SOLVER.MAX_EPOCH),
            "iter": "{}/{}".format(cur_iter + 1, self.max_iter),
            "time_diff": self.iter_timer.seconds(),
            "eta": eta,
            "verb_top1_acc": self.mb_verb_top1_acc.get_win_median(),
            "verb_top5_acc": self.mb_verb_top5_acc.get_win_median(),
            "noun_top1_acc": self.mb_noun_top1_acc.get_win_median(),
            "noun_top5_acc": self.mb_noun_top5_acc.get_win_median(),
            "top1_acc": self.mb_top1_acc.get_win_median(),
            "top5_acc": self.mb_top5_acc.get_win_median(),
            "gpu_mem": "{:.2f}G".format(misc.gpu_mem_usage()),
        }
        logging.log_json_stats(stats)

    def log_epoch_stats(self, cur_epoch):
        """
        Log the stats of the current epoch.
        Args:
            cur_epoch (int): the number of current epoch.
        """
        verb_top1_acc = self.num_verb_top1_cor / self.num_samples
        verb_top5_acc = self.num_verb_top5_cor / self.num_samples
        noun_top1_acc = self.num_noun_top1_cor / self.num_samples
        noun_top5_acc = self.num_noun_top5_cor / self.num_samples
        top1_acc = self.num_top1_cor / self.num_samples
        top5_acc = self.num_top5_cor / self.num_samples
        self.max_verb_top1_acc = max(self.max_verb_top1_acc, verb_top1_acc)
        self.max_verb_top5_acc = max(self.max_verb_top5_acc, verb_top5_acc)
        self.max_noun_top1_acc = max(self.max_noun_top1_acc, noun_top1_acc)
        self.max_noun_top5_acc = max(self.max_noun_top5_acc, noun_top5_acc)
        is_best_epoch = top1_acc > self.max_top1_acc
        self.max_top1_acc = max(self.max_top1_acc, top1_acc)
        self.max_top5_acc = max(self.max_top5_acc, top5_acc)
        stats = {
            "_type": "val_epoch",
            "epoch": "{}/{}".format(cur_epoch + 1, self._cfg.SOLVER.MAX_EPOCH),
            "time_diff": self.iter_timer.seconds(),
            "verb_top1_acc": verb_top1_acc,
            "verb_top5_acc": verb_top5_acc,
            "noun_top1_acc": noun_top1_acc,
            "noun_top5_acc": noun_top5_acc,
            "top1_acc": top1_acc,
            "top5_acc": top5_acc,
            "max_verb_top1_acc": self.max_verb_top1_acc,
            "max_verb_top5_acc": self.max_verb_top5_acc,
            "max_noun_top1_acc": self.max_noun_top1_acc,
            "max_noun_top5_acc": self.max_noun_top5_acc,
            "max_top1_acc": self.max_top1_acc,
            "max_top5_acc": self.max_top5_acc,
            "gpu_mem": "{:.2f}G".format(misc.gpu_mem_usage()),
            "RAM": "{:.2f}/{:.2f}G".format(*misc.cpu_mem_usage()),
        }
        logging.log_json_stats(stats)

        return is_best_epoch, {"top1_acc": top1_acc, "verb_top1_acc": verb_top1_acc, "noun_top1_acc": noun_top1_acc}
Example #20
0
class TrainMeter(object):
    """
    Measure training stats.
    """

    def __init__(self, epoch_iters, cfg):
        """
        Args:
            epoch_iters (int): the overall number of iterations of one epoch.
            cfg (CfgNode): configs.
        """
        self._cfg = cfg
        self.epoch_iters = epoch_iters
        self.MAX_EPOCH = cfg.SOLVER.MAX_EPOCH * epoch_iters
        self.iter_timer = Timer()
        # self.loss = ScalarMeter(cfg.LOG_PERIOD)
        # self.loss_total = 0.0
        self.lr = None
        
        # Current minibatch errors (smoothed over a window).
        # self.mb_top1_err = ScalarMeter(cfg.LOG_PERIOD)
        # self.mb_top5_err = ScalarMeter(cfg.LOG_PERIOD)
        # Number of misclassified examples.
        # self.num_top1_mis = 0
        # self.num_top5_mis = 0
    
        self.num_samples = 0
        self.stats = {}

    def reset(self):
        """
        Reset the Meter.
        """
        # self.loss.reset()
        # self.loss_total = 0.0
        self.lr = None
        # if self.mb_top1_err:
        #     self.mb_top1_err.reset()
        #     self.mb_top5_err.reset()
        #     self.num_top1_mis = 0
        #     self.num_top5_mis = 0
        self.num_samples = 0

        for k,v in self.stats.items():
            self.stats[k].reset()
        

    def iter_tic(self):
        """
        Start to record time.
        """
        self.iter_timer.reset()

    def iter_toc(self):
        """
        Stop to record time.
        """
        self.iter_timer.pause()
 
    def update_stats(self, lr, mb_size, **kwargs): #, top1_err, top5_err, loss
        """
        Update the current stats.
        Args:
            top1_err (float): top1 error rate.
            top5_err (float): top5 error rate.
            loss (float): loss value.
            lr (float): learning rate.
            mb_size (int): mini batch size.
        """
        # Current minibatch stats
        # if self.mb_top1_err:
        #     self.mb_top1_err.add_value(top1_err)
        #     self.mb_top5_err.add_value(top5_err)
        #     self.num_top1_mis += top1_err * mb_size
        #     self.num_top5_mis += top5_err * mb_size
        
        for k,v in kwargs.items():
            if k not in self.stats:
                self.stats[k] = ScalarMeter(self._cfg.LOG_PERIOD)
            self.stats[k].add_value(v)
        

        # self.loss.add_value(loss)

        self.lr = lr
        # Aggregate stats
        
        # self.loss_total += loss * mb_size
        self.num_samples += mb_size

    def log_iter_stats(self, cur_epoch, cur_iter):
        """
        log the stats of the current iteration.
        Args:
            cur_epoch (int): the number of current epoch.
            cur_iter (int): the number of current iteration.
        """
        if (cur_iter + 1) % self._cfg.LOG_PERIOD != 0:
            return
        eta_sec = self.iter_timer.seconds() * (
            self.MAX_EPOCH - (cur_epoch * self.epoch_iters + cur_iter + 1)
        )
        eta = str(datetime.timedelta(seconds=int(eta_sec)))
        mem_usage = misc.gpu_mem_usage()
        
        stats = {
            "_type": "train_iter",
            "epoch": "{}/{}".format(cur_epoch + 1, self._cfg.SOLVER.MAX_EPOCH),
            "iter": "{}/{}".format(cur_iter + 1, self.epoch_iters),
            "time_diff": self.iter_timer.seconds(),
            "time_left": eta,
            # "top1_err": self.mb_top1_err.get_win_median(),
            # "top5_err": self.mb_top5_err.get_win_median(),
            # "loss": self.loss.get_win_median(),
            "lr": self.lr,
            "mem": int(np.ceil(mem_usage)),
            
        }
        for k, v in self.stats.items():
            stats[k] = v.get_win_median()
        # if self.mb_top1_err:
        #     stats = {**stats, **{"top1_err": self.mb_top1_err.get_win_median(),
        #                         "top5_err": self.mb_top5_err.get_win_median()}}
            
            
        logging.log_json_stats(stats)

    def log_epoch_stats(self, cur_epoch):
        """
        Log the stats of the current epoch.
        Args:
            cur_epoch (int): the number of current epoch.
        """
        eta_sec = self.iter_timer.seconds() * (
            self.MAX_EPOCH - (cur_epoch + 1) * self.epoch_iters
        )
        eta = str(datetime.timedelta(seconds=int(eta_sec)))
        mem_usage = misc.gpu_mem_usage()
        # top1_err = self.num_top1_mis / self.num_samples
        # top5_err = self.num_top5_mis / self.num_samples
        # avg_loss = self.loss_total / self.num_samples
        stats = {
            "_type": "train_epoch",
            "epoch": "{}/{}".format(cur_epoch + 1, self._cfg.SOLVER.MAX_EPOCH),
            "time_diff": self.iter_timer.seconds(),
            "time_left": eta,
            # "top1_err": top1_err,
            # "top5_err": top5_err,
            # "loss": avg_loss,
            "lr": self.lr,
            "mem": int(np.ceil(mem_usage)),
        }
        for k, v in self.stats.items():
            stats[k] = v.get_global_avg()
        # if self.mb_top1_err:
        #     stats = {**stats, **{"top1_err": top1_err,
        #                 "top5_err": top5_err}}
        logging.log_json_stats(stats)
Example #21
0
class EPICValMeter(object):
    """
    Measures validation stats.
    """
    def __init__(self, summary_writer, max_iter, cfg):
        """
        Args:
            max_iter (int): the max number of iteration of the current epoch.
            cfg (CfgNode): configs.
        """
        self._cfg = cfg
        self.max_iter = max_iter
        self.iter_timer = Timer()
        # Current minibatch accuracies (smoothed over a window).
        self.mb_top1_acc = ScalarMeter(cfg.LOG_PERIOD)
        self.mb_top5_acc = ScalarMeter(cfg.LOG_PERIOD)
        self.mb_verb_top1_acc = ScalarMeter(cfg.LOG_PERIOD)
        self.mb_verb_top5_acc = ScalarMeter(cfg.LOG_PERIOD)
        self.mb_noun_top1_acc = ScalarMeter(cfg.LOG_PERIOD)
        self.mb_noun_top5_acc = ScalarMeter(cfg.LOG_PERIOD)
        # Max accuracies (over the full val set).
        self.max_top1_acc = 0.0
        self.max_top5_acc = 0.0
        self.max_verb_top1_acc = 0.0
        self.max_verb_top5_acc = 0.0
        self.max_noun_top1_acc = 0.0
        self.max_noun_top5_acc = 0.0
        # Number of correctly classified examples.
        self.num_top1_cor = 0
        self.num_top5_cor = 0
        self.num_verb_top1_cor = 0
        self.num_verb_top5_cor = 0
        self.num_noun_top1_cor = 0
        self.num_noun_top5_cor = 0
        self.num_samples = 0

        self.tb_writer: SummaryWriter = summary_writer

    def reset(self):
        """
        Reset the Meter.
        """
        self.iter_timer.reset()
        self.mb_top1_acc.reset()
        self.mb_top5_acc.reset()
        self.mb_verb_top1_acc.reset()
        self.mb_verb_top5_acc.reset()
        self.mb_noun_top1_acc.reset()
        self.mb_noun_top5_acc.reset()
        self.num_top1_cor = 0
        self.num_top5_cor = 0
        self.num_verb_top1_cor = 0
        self.num_verb_top5_cor = 0
        self.num_noun_top1_cor = 0
        self.num_noun_top5_cor = 0
        self.num_samples = 0

    def iter_tic(self):
        """
        Start to record time.
        """
        self.iter_timer.reset()

    def iter_toc(self):
        """
        Stop to record time.
        """
        self.iter_timer.pause()

    def update_stats(self, top1_acc, top5_acc, mb_size):
        """
        Update the current stats.
        Args:
            top1_acc (float): top1 accuracy rate.
            top5_acc (float): top5 accuracy rate.
            mb_size (int): mini batch size.
        """
        self.mb_verb_top1_acc.add_value(top1_acc[0])
        self.mb_verb_top5_acc.add_value(top5_acc[0])
        self.mb_noun_top1_acc.add_value(top1_acc[1])
        self.mb_noun_top5_acc.add_value(top5_acc[1])
        self.mb_top1_acc.add_value(top1_acc[2])
        self.mb_top5_acc.add_value(top5_acc[2])
        self.num_verb_top1_cor += top1_acc[0] * mb_size
        self.num_verb_top5_cor += top5_acc[0] * mb_size
        self.num_noun_top1_cor += top1_acc[1] * mb_size
        self.num_noun_top5_cor += top5_acc[1] * mb_size
        self.num_top1_cor += top1_acc[2] * mb_size
        self.num_top5_cor += top5_acc[2] * mb_size
        self.num_samples += mb_size

    def log_iter_stats(self, cur_epoch, cur_iter):
        """
        log the stats of the current iteration.
        Args:
            cur_epoch (int): the number of current epoch.
            cur_iter (int): the number of current iteration.
        """
        if (cur_iter + 1) % self._cfg.LOG_PERIOD != 0:
            return
        eta_sec = self.iter_timer.seconds() * (self.max_iter - cur_iter - 1)
        eta = str(datetime.timedelta(seconds=int(eta_sec)))
        mem_usage = misc.gpu_mem_usage()
        stats = {
            "_type": "val_iter",
            "epoch": "{}/{}".format(cur_epoch + 1, self._cfg.SOLVER.MAX_EPOCH),
            "iter": "{}/{}".format(cur_iter + 1, self.max_iter),
            "time_diff": self.iter_timer.seconds(),
            "eta": eta,
            "verb_top1_acc": self.mb_verb_top1_acc.get_win_median(),
            "verb_top5_acc": self.mb_verb_top5_acc.get_win_median(),
            "noun_top1_acc": self.mb_noun_top1_acc.get_win_median(),
            "noun_top5_acc": self.mb_noun_top5_acc.get_win_median(),
            "top1_acc": self.mb_top1_acc.get_win_median(),
            "top5_acc": self.mb_top5_acc.get_win_median(),
            "mem": int(np.ceil(mem_usage)),
        }
        log_to_tensorboard(self.tb_writer, stats)
        logging.log_json_stats(stats)

    def log_epoch_stats(self, cur_epoch):
        """
        Log the stats of the current epoch.
        Args:
            cur_epoch (int): the number of current epoch.
        """
        verb_top1_acc = self.num_verb_top1_cor / self.num_samples
        verb_top5_acc = self.num_verb_top5_cor / self.num_samples
        noun_top1_acc = self.num_noun_top1_cor / self.num_samples
        noun_top5_acc = self.num_noun_top5_cor / self.num_samples
        top1_acc = self.num_top1_cor / self.num_samples
        top5_acc = self.num_top5_cor / self.num_samples
        self.max_verb_top1_acc = max(self.max_verb_top1_acc, verb_top1_acc)
        self.max_verb_top5_acc = max(self.max_verb_top5_acc, verb_top5_acc)
        self.max_noun_top1_acc = max(self.max_noun_top1_acc, noun_top1_acc)
        self.max_noun_top5_acc = max(self.max_noun_top5_acc, noun_top5_acc)
        is_best_epoch = top1_acc > self.max_top1_acc
        self.max_top1_acc = max(self.max_top1_acc, top1_acc)
        self.max_top5_acc = max(self.max_top5_acc, top5_acc)
        mem_usage = misc.gpu_mem_usage()
        stats = {
            "_type": "val_epoch",
            "epoch": "{}/{}".format(cur_epoch + 1, self._cfg.SOLVER.MAX_EPOCH),
            "time_diff": self.iter_timer.seconds(),
            "verb_top1_acc": verb_top1_acc,
            "verb_top5_acc": verb_top5_acc,
            "noun_top1_acc": noun_top1_acc,
            "noun_top5_acc": noun_top5_acc,
            "top1_acc": top1_acc,
            "top5_acc": top5_acc,
            "max_verb_top1_acc": self.max_verb_top1_acc,
            "max_verb_top5_acc": self.max_verb_top5_acc,
            "max_noun_top1_acc": self.max_noun_top1_acc,
            "max_noun_top5_acc": self.max_noun_top5_acc,
            "max_top1_acc": self.max_top1_acc,
            "max_top5_acc": self.max_top5_acc,
            "mem": int(np.ceil(mem_usage)),
        }
        log_to_tensorboard(self.tb_writer, stats, False)
        logging.log_json_stats(stats)

        return is_best_epoch
Example #22
0
class ValMeter(object):
    """
    Measures validation stats.
    """

    def __init__(self, max_iter, cfg):
        """
        Args:
            max_iter (int): the max number of iteration of the current epoch.
            cfg (CfgNode): configs.
        """
        self._cfg = cfg
        self.max_iter = max_iter
        self.iter_timer = Timer()
        self.num_samples = 0
        self.stats = {}

    def reset(self):
        """
        Reset the Meter.
        """
        self.iter_timer.reset()
        self.num_samples = 0
        for k,v in self.stats.items():
            self.stats[k].reset()

    def iter_tic(self):
        """
        Start to record time.
        """
        self.iter_timer.reset()

    def iter_toc(self):
        """
        Stop to record time.
        """
        self.iter_timer.pause()
 
    def update_stats(self, mb_size, **kwargs): #, top1_err, top5_err
        """
        Update the current stats.
        Args:
            top1_err (float): top1 error rate.
            top5_err (float): top5 error rate.
            mb_size (int): mini batch size.
        """

        self.num_samples += mb_size

        for k,v in kwargs.items():
            if k not in self.stats:
                self.stats[k] = ScalarMeter(self._cfg.LOG_PERIOD)
            self.stats[k].add_value(v)
        

    def log_iter_stats(self, cur_epoch, cur_iter):
        """
        log the stats of the current iteration.
        Args:
            cur_epoch (int): the number of current epoch.
            cur_iter (int): the number of current iteration.
        """
        if (cur_iter + 1) % self._cfg.LOG_PERIOD != 0:
            return
        eta_sec = self.iter_timer.seconds() * (self.max_iter - cur_iter - 1)
        eta = str(datetime.timedelta(seconds=int(eta_sec)))
        mem_usage = misc.gpu_mem_usage()
        stats = {
            "_type": "val_iter",
            "epoch": "{}/{}".format(cur_epoch + 1, self._cfg.SOLVER.MAX_EPOCH),
            "iter": "{}/{}".format(cur_iter + 1, self.max_iter),
            "time_diff": self.iter_timer.seconds(),
            "time_left": eta,
            # "top1_err": self.mb_top1_err.get_win_median(),
            # "top5_err": self.mb_top5_err.get_win_median(),
            "mem": int(np.ceil(mem_usage)),
        }
        for k, v in self.stats.items():
            stats[k] = v.get_win_median()

        logging.log_json_stats(stats)

    def log_epoch_stats(self, cur_epoch):
        """
        Log the stats of the current epoch.
        Args:
            cur_epoch (int): the number of current epoch.
        """

        mem_usage = misc.gpu_mem_usage()
        stats = {
            "_type": "val_epoch",
            "epoch": "{}/{}".format(cur_epoch + 1, self._cfg.SOLVER.MAX_EPOCH),
            "time_diff": self.iter_timer.seconds(),
            "mem": int(np.ceil(mem_usage)),
        }
        for k, v in self.stats.items():
            stats[k] = v.get_global_avg()
        logging.log_json_stats(stats)
Example #23
0
class TestMeter(object):
    """
    Perform the multi-view ensemble for testing: each video with an unique index
    will be sampled with multiple clips, and the predictions of the clips will
    be aggregated to produce the final prediction for the video.
    The accuracy is calculated with the given ground truth labels.
    """

    def __init__(self, num_videos, num_clips, num_cls, overall_iters):
        """
        Construct tensors to store the predictions and labels. Expect to get
        num_clips predictions from each video, and calculate the metrics on
        num_videos videos.
        Args:
            num_videos (int): number of videos to test.
            num_clips (int): number of clips sampled from each video for
                aggregating the final prediction for the video.
            num_cls (int): number of classes for each prediction.
            overall_iters (int): overall iterations for testing.
        """

        self.iter_timer = Timer()
        self.num_clips = num_clips
        self.overall_iters = overall_iters
        # Initialize tensors.
        self.video_preds = torch.zeros((num_videos, num_cls))
        self.video_labels = torch.zeros((num_videos)).long()
        self.clip_count = torch.zeros((num_videos)).long()
        # Reset metric.
        self.reset()

    def reset(self):
        """
        Reset the metric.
        """
        self.clip_count.zero_()
        self.video_preds.zero_()
        self.video_labels.zero_()

    def update_stats(self, preds, labels, clip_ids):
        """
        Collect the predictions from the current batch and perform on-the-flight
        summation as ensemble.
        Args:
            preds (tensor): predictions from the current batch. Dimension is
                N x C where N is the batch size and C is the channel size
                (num_cls).
            labels (tensor): the corresponding labels of the current batch.
                Dimension is N.
            clip_ids (tensor): clip indexes of the current batch, dimension is
                N.
        """
        for ind in range(preds.shape[0]):
            vid_id = int(clip_ids[ind]) // self.num_clips
            self.video_labels[vid_id] = labels[ind]
            self.video_preds[vid_id] += preds[ind]
            self.clip_count[vid_id] += 1

    def log_iter_stats(self, cur_iter):
        """
        Log the stats.
        Args:
            cur_iter (int): the current iteration of testing.
        """
        eta_sec = self.iter_timer.seconds() * (self.overall_iters - cur_iter)
        eta = str(datetime.timedelta(seconds=int(eta_sec)))
        stats = {
            "split": "test_iter",
            "cur_iter": "{}".format(cur_iter + 1),
            "eta": eta,
            "time_diff": self.iter_timer.seconds(),
        }
        logging.log_json_stats(stats)

    def iter_tic(self):
        self.iter_timer.reset()

    def iter_toc(self):
        self.iter_timer.pause()

    def finalize_metrics(self, ks=(1, 5)):
        """
        Calculate and log the final ensembled metrics.
        ks (tuple): list of top-k values for topk_accuracies. For example,
            ks = (1, 5) correspods to top-1 and top-5 accuracy.
        """
        if not all(self.clip_count == self.num_clips):
            logger.warning(
                "clip count {} ~= num clips {}".format(
                    self.clip_count, self.num_clips
                )
            )
            logger.warning(self.clip_count)

        num_topks_correct = metrics.topks_correct(
            self.video_preds, self.video_labels, ks
        )
        topks = [
            (x / self.video_preds.size(0)) * 100.0 for x in num_topks_correct
        ]
        assert len({len(ks), len(topks)}) == 1
        stats = {"split": "test_final"}
        for k, topk in zip(ks, topks):
            stats["top{}_acc".format(k)] = "{:.{prec}f}".format(topk, prec=2)
        logging.log_json_stats(stats)
class TrainMeter(object):
    def __init__(self, epoch_iters, cfg):
        """

        :param epoch_iters: iters in one epoch
        :param cfg:
        """
        self._cfg = cfg
        self.epoch_iters = epoch_iters
        # self.loss=ScalarMeter(cfg.LOG_PERIOD)
        self.mse_loss = ScalarMeter(cfg.LOG_PERIOD)
        self.entropy_loss = ScalarMeter(cfg.LOG_PERIOD)
        self.combine_loss = ScalarMeter(cfg.LOG_PERIOD)
        self.iter_timer = Timer()
        self.lr = None
        # self.loss_total=0.0
        self.MAX_EPOCH = cfg.SOLVER.MAX_EPOCH * epoch_iters
        # self.num_samples=0

    def reset(self):
        """
        reset meter
        :return:
        """
        self.lr = None
        self.mse_loss.reset()
        self.entropy_loss.reset()
        self.combine_loss.reset()
        # self.loss_total=0.0
    def iter_start(self):
        """
        start to recode time
        :return:
        """
        self.iter_timer.reset()

    def iter_stop(self):
        """
        stop recode time
        :return:
        """
        self.iter_timer.pause()

    def update_stats(self, mse_loss, entropy_loss, combine_loss, lr, mb_size):

        self.mse_loss.add_value(mse_loss)
        self.entropy_loss.add_value(entropy_loss)
        self.combine_loss.add_value(combine_loss)
        self.lr = lr
        # self.loss_total+=loss*mb_size
        # self.num_samples+=mb_size

    def log_iter_stats(self, cur_epoch, cur_iter):
        """
        log the stats for cur iteration
        :param cur_epoch:
        :param cur_iter:
        :return:
        """
        if (cur_iter + 1) % self._cfg.LOG_PERIOD != 0:
            return
        eta_sec = self.iter_timer.seconds() * (
            self.MAX_EPOCH - (cur_epoch * self.epoch_iters + cur_iter + 1))
        eta = str(datetime.timedelta(seconds=int(eta_sec)))

        stats = {
            "_type": "train_iter",
            "epoch": "{}/{}".format(cur_epoch + 1, self._cfg.SOLVER.MAX_EPOCH),
            "iter": "{}/{}".format(cur_iter + 1, self.epoch_iters),
            "time": self.iter_timer.seconds(),
            "eta": eta,
            "mse_loss": self.mse_loss.get_win_median(),
            "entropy_loss": self.entropy_loss.get_win_median(),
            "combine_loss": self.combine_loss.get_win_median(),
            "lr": self.lr,
            "gpu":
            "{:.2f}GB".format(torch.cuda.max_memory_allocated() / 1024**3)
        }
        logging.log_json_stats(stats)

    def log_epoch_stats(self, cur_epoch):
        """

        :param cur_epoch:
        :return:
        """
        stats = {
            "_type":
            "train_epoch",
            "epoch":
            "{}/{}".format(cur_epoch + 1, self._cfg.SOLVER.MAX_EPOCH),
            "time_diff":
            self.iter_timer.seconds(),
            "mse_loss":
            self.mse_loss.get_win_avg(),
            "entropy_loss":
            self.entropy_loss.get_win_avg(),
            "combine_loss":
            self.combine_loss.get_win_avg(),
            "gpu_mem":
            "{:.2f} GB".format(torch.cuda.max_memory_allocated() / 1024**3),
        }
        logging.log_json_stats(stats)
Example #25
0
class IterationTimer(HookBase):
    """
    Track the time spent for each iteration (each run_step call in the trainer).
    Print a summary in the end of training.

    This hook uses the time between the call to its :meth:`before_step`
    and :meth:`after_step` methods.
    Under the convention that :meth:`before_step` of all hooks should only
    take negligible amount of time, the :class:`IterationTimer` hook should be
    placed at the beginning of the list of hooks to obtain accurate timing.
    """

    def __init__(self, warmup_iter=3):
        """
        Args:
            warmup_iter (int): the number of iterations at the beginning to exclude
                from timing.
        """
        self._warmup_iter = warmup_iter
        self._step_timer = Timer()
        self._start_time = time.perf_counter()
        self._total_timer = Timer()

    def before_train(self):
        self._start_time = time.perf_counter()
        self._total_timer.reset()
        self._total_timer.pause()

    def after_train(self):
        logger = logging.getLogger(__name__)
        total_time = time.perf_counter() - self._start_time
        total_time_minus_hooks = self._total_timer.seconds()
        hook_time = total_time - total_time_minus_hooks

        num_iter = self.trainer.iter + 1 - self.trainer.start_iter - self._warmup_iter

        if num_iter > 0 and total_time_minus_hooks > 0:
            # Speed is meaningful only after warmup
            # NOTE this format is parsed by grep in some scripts
            logger.info(
                "Overall training speed: {} iterations in {} ({:.4f} s / it)".format(
                    num_iter,
                    str(datetime.timedelta(seconds=int(total_time_minus_hooks))),
                    total_time_minus_hooks / num_iter,
                )
            )

        logger.info(
            "Total training time: {} ({} on hooks)".format(
                str(datetime.timedelta(seconds=int(total_time))),
                str(datetime.timedelta(seconds=int(hook_time))),
            )
        )

    def before_step(self):
        self._step_timer.reset()
        self._total_timer.resume()

    def after_step(self):
        # +1 because we're in after_step
        iter_done = self.trainer.iter - self.trainer.start_iter + 1
        if iter_done >= self._warmup_iter:
            sec = self._step_timer.seconds()
            self.trainer.storage.put_scalars(time=sec)
        else:
            self._start_time = time.perf_counter()
            self._total_timer.reset()

        self._total_timer.pause()
Example #26
0
class TestMeter(object):
    """
    Perform the multi-view ensemble for testing: each video with an unique index
    will be sampled with multiple clips, and the predictions of the clips will
    be aggregated to produce the final prediction for the video.
    The accuracy is calculated with the given ground truth labels.
    """
    def __init__(self, num_videos, num_clips, num_cls, overall_iters, isDemo):
        """
        Construct tensors to store the predictions and labels. Expect to get
        num_clips predictions from each video, and calculate the metrics on
        num_videos videos.
        Args:
            num_videos (int): number of videos to test.
            num_clips (int): number of clips sampled from each video for
                aggregating the final prediction for the video.
            num_cls (int): number of classes for each prediction.
            overall_iters (int): overall iterations for testing.
        """

        self.iter_timer = Timer()
        self.num_clips = num_clips
        self.overall_iters = overall_iters
        # Initialize tensors.
        self.video_preds = torch.zeros((num_videos, num_cls))
        self.video_labels = torch.zeros((num_videos)).long()
        self.clip_count = torch.zeros((num_videos)).long()
        # Reset metric.
        self.reset()
        self.isDemo = isDemo

    def reset(self):
        """
        Reset the metric.
        """
        self.clip_count.zero_()
        self.video_preds.zero_()
        self.video_labels.zero_()

    def update_stats(self, preds, labels, clip_ids):
        """
        Collect the predictions from the current batch and perform on-the-flight
        summation as ensemble.
        Args:
            preds (tensor): predictions from the current batch. Dimension is
                N x C where N is the batch size and C is the channel size
                (num_cls).
            labels (tensor): the corresponding labels of the current batch.
                Dimension is N.
            clip_ids (tensor): clip indexes of the current batch, dimension is
                N.
        """
        #print(preds,labels)
        for ind in range(preds.shape[0]):
            vid_id = int(clip_ids[ind]) // self.num_clips
            self.video_labels[vid_id] = labels[ind]
            self.video_preds[vid_id] += preds[ind]
            self.clip_count[vid_id] += 1

    def log_iter_stats(self, cur_iter):
        """
        Log the stats.
        Args:
            cur_iter (int): the current iteration of testing.
        """
        eta_sec = self.iter_timer.seconds() * (self.overall_iters - cur_iter)
        eta = str(datetime.timedelta(seconds=int(eta_sec)))
        stats = {
            "split": "test_iter",
            "cur_iter": "{}".format(cur_iter + 1),
            #"eta": eta,
            #"time_diff": self.iter_timer.seconds(),
        }
        #logging.log_json_stats(stats)

    def iter_tic(self):
        self.iter_timer.reset()

    def iter_toc(self):
        self.iter_timer.pause()

    def finalize_metrics(self, ks=(1, 2)):
        """
        Calculate and log the final ensembled metrics.
        ks (tuple): list of top-k values for topk_accuracies. For example,
            ks = (1, 5) correspods to top-1 and top-5 accuracy.
        """
        if self.isDemo:
            preds_numpy = self.video_preds.clone()
            normalize = np.array(softmax(preds_numpy.cpu().numpy()))
            jogging_label = 21
            sort_p = []
            for p in normalize:
                sort_p.append(sorted(p, reverse=True))

            propability = np.transpose(
                np.array(softmax(preds_numpy.cpu().numpy())))

            for i, v in enumerate(propability[jogging_label]):
                top1_v = sort_p[i][0]
                top2_v = sort_p[i][1]
                if v == top1_v or v == top2_v:
                    propability[jogging_label][
                        i] = propability[jogging_label][i] / (top1_v + top2_v)

            cwd = os.getcwd()
            tmp_dir = os.path.join(cwd, "tmp")
            if not os.path.exists(tmp_dir):
                os.mkdir(tmp_dir)
            out_dir = os.path.join(tmp_dir, "probability.npy")

            np.save(out_dir, propability[jogging_label])
        if not all(self.clip_count == self.num_clips):
            logger.warning("clip count {} ~= num clips {}".format(
                self.clip_count, self.num_clips))
            logger.warning(self.clip_count)

        num_topks_correct = metrics.topks_correct(self.video_preds,
                                                  self.video_labels, ks)
        topks = [(x / self.video_preds.size(0)) * 100.0
                 for x in num_topks_correct]
        #binary = [
        #    (x / self.video_preds.size(0)) * 100.0 for x in binary_correct
        #]
        assert len({len(ks), len(topks)}) == 1
        stats = {"split": "test_final"}

        for k, topk in zip(ks, topks):
            stats["top{}_acc".format(k)] = "{:.{prec}f}".format(topk, prec=2)
Example #27
0
class AVAMeter(object):
    """
    Measure the AVA train, val, and test stats.
    """
    def __init__(self, overall_iters, cfg, mode):
        """
        overall_iters (int): the overall number of iterations of one epoch.
        cfg (CfgNode): configs.
        mode (str): `train`, `val`, or `test` mode.
        """
        self.cfg = cfg
        self.lr = None
        self.loss = ScalarMeter(cfg.LOG_PERIOD)
        self.full_ava_test = cfg.AVA.FULL_TEST_ON_VAL
        self.mode = mode
        self.iter_timer = Timer()
        self.data_timer = Timer()
        self.net_timer = Timer()
        self.all_preds_train = []
        self.all_ori_boxes_train = []
        self.all_metadata_train = []
        self.all_preds = []
        self.all_ori_boxes = []
        self.all_metadata = []
        self.overall_iters = overall_iters
        self.categories, self.class_whitelist = read_labelmap(
            os.path.join(cfg.AVA.ANNOTATION_DIR, cfg.AVA.LABEL_MAP_FILE))
        gt_filename = os.path.join(cfg.AVA.ANNOTATION_DIR,
                                   cfg.AVA.GROUNDTRUTH_FILE)
        self.full_groundtruth = read_csv(gt_filename, self.class_whitelist)
        self.mini_groundtruth = get_ava_mini_groundtruth(self.full_groundtruth)

        _, self.video_idx_to_name = ava_helper.load_image_lists(
            cfg, mode == "train")
        self.output_dir = cfg.OUTPUT_DIR

    def log_iter_stats(self, cur_epoch, cur_iter):
        """
        Log the stats.
        Args:
            cur_epoch (int): the current epoch.
            cur_iter (int): the current iteration.
        """

        if (cur_iter + 1) % self.cfg.LOG_PERIOD != 0:
            return

        eta_sec = self.iter_timer.seconds() * (self.overall_iters - cur_iter)
        eta = str(datetime.timedelta(seconds=int(eta_sec)))
        if self.mode == "train":
            stats = {
                "_type": "{}_iter".format(self.mode),
                "cur_epoch": "{}".format(cur_epoch + 1),
                "cur_iter": "{}".format(cur_iter + 1),
                "eta": eta,
                "dt": self.iter_timer.seconds(),
                "dt_data": self.data_timer.seconds(),
                "dt_net": self.net_timer.seconds(),
                "mode": self.mode,
                "loss": self.loss.get_win_median(),
                "lr": self.lr,
            }
        elif self.mode == "val":
            stats = {
                "_type": "{}_iter".format(self.mode),
                "cur_epoch": "{}".format(cur_epoch + 1),
                "cur_iter": "{}".format(cur_iter + 1),
                "eta": eta,
                "dt": self.iter_timer.seconds(),
                "dt_data": self.data_timer.seconds(),
                "dt_net": self.net_timer.seconds(),
                "mode": self.mode,
            }
        elif self.mode == "test":
            stats = {
                "_type": "{}_iter".format(self.mode),
                "cur_iter": "{}".format(cur_iter + 1),
                "eta": eta,
                "dt": self.iter_timer.seconds(),
                "dt_data": self.data_timer.seconds(),
                "dt_net": self.net_timer.seconds(),
                "mode": self.mode,
            }
        else:
            raise NotImplementedError("Unknown mode: {}".format(self.mode))

        logging.log_json_stats(stats)

    def iter_tic(self):
        """
        Start to record time.
        """
        self.iter_timer.reset()
        self.data_timer.reset()

    def iter_toc(self):
        """
        Stop to record time.
        """
        self.iter_timer.pause()
        self.net_timer.pause()

    def data_toc(self):
        self.data_timer.pause()
        self.net_timer.reset()

    def reset(self):
        """
        Reset the Meter.
        """
        self.loss.reset()

        self.all_preds = []
        self.all_ori_boxes = []
        self.all_metadata = []

    def update_stats(self, preds, ori_boxes, metadata, loss=None, lr=None):
        """
        Update the current stats.
        Args:
            preds (tensor): prediction embedding.
            ori_boxes (tensor): original boxes (x1, y1, x2, y2).
            metadata (tensor): metadata of the AVA data.
            loss (float): loss value.
            lr (float): learning rate.
        """
        if self.mode in ["val", "test"]:
            self.all_preds.append(preds)
            self.all_ori_boxes.append(ori_boxes)
            self.all_metadata.append(metadata)
        if self.mode in ["train"]:
            self.all_preds_train.append(preds)
            self.all_ori_boxes_train.append(ori_boxes)
            self.all_metadata_train.append(metadata)

        if loss is not None:
            self.loss.add_value(loss)
        if lr is not None:
            self.lr = lr

    def finalize_metrics(self, log=True):
        """
        Calculate and log the final AVA metrics.
        """
        all_preds = torch.cat(self.all_preds, dim=0)
        all_ori_boxes = torch.cat(self.all_ori_boxes, dim=0)
        all_metadata = torch.cat(self.all_metadata, dim=0)

        if self.mode == "test" or (self.full_ava_test and self.mode == "val"):
            groundtruth = self.full_groundtruth
        else:
            groundtruth = self.mini_groundtruth

        self.full_map = evaluate_ava(
            all_preds,
            all_ori_boxes,
            all_metadata.tolist(),
            self.class_whitelist,
            self.categories,
            groundtruth=groundtruth,
            video_idx_to_name=self.video_idx_to_name,
        )
        if log:
            stats = {"mode": self.mode, "map": self.full_map}
            logging.log_json_stats(stats)

    def log_epoch_stats(self, cur_epoch):
        """
        Log the stats of the current epoch.
        Args:
            cur_epoch (int): the number of current epoch.
        """
        if self.mode in ["val", "test"]:
            self.finalize_metrics(log=False)
            stats = {
                "_type": "{}_epoch".format(self.mode),
                "cur_epoch": "{}".format(cur_epoch + 1),
                "mode": self.mode,
                "map": self.full_map,
                "gpu_mem": "{:.2f}G".format(misc.gpu_mem_usage()),
                "RAM": "{:.2f}/{:.2f}G".format(*misc.cpu_mem_usage()),
            }
            logging.log_json_stats(stats)
Example #28
0
def benchmark_data_loading(cfg):
    """
    Benchmark the speed of data loading in PySlowFast.
    Args:

        cfg (CfgNode): configs. Details can be found in
            slowfast/config/defaults.py
    """
    # Set up environment.
    setup_environment()
    # Set random seed from configs.
    np.random.seed(cfg.RNG_SEED)
    torch.manual_seed(cfg.RNG_SEED)

    # Setup logging format.
    logging.setup_logging()

    # Print config.
    logger.info("Benchmark data loading with config:")
    logger.info(pprint.pformat(cfg))

    timer = Timer()
    dataloader = loader.construct_loader(cfg, "train")
    logger.info("Initialize loader using {:.2f} seconds.".format(
        timer.seconds()))
    # Total batch size across different machines.
    batch_size = cfg.TRAIN.BATCH_SIZE * cfg.NUM_SHARDS
    log_period = cfg.BENCHMARK.LOG_PERIOD
    epoch_times = []
    # Test for a few epochs.
    for cur_epoch in range(cfg.BENCHMARK.NUM_EPOCHS):
        timer = Timer()
        timer_epoch = Timer()
        iter_times = []
        for cur_iter, _ in enumerate(tqdm.tqdm(dataloader)):
            if cur_iter > 0 and cur_iter % log_period == 0:
                iter_times.append(timer.seconds())
                ram_usage, ram_total = misc.cpu_mem_usage()
                logger.info(
                    "Epoch {}: {} iters ({} videos) in {:.2f} seconds. "
                    "RAM Usage: {:.2f}/{:.2f} GB.".format(
                        cur_epoch,
                        log_period,
                        log_period * batch_size,
                        iter_times[-1],
                        ram_usage,
                        ram_total,
                    ))
                timer.reset()
        epoch_times.append(timer_epoch.seconds())
        ram_usage, ram_total = misc.cpu_mem_usage()
        logger.info(
            "Epoch {}: in total {} iters ({} videos) in {:.2f} seconds. "
            "RAM Usage: {:.2f}/{:.2f} GB.".format(
                cur_epoch,
                len(dataloader),
                len(dataloader) * batch_size,
                epoch_times[-1],
                ram_usage,
                ram_total,
            ))
        logger.info(
            "Epoch {}: on average every {} iters ({} videos) take {:.2f}/{:.2f} "
            "(avg/std) seconds.".format(
                cur_epoch,
                log_period,
                log_period * batch_size,
                np.mean(iter_times),
                np.std(iter_times),
            ))
    logger.info("On average every epoch ({} videos) takes {:.2f}/{:.2f} "
                "(avg/std) seconds.".format(
                    len(dataloader) * batch_size,
                    np.mean(epoch_times),
                    np.std(epoch_times),
                ))
Example #29
0
class ValMeter(object):
    """
    Measures validation stats.
    """
    def __init__(self, max_iter, cfg):
        """
        Args:
            max_iter (int): the max number of iteration of the current epoch.
            cfg (CfgNode): configs.
        """
        self._cfg = cfg
        self.max_iter = max_iter
        self.iter_timer = Timer()
        self.data_timer = Timer()
        self.net_timer = Timer()
        # Current minibatch errors (smoothed over a window).
        self.mb_top1_err = ScalarMeter(cfg.LOG_PERIOD)
        self.mb_top5_err = ScalarMeter(cfg.LOG_PERIOD)
        # Min errors (over the full val set).
        self.min_top1_err = 100.0
        self.min_top5_err = 100.0
        # Number of misclassified examples.
        self.num_top1_mis = 0
        self.num_top5_mis = 0
        self.num_samples = 0
        self.all_preds = []
        self.all_labels = []
        self.output_dir = cfg.OUTPUT_DIR

    def reset(self):
        """
        Reset the Meter.
        """
        self.iter_timer.reset()
        self.mb_top1_err.reset()
        self.mb_top5_err.reset()
        self.num_top1_mis = 0
        self.num_top5_mis = 0
        self.num_samples = 0
        self.all_preds = []
        self.all_labels = []

    def iter_tic(self):
        """
        Start to record time.
        """
        self.iter_timer.reset()
        self.data_timer.reset()

    def iter_toc(self):
        """
        Stop to record time.
        """
        self.iter_timer.pause()
        self.net_timer.pause()

    def data_toc(self):
        self.data_timer.pause()
        self.net_timer.reset()

    def update_stats(self, top1_err, top5_err, mb_size):
        """
        Update the current stats.
        Args:
            top1_err (float): top1 error rate.
            top5_err (float): top5 error rate.
            mb_size (int): mini batch size.
        """
        self.mb_top1_err.add_value(top1_err)
        self.mb_top5_err.add_value(top5_err)
        self.num_top1_mis += top1_err * mb_size
        self.num_top5_mis += top5_err * mb_size
        self.num_samples += mb_size

    def update_predictions(self, preds, labels):
        """
        Update predictions and labels.
        Args:
            preds (tensor): model output predictions.
            labels (tensor): labels.
        """
        # TODO: merge update_prediction with update_stats.
        self.all_preds.append(preds)
        self.all_labels.append(labels)

    def log_iter_stats(self, cur_epoch, cur_iter):
        """
        log the stats of the current iteration.
        Args:
            cur_epoch (int): the number of current epoch.
            cur_iter (int): the number of current iteration.
        """
        if (cur_iter + 1) % self._cfg.LOG_PERIOD != 0:
            return
        eta_sec = self.iter_timer.seconds() * (self.max_iter - cur_iter - 1)
        eta = str(datetime.timedelta(seconds=int(eta_sec)))
        stats = {
            "_type": "val_iter",
            "epoch": "{}/{}".format(cur_epoch + 1, self._cfg.SOLVER.MAX_EPOCH),
            "iter": "{}/{}".format(cur_iter + 1, self.max_iter),
            "time_diff": self.iter_timer.seconds(),
            "eta": eta,
            "gpu_mem": "{:.2f}G".format(misc.gpu_mem_usage()),
        }
        if not self._cfg.DATA.MULTI_LABEL:
            stats["top1_err"] = self.mb_top1_err.get_win_median()
            stats["top5_err"] = self.mb_top5_err.get_win_median()
        logging.log_json_stats(stats)

    def log_epoch_stats(self, cur_epoch):
        """
        Log the stats of the current epoch.
        Args:
            cur_epoch (int): the number of current epoch.
        """
        stats = {
            "_type": "val_epoch",
            "epoch": "{}/{}".format(cur_epoch + 1, self._cfg.SOLVER.MAX_EPOCH),
            "time_diff": self.iter_timer.seconds(),
            "gpu_mem": "{:.2f}G".format(misc.gpu_mem_usage()),
            "RAM": "{:.2f}/{:.2f}G".format(*misc.cpu_mem_usage()),
        }
        if self._cfg.DATA.MULTI_LABEL:
            stats["map"] = get_map(
                torch.cat(self.all_preds).cpu().numpy(),
                torch.cat(self.all_labels).cpu().numpy(),
            )
        else:
            top1_err = self.num_top1_mis / self.num_samples
            top5_err = self.num_top5_mis / self.num_samples
            self.min_top1_err = min(self.min_top1_err, top1_err)
            self.min_top5_err = min(self.min_top5_err, top5_err)

            stats["top1_err"] = top1_err
            stats["top5_err"] = top5_err
            stats["min_top1_err"] = self.min_top1_err
            stats["min_top5_err"] = self.min_top5_err

        logging.log_json_stats(stats)
Example #30
0
class TrainMeter(object):
    """
    Measure training stats.
    """
    def __init__(self, epoch_iters, cfg):
        """
        Args:
            epoch_iters (int): the overall number of iterations of one epoch.
            cfg (CfgNode): configs.
        """
        self._cfg = cfg
        self.epoch_iters = epoch_iters
        self.MAX_EPOCH = cfg.SOLVER.MAX_EPOCH * epoch_iters
        self.iter_timer = Timer()

        self.loss_D = ScalarMeter(cfg.LOG_PERIOD)

        self.loss_G = ScalarMeter(cfg.LOG_PERIOD)
        self.appe_loss = ScalarMeter(cfg.LOG_PERIOD)
        self.flow_loss = ScalarMeter(cfg.LOG_PERIOD)
        self.loss_G_three_part = ScalarMeter(cfg.LOG_PERIOD)

        self.loss_D_total = 0.0
        # loss_G,appe_loss,flow_loss,loss_G_total
        self.loss_G_total = 0.0
        self.appe_loss_total = 0.0
        self.flow_loss_total = 0.0
        self.loss_G_three_part_total = 0.0

        self.lr_G = None
        self.lr_D = None
        # Current minibatch errors (smoothed over a window).
        # self.mb_top1_err = ScalarMeter(cfg.LOG_PERIOD)
        # self.mb_top5_err = ScalarMeter(cfg.LOG_PERIOD)
        # Number of misclassified examples.
        self.num_top1_mis = 0
        self.num_top5_mis = 0
        self.num_samples = 0
        self.num_samples_G = 0
        self.num_samples_D = 0

    def reset(self):
        """
        Reset the Meter.
        """
        # self.loss.reset()
        # self.loss_total = 0.0
        # self.lr = None
        self.loss_D.reset()

        self.loss_G.reset()
        self.appe_loss.reset()
        self.flow_loss.reset()
        self.loss_G_three_part.reset()

        self.loss_D_total = 0.0

        self.loss_G_total = 0.0
        self.appe_loss_total = 0.0
        self.flow_loss_total = 0.0
        self.loss_G_three_part_total = 0.0

        self.lr_G = None
        self.lr_D = None
        # self.mb_top1_err.reset()
        # self.mb_top5_err.reset()
        # self.num_top1_mis = 0
        # self.num_top5_mis = 0
        self.num_samples = 0
        self.num_samples_D = 0
        self.num_samples_G = 0

    def iter_tic(self):
        """
        Start to record time.
        """
        self.iter_timer.reset()

    def iter_toc(self):
        """
        Stop to record time.
        """
        self.iter_timer.pause()

    def update_stats(self, top1_err, top5_err, loss, lr, mb_size):
        """
        Update the current stats.
        Args:
            top1_err (float): top1 error rate.
            top5_err (float): top5 error rate.
            loss (float): loss value.
            lr (float): learning rate.
            mb_size (int): mini batch size.
        """
        self.loss.add_value(loss)
        self.lr = lr
        self.loss_total += loss * mb_size
        self.num_samples += mb_size

        # if not self._cfg.DATA.MULTI_LABEL:
        #     # Current minibatch stats
        #     self.mb_top1_err.add_value(top1_err)
        #     self.mb_top5_err.add_value(top5_err)
        #     # Aggregate stats
        #     self.num_top1_mis += top1_err * mb_size
        #     self.num_top5_mis += top5_err * mb_size

    def update_stats_G(self, loss_G, appe_loss, flow_loss, loss_G_three_part,
                       lr, mb_size):
        """
        Update the current stats.
        Args:

            loss (float): loss value.
            lr (float): learning rate.
            mb_size (int): mini batch size.
        """
        self.loss_G.add_value(loss_G)
        self.appe_loss.add_value(appe_loss)
        self.flow_loss.add_value(flow_loss)
        self.loss_G_three_part.add_value(loss_G_three_part)
        #
        self.lr_G = lr
        # self.loss_total_G+= loss * mb_size
        self.loss_G_total += loss_G * mb_size
        self.appe_loss_total = appe_loss * mb_size
        self.flow_loss_total = flow_loss * mb_size

        self.loss_G_three_part_total = loss_G_three_part * mb_size

        self.num_samples_G += mb_size

    def update_stats_D(self, loss_D, lr, mb_size):
        """
        Update the current stats of D .
        Args:

            loss (float): loss value.
            lr (float): learning rate.
            mb_size (int): mini batch size.
        """
        self.loss_D.add_value(loss_D)
        self.lr_D = lr
        self.loss_D_total += loss_D * mb_size
        self.num_samples_D += mb_size

    def log_iter_stats(self, cur_epoch, cur_iter, mode):
        """
        log the stats of the current iteration.
        Args:
            cur_epoch (int): the number of current epoch.
            cur_iter (int): the number of current iteration.
        """
        if (cur_iter + 1) % self._cfg.LOG_PERIOD != 0:
            return
        eta_sec = self.iter_timer.seconds() * (
            self.MAX_EPOCH - (cur_epoch * self.epoch_iters + cur_iter + 1))
        eta = str(datetime.timedelta(seconds=int(eta_sec)))

        # stats in D or G
        if mode in ["D", "Discriminator"]:
            stats = {
                "_type": "train_iter",
                "epoch": "{}/{}".format(cur_epoch + 1,
                                        self._cfg.SOLVER.MAX_EPOCH),
                "iter": "{}/{}".format(cur_iter + 1, self.epoch_iters),
                "time_diff": self.iter_timer.seconds(),
                "eta": eta,
                "loss_D": self.loss_D.get_win_median(),
                "lr_D": self.lr_D,
                "gpu_mem": "{:.2f} GB".format(misc.gpu_mem_usage()),
            }
        elif mode in ["G", "Generator"]:
            stats = {
                "_type": "train_iter",
                "epoch": "{}/{}".format(cur_epoch + 1,
                                        self._cfg.SOLVER.MAX_EPOCH),
                "iter": "{}/{}".format(cur_iter + 1, self.epoch_iters),
                "time_diff": self.iter_timer.seconds(),
                "eta": eta,
                "loss_G": self.loss_G.get_win_median(),
                "appe_loss": self.appe_loss.get_win_median(),
                "flow_loss": self.flow_loss.get_win_median(),
                "three_part_loss_G": self.loss_G_three_part.get_win_median(),
                "lr_G": self.lr_G,
                "gpu_mem": "{:.2f} GB".format(misc.gpu_mem_usage()),
            }

        else:
            raise NotImplementedError("Does not support  state")
        logging.log_json_stats(stats)

        # stats = {
        #     "_type": "train_iter",
        #     "epoch": "{}/{}".format(cur_epoch + 1, self._cfg.SOLVER.MAX_EPOCH),
        #     "iter": "{}/{}".format(cur_iter + 1, self.epoch_iters),
        #     "time_diff": self.iter_timer.seconds(),
        #     "eta": eta,
        #
        #     "loss": self.loss.get_win_median(),
        #     "lr": self.lr,
        #     "gpu_mem": "{:.2f} GB".format(misc.gpu_mem_usage()),
        # }
        # if not self._cfg.DATA.MULTI_LABEL:
        #     stats["top1_err"] = self.mb_top1_err.get_win_median()
        #     stats["top5_err"] = self.mb_top5_err.get_win_median()

    def log_epoch_stats(self, cur_epoch):
        """
        Log the stats of the current epoch.
        Args:
            cur_epoch (int): the number of current epoch.
        """
        eta_sec = self.iter_timer.seconds() * (
            self.MAX_EPOCH - (cur_epoch + 1) * self.epoch_iters)
        eta = str(datetime.timedelta(seconds=int(eta_sec)))
        # stats in G or D
        stats = {
            "_type": "train_epoch",
            "epoch": "{}/{}".format(cur_epoch + 1, self._cfg.SOLVER.MAX_EPOCH),
            "time_diff": self.iter_timer.seconds(),
            "eta": eta,
            "lr_D": self.lr_D,
            "loss_D": self.loss_D_total / self.num_samples_D,
            "lr_G": self.lr_G,
            "loss_G": self.loss_G_total / self.num_samples_G,
            "appe_loss": self.appe_loss_total / self.num_samples_G,
            "flow_loss": self.flow_loss_total / self.num_samples_G,
            "total_G_loss": self.loss_G_three_part_total / self.num_samples_G,
            "gpu_mem": "{:.2f} GB".format(misc.gpu_mem_usage()),
            "RAM": "{:.2f}/{:.2f} GB".format(*misc.cpu_mem_usage()),
        }

        # avg_loss = self.loss_total_D / self.num_samples_D
        # stats["loss_D"] = avg_loss

        # stats = {
        #     "_type": "train_epoch",
        #     "epoch": "{}/{}".format(cur_epoch + 1, self._cfg.SOLVER.MAX_EPOCH),
        #     "time_diff": self.iter_timer.seconds(),
        #     "eta": eta,
        #     "lr": self.lr,
        #     "gpu_mem": "{:.2f} GB".format(misc.gpu_mem_usage()),
        #     "RAM": "{:.2f}/{:.2f} GB".format(*misc.cpu_mem_usage()),
        # }
        # if not self._cfg.DATA.MULTI_LABEL:
        #     top1_err = self.num_top1_mis / self.num_samples
        #     top5_err = self.num_top5_mis / self.num_samples
        #     avg_loss = self.loss_total / self.num_samples
        #     stats["top1_err"] = top1_err
        #     stats["top5_err"] = top5_err
        #     stats["loss"] = avg_loss
        logging.log_json_stats(stats)