Example #1
0
    def finalize_metrics(self, ks=(1, 5)):
        """
        Calculate and log the final ensembled metrics.
        ks (tuple): list of top-k values for topk_accuracies. For example,
            ks = (1, 5) correspods to top-1 and top-5 accuracy.
        """
        if not all(self.clip_count == self.num_clips):
            logger.warning(
                "clip count {} ~= num clips {}".format(
                    self.clip_count, self.num_clips
                )
            )
            logger.warning(self.clip_count)

        num_topks_correct = metrics.topks_correct(
            self.video_preds, self.video_labels, ks
        )
        topks = [
            (x / self.video_preds.size(0)) * 100.0 for x in num_topks_correct
        ]
        assert len({len(ks), len(topks)}) == 1
        stats = {"split": "test_final"}
        for k, topk in zip(ks, topks):
            stats["top{}_acc".format(k)] = "{:.{prec}f}".format(topk, prec=2)
        logging.log_json_stats(stats)
Example #2
0
    def finalize_metrics(self, ks=(1, 5)):
        """
        Calculate and log the final ensembled metrics.
        ks (tuple): list of top-k values for topk_accuracies. For example,
            ks = (1, 5) correspods to top-1 and top-5 accuracy.
        """
        if not all(self.clip_count == self.num_clips):
            logger.warning("clip count {} ~= num clips {}".format(
                ", ".join([
                    "{}: {}".format(i, k)
                    for i, k in enumerate(self.clip_count.tolist())
                ]),
                self.num_clips,
            ))

        self.stats = {"split": "test_final"}
        if self.multi_label:
            map = get_map(self.video_preds.cpu().numpy(),
                          self.video_labels.cpu().numpy())
            self.stats["map"] = map
        else:
            num_topks_correct = metrics.topks_correct(self.video_preds,
                                                      self.video_labels, ks)
            topks = [(x / self.video_preds.size(0)) * 100.0
                     for x in num_topks_correct]
            assert len({len(ks), len(topks)}) == 1
            for k, topk in zip(ks, topks):
                self.stats["top{}_acc".format(k)] = "{:.{prec}f}".format(
                    topk, prec=2)
        logging.log_json_stats(self.stats)
Example #3
0
def eval_epoch(val_loader, model, val_meter, cur_epoch, cfg, test_imp=False):
    """
    Evaluate the model on the val set.
    Args:
        val_loader (loader): data loader to provide validation data.
        model (model): model to evaluate the performance.
        val_meter (ClevrerValMeter): meter instance to record and calculate the metrics.
        cur_epoch (int): number of the current epoch of training.
        cfg (CfgNode): configs. Details can be found in
            slowfast/config/defaults.py
    """
    # Evaluation mode enabled. The running stats would not be updated.
    model.eval()
    val_meter.iter_tic()

    for cur_iter, sampled_batch in enumerate(val_loader):
        video_ft = sampled_batch['res_ft']
        des_q = sampled_batch['question_dict']['question']
        attn_masks = sampled_batch['question_dict']['attention_mask']
        des_ans = sampled_batch['question_dict']['ans']
        # Transfer the data to the current GPU device.
        if cfg.NUM_GPUS:
            video_ft = video_ft.cuda(non_blocking=True)
            des_q = des_q.cuda(non_blocking=True)
            attn_masks = attn_masks.cuda(non_blocking=True)
            des_ans = des_ans.cuda()

        val_meter.data_toc()

        # Explicitly declare reduction to mean.
        des_loss_fun = losses.get_loss_func('cross_entropy')(reduction="mean")
        pred_des_ans = model(video_ft, des_q, attn_masks)
        loss_des_val = des_loss_fun(pred_des_ans, des_ans)

        # Compute the errors.
        num_topks_correct = metrics.topks_correct(pred_des_ans, des_ans,
                                                  (1, 5))
        # Combine the errors across the GPUs.
        top1_err, top5_err = [(1.0 - x / pred_des_ans.size(0)) * 100.0
                              for x in num_topks_correct]
        loss_mc_val = None
        mc_opt_err, mc_q_err = None, None
        mb_size_mc = None
        loss_des_val, top1_err, top5_err = (loss_des_val.item(),
                                            top1_err.item(), top5_err.item())

        val_meter.iter_toc()
        #top1_err, top5_err, mc_opt_err, mc_q_err, loss_des, loss_mc, mb_size_des, mb_size_mc
        # Update and log stats.
        val_meter.update_stats(top1_err, top5_err, mc_opt_err,
                               mc_q_err, loss_des_val, loss_mc_val,
                               des_ans.size(0), mb_size_mc)
        val_meter.log_iter_stats(cur_epoch, cur_iter)
        val_meter.iter_tic()

    # Log epoch stats.
    val_meter.log_epoch_stats(cur_epoch)
    val_meter.reset()
def eval_epoch(val_loader, model, val_meter, cur_epoch, cfg):
    """
    Evaluate the model on the val set.
    Args:
        val_loader (loader): data loader to provide validation data.
        model (model): model to evaluate the performance.
        val_meter (ValMeter): meter instance to record and calculate the metrics.
        cur_epoch (int): number of the current epoch of training.
        cfg (CfgNode): configs. Details can be found in
            slowfast/config/defaults.py
    """

    # Evaluation mode enabled. The running stats would not be updated.
    model.eval()
    val_meter.iter_tic()

    for cur_iter, (inputs, labels, _, _) in enumerate(val_loader):
        # Transferthe data to the current GPU device.
        if isinstance(inputs, (list,)):
            for i in range(len(inputs)):
                inputs[i] = inputs[i].cuda(non_blocking=True)
        else:
            inputs = inputs.cuda(non_blocking=True)

        labels = labels.cuda()

        # Compute the predictions.
        preds, _ = model(inputs)
        # Compute the errors.
        num_topks_correct = metrics.topks_correct(preds, labels, (1, 5))

        # Combine the errors across the GPUs.
        top1_err, top5_err = [
            (1.0 - x / preds.size(0)) * 100.0 for x in num_topks_correct
        ]
        if cfg.NUM_GPUS > 1:
            top1_err, top5_err = du.all_reduce([top1_err, top5_err])

        # Copy the errors from GPU to CPU (sync point).
        top1_err, top5_err = top1_err.item(), top5_err.item()
        val_meter.iter_toc()
        # Update and log stats.
        val_meter.update_stats(
            top1_err, top5_err, inputs[0].size(0) * cfg.NUM_GPUS
        )
        val_meter.log_iter_stats(cur_epoch, cur_iter)
        val_meter.iter_tic()

    # Log epoch stats.
    val_meter.log_epoch_stats(cur_epoch)
    val_meter.reset()
Example #5
0
    def finalize_metrics(self, ks=(1, 2)):
        """
        Calculate and log the final ensembled metrics.
        ks (tuple): list of top-k values for topk_accuracies. For example,
            ks = (1, 5) correspods to top-1 and top-5 accuracy.
        """
        if self.isDemo:
            preds_numpy = self.video_preds.clone()
            normalize = np.array(softmax(preds_numpy.cpu().numpy()))
            jogging_label = 21
            sort_p = []
            for p in normalize:
                sort_p.append(sorted(p, reverse=True))

            propability = np.transpose(
                np.array(softmax(preds_numpy.cpu().numpy())))

            for i, v in enumerate(propability[jogging_label]):
                top1_v = sort_p[i][0]
                top2_v = sort_p[i][1]
                if v == top1_v or v == top2_v:
                    propability[jogging_label][
                        i] = propability[jogging_label][i] / (top1_v + top2_v)

            cwd = os.getcwd()
            tmp_dir = os.path.join(cwd, "tmp")
            if not os.path.exists(tmp_dir):
                os.mkdir(tmp_dir)
            out_dir = os.path.join(tmp_dir, "probability.npy")

            np.save(out_dir, propability[jogging_label])
        if not all(self.clip_count == self.num_clips):
            logger.warning("clip count {} ~= num clips {}".format(
                self.clip_count, self.num_clips))
            logger.warning(self.clip_count)

        num_topks_correct = metrics.topks_correct(self.video_preds,
                                                  self.video_labels, ks)
        topks = [(x / self.video_preds.size(0)) * 100.0
                 for x in num_topks_correct]
        #binary = [
        #    (x / self.video_preds.size(0)) * 100.0 for x in binary_correct
        #]
        assert len({len(ks), len(topks)}) == 1
        stats = {"split": "test_final"}

        for k, topk in zip(ks, topks):
            stats["top{}_acc".format(k)] = "{:.{prec}f}".format(topk, prec=2)
Example #6
0
    def finalize_metrics(self, ks=(1, 5)):
        """
        Calculate and log the final ensembled metrics.
        ks (tuple): list of top-k values for topk_accuracies. For example,
            ks = (1, 5) correspods to top-1 and top-5 accuracy.
        """
        clip_check = self.clip_count == self.num_clips
        if not all(clip_check):
            logger.warning(
                "clip count Ids={} = {} (should be {})".format(
                    np.argwhere(~clip_check),
                    self.clip_count[~clip_check],
                    self.num_clips,
                )
            )

        self.stats = {"split": "test_final"}
        if self.multi_label:
            mean_ap = get_map(
                self.video_preds.cpu().numpy(), self.video_labels.cpu().numpy()
            )
            map_str = "{:.{prec}f}".format(mean_ap * 100.0, prec=2)
            self.stats["map"] = map_str
            self.stats["top1_acc"] = map_str
            self.stats["top5_acc"] = map_str
        else:
            num_topks_correct = metrics.topks_correct(
                self.video_preds, self.video_labels, ks
            )
            topks = [
                (x / self.video_preds.size(0)) * 100.0
                for x in num_topks_correct
            ]
            assert len({len(ks), len(topks)}) == 1
            for k, topk in zip(ks, topks):
                # self.stats["top{}_acc".format(k)] = topk.cpu().numpy()
                self.stats["top{}_acc".format(k)] = "{:.{prec}f}".format(
                    topk, prec=2
                )
        logging.log_json_stats(self.stats)
Example #7
0
def eval_epoch(val_dloader, model, cur_epoch, cfg):
    model.eval()
    results = defaultdict(list)
    for cur_iter, (inputs, labels, _,
                   extra_data) in enumerate(tqdm(val_dloader, ncols=80)):
        # Transfer the data to the current GPU device.
        if cfg.NUM_GPUS:
            if isinstance(inputs, (list, )):
                for i in range(len(inputs)):
                    inputs[i] = inputs[i].cuda(non_blocking=True)
            else:
                inputs = inputs.cuda(non_blocking=True)
            labels = labels.cuda()
            for key, val in extra_data.items():
                if isinstance(val, (list, )):
                    for i in range(len(val)):
                        val[i] = val[i].cuda(non_blocking=True)
                else:
                    extra_data[key] = val.cuda(non_blocking=True)
        with torch.no_grad():
            if cfg.DETECTION.ENABLE:
                preds = model(inputs, extra_data["boxes"])
            else:
                preds = model(inputs)
        top1_tensor, top5_tensor = metrics.topks_correct(preds, labels, (1, 5))
        if cfg.NUM_GPUS:
            top1_tensor, top5_tensor = top1_tensor.cpu(), top5_tensor.cpu()
        results['top1'] += top1_tensor.tolist()
        results['top5'] += top5_tensor.tolist()
    print_str = "epoch: {} ".format(cur_epoch)
    for key in results:
        results[key] = sum(results[key]) / len(results[key])
        print_str += "{}: {} ".format(key, results[key])
    print(print_str)
    with open(os.path.join(cfg.OUTPUT_DIR, 'res_out.txt'), 'a') as f:
        f.write("epoch: " + str(cur_epoch) + " " + print_str + "\n")
    return results
Example #8
0
def eval_epoch(val_loader, model, val_meter, cur_epoch, cfg):
    """
    Evaluate the model on the val set.
    Args:
        val_loader (loader): data loader to provide validation data.
        model (model): model to evaluate the performance.
        val_meter (ValMeter): meter instance to record and calculate the metrics.
        cur_epoch (int): number of the current epoch of training.
        cfg (CfgNode): configs. Details can be found in
            slowfast/config/defaults.py
    """
    # Evaluation mode enabled. The running stats would not be updated.
    model.eval()
    val_meter.iter_tic()

    for cur_iter, (inputs, labels, _, meta) in enumerate(val_loader):
        # Transferthe data to the current GPU device.
        if isinstance(inputs, (list, )):
            for i in range(len(inputs)):
                inputs[i] = inputs[i].cuda(non_blocking=True)
        else:
            inputs = inputs.cuda(non_blocking=True)
        if isinstance(labels, (dict, )):
            labels = {k: v.cuda() for k, v in labels.items()}
        else:
            labels = labels.cuda()

        if cfg.DETECTION.ENABLE:
            logger.info("Detection Metadata: {}".format(meta))
            for key, val in meta.items():
                if isinstance(val, (list, )):
                    for i in range(len(val)):
                        val[i] = val[i].cuda(non_blocking=True)
                else:
                    meta[key] = val.cuda(non_blocking=True)
            # Compute the predictions.
            preds = model(inputs, meta["boxes"])

            preds = preds.cpu()
            ori_boxes = meta["ori_boxes"].cpu()
            metadata = meta["metadata"].cpu()

            if cfg.NUM_GPUS > 1:
                preds = torch.cat(du.all_gather_unaligned(preds), dim=0)
                ori_boxes = torch.cat(du.all_gather_unaligned(ori_boxes),
                                      dim=0)
                metadata = torch.cat(du.all_gather_unaligned(metadata), dim=0)

            val_meter.iter_toc()
            # Update and log stats.
            val_meter.update_stats(preds.cpu(), ori_boxes.cpu(),
                                   metadata.cpu())
        else:
            preds = model(inputs)
            if isinstance(labels, (dict, )):
                # Compute the verb accuracies.
                verb_top1_acc, verb_top5_acc = metrics.topk_accuracies(
                    preds[0], labels['verb'], (1, 5))

                # Combine the errors across the GPUs.
                if cfg.NUM_GPUS > 1:
                    verb_top1_acc, verb_top5_acc = du.all_reduce(
                        [verb_top1_acc, verb_top5_acc])

                # Copy the errors from GPU to CPU (sync point).
                verb_top1_acc, verb_top5_acc = verb_top1_acc.item(
                ), verb_top5_acc.item()

                # Compute the noun accuracies.
                noun_top1_acc, noun_top5_acc = metrics.topk_accuracies(
                    preds[1], labels['noun'], (1, 5))

                # Combine the errors across the GPUs.
                if cfg.NUM_GPUS > 1:
                    noun_top1_acc, noun_top5_acc = du.all_reduce(
                        [noun_top1_acc, noun_top5_acc])

                # Copy the errors from GPU to CPU (sync point).
                noun_top1_acc, noun_top5_acc = noun_top1_acc.item(
                ), noun_top5_acc.item()

                # Compute the action accuracies.
                action_top1_acc, action_top5_acc = metrics.multitask_topk_accuracies(
                    (preds[0], preds[1]), (labels['verb'], labels['noun']),
                    (1, 5))
                # Combine the errors across the GPUs.
                if cfg.NUM_GPUS > 1:
                    action_top1_acc, action_top5_acc = du.all_reduce(
                        [action_top1_acc, action_top5_acc])

                # Copy the errors from GPU to CPU (sync point).
                action_top1_acc, action_top5_acc = action_top1_acc.item(
                ), action_top5_acc.item()

                val_meter.iter_toc()
                # Update and log stats.
                val_meter.update_stats(
                    (verb_top1_acc, noun_top1_acc, action_top1_acc),
                    (verb_top5_acc, noun_top5_acc, action_top5_acc),
                    inputs[0].size(0) * cfg.NUM_GPUS)
            else:
                # Compute the errors.
                num_topks_correct = metrics.topks_correct(
                    preds, labels, (1, 5))

                # Combine the errors across the GPUs.
                top1_err, top5_err = [(1.0 - x / preds.size(0)) * 100.0
                                      for x in num_topks_correct]
                if cfg.NUM_GPUS > 1:
                    top1_err, top5_err = du.all_reduce([top1_err, top5_err])

                # Copy the errors from GPU to CPU (sync point).
                top1_err, top5_err = top1_err.item(), top5_err.item()

                val_meter.iter_toc()
                # Update and log stats.
                val_meter.update_stats(top1_err, top5_err,
                                       inputs[0].size(0) * cfg.NUM_GPUS)
        val_meter.log_iter_stats(cur_epoch, cur_iter)
        val_meter.iter_tic()
    # Log epoch stats.
    is_best_epoch = val_meter.log_epoch_stats(cur_epoch)
    val_meter.reset()
    return is_best_epoch
Example #9
0
def train_epoch(train_loader,
                model,
                optimizer,
                scaler,
                train_meter,
                cur_epoch,
                cfg,
                writer=None):
    """
    Perform the video training for one epoch.
    Args:
        train_loader (loader): video training loader.
        model (model): the video model to train.
        optimizer (optim): the optimizer to perform optimization on the model's
            parameters.
        train_meter (TrainMeter): training meters to log the training performance.
        cur_epoch (int): current epoch of training.
        cfg (CfgNode): configs. Details can be found in
            slowfast/config/defaults.py
        writer (TensorboardWriter, optional): TensorboardWriter object
            to writer Tensorboard log.
    """
    # Enable train mode.
    model.train()
    train_meter.iter_tic()
    data_size = len(train_loader)

    if cfg.MIXUP.ENABLE:
        mixup_fn = MixUp(
            mixup_alpha=cfg.MIXUP.ALPHA,
            cutmix_alpha=cfg.MIXUP.CUTMIX_ALPHA,
            mix_prob=cfg.MIXUP.PROB,
            switch_prob=cfg.MIXUP.SWITCH_PROB,
            label_smoothing=cfg.MIXUP.LABEL_SMOOTH_VALUE,
            num_classes=cfg.MODEL.NUM_CLASSES,
        )

    for cur_iter, (inputs, labels, _, meta) in enumerate(train_loader):
        # Transfer the data to the current GPU device.
        if cfg.NUM_GPUS:
            if isinstance(inputs, (list, )):
                for i in range(len(inputs)):
                    inputs[i] = inputs[i].cuda(non_blocking=True)
            else:
                inputs = inputs.cuda(non_blocking=True)
            labels = labels.cuda()
            for key, val in meta.items():
                if isinstance(val, (list, )):
                    for i in range(len(val)):
                        val[i] = val[i].cuda(non_blocking=True)
                else:
                    meta[key] = val.cuda(non_blocking=True)

        # Update the learning rate.
        lr = optim.get_epoch_lr(cur_epoch + float(cur_iter) / data_size, cfg)
        optim.set_lr(optimizer, lr)

        train_meter.data_toc()
        if cfg.MIXUP.ENABLE:
            samples, labels = mixup_fn(inputs[0], labels)
            inputs[0] = samples

        with torch.cuda.amp.autocast(enabled=cfg.TRAIN.MIXED_PRECISION):
            if cfg.DETECTION.ENABLE:
                preds = model(inputs, meta["boxes"])
            else:
                preds = model(inputs)
            # Explicitly declare reduction to mean.
            loss_fun = losses.get_loss_func(
                cfg.MODEL.LOSS_FUNC)(reduction="mean")

            # Compute the loss.
            loss = loss_fun(preds, labels)

        # check Nan Loss.
        misc.check_nan_losses(loss)

        # Perform the backward pass.
        optimizer.zero_grad()
        scaler.scale(loss).backward()
        # Unscales the gradients of optimizer's assigned params in-place
        scaler.unscale_(optimizer)
        # Clip gradients if necessary
        if cfg.SOLVER.CLIP_GRAD_VAL:
            torch.nn.utils.clip_grad_value_(model.parameters(),
                                            cfg.SOLVER.CLIP_GRAD_VAL)
        elif cfg.SOLVER.CLIP_GRAD_L2NORM:
            torch.nn.utils.clip_grad_norm_(model.parameters(),
                                           cfg.SOLVER.CLIP_GRAD_L2NORM)
        # Update the parameters.
        scaler.step(optimizer)
        scaler.update()

        if cfg.MIXUP.ENABLE:
            _top_max_k_vals, top_max_k_inds = torch.topk(labels,
                                                         2,
                                                         dim=1,
                                                         largest=True,
                                                         sorted=True)
            idx_top1 = torch.arange(labels.shape[0]), top_max_k_inds[:, 0]
            idx_top2 = torch.arange(labels.shape[0]), top_max_k_inds[:, 1]
            preds = preds.detach()
            preds[idx_top1] += preds[idx_top2]
            preds[idx_top2] = 0.0
            labels = top_max_k_inds[:, 0]

        if cfg.DETECTION.ENABLE:
            if cfg.NUM_GPUS > 1:
                loss = du.all_reduce([loss])[0]
            loss = loss.item()

            # Update and log stats.
            train_meter.update_stats(None, None, None, loss, lr)
            # write to tensorboard format if available.
            if writer is not None:
                writer.add_scalars(
                    {
                        "Train/loss": loss,
                        "Train/lr": lr
                    },
                    global_step=data_size * cur_epoch + cur_iter,
                )

        else:
            top1_err, top5_err = None, None
            if cfg.DATA.MULTI_LABEL:
                # Gather all the predictions across all the devices.
                if cfg.NUM_GPUS > 1:
                    [loss] = du.all_reduce([loss])
                loss = loss.item()
            else:
                # Compute the errors.
                num_topks_correct = metrics.topks_correct(
                    preds, labels, (1, 5))
                top1_err, top5_err = [(1.0 - x / preds.size(0)) * 100.0
                                      for x in num_topks_correct]
                # Gather all the predictions across all the devices.
                if cfg.NUM_GPUS > 1:
                    loss, top1_err, top5_err = du.all_reduce(
                        [loss, top1_err, top5_err])

                # Copy the stats from GPU to CPU (sync point).
                loss, top1_err, top5_err = (
                    loss.item(),
                    top1_err.item(),
                    top5_err.item(),
                )

            # Update and log stats.
            train_meter.update_stats(
                top1_err,
                top5_err,
                loss,
                lr,
                inputs[0].size(0) * max(
                    cfg.NUM_GPUS, 1
                ),  # If running  on CPU (cfg.NUM_GPUS == 1), use 1 to represent 1 CPU.
            )
            # write to tensorboard format if available.
            if writer is not None:
                writer.add_scalars(
                    {
                        "Train/loss": loss,
                        "Train/lr": lr,
                        "Train/Top1_err": top1_err,
                        "Train/Top5_err": top5_err,
                    },
                    global_step=data_size * cur_epoch + cur_iter,
                )

        train_meter.iter_toc()  # measure allreduce for this meter
        train_meter.log_iter_stats(cur_epoch, cur_iter)
        train_meter.iter_tic()

    # Log epoch stats.
    train_meter.log_epoch_stats(cur_epoch)
    train_meter.reset()
Example #10
0
def train_epoch(train_loader, model, optimizer, train_meter, cur_epoch, writer,
                nep, cfg):
    """
    Perform the video training for one epoch.
    Args:
        train_loader (loader): video training loader.
        model (model): the video model to train.
        optimizer (optim): the optimizer to perform optimization on the model's
            parameters.
        train_meter (TrainMeter): training meters to log the training performance.
        cur_epoch (int): current epoch of training.
        cfg (CfgNode): configs. Details can be found in
            slowfast/config/defaults.py
    """
    # Enable train mode.
    model.train()
    train_meter.iter_tic()
    data_size = len(train_loader)

    global_iters = data_size * cur_epoch
    for cur_iter, (inputs, labels, _, meta) in enumerate(train_loader):

        # Transfer the data to the current GPU device.
        if isinstance(inputs, (list, )):
            for i in range(len(inputs)):
                inputs[i] = inputs[i].cuda(non_blocking=True)

        else:
            inputs = inputs.cuda(non_blocking=True)

        if len(inputs[i].shape) > 5:

            labels = torch.repeat_interleave(labels, inputs[i].size(1), 0)

        for i in range(len(inputs)):
            if len(inputs[i].shape) > 5:

                inputs[i] = inputs[i].view((-1, ) + inputs[i].shape[2:])

        labels = labels.cuda()
        for key, val in meta.items():
            if isinstance(val, (list, )):
                for i in range(len(val)):
                    val[i] = val[i].cuda(non_blocking=True)
            else:
                meta[key] = val.cuda(non_blocking=True)

        # Update the learning rate.
        lr = optim.get_epoch_lr(cur_epoch + float(cur_iter) / data_size,
                                global_iters, cfg)
        optim.set_lr(optimizer, lr)

        if cfg.DETECTION.ENABLE:
            # Compute the predictions.
            preds = model(inputs, meta["boxes"])

        else:
            # Perform the forward pass.
            if 'masks' in meta:
                preds = model((inputs, meta['masks']))
            else:
                preds = model(inputs)

        ####################################################################################################################################
        # check activations
        ####################################################################################################################################
        # if writer is not None and global_iters%cfg.SUMMARY_PERIOD==0:

        #     bu_errors = preds['bu_errors']#.cpu()#.data.numpy().squeeze()

        #     for layer in range(len(bu_errors)):
        #         images = bu_errors[layer].transpose(1,2).transpose(0,1)
        #         images = (images-images.min())
        #         images = images/images.max()
        #         images = images.reshape((-1,) + images.shape[2:])

        #         # grid = tv.utils.make_grid(images, nrow=18, normalize=True)
        #         # writer.add_image('activations/bu_error_l%d'%layer, grid, global_iters)

        #         tv.utils.save_image(images, os.path.join(cfg.OUTPUT_DIR, 'preds_%d_bu_errors_l%d.jpg'%(global_iters,layer)), nrow=18, normalize=True)

        #     mix_out = preds['mix_layer']#.cpu().data.numpy().squeeze()
        #     for layer in range(len(mix_out)):

        #         images = mix_out[layer].transpose(1,2).transpose(0,1)
        #         images = images.reshape((-1,) + images.shape[2:])
        #         images = (images-images.min())
        #         images = images/images.max()
        #         # grid = tv.utils.make_grid(images, nrow=18, normalize=True)
        #         # writer.add_image('activations/mix_layer_l%d'%layer, grid, global_iters)
        #         # tv.utils.save_image(images, os.path.join(cfg.OUTPUT_DIR, 'example_%d_mix_layer_l%d.jpg'%(i,layer)), nrow=18, normalize=True)

        #         tv.utils.save_image(images, os.path.join(cfg.OUTPUT_DIR, 'preds_%d_mix_layer_l%d.jpg'%(global_iters,layer)), nrow=18, normalize=True)

        #     inhibition = preds['H_inh']#.cpu()#.data.numpy().squeeze()
        #     for layer in range(len(inhibition)):
        #         images = inhibition[layer].transpose(1,2).transpose(0,1)
        #         images = (images-images.min())
        #         images = images/images.max()
        #         images = images.reshape((-1,) + images.shape[2:])
        #         # grid = tv.utils.make_grid(images, nrow=18, normalize=True)
        #         # writer.add_image('activations/H_inh_l%d'%layer, grid, global_iters)
        #         tv.utils.save_image(images, os.path.join(cfg.OUTPUT_DIR, 'preds_%d_H_inh_l%d.jpg'%(global_iters,layer)), nrow=18, normalize=True)

        #     hidden = preds['hidden']#.cpu()#.data.numpy().squeeze()
        #     for layer in range(len(hidden)):
        #         images = hidden[layer].transpose(1,2).transpose(0,1)
        #         images = (images-images.min())
        #         images = images/images.max()
        #         images = images.reshape((-1,) + images.shape[2:])
        #         # grid = tv.utils.make_grid(images, nrow=18, normalize=True)
        #         # writer.add_image('activations/hidden_l%d'%layer, grid, global_iters)
        #         tv.utils.save_image(images, os.path.join(cfg.OUTPUT_DIR, 'preds_%d_hidden_l%d.jpg'%(global_iters,layer)), nrow=18, normalize=True)

        out_keys = preds.keys()
        total_loss = 0

        if cfg.PREDICTIVE.ENABLE:

            errors = preds['pred_errors']
            if 'frame_errors' in preds:
                frame_errors = preds['frame_errors']

            if 'IoU' in preds:
                iou = preds['IoU']
            if 'Acc' in preds:
                acc = preds['Acc']

            pred_loss = errors.mean()
            total_loss += pred_loss

            # if 'frame_errors' in out_keys:
            #     total_loss += frame_errors
            # copy_baseline = F.smooth_l1_loss(inputs[i][:,:,1:] - inputs[i][:,:,:-1], torch.zeros_like(inputs[i][:,:,1:]))
            # copy_baseline = F.l1_loss(inputs[i][:,:,1:] - inputs[i][:,:,:-1], torch.zeros_like(inputs[i][:,:,1:]))

        if cfg.PREDICTIVE.CPC:
            cpc_loss = preds['cpc_loss']
            total_loss += cpc_loss

        if 'cbp_penalty' in preds:
            penalty = preds['cbp_penalty']
            total_loss += penalty

        if cfg.SUPERVISED:
            preds = preds['logits']

            if cfg.MODEL.LOSS_FUNC != '':
                # Explicitly declare reduction to mean.
                loss_fun = losses.get_loss_func(
                    cfg.MODEL.LOSS_FUNC)(reduction="mean")

                # Compute the loss.
                loss = loss_fun(preds, labels)

                total_loss += loss

        # check Nan Loss.
        misc.check_nan_losses(total_loss)

        # Perform the backward pass.
        optimizer.zero_grad()

        total_loss.backward()

        ####################################################################################################################################
        # check gradients
        if writer is not None and global_iters % cfg.SUMMARY_PERIOD == 0:
            n_p = model.module.named_parameters() if hasattr(
                model, 'module') else model.named_parameters()
            fig = viz_helpers.plot_grad_flow_v2(n_p)
            writer.add_figure('grad_flow/grad_flow', fig, global_iters)
        ####################################################################################################################################

        # Update the parameters.
        optimizer.step()

        if cfg.DETECTION.ENABLE:
            if cfg.NUM_GPUS > 1:
                loss = du.all_reduce([loss])[0]
            loss = loss.item()

            train_meter.iter_toc()
            # Update and log stats.
            train_meter.update_stats(lr,
                                     inputs[0].size(0) * cfg.NUM_GPUS,
                                     loss=loss)
        else:
            if cfg.SUPERVISED:
                # Compute the errors.
                num_topks_correct = metrics.topks_correct(
                    preds, labels, (1, 5))
                top1_err, top5_err = [(1.0 - x / preds.size(0)) * 100.0
                                      for x in num_topks_correct]

            # Gather all the predictions across all the devices.
            if cfg.NUM_GPUS > 1:

                if cfg.PREDICTIVE.ENABLE:
                    pred_loss = du.all_reduce([pred_loss])
                    pred_loss = pred_loss[0]
                    if 'frame_errors' in out_keys:
                        frame_errors = du.all_reduce([frame_errors])[0]
                    if 'IoU' in preds:
                        iou = du.all_reduce([iou])[0]
                    if 'Acc' in preds:
                        acc = du.all_reduce([acc])[0]
                    # copy_baseline = du.all_reduce([copy_baseline])
                    # copy_baseline = copy_baseline[0]

                if cfg.PREDICTIVE.CPC:
                    cpc_loss = du.all_reduce([cpc_loss])
                    cpc_loss = cpc_loss[0]
                if cfg.SUPERVISED:
                    loss, top1_err, top5_err = du.all_reduce(
                        [loss, top1_err, top5_err])

                if 'cbp_penalty' in out_keys:
                    penalty = du.all_reduce([penalty])[0]

            loss_logs = {}
            if cfg.PREDICTIVE.ENABLE:
                pred_loss = pred_loss.item()
                loss_logs['loss_pred'] = pred_loss
                if 'frame_errors' in out_keys:
                    frame_errors = frame_errors.item()
                    loss_logs['frame_errors'] = frame_errors

                if 'IoU' in preds:
                    loss_logs['IoU'] = iou.item()
                if 'Acc' in preds:
                    loss_logs['Acc'] = acc.item()
                # copy_baseline = copy_baseline.item()
                # loss_logs['copy_comp'] = copy_baseline
            if cfg.PREDICTIVE.CPC:
                cpc_loss = cpc_loss.item()
                loss_logs['loss_cpc'] = cpc_loss

            if cfg.SUPERVISED:
                # Copy the stats from GPU to CPU (sync point).
                loss, top1_err, top5_err = (
                    loss.item(),
                    top1_err.item(),
                    top5_err.item(),
                )

                loss_logs['loss_class'] = loss
                loss_logs['top5_err'] = top5_err
                loss_logs['top1_err'] = top1_err

            if 'cbp_penalty' in out_keys:
                loss_logs['cbp_penalty'] = penalty.item()

            train_meter.iter_toc()
            # Update and log stats.
            train_meter.update_stats(lr, inputs[0].size(0) * cfg.NUM_GPUS,
                                     **loss_logs)

            if writer is not None and global_iters % cfg.LOG_PERIOD == 0:
                for k, v in loss_logs.items():
                    writer.add_scalar('loss/' + k.strip('loss_'),
                                      train_meter.stats[k].get_win_median(),
                                      global_iters)
            if nep is not None and global_iters % cfg.LOG_PERIOD == 0:
                for k, v in loss_logs.items():
                    nep.log_metric(k.strip('loss_'),
                                   train_meter.stats[k].get_win_median())

                nep.log_metric('global_iters', global_iters)

                # writer.add_scalar('loss/top1_err', train_meter.mb_top1_err.get_win_median(), global_iters)
                # writer.add_scalar('loss/top5_err', train_meter.mb_top5_err.get_win_median(), global_iters)
                # writer.add_scalar('loss/loss', train_meter.loss.get_win_median(), global_iters)
            if global_iters % cfg.SUMMARY_PERIOD == 0 and du.get_rank(
            ) == 0 and du.is_master_proc(num_gpus=cfg.NUM_GPUS):

                with torch.no_grad():
                    # logger.info(inputs[i].shape)
                    # sys.stdout.flush()
                    inputs[0] = inputs[0][:min(3, len(inputs[0]))]
                    if 'masks' in meta:
                        frames = model(
                            (inputs, meta['masks'][:min(3, len(inputs[0]))]),
                            extra=['frames'])['frames']
                    else:
                        frames = model(inputs, extra=['frames'])['frames']

                    n_rows = inputs[0].size(2) - 1

                    inputs = inputs[0].transpose(1, 2)[:, -n_rows:]
                    frames = frames.transpose(1, 2)[:, -n_rows:]

                    inputs = inputs * inputs.new(
                        cfg.DATA.STD)[None, None, :, None, None] + inputs.new(
                            cfg.DATA.MEAN)[None, None, :, None, None]
                    frames = frames * frames.new(
                        cfg.DATA.STD)[None, None, :, None, None] + frames.new(
                            cfg.DATA.MEAN)[None, None, :, None, None]
                    images = torch.cat([inputs, frames],
                                       1).reshape((-1, ) + inputs.shape[2:])

                # grid = tv.utils.make_grid(images, nrow=8, normalize=True)
                # writer.add_image('predictions', images, global_iters)

                tv.utils.save_image(
                    images,
                    os.path.join(cfg.OUTPUT_DIR,
                                 'preds_%d.jpg' % global_iters),
                    nrow=n_rows,
                    normalize=True)

                # del images
                # del frames
                # del inputs

        train_meter.log_iter_stats(cur_epoch, cur_iter)
        train_meter.iter_tic()

        global_iters += 1

    # Log epoch stats.
    train_meter.log_epoch_stats(cur_epoch)
    train_meter.reset()
Example #11
0
def eval_epoch(val_loader, model, val_meter, cur_epoch, cfg):
    """
    Evaluate the model on the val set.
    Args:
        val_loader (loader): data loader to provide validation data.
        model (model): model to evaluate the performance.
        val_meter (ValMeter): meter instance to record and calculate the metrics.
        cur_epoch (int): number of the current epoch of training.
        cfg (CfgNode): configs. Details can be found in
            slowfast/config/defaults.py
    """

    # Evaluation mode enabled. The running stats would not be updated.
    model.eval()
    val_meter.iter_tic()

    for cur_iter, (inputs, labels, _, meta) in enumerate(val_loader):
        # Transferthe data to the current GPU device.
        if isinstance(inputs, (list, )):
            for i in range(len(inputs)):
                inputs[i] = inputs[i].cuda(non_blocking=True)
        else:
            inputs = inputs.cuda(non_blocking=True)
        labels = labels.cuda()
        for key, val in meta.items():
            if isinstance(val, (list, )):
                for i in range(len(val)):
                    val[i] = val[i].cuda(non_blocking=True)
            else:
                meta[key] = val.cuda(non_blocking=True)

        if cfg.DETECTION.ENABLE:
            # Compute the predictions.
            preds = model(inputs, meta["boxes"])

            preds = preds.cpu()
            ori_boxes = meta["ori_boxes"].cpu()
            metadata = meta["metadata"].cpu()

            if cfg.NUM_GPUS > 1:
                preds = torch.cat(du.all_gather_unaligned(preds), dim=0)
                ori_boxes = torch.cat(du.all_gather_unaligned(ori_boxes),
                                      dim=0)
                metadata = torch.cat(du.all_gather_unaligned(metadata), dim=0)

            val_meter.iter_toc()
            # Update and log stats.
            val_meter.update_stats(preds.cpu(), ori_boxes.cpu(),
                                   metadata.cpu())
        else:
            preds = model(inputs)

            if cfg.DATA.MULTI_LABEL:
                if cfg.NUM_GPUS > 1:
                    preds, labels = du.all_gather([preds, labels])
                val_meter.update_predictions(preds, labels)
            else:
                # Compute the errors.
                num_topks_correct = metrics.topks_correct(
                    preds, labels, (1, 5))

                # Combine the errors across the GPUs.
                top1_err, top5_err = [(1.0 - x / preds.size(0)) * 100.0
                                      for x in num_topks_correct]
                if cfg.NUM_GPUS > 1:
                    top1_err, top5_err = du.all_reduce([top1_err, top5_err])

                # Copy the errors from GPU to CPU (sync point).
                top1_err, top5_err = top1_err.item(), top5_err.item()

                val_meter.iter_toc()
                # Update and log stats.
                val_meter.update_stats(top1_err, top5_err,
                                       inputs[0].size(0) * cfg.NUM_GPUS)

        val_meter.log_iter_stats(cur_epoch, cur_iter)
        val_meter.iter_tic()

    # Log epoch stats.
    val_meter.log_epoch_stats(cur_epoch)
    val_meter.reset()
def train_epoch(train_loader,
                model,
                optimizer,
                train_meter,
                cur_epoch,
                cfg,
                writer=None,
                wandb_log=False):
    """
    Perform the audio training for one epoch.
    Args:
        train_loader (loader): audio training loader.
        model (model): the audio model to train.
        optimizer (optim): the optimizer to perform optimization on the model's
            parameters.
        train_meter (TrainMeter): training meters to log the training performance.
        cur_epoch (int): current epoch of training.
        cfg (CfgNode): configs. Details can be found in
            slowfast/config/defaults.py
        writer (TensorboardWriter, optional): TensorboardWriter object
            to writer Tensorboard log.
    """
    # Enable train mode.
    model.train()
    if cfg.BN.FREEZE:
        model.module.freeze_fn(
            'bn_statistics') if cfg.NUM_GPUS > 1 else model.freeze_fn(
                'bn_statistics')

    train_meter.iter_tic()
    data_size = len(train_loader)

    for cur_iter, (inputs, labels, _, meta) in enumerate(train_loader):
        # Transfer the data to the current GPU device.
        if cfg.NUM_GPUS:
            if isinstance(inputs, (list, )):
                for i in range(len(inputs)):
                    inputs[i] = inputs[i].cuda(non_blocking=True)
            else:
                inputs = inputs.cuda(non_blocking=True)
            if isinstance(labels, (dict, )):
                labels = {k: v.cuda() for k, v in labels.items()}
            else:
                labels = labels.cuda()
            for key, val in meta.items():
                if isinstance(val, (list, )):
                    for i in range(len(val)):
                        val[i] = val[i].cuda(non_blocking=True)
                else:
                    meta[key] = val.cuda(non_blocking=True)

        # Update the learning rate.
        lr = optim.get_epoch_lr(cur_epoch + float(cur_iter) / data_size, cfg)
        optim.set_lr(optimizer, lr)

        train_meter.data_toc()

        # preds = model(inputs) #this is how model.forward() is called
        preds = model(inputs)[
            0]  #this is the original output, the output of the last layer
        linear_layer_output = model(inputs)[1]

        if isinstance(labels, (dict, )):
            # Explicitly declare reduction to mean.
            loss_fun = losses.get_loss_func(
                cfg.MODEL.LOSS_FUNC)(reduction="mean")

            # Compute the loss.
            loss_verb = loss_fun(preds[0], labels['verb'])
            loss_noun = loss_fun(preds[1], labels['noun'])
            loss = 0.5 * (loss_verb + loss_noun)

            # check Nan Loss.
            misc.check_nan_losses(loss)
        else:
            #I believe this is the VGG loss part, as the labels are not split into nouns and verbs

            # Explicitly declare reduction to mean.
            loss_fun = losses.get_loss_func(
                cfg.MODEL.LOSS_FUNC)(reduction="mean")

            # Embedding loss function.
            emb_loss_fun = losses.get_loss_func(
                cfg.MODEL.EMB_LOSS_FUNC)(reduction="mean")

            # Compute the loss for the main model.
            loss = loss_fun(preds, labels)

            # Compute the loss for the embeddings.
            emb_loss = emb_loss_fun(linear_layer_output, word_embedding)

            # Add the losses together- use embeddings to fine tune the model's objective
            loss = loss + emb_loss

            # check Nan Loss.
            misc.check_nan_losses(loss)

        # Perform the backward pass.
        optimizer.zero_grad()
        loss.backward()
        # Update the parameters.
        optimizer.step()

        if isinstance(labels, (dict, )):
            # Compute the verb accuracies.
            verb_top1_acc, verb_top5_acc = metrics.topk_accuracies(
                preds[0], labels['verb'], (1, 5))

            # Gather all the predictions across all the devices.
            if cfg.NUM_GPUS > 1:
                loss_verb, verb_top1_acc, verb_top5_acc = du.all_reduce(
                    [loss_verb, verb_top1_acc, verb_top5_acc])

            # Copy the stats from GPU to CPU (sync point).
            loss_verb, verb_top1_acc, verb_top5_acc = (
                loss_verb.item(),
                verb_top1_acc.item(),
                verb_top5_acc.item(),
            )

            # Compute the noun accuracies.
            noun_top1_acc, noun_top5_acc = metrics.topk_accuracies(
                preds[1], labels['noun'], (1, 5))

            # Gather all the predictions across all the devices.
            if cfg.NUM_GPUS > 1:
                loss_noun, noun_top1_acc, noun_top5_acc = du.all_reduce(
                    [loss_noun, noun_top1_acc, noun_top5_acc])

            # Copy the stats from GPU to CPU (sync point).
            loss_noun, noun_top1_acc, noun_top5_acc = (
                loss_noun.item(),
                noun_top1_acc.item(),
                noun_top5_acc.item(),
            )

            # Compute the action accuracies.
            action_top1_acc, action_top5_acc = metrics.multitask_topk_accuracies(
                (preds[0], preds[1]), (labels['verb'], labels['noun']), (1, 5))
            # Gather all the predictions across all the devices.
            if cfg.NUM_GPUS > 1:
                loss, action_top1_acc, action_top5_acc = du.all_reduce(
                    [loss, action_top1_acc, action_top5_acc])

            # Copy the stats from GPU to CPU (sync point).
            loss, action_top1_acc, action_top5_acc = (
                loss.item(),
                action_top1_acc.item(),
                action_top5_acc.item(),
            )

            # Update and log stats.
            train_meter.update_stats(
                (verb_top1_acc, noun_top1_acc, action_top1_acc),
                (verb_top5_acc, noun_top5_acc, action_top5_acc),
                (loss_verb, loss_noun, loss),
                lr,
                inputs[0].size(0) * max(
                    cfg.NUM_GPUS, 1
                ),  # If running  on CPU (cfg.NUM_GPUS == 1), use 1 to represent 1 CPU.
            )
            # write to tensorboard format if available.
            if writer is not None and not wandb_log:
                writer.add_scalars(
                    {
                        "Train/loss": loss,
                        "Train/lr": lr,
                        "Train/Top1_acc": action_top1_acc,
                        "Train/Top5_acc": action_top5_acc,
                        "Train/verb/loss": loss_verb,
                        "Train/noun/loss": loss_noun,
                        "Train/verb/Top1_acc": verb_top1_acc,
                        "Train/verb/Top5_acc": verb_top5_acc,
                        "Train/noun/Top1_acc": noun_top1_acc,
                        "Train/noun/Top5_acc": noun_top5_acc,
                    },
                    global_step=data_size * cur_epoch + cur_iter,
                )

            if wandb_log:
                wandb.log(
                    {
                        "Train/loss": loss,
                        "Train/lr": lr,
                        "Train/Top1_acc": action_top1_acc,
                        "Train/Top5_acc": action_top5_acc,
                        "Train/verb/loss": loss_verb,
                        "Train/noun/loss": loss_noun,
                        "Train/verb/Top1_acc": verb_top1_acc,
                        "Train/verb/Top5_acc": verb_top5_acc,
                        "Train/noun/Top1_acc": noun_top1_acc,
                        "Train/noun/Top5_acc": noun_top5_acc,
                        "train_step": data_size * cur_epoch + cur_iter,
                    }, )
        else:
            top1_err, top5_err = None, None
            if cfg.DATA.MULTI_LABEL:
                # Gather all the predictions across all the devices.
                if cfg.NUM_GPUS > 1:
                    [loss] = du.all_reduce([loss])
                loss = loss.item()
            else:
                # Compute the errors.
                num_topks_correct = metrics.topks_correct(
                    preds, labels, (1, 5))
                top1_err, top5_err = [(1.0 - x / preds.size(0)) * 100.0
                                      for x in num_topks_correct]
                # Gather all the predictions across all the devices.
                if cfg.NUM_GPUS > 1:
                    loss, top1_err, top5_err = du.all_reduce(
                        [loss, top1_err, top5_err])

                # Copy the stats from GPU to CPU (sync point).
                loss, top1_err, top5_err = (
                    loss.item(),
                    top1_err.item(),
                    top5_err.item(),
                )

            # Update and log stats.
            train_meter.update_stats(
                top1_err,
                top5_err,
                loss,
                lr,
                inputs[0].size(0) * max(
                    cfg.NUM_GPUS, 1
                ),  # If running  on CPU (cfg.NUM_GPUS == 1), use 1 to represent 1 CPU.
            )
            # write to tensorboard format if available.
            if writer is not None and not wandb_log:
                writer.add_scalars(
                    {
                        "Train/loss": loss,
                        "Train/lr": lr,
                        "Train/Top1_err": top1_err,
                        "Train/Top5_err": top5_err,
                    },
                    global_step=data_size * cur_epoch + cur_iter,
                )

            if wandb_log:
                wandb.log(
                    {
                        "Train/loss": loss,
                        "Train/lr": lr,
                        "Train/Top1_err": top1_err,
                        "Train/Top5_err": top5_err,
                        "train_step": data_size * cur_epoch + cur_iter,
                    }, )

        train_meter.iter_toc()  # measure allreduce for this meter
        train_meter.log_iter_stats(cur_epoch, cur_iter)
        train_meter.iter_tic()

    # Log epoch stats.
    train_meter.log_epoch_stats(cur_epoch)
    train_meter.reset()
Example #13
0
def train_epoch(
    train_loader, model, optimizer, train_meter, cur_epoch, cfg, test_imp=False
):
    """
    Perform the video training for one epoch.
    Args:
        train_loader (loader): video training loader.
        model (model): the video model to train.
        optimizer (optim): the optimizer to perform optimization on the model's
            parameters.
        train_meter (ClevrerTrainMeter): training meters to log the training performance.
        cur_epoch (int): current epoch of training.
        cfg (CfgNode): configs. Details can be found in
            slowfast/config/defaults.py
    """
    test_counter = 0
    # Enable train mode.
    model.train()
    train_meter.iter_tic()
    data_size = len(train_loader)
    
    for cur_iter, sampled_batch in enumerate(train_loader): 
        frames = sampled_batch['frames']
        des_q = sampled_batch['question_dict']['question']
        des_ans = sampled_batch['question_dict']['ans']
        # des_len = sampled_batch['question_dict']['len']
        # Transfer the data to the current GPU device.
        if cfg.NUM_GPUS:
            if isinstance(frames, (list,)):
                for i in range(len(frames)):
                    frames[i] = frames[i].cuda(non_blocking=True)
            else:
                frames = frames.cuda(non_blocking=True)
            des_q = des_q.cuda(non_blocking=True)
            des_ans = des_ans.cuda()
            # des_len = des_len.cuda(non_blocking=True)

        # Update the learning rate.
        lr = optim.get_epoch_lr(cur_epoch + float(cur_iter) / data_size, cfg)
        optim.set_lr(optimizer, lr)

        train_meter.data_toc()

        #Separated batches
        #Des
        pred_des_ans = model(frames, des_q, True)
        des_loss_fun = losses.get_loss_func('cross_entropy')(reduction="mean")
        loss = des_loss_fun(pred_des_ans, des_ans)
        # check Nan Loss.
        misc.check_nan_losses(loss)
        #Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        #Save for stats
        loss_des_val = loss

        top1_err, top5_err = None, None
        # Compute the errors.
        num_topks_correct = metrics.topks_correct(pred_des_ans, des_ans, (1, 5))
        top1_err, top5_err = [
            (1.0 - x / pred_des_ans.size(0)) * 100.0 for x in num_topks_correct
        ]
        mc_opt_err, mc_q_err = None, None
        mb_size_mc = None
        loss_des_val, top1_err, top5_err = (
            loss_des_val.item(),
            top1_err.item(),
            top5_err.item()
        )
        #top1_err, top5_err, mc_opt_err, mc_q_err, loss_des, loss_mc, lr, mb_size
        # Update and log stats.
        train_meter.update_stats(
            top1_err,
            top5_err,
            mc_opt_err,
            mc_q_err,
            loss_des_val,
            None,
            lr,
            des_q.size()[0],
            mb_size_mc
        )
        train_meter.iter_toc()  # measure allreduce for this meter
        train_meter.log_iter_stats(cur_epoch, cur_iter)
        train_meter.iter_tic()


        #For testing implementation
        if test_imp:
            print(" --- Descriptive questions results --- ")
            # print("Des_q")
            # print(des_q)
            print("Des_ans")
            print(des_ans)
            #print("Des_ans_pred")
            #print(pred_des_ans)
            print("Argmax => prediction")
            print(torch.argmax(pred_des_ans, dim=1, keepdim=False))
            print("Top1_err and Top5err")
            print(top1_err, top5_err)
            print("Loss_des_val = {}".format(loss_des_val))
            test_counter += 1
            if test_counter == 1: 
                break

    # Log epoch stats.
    train_meter.log_epoch_stats(cur_epoch)
    train_meter.reset()
Example #14
0
def eval_epoch(val_loader, model, val_meter, cur_epoch, cfg, writer=None):
    """
    Evaluate the model on the val set.
    Args:
        val_loader (loader): data loader to provide validation data.
        model (model): model to evaluate the performance.
        val_meter (ClevrerValMeter): meter instance to record and calculate the metrics.
        cur_epoch (int): number of the current epoch of training.
        cfg (CfgNode): configs. Details can be found in
            slowfast/config/defaults.py
        writer (TensorboardWriter, optional): TensorboardWriter object
            to writer Tensorboard log.
    """

    # Evaluation mode enabled. The running stats would not be updated.
    model.eval()
    val_meter.iter_tic()

    for cur_iter, sampled_batch in enumerate(val_loader):
        frames = sampled_batch['frames']
        des_q = sampled_batch['question_dict']['des_q']
        des_ans = sampled_batch['question_dict']['des_ans']
        mc_q = sampled_batch['question_dict']['mc_q']
        mc_ans = sampled_batch['question_dict']['mc_ans']
        # Transfer the data to the current GPU device.
        if cfg.NUM_GPUS:
            frames = frames.cuda(non_blocking=True)
            des_q = des_q.cuda(non_blocking=True)
            des_ans = des_ans.cuda()
            mc_q = mc_q.cuda(non_blocking=True)
            mc_ans = mc_ans.cuda()

        val_meter.data_toc()

        pred_des_ans = model(frames, des_q, True)
        pred_mc_ans = model(frames, mc_q, False)

        # Compute the errors.
        num_topks_correct = metrics.topks_correct(pred_des_ans, des_ans,
                                                  (1, 5))
        # Combine the errors across the GPUs.
        top1_err, top5_err = [(1.0 - x / pred_des_ans.size(0)) * 100.0
                              for x in num_topks_correct]
        diff_mc_ans = torch.abs(
            mc_ans - (torch.sigmoid(pred_mc_ans) >= 0.5).float())  #Errors
        mc_opt_err = 100 * torch.true_divide(diff_mc_ans.sum(),
                                             (4 * des_q.size()[0]))
        mc_q_err = 100 * torch.true_divide(
            (diff_mc_ans.sum(dim=1, keepdim=True) != 0).float().sum(),
            des_q.size()[0])

        if cfg.NUM_GPUS > 1:
            top1_err, top5_err, mc_opt_err, mc_q_err = du.all_reduce(
                [top1_err, top5_err, mc_opt_err, mc_q_err])
        # Copy the errors from GPU to CPU (sync point).
        top1_err, top5_err, mc_opt_err, mc_q_err = (top1_err.item(),
                                                    top5_err.item(),
                                                    mc_opt_err.item(),
                                                    mc_q_err.item())

        val_meter.iter_toc()

        # Update and log stats.
        val_meter.update_stats(
            top1_err,
            top5_err,
            mc_opt_err,
            mc_q_err,
            frames.size()[0] * max(
                cfg.NUM_GPUS, 1
            ),  # If running  on CPU (cfg.NUM_GPUS == 1), use 1 to represent 1 CPU.
        )
        # write to tensorboard format if available.
        if writer is not None:
            writer.add_scalars(
                {
                    "Val/Top1_err": top1_err,
                    "Val/Top5_err": top5_err
                },
                global_step=len(val_loader) * cur_epoch + cur_iter,
            )

        val_meter.log_iter_stats(cur_epoch, cur_iter)
        val_meter.iter_tic()

    # Log epoch stats.
    val_meter.log_epoch_stats(cur_epoch)
    # write to tensorboard format if available.
    if writer is not None:
        all_preds = [pred.clone().detach() for pred in val_meter.all_preds]
        all_labels = [label.clone().detach() for label in val_meter.all_labels]
        if cfg.NUM_GPUS:
            all_preds = [pred.cpu() for pred in all_preds]
            all_labels = [label.cpu() for label in all_labels]
        writer.plot_eval(preds=all_preds,
                         labels=all_labels,
                         global_step=cur_epoch)

    val_meter.reset()
Example #15
0
    def train_epoch(self,
                    train_loader,
                    model,
                    optimizer,
                    train_meter,
                    cur_epoch,
                    cfg,
                    writer=None):
        """
        Perform the video training for one epoch.
        Args:
            train_loader (loader): video training loader.
            model (model): the video model to train.
            optimizer (optim): the optimizer to perform optimization on the model's
                parameters.
            train_meter (TrainMeter): training meters to log the training performance.
            cur_epoch (int): current epoch of training.
            cfg (CfgNode): configs. Details can be found in
                slowfast/config/defaults.py
            writer (TensorboardWriter, optional): TensorboardWriter object
                to writer Tensorboard log.
        """
        # Enable train mode.
        model.train()
        train_meter.iter_tic()
        data_size = len(train_loader)
        start = time.time()
        btch = cfg.TRAIN.BATCH_SIZE * self.cfg.NUM_SHARDS
        rankE = os.environ.get("RANK", None)
        worldE = os.environ.get("WORLD_SIZE", None)
        dSize = data_size * btch
        self.logger.info(
            "Train Epoch {} dLen {} Batch {} dSize {} localRank {} rank {} {} world {} {}"
            .format(cur_epoch, data_size, btch, dSize, du.get_local_rank(),
                    du.get_rank(), rankE, du.get_world_size(), worldE))
        tot = 0
        first = True
        predsAll = []
        labelsAll = []

        for cur_iter, (inputs, labels, _, meta) in enumerate(train_loader):
            # Transfer the data to the current GPU device.
            tot += len(labels)
            if isinstance(inputs, (list, )):
                if first:
                    self.logger.info(
                        "rank {} LEN {}  {} shape Slow {} Fast {} {} tot {}".
                        format(du.get_rank(), len(labels), len(inputs),
                               inputs[0].shape, inputs[1].shape,
                               labels[0].shape, tot))
                    first = False
                for i in range(len(inputs)):
                    inputs[i] = inputs[i].cuda(non_blocking=True)
            else:
                if first:
                    self.logger.info(
                        "rank {} LEN {} shape {} {} tot {}".format(
                            du.get_rank(), len(labels), inputs.shape,
                            labels[0].shape, tot))
                    first = False
                inputs = inputs.cuda(non_blocking=True)
            labels = labels.cuda()

            for key, val in meta.items():
                if isinstance(val, (list, )):
                    for i in range(len(val)):
                        val[i] = val[i].cuda(non_blocking=True)
                else:
                    meta[key] = val.cuda(non_blocking=True)

            # Update the learning rate.
            lr = optim.get_epoch_lr(cur_epoch + float(cur_iter) / data_size,
                                    cfg)
            optim.set_lr(optimizer, lr)
            if cfg.DETECTION.ENABLE:
                # Compute the predictions.
                preds = model(inputs, meta["boxes"])

            else:
                # Perform the forward pass.
                preds = model(inputs)
            # Explicitly declare reduction to mean.
            loss_fun = losses.get_loss_func(
                cfg.MODEL.LOSS_FUNC)(reduction="mean")

            # Compute the loss.
            loss = loss_fun(preds, labels)

            # check Nan Loss.
            misc.check_nan_losses(loss)

            # Perform the backward pass.
            optimizer.zero_grad()
            loss.backward()
            # Update the parameters.
            optimizer.step()

            if cfg.DETECTION.ENABLE:
                if cfg.NUM_GPUS > 1:
                    loss = du.all_reduce([loss])[0]
                loss = loss.item()

                train_meter.iter_toc()
                # Update and log stats.
                train_meter.update_stats(None, None, None, loss, lr)
                # write to tensorboard format if available.
                if writer is not None:
                    writer.add_scalars(
                        {
                            "Train/loss": loss,
                            "Train/lr": lr
                        },
                        global_step=data_size * cur_epoch + cur_iter,
                    )
                ite = data_size * cur_epoch + cur_iter
                if du.is_master_proc():
                    self.logger.log_row(name='TrainLoss',
                                        iter=ite,
                                        loss=loss,
                                        description="train loss")
                    self.logger.log_row(name='TrainLr',
                                        iter=ite,
                                        lr=lr,
                                        description="train learn rate")

            else:
                top1_err, top5_err = None, None
                if cfg.DATA.MULTI_LABEL:
                    # Gather all the predictions across all the devices.
                    if cfg.NUM_GPUS > 1:
                        [loss] = du.all_reduce([loss])
                    loss = loss.item()
                else:
                    # Binary classifier - save preds / labels for metrics
                    if cfg.MODEL.NUM_CLASSES == 2:
                        predsAll.extend(preds.detach().cpu().numpy()[:, -1])
                        labelsAll.extend(labels.detach().cpu().numpy())
                    # Compute the errors.
                    num_topks_correct = metrics.topks_correct(
                        preds, labels, (1, min(5, cfg.MODEL.NUM_CLASSES)))
                    top1_err, top5_err = [(1.0 - x / preds.size(0)) * 100.0
                                          for x in num_topks_correct]

                    # Gather all the predictions across all the devices.
                    if cfg.NUM_GPUS > 1:
                        loss, top1_err, top5_err = du.all_reduce(
                            [loss, top1_err, top5_err])

                    # Copy the stats from GPU to CPU (sync point).
                    loss, top1_err, top5_err = (
                        loss.item(),
                        top1_err.item(),
                        top5_err.item(),
                    )

                train_meter.iter_toc()
                # Update and log stats.
                # self.logger.info("UPDATING stat {} {} {}".format(inputs[0].size(0), cfg.NUM_GPUS, inputs[0].size(0) * cfg.NUM_GPUS))
                train_meter.update_stats(top1_err, top5_err, loss, lr,
                                         inputs[0].size(0) * cfg.NUM_GPUS)
                # write to tensorboard format if available.
                if writer is not None:
                    writer.add_scalars(
                        {
                            "Train/loss": loss,
                            "Train/lr": lr,
                            "Train/Top1_err": top1_err,
                            "Train/Top5_err": top5_err,
                        },
                        global_step=data_size * cur_epoch + cur_iter,
                    )

            stats = train_meter.log_iter_stats(cur_epoch, cur_iter, predsAll,
                                               labelsAll)
            ite = dSize * cur_epoch + btch * (cur_iter + 1)
            self.plotStats(stats, ite, 'TrainIter')
            train_meter.iter_tic()

        if du.is_master_proc() and cfg.LOG_MODEL_INFO:
            misc.log_model_info(model, cfg, use_train_input=True)
        # Log epoch stats.
        gathered = du.all_gather([
            torch.tensor(predsAll).to(torch.device("cuda")),
            torch.tensor(labelsAll).to(torch.device("cuda"))
        ])
        stats = train_meter.log_epoch_stats(cur_epoch,
                                            gathered[0].detach().cpu().numpy(),
                                            gathered[1].detach().cpu().numpy())
        ite = (cur_epoch + 1) * dSize
        self.plotStats(stats, ite, 'TrainEpoch')
        train_meter.reset()
        end = time.time()
        el = end - start
        totAll = du.all_reduce([torch.tensor(tot).cuda()], average=False)
        tSum = totAll[0].item()
        elT = torch.tensor(el).cuda()
        elMax = du.all_reduce([elT], op=dist.ReduceOp.MAX,
                              average=False)[0].item()
        jobRate = tSum / elMax
        self.logger.info(
            "totSampCnt {} workerSampCnt {}  eTimeMax {} eTimeWorker {}  SampPerSecJob {:.1f} SampPerSecWorker {:.1f}"
            .format(tSum, tot, elMax, el, jobRate, tot / el))
        return jobRate
Example #16
0
    def eval_epoch(self,
                   val_loader,
                   model,
                   val_meter,
                   cur_epoch,
                   cfg,
                   writer=None):
        """
        Evaluate the model on the val set.
        Args:
            val_loader (loader): data loader to provide validation data.
            model (model): model to evaluate the performance.
            val_meter (ValMeter): meter instance to record and calculate the metrics.
            cur_epoch (int): number of the current epoch of training.
            cfg (CfgNode): configs. Details can be found in
                slowfast/config/defaults.py
            writer (TensorboardWriter, optional): TensorboardWriter object
                to writer Tensorboard log.
        """

        # Evaluation mode enabled. The running stats would not be updated.
        model.eval()
        data_size = len(val_loader)
        btch = cfg.TRAIN.BATCH_SIZE * self.cfg.NUM_SHARDS
        rankE = os.environ.get("RANK", None)
        worldE = os.environ.get("WORLD_SIZE", None)
        dSize = data_size * btch
        self.logger.info(
            "Val Epoch {} dLen {} Batch {} dSize {} localRank {} rank {} {} world {} {}"
            .format(cur_epoch, data_size, btch, dSize, du.get_local_rank(),
                    du.get_rank(), rankE, du.get_world_size(), worldE))

        val_meter.iter_tic()
        predsAll = []
        labelsAll = []
        data_size = len(val_loader)

        for cur_iter, (inputs, labels, _, meta) in enumerate(val_loader):
            # Transferthe data to the current GPU device.
            if isinstance(inputs, (list, )):
                for i in range(len(inputs)):
                    inputs[i] = inputs[i].cuda(non_blocking=True)
            else:
                inputs = inputs.cuda(non_blocking=True)
            labels = labels.cuda()
            for key, val in meta.items():
                if isinstance(val, (list, )):
                    for i in range(len(val)):
                        val[i] = val[i].cuda(non_blocking=True)
                else:
                    meta[key] = val.cuda(non_blocking=True)

            if cfg.DETECTION.ENABLE:
                # Compute the predictions.
                preds = model(inputs, meta["boxes"])

                preds = preds.cpu()
                ori_boxes = meta["ori_boxes"].cpu()
                metadata = meta["metadata"].cpu()

                if cfg.NUM_GPUS > 1:
                    preds = torch.cat(du.all_gather_unaligned(preds), dim=0)
                    ori_boxes = torch.cat(du.all_gather_unaligned(ori_boxes),
                                          dim=0)
                    metadata = torch.cat(du.all_gather_unaligned(metadata),
                                         dim=0)

                val_meter.iter_toc()
                # Update and log stats.
                val_meter.update_stats(preds.cpu(), ori_boxes.cpu(),
                                       metadata.cpu())

            else:
                preds = model(inputs)

                if cfg.DATA.MULTI_LABEL:
                    if cfg.NUM_GPUS > 1:
                        preds, labels = du.all_gather([preds, labels])
                else:
                    if cfg.MODEL.NUM_CLASSES == 2:
                        predsAll.extend(preds.detach().cpu().numpy()[:, -1])
                        labelsAll.extend(labels.detach().cpu().numpy())

                    # Compute the errors.
                    num_topks_correct = metrics.topks_correct(
                        preds, labels, (1, min(5, cfg.MODEL.NUM_CLASSES)))

                    # Combine the errors across the GPUs.
                    top1_err, top5_err = [(1.0 - x / preds.size(0)) * 100.0
                                          for x in num_topks_correct]
                    if cfg.NUM_GPUS > 1:
                        top1_err, top5_err = du.all_reduce(
                            [top1_err, top5_err])

                    # Copy the errors from GPU to CPU (sync point).
                    top1_err, top5_err = top1_err.item(), top5_err.item()

                    val_meter.iter_toc()
                    # Update and log stats.
                    val_meter.update_stats(top1_err, top5_err,
                                           inputs[0].size(0) * cfg.NUM_GPUS)
                    # write to tensorboard format if available.
                    if writer is not None:
                        writer.add_scalars(
                            {
                                "Val/Top1_err": top1_err,
                                "Val/Top5_err": top5_err
                            },
                            global_step=len(val_loader) * cur_epoch + cur_iter,
                        )

                    if du.is_master_proc():
                        ite = len(val_loader) * cur_epoch + cur_iter
                        self.logger.log_row(name='ValTop1',
                                            iter=ite,
                                            lr=top1_err,
                                            description="Top 1 Err")
                        self.logger.log_row(name='ValTop5',
                                            iter=ite,
                                            lr=top5_err,
                                            description="Top 5 Err")

                val_meter.update_predictions(preds, labels)

            stats = val_meter.log_iter_stats(cur_epoch, cur_iter, predsAll,
                                             labelsAll)
            ite = dSize * cur_epoch + btch * (cur_iter + 1)
            self.plotStats(stats, ite, 'ValIter')

            val_meter.iter_tic()

        # Log epoch stats.
        gathered = du.all_gather([
            torch.tensor(predsAll).to(torch.device("cuda")),
            torch.tensor(labelsAll).to(torch.device("cuda"))
        ])
        stats = val_meter.log_epoch_stats(cur_epoch,
                                          gathered[0].detach().cpu().numpy(),
                                          gathered[1].detach().cpu().numpy())
        ite = (cur_epoch + 1) * dSize
        self.plotStats(stats, ite, 'ValEpoch')

        # write to tensorboard format if available.
        if writer is not None:
            if cfg.DETECTION.ENABLE:
                writer.add_scalars({"Val/mAP": val_meter.full_map},
                                   global_step=cur_epoch)
            all_preds_cpu = [
                pred.clone().detach().cpu() for pred in val_meter.all_preds
            ]
            all_labels_cpu = [
                label.clone().detach().cpu() for label in val_meter.all_labels
            ]
            # plotScatter(all_preds_cpu, all_labels_cpu, "Epoch_{}".format(cur_epoch))
            # writer.plot_eval(
            #     preds=all_preds_cpu, labels=all_labels_cpu, global_step=cur_epoch
            # )
        val_meter.reset()
Example #17
0
    def finalize_metrics(self, ks=(1, 5)):
        """
            Calculate and log the final ensembled metrics.
            ks (tuple): list of top-k values for topk_accuracies. For example,
                ks = (1, 5) correspods to top-1 and top-5 accuracy.
            """
        if not all(self.clip_count == self.num_clips):
            logger.warning("clip count {} ~= num clips {}".format(
                ", ".join([
                    "{}: {}".format(i, k)
                    for i, k in enumerate(self.clip_count.tolist())
                ]),
                self.num_clips,
            ))

        self.stats = {"split": "test_final"}
        timestamp = datetime.datetime.now().isoformat()
        computed_representations_path = "comp_repr_{}.csv".format(timestamp)
        actual_labels_path = "act_labels_{}.csv".format(timestamp)
        csv_repr_file = open(computed_representations_path, "w")
        csv_label_file = open(actual_labels_path, "w")
        csv_repr_writer = csv.writer(csv_repr_file)
        csv_label_writer = csv.writer(csv_label_file)
        csv_repr_writer.writerows(self.video_preds.tolist())
        csv_label_writer.writerows([[self.video_labels.tolist()]])
        csv_repr_file.close()
        csv_label_file.close()
        logger.info("Saving computed representations to {}".format(
            computed_representations_path))
        logger.info("Running linear model on the computed representations")
        logger.info("Running for {} iterations".format(self.lin_epochs))
        iter = 0
        logit_model = LogisticRegression(self.video_preds.shape[-1],
                                         self.num_test_classes)
        optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
        for epoch in range(int(self.lin_epochs)):
            optimizer.zero_grad()
            self.final_preds = logit_model(self.video_preds.cpu())
            loss = torch.nn.CrossEntropyLoss()(self.final_preds,
                                               self.video_labels)
            loss.backward()
            optimizer.step()
            iter += 1
            if iter % 500 == 0:
                # calculate Accuracy
                _, self.final_preds = torch.max(self.video_preds.data, 1)
                total = self.video_preds.size(0)
                correct = (self.final_preds == self.video_labels).sum()
                accuracy = 100 * correct / total
                print("Iteration: {}. Loss: {}. Accuracy: {}.".format(
                    iter, loss.item(), accuracy))

        logger.info("Approx Acc of the linear model {}", accuracy)
        self.video_preds = self.video_preds_res
        if self.multi_label:
            map = get_map(self.video_preds.cpu().numpy(),
                          self.video_labels.cpu().numpy())
            self.stats["map"] = map
        else:
            num_topks_correct = metrics.topks_correct(self.video_preds,
                                                      self.video_labels, ks)
            topks = [(x / self.video_preds.size(0)) * 100.0
                     for x in num_topks_correct]
            assert len({len(ks), len(topks)}) == 1
            for k, topk in zip(ks, topks):
                self.stats["top{}_acc".format(k)] = "{:.{prec}f}".format(
                    topk, prec=2)
        logging.log_json_stats(self.stats)
Example #18
0
def eval_epoch(val_loader, model, val_meter, cur_epoch, nep, cfg):
    """
    Evaluate the model on the val set.
    Args:
        val_loader (loader): data loader to provide validation data.
        model (model): model to evaluate the performance.
        val_meter (ValMeter): meter instance to record and calculate the metrics.
        cur_epoch (int): number of the current epoch of training.
        cfg (CfgNode): configs. Details can be found in
            slowfast/config/defaults.py
    """

    # Evaluation mode enabled. The running stats would not be updated.
    model.eval()
    val_meter.iter_tic()

    for cur_iter, (inputs, labels, _, meta) in enumerate(val_loader):
        # Transferthe data to the current GPU device.
        if isinstance(inputs, (list, )):
            for i in range(len(inputs)):
                inputs[i] = inputs[i].cuda(non_blocking=True)
        else:
            inputs = inputs.cuda(non_blocking=True)
        labels = labels.cuda()
        for key, val in meta.items():
            if isinstance(val, (list, )):
                for i in range(len(val)):
                    val[i] = val[i].cuda(non_blocking=True)
            else:
                meta[key] = val.cuda(non_blocking=True)

        if cfg.DETECTION.ENABLE:
            # Compute the predictions.
            preds = model(inputs, meta["boxes"])

            preds = preds.cpu()
            ori_boxes = meta["ori_boxes"].cpu()
            metadata = meta["metadata"].cpu()

            if cfg.NUM_GPUS > 1:
                preds = torch.cat(du.all_gather_unaligned(preds), dim=0)
                ori_boxes = torch.cat(du.all_gather_unaligned(ori_boxes),
                                      dim=0)
                metadata = torch.cat(du.all_gather_unaligned(metadata), dim=0)

            val_meter.iter_toc()
            # Update and log stats.
            val_meter.update_stats(preds.cpu(), ori_boxes.cpu(),
                                   metadata.cpu())

        else:

            preds = model(inputs)
            aux_loss_keys = []
            if cfg.PREDICTIVE.ENABLE:
                aux_loss_keys.append('pred_errors')
                errors = preds['pred_errors']
                pred_loss = errors.mean()

            if 'frame_errors' in preds:
                aux_loss_keys.append('frame_errors')
                frame_errors = preds['frame_errors']

            if cfg.PREDICTIVE.CPC:
                aux_loss_keys.append('cpc_loss')
                cpc_loss = preds['cpc_loss']

            if cfg.SUPERVISED:
                preds = preds['logits']

            # Explicitly declare reduction to mean.
            if cfg.MODEL.LOSS_FUNC != '' and cfg.SUPERVISED:
                loss_fun = losses.get_loss_func(
                    cfg.MODEL.LOSS_FUNC)(reduction="mean")

                # Compute the loss.
                loss = loss_fun(preds, labels)
                # total_loss = total_loss + loss

                # Compute the errors.
                num_topks_correct = metrics.topks_correct(
                    preds, labels, (1, 5))

                # Combine the errors across the GPUs.
                top1_err, top5_err = [(1.0 - x / preds.size(0)) * 100.0
                                      for x in num_topks_correct]

            # Gather all the predictions across all the devices.
            if cfg.NUM_GPUS > 1:
                if cfg.PREDICTIVE.ENABLE:
                    pred_loss = du.all_reduce([pred_loss])[0]

                if cfg.PREDICTIVE.CPC:
                    cpc_loss = du.all_reduce([cpc_loss])[0]

                if cfg.SUPERVISED:
                    loss, top1_err, top5_err = du.all_reduce(
                        [loss, top1_err, top5_err])

            # # Copy the stats from GPU to CPU (sync point).
            # loss, top1_err, top5_err = (
            #     loss.item(),
            #     top1_err.item(),
            #     top5_err.item(),
            # )

            # if cfg.NUM_GPUS > 1:
            #     top1_err, top5_err = du.all_reduce([top1_err, top5_err])

            # Copy the errors from GPU to CPU (sync point).
            loss_logs = {}
            if 'loss_pred' in aux_loss_keys:
                loss_logs['loss_pred'] = pred_loss.item()

            if 'frame_errors' in aux_loss_keys:
                loss_logs['frame_errors'] = frame_errors.item()

            if 'loss_cpc' in aux_loss_keys:
                loss_logs['loss_cpc'] = cpc_loss.item()

            if cfg.SUPERVISED:
                loss_logs['loss_class'] = loss.item()
                loss_logs['top1_err'] = top1_err.item()
                loss_logs['top5_err'] = top5_err.item()

            val_meter.iter_toc()
            # Update and log stats.
            val_meter.update_stats(inputs[0].size(0) * cfg.NUM_GPUS,
                                   **loss_logs)

        val_meter.log_iter_stats(cur_epoch, cur_iter)
        val_meter.iter_tic()

    # neptune update
    if nep is not None:
        for k, v in loss_logs.items():
            nep.log_metric('val_' + k.strip('loss_'),
                           val_meter.stats[k].get_global_avg())

    # Log epoch stats.
    val_meter.log_epoch_stats(cur_epoch)
    val_meter.reset()
Example #19
0
def train_epoch(train_loader, model, optimizer, train_meter, cur_epoch, cfg, cnt):
    """
    Perform the video training for one epoch.
    Args:
        train_loader (loader): video training loader.
        model (model): the video model to train.
        optimizer (optim): the optimizer to perform optimization on the model's
            parameters.
        train_meter (TrainMeter): training meters to log the training performance.
        cur_epoch (int): current epoch of training.
        cfg (CfgNode): configs. Details can be found in
            slowfast/config/defaults.py
    """
    # Enable train mode.
    model.train()
    if cfg.BN.FREEZE:
        model.freeze_fn('bn_statistics')

    train_meter.iter_tic()
    data_size = len(train_loader)

    #for cur_iter, (inputs, bboxs, masks, labels, _, meta) in enumerate(train_loader):
    for cur_iter, output_dict in enumerate(train_loader):
        if cfg.EPICKITCHENS.USE_BBOX:
            inputs = output_dict['inputs']
            bboxs = output_dict['bboxs']
            masks = output_dict['masks']
            labels = output_dict['label'] 
            # output_dict['index'] 
            meta = output_dict['metadata'] 
        else:
            inputs = output_dict['inputs']
            labels = output_dict['label'] 
            meta = output_dict['metadata'] 
        

        # Transfer the data to the current GPU device.
        if isinstance(inputs, (list,)):
            for i in range(len(inputs)):
                inputs[i] = inputs[i].cuda(non_blocking=True)
        else:
            inputs = inputs.cuda(non_blocking=True)
        if isinstance(labels, (dict,)):
            labels = {k: v.cuda() for k, v in labels.items()}
        else:
            labels = labels.cuda()
        # Update the learning rate.
        lr = optim.get_epoch_lr(cur_epoch + float(cur_iter) / data_size, cfg)
        optim.set_lr(optimizer, lr)

        if cfg.DETECTION.ENABLE:
            # Compute the predictions.
            preds = model(inputs, meta["boxes"])

        else:
            # Perform the forward pass.
            if cfg.EPICKITCHENS.USE_BBOX:
                if isinstance(bboxs, (list,)):
                    for i in range(len(bboxs)):
                        bboxs[i] = bboxs[i].cuda(non_blocking=True)
                        masks[i] = masks[i].cuda(non_blocking=True)
                else:
                    bboxs = bboxs.cuda(non_blocking=True)
                    masks = masks.cuda(non_blocking=True)
                
                preds = model(inputs, bboxes=bboxs, masks=masks)
            else:
                preds = model(inputs)

        if isinstance(labels, (dict,)):
            # Explicitly declare reduction to mean.
            loss_fun = losses.get_loss_func(cfg.MODEL.LOSS_FUNC)(reduction="mean")
            # Compute the loss.
            loss_verb = loss_fun(preds[0], labels['verb'])
            loss_noun = loss_fun(preds[1], labels['noun'])
            loss = 0.5 * (loss_verb + loss_noun)
            # check Nan Loss.
            misc.check_nan_losses(loss)
        else:
            # Explicitly declare reduction to mean.
            loss_fun = losses.get_loss_func(cfg.MODEL.LOSS_FUNC)(reduction="mean")
            # Compute the loss.
            loss = loss_fun(preds, labels)
            # check Nan Loss.
            misc.check_nan_losses(loss)

        # Perform the backward pass.
        optimizer.zero_grad()
        loss.backward()
        # Update the parameters.
        optimizer.step()

        if cfg.DETECTION.ENABLE:
            if cfg.NUM_GPUS > 1:
                loss = du.all_reduce([loss])[0]
            loss = loss.item()

            train_meter.iter_toc()
            # Update and log stats.
            train_meter.update_stats(None, None, None, loss, lr)
        else:
            if isinstance(labels, (dict,)):
                # Compute the verb accuracies.
                verb_top1_acc, verb_top5_acc = metrics.topk_accuracies(preds[0], labels['verb'], (1, 5))
                
                # predicted_answer_softmax = torch.nn.Softmax(dim=1)(preds[0])
                # predicted_answer_max = torch.max(predicted_answer_softmax.data, 1).indices
                # print(cnt, predicted_answer_max, labels['verb'])

                # Gather all the predictions across all the devices.
                if cfg.NUM_GPUS > 1:
                    loss_verb, verb_top1_acc, verb_top5_acc = du.all_reduce(
                        [loss_verb, verb_top1_acc, verb_top5_acc]
                    )

                # Copy the stats from GPU to CPU (sync point).
                loss_verb, verb_top1_acc, verb_top5_acc = (
                    loss_verb.item(),
                    verb_top1_acc.item(),
                    verb_top5_acc.item(),
                )

                # Compute the noun accuracies.
                noun_top1_acc, noun_top5_acc = metrics.topk_accuracies(preds[1], labels['noun'], (1, 5))

                # Gather all the predictions across all the devices.
                if cfg.NUM_GPUS > 1:
                    loss_noun, noun_top1_acc, noun_top5_acc = du.all_reduce(
                        [loss_noun, noun_top1_acc, noun_top5_acc]
                    )

                # Copy the stats from GPU to CPU (sync point).
                loss_noun, noun_top1_acc, noun_top5_acc = (
                    loss_noun.item(),
                    noun_top1_acc.item(),
                    noun_top5_acc.item(),
                )

                # Gather all the predictions across all the devices.
                if cfg.NUM_GPUS > 1:
                    loss = du.all_reduce(
                        [loss]
                    )
                    if isinstance(loss, (list,)):
                        loss = loss[0]

                # Copy the stats from GPU to CPU (sync point).
                loss = loss.item()

                train_meter.iter_toc()
                # Update and log stats.
                train_meter.update_stats(
                    (verb_top1_acc, noun_top1_acc),
                    (verb_top5_acc, noun_top5_acc),
                    (loss_verb, loss_noun, loss),
                    lr, inputs[0].size(0) * cfg.NUM_GPUS
                )
            else:
                # Compute the errors.
                num_topks_correct = metrics.topks_correct(preds, labels, (1, 5))
                top1_err, top5_err = [
                    (1.0 - x / preds.size(0)) * 100.0 for x in num_topks_correct
                ]

                # Gather all the predictions across all the devices.
                if cfg.NUM_GPUS > 1:
                    loss, top1_err, top5_err = du.all_reduce(
                        [loss, top1_err, top5_err]
                    )

                # Copy the stats from GPU to CPU (sync point).
                loss, top1_err, top5_err = (
                    loss.item(),
                    top1_err.item(),
                    top5_err.item(),
                )

                train_meter.iter_toc()
                # Update and log stats.
                train_meter.update_stats(
                    top1_err, top5_err, loss, lr, inputs[0].size(0) * cfg.NUM_GPUS
                )
        train_meter.log_iter_stats(cur_epoch, cur_iter, cnt)
        train_meter.iter_tic()
        cnt += 1
    # Log epoch stats.
    train_meter.log_epoch_stats(cur_epoch)
    train_meter.reset()
    return cnt
Example #20
0
def train_epoch(train_loader,
                model,
                optimizer,
                train_meter,
                cur_epoch,
                cfg,
                writer=None):
    """
    Perform the video training for one epoch.
    Args:
        train_loader (loader): video training loader.
        model (model): the video model to train.
        optimizer (optim): the optimizer to perform optimization on the model's
            parameters.
        train_meter (ClevrerTrainMeter): training meters to log the training performance.
        cur_epoch (int): current epoch of training.
        cfg (CfgNode): configs. Details can be found in
            slowfast/config/defaults.py
        writer (TensorboardWriter, optional): TensorboardWriter object
            to writer Tensorboard log.
    """
    # Enable train mode.
    model.train()
    train_meter.iter_tic()
    data_size = len(train_loader)

    for cur_iter, sampled_batch in enumerate(train_loader):
        frames = sampled_batch['frames']
        des_q = sampled_batch['question_dict']['des_q']
        des_ans = sampled_batch['question_dict']['des_ans']
        mc_q = sampled_batch['question_dict']['mc_q']
        mc_ans = sampled_batch['question_dict']['mc_ans']
        # Transfer the data to the current GPU device.
        if cfg.NUM_GPUS:
            frames = frames.cuda(non_blocking=True)
            des_q = des_q.cuda(non_blocking=True)
            des_ans = des_ans.cuda()
            mc_q = mc_q.cuda(non_blocking=True)
            mc_ans = mc_ans.cuda()

        # Update the learning rate.
        lr = optim.get_epoch_lr(cur_epoch + float(cur_iter) / data_size, cfg)
        optim.set_lr(optimizer, lr)

        train_meter.data_toc()

        pred_des_ans = model(frames, des_q, True)
        pred_mc_ans = model(frames, mc_q, False)
        # Explicitly declare reduction to mean.
        des_loss_fun = losses.get_loss_func('cross_entropy')(reduction="mean")
        mc_loss_fun = losses.get_loss_func('bce_logit')(reduction="mean")
        # Compute the loss.
        loss = des_loss_fun(pred_des_ans, des_ans)
        loss += mc_loss_fun(pred_mc_ans, mc_ans)
        # check Nan Loss.
        misc.check_nan_losses(loss)
        #Check if plateau

        # Perform the backward pass.
        optimizer.zero_grad()
        loss.backward()
        # Update the parameters.
        optimizer.step()

        top1_err, top5_err = None, None
        # Compute the errors.
        num_topks_correct = metrics.topks_correct(pred_des_ans, des_ans,
                                                  (1, 5))
        top1_err, top5_err = [(1.0 - x / pred_des_ans.size(0)) * 100.0
                              for x in num_topks_correct]
        diff_mc_ans = torch.abs(
            mc_ans - (torch.sigmoid(pred_mc_ans) >= 0.5).float())  #Errors
        mc_opt_err = 100 * torch.true_divide(diff_mc_ans.sum(),
                                             (4 * des_q.size()[0]))
        mc_q_err = 100 * torch.true_divide(
            (diff_mc_ans.sum(dim=1, keepdim=True) != 0).float().sum(),
            des_q.size()[0])
        # Gather all the predictions across all the devices.
        if cfg.NUM_GPUS > 1:
            loss, top1_err, top5_err, mc_opt_err, mc_q_err = du.all_reduce(
                [loss, top1_err, top5_err, mc_opt_err, mc_q_err])
        # Copy the stats from GPU to CPU (sync point).
        loss, top1_err, top5_err, mc_opt_err, mc_q_err = (loss.item(),
                                                          top1_err.item(),
                                                          top5_err.item(),
                                                          mc_opt_err.item(),
                                                          mc_q_err.item())

        # Update and log stats.
        train_meter.update_stats(
            top1_err,
            top5_err,
            mc_opt_err,
            mc_q_err,
            loss,
            lr,
            frames.size()[0] * max(
                cfg.NUM_GPUS, 1
            ),  # If running  on CPU (cfg.NUM_GPUS == 1), use 1 to represent 1 CPU.
        )
        # write to tensorboard format if available.
        if writer is not None:
            writer.add_scalars(
                {
                    "Train/loss": loss,
                    "Train/lr": lr,
                    "Train/Top1_err": top1_err,
                    "Train/Top5_err": top5_err,
                    "Train/mc_opt_err": mc_opt_err,
                    "Train/mc_q_err": mc_q_err,
                },
                global_step=data_size * cur_epoch + cur_iter,
            )

        train_meter.iter_toc()  # measure allreduce for this meter
        train_meter.log_iter_stats(cur_epoch, cur_iter)
        train_meter.iter_tic()

    # Log epoch stats.
    train_meter.log_epoch_stats(cur_epoch)
    train_meter.reset()
Example #21
0
def eval_epoch(val_loader, model, val_meter, cur_epoch, cfg, test_imp=False):
    """
    Evaluate the model on the val set.
    Args:
        val_loader (loader): data loader to provide validation data.
        model (model): model to evaluate the performance.
        val_meter (ClevrerValMeter): meter instance to record and calculate the metrics.
        cur_epoch (int): number of the current epoch of training.
        cfg (CfgNode): configs. Details can be found in
            slowfast/config/defaults.py
    """
    test_counter = 0
    # Evaluation mode enabled. The running stats would not be updated.
    model.eval()
    val_meter.iter_tic()

    for cur_iter, sampled_batch in enumerate(val_loader):
        frames = sampled_batch['frames']
        des_q = sampled_batch['question_dict']['question']
        des_ans = sampled_batch['question_dict']['ans']
        # des_len = sampled_batch['question_dict']['len']
        # Transfer the data to the current GPU device.
        if cfg.NUM_GPUS:
            if isinstance(frames, (list,)):
                for i in range(len(frames)):
                    frames[i] = frames[i].cuda(non_blocking=True)
            else:
                frames = frames.cuda(non_blocking=True)
            des_q = des_q.cuda(non_blocking=True)
            des_ans = des_ans.cuda()
            # des_len = des_len.cuda(non_blocking=True)

        val_meter.data_toc()

        # Explicitly declare reduction to mean.
        des_loss_fun = losses.get_loss_func('cross_entropy')(reduction="mean")
        pred_des_ans = model(frames, des_q, True)
        loss_des_val = des_loss_fun(pred_des_ans, des_ans)

        # Compute the errors.
        num_topks_correct = metrics.topks_correct(pred_des_ans, des_ans, (1, 5))
        # Combine the errors across the GPUs.
        top1_err, top5_err = [
            (1.0 - x / pred_des_ans.size(0)) * 100.0 for x in num_topks_correct
        ]
        loss_mc_val = None
        mc_opt_err, mc_q_err = None, None
        mb_size_mc = None
        loss_des_val, top1_err, top5_err = (
            loss_des_val.item(),
            top1_err.item(),
            top5_err.item()
        )

        val_meter.iter_toc()
        #top1_err, top5_err, mc_opt_err, mc_q_err, loss_des, loss_mc, mb_size_des, mb_size_mc
        # Update and log stats.
        val_meter.update_stats(
            top1_err,
            top5_err,
            mc_opt_err,
            mc_q_err,
            loss_des_val,
            loss_mc_val,
            des_q.size()[0],
            mb_size_mc
        )
        val_meter.log_iter_stats(cur_epoch, cur_iter)
        val_meter.iter_tic()

        #For testing implementation
        if test_imp:
            print(" --- Descriptive questions results --- ")
            # print("Des_q")
            # print(des_q)
            print("Des_ans")
            print(des_ans)
            #print("Des_ans_pred")
            #print(pred_des_ans)
            print("Argmax => prediction")
            print(torch.argmax(pred_des_ans, dim=1, keepdim=False))
            print("Top1_err and Top5err")
            print(top1_err, top5_err)
            print("Loss_des_val = {}".format(loss_des_val))
            test_counter += 1
            if test_counter == 1: 
                break

    # Log epoch stats.
    val_meter.log_epoch_stats(cur_epoch)
    val_meter.reset()
Example #22
0
def eval_epoch(val_loader, model, val_meter, cur_epoch, cfg, writer=None):
    """
    Evaluate the model on the val set.
    Args:
        val_loader (loader): data loader to provide validation data.
        model (model): model to evaluate the performance.
        val_meter (ValMeter): meter instance to record and calculate the metrics.
        cur_epoch (int): number of the current epoch of training.
        cfg (CfgNode): configs. Details can be found in
            slowfast/config/defaults.py
        writer (TensorboardWriter, optional): TensorboardWriter object
            to writer Tensorboard log.
    """

    # Evaluation mode enabled. The running stats would not be updated.
    model.eval()
    val_meter.iter_tic()

    for cur_iter, (inputs, labels, _, meta, boxes,
                   b_indices) in enumerate(val_loader):
        if cfg.NUM_GPUS:
            # Transferthe data to the current GPU device.
            if isinstance(inputs, (list, )):
                for i in range(len(inputs)):
                    inputs[i] = inputs[i].cuda(non_blocking=True)
            else:
                inputs = inputs.cuda(non_blocking=True)
            labels = labels.cuda()
            for key, val in meta.items():
                if isinstance(val, (list, )):
                    for i in range(len(val)):
                        val[i] = val[i].cuda(non_blocking=True)
                else:
                    meta[key] = val.cuda(non_blocking=True)
        val_meter.data_toc()

        if cfg.DETECTION.ENABLE:
            # Compute the predictions.
            preds = model(inputs, meta["boxes"])
            ori_boxes = meta["ori_boxes"]
            metadata = meta["metadata"]

            if cfg.NUM_GPUS:
                preds = preds.cpu()
                ori_boxes = ori_boxes.cpu()
                metadata = metadata.cpu()

            if cfg.NUM_GPUS > 1:
                preds = torch.cat(du.all_gather_unaligned(preds), dim=0)
                ori_boxes = torch.cat(du.all_gather_unaligned(ori_boxes),
                                      dim=0)
                metadata = torch.cat(du.all_gather_unaligned(metadata), dim=0)

            val_meter.iter_toc()
            # Update and log stats.
            val_meter.update_stats(preds, ori_boxes, metadata)

        else:
            preds = model(inputs)

            if cfg.DATA.MULTI_LABEL:
                if cfg.NUM_GPUS > 1:
                    preds, labels = du.all_gather([preds, labels])
            else:
                # Compute the errors.
                num_topks_correct = metrics.topks_correct(
                    preds, labels, (1, 5))

                # Combine the errors across the GPUs.
                top1_err, top5_err = [(1.0 - x / preds.size(0)) * 100.0
                                      for x in num_topks_correct]
                if cfg.NUM_GPUS > 1:
                    top1_err, top5_err = du.all_reduce([top1_err, top5_err])

                # Copy the errors from GPU to CPU (sync point).
                top1_err, top5_err = top1_err.item(), top5_err.item()

                val_meter.iter_toc()
                # Update and log stats.
                val_meter.update_stats(
                    top1_err,
                    top5_err,
                    inputs[0].size(0) * max(
                        cfg.NUM_GPUS, 1
                    ),  # If running  on CPU (cfg.NUM_GPUS == 1), use 1 to represent 1 CPU.
                )
                # write to tensorboard format if available.
                if writer is not None:
                    writer.add_scalars(
                        {
                            "Val/Top1_err": top1_err,
                            "Val/Top5_err": top5_err
                        },
                        global_step=len(val_loader) * cur_epoch + cur_iter,
                    )

            val_meter.update_predictions(preds, labels)

        val_meter.log_iter_stats(cur_epoch, cur_iter)
        val_meter.iter_tic()

    # Log epoch stats.
    val_meter.log_epoch_stats(cur_epoch)
    # write to tensorboard format if available.
    if writer is not None:
        if cfg.DETECTION.ENABLE:
            writer.add_scalars({"Val/mAP": val_meter.full_map},
                               global_step=cur_epoch)
        else:
            all_preds = [pred.clone().detach() for pred in val_meter.all_preds]
            all_labels = [
                label.clone().detach() for label in val_meter.all_labels
            ]
            if cfg.NUM_GPUS:
                all_preds = [pred.cpu() for pred in all_preds]
                all_labels = [label.cpu() for label in all_labels]
            writer.plot_eval(preds=all_preds,
                             labels=all_labels,
                             global_step=cur_epoch)

    val_meter.reset()
Example #23
0
def perform_test(test_dloader, model, cfg):
    model.eval()
    ens_number = cfg.TEST.NUM_ENSEMBLE_VIEWS * cfg.TEST.NUM_SPATIAL_CROPS
    # Collect predictions
    collect_pred = []
    collect_vids = []
    for cur_iter, (inputs, labels, vids,
                   extra_data) in enumerate(tqdm(test_dloader, ncols=80)):
        # Transfer the data to the current GPU device.
        if cfg.NUM_GPUS:
            if isinstance(inputs, (list, )):
                for i in range(len(inputs)):
                    inputs[i] = inputs[i].cuda(
                        non_blocking=cfg.DATA_LOADER.PIN_MEMORY)
            else:
                inputs = inputs.cuda(non_blocking=cfg.DATA_LOADER.PIN_MEMORY)
            for key, val in extra_data.items():
                if isinstance(val, (list, )):
                    for i in range(len(val)):
                        val[i] = val[i].cuda(non_blocking=True)
                else:
                    extra_data[key] = val.cuda(non_blocking=True)
        with torch.no_grad():
            if cfg.DETECTION.ENABLE:
                preds = model(inputs, extra_data["boxes"])
            else:
                preds = model(inputs)
            preds = preds.cpu()
        collect_pred.append(preds)
        collect_vids.append(vids)
    # Make vid2label
    idx2label, label2idx = test_dloader.dataset.idx2label, test_dloader.dataset.label2idx
    test_datas = test_dloader.dataset.data
    vid2label = {}
    for label, vid, _ in test_datas:
        if vid not in vid2label:
            vid2label[vid] = torch.LongTensor([label2idx[label]])
    # Gather vid2logits
    vid2logits = {}
    for preds, vids in tqdm(zip(collect_pred, collect_vids)):
        B = preds.size(0)
        for b in range(B):
            if vids[b] not in vid2logits:
                vid2logits[vids[b]] = preds[b]
            else:
                if cfg.TEST.ENSEMBLE_METHOD == 'sum':
                    vid2logits[vids[b]] += preds[b]
                elif cfg.TEST.ENSEMBLE_METHOD == 'max':
                    vid2logits[vids[b]], _ = torch.stack(
                        (vid2logits[vids[b]], preds[b])).max(0)
    # Calculate accuracy
    results = defaultdict(list)
    for vid in vid2label:
        #vid2logits[vid] = torch.tensor(vid2logits[vid])
        if cfg.TEST.ENSEMBLE_METHOD == 'sum':
            vid2logits[vid] = vid2logits[vid] / ens_number
        preds = vid2logits[vid].unsqueeze(0)
        labels = vid2label[vid].unsqueeze(0)
        top1_tensor, top2_tensor = metrics.topks_correct(preds, labels, (1, 2))
        results['top1'] += top1_tensor.tolist()
        results['top2'] += top2_tensor.tolist()
        #results['top1_'+idx2label[labels[0]]] += top1_tensor.tolist()
        #results['top2_'+idx2label[labels[0]]] += top2_tensor.tolist()
    print_str = "test: "
    for key in results:
        results[key] = sum(results[key]) / len(results[key])
        print_str += "{}: {} \n".format(key, results[key])
    print(print_str)
    with open(os.path.join(cfg.TEST_OUTPUT_DIR, 'res_out.txt'), 'a') as f:
        f.write(print_str)
    with open(os.path.join(cfg.TEST_OUTPUT_DIR, 'pred_logits.pkl'), 'wb') as f:
        pickle.dump(vid2logits, f)
    return results
Example #24
0
def train_epoch(train_loader,
                model,
                optimizer,
                scheduler,
                train_meter,
                cur_epoch,
                cfg,
                test_imp=False):
    """
    Perform the video training for one epoch.
    Args:
        train_loader (loader): video training loader.
        model (model): the video model to train.
        optimizer (optim): the optimizer to perform optimization on the model's
            parameters.
        train_meter (ClevrerTrainMeter): training meters to log the training performance.
        cur_epoch (int): current epoch of training.
        cfg (CfgNode): configs. Details can be found in
            slowfast/config/defaults.py
    """
    # Enable train mode.
    model.train()
    train_meter.iter_tic()
    data_size = len(train_loader)

    for cur_iter, sampled_batch in enumerate(train_loader):
        frames = sampled_batch['res_ft']
        des_q = sampled_batch['question_dict']['question']
        des_ans = sampled_batch['question_dict']['ans']
        # Transfer the data to the current GPU device.
        if cfg.NUM_GPUS:
            frames = frames.cuda(non_blocking=True)
            des_q = des_q.cuda(non_blocking=True)
            des_ans = des_ans.cuda()

        train_meter.data_toc()
        #Pass through
        model.zero_grad()
        pred_des_ans = model(frames, des_q, True)
        des_loss_fun = losses.get_loss_func('cross_entropy')(reduction="mean")
        loss = des_loss_fun(pred_des_ans, des_ans)
        # check Nan Loss.
        misc.check_nan_losses(loss)
        #Backward pass
        loss.backward()
        optimizer.step()
        scheduler.step()
        #Save for stats
        loss_des_val = loss

        top1_err, top5_err = None, None
        # Compute the errors.
        num_topks_correct = metrics.topks_correct(pred_des_ans, des_ans,
                                                  (1, 5))
        top1_err, top5_err = [(1.0 - x / pred_des_ans.size(0)) * 100.0
                              for x in num_topks_correct]
        mc_opt_err, mc_q_err = None, None
        mb_size_mc = None
        loss_des_val, top1_err, top5_err = (loss_des_val.item(),
                                            top1_err.item(), top5_err.item())
        #top1_err, top5_err, mc_opt_err, mc_q_err, loss_des, loss_mc, lr, mb_size
        # Update and log stats.
        train_meter.update_stats(top1_err, top5_err, mc_opt_err, mc_q_err,
                                 loss_des_val, None, scheduler.get_last_lr(),
                                 des_ans.size(0), mb_size_mc)
        train_meter.iter_toc()  # measure allreduce for this meter
        train_meter.log_iter_stats(cur_epoch, cur_iter)
        train_meter.iter_tic()

    # Log epoch stats.
    train_meter.log_epoch_stats(cur_epoch)
    train_meter.reset()
Example #25
0
def train_epoch(train_loader, model, optimizer, train_meter, cur_epoch, cfg):
    """
    Perform the video training for one epoch.
    Args:
        train_loader (loader): video training loader.
        model (model): the video model to train.
        optimizer (optim): the optimizer to perform optimization on the model's
            parameters.
        train_meter (TrainMeter): training meters to log the training performance.
        cur_epoch (int): current epoch of training.
        cfg (CfgNode): configs. Details can be found in
            slowfast/config/defaults.py
    """
    # Enable train mode.
    model.train()
    train_meter.iter_tic()
    data_size = len(train_loader)

    for cur_iter, (inputs, labels, _, meta) in enumerate(train_loader):
        # Transfer the data to the current GPU device.
        if isinstance(inputs, (list, )):
            for i in range(len(inputs)):
                inputs[i] = inputs[i].cuda(non_blocking=True)
        else:
            inputs = inputs.cuda(non_blocking=True)
        labels = labels.cuda()
        for key, val in meta.items():
            if isinstance(val, (list, )):
                for i in range(len(val)):
                    val[i] = val[i].cuda(non_blocking=True)
            else:
                meta[key] = val.cuda(non_blocking=True)

        # Update the learning rate.
        lr = optim.get_epoch_lr(cur_epoch + float(cur_iter) / data_size, cfg)
        optim.set_lr(optimizer, lr)

        if cfg.DETECTION.ENABLE:
            # Compute the predictions.
            preds = model(inputs, meta["boxes"])

        else:
            # Perform the forward pass.
            preds = model(inputs)

        # Explicitly declare reduction to mean.
        loss_fun = losses.get_loss_func(cfg.MODEL.LOSS_FUNC)(reduction="mean")

        # Compute the loss.
        loss = loss_fun(preds, labels)

        # check Nan Loss.
        misc.check_nan_losses(loss)

        # Perform the backward pass.
        optimizer.zero_grad()
        loss.backward()
        # Update the parameters.
        optimizer.step()

        if cfg.DETECTION.ENABLE:
            if cfg.NUM_GPUS > 1:
                loss = du.all_reduce([loss])[0]
            loss = loss.item()

            train_meter.iter_toc()
            # Update and log stats.
            train_meter.update_stats(None, None, None, loss, lr)
        else:
            # Compute the errors.
            num_topks_correct = metrics.topks_correct(preds, labels, (1, 5))
            top1_err, top5_err = [(1.0 - x / preds.size(0)) * 100.0
                                  for x in num_topks_correct]

            # Gather all the predictions across all the devices.
            if cfg.NUM_GPUS > 1:
                loss, top1_err, top5_err = du.all_reduce(
                    [loss, top1_err, top5_err])

            # Copy the stats from GPU to CPU (sync point).
            loss, top1_err, top5_err = (
                loss.item(),
                top1_err.item(),
                top5_err.item(),
            )

            train_meter.iter_toc()
            # Update and log stats.
            train_meter.update_stats(top1_err, top5_err, loss, lr,
                                     inputs[0].size(0) * cfg.NUM_GPUS)

        train_meter.log_iter_stats(cur_epoch, cur_iter)
        train_meter.iter_tic()

    # Log epoch stats.
    train_meter.log_epoch_stats(cur_epoch)
    train_meter.reset()
Example #26
0
def train_epoch(
        train_loader, model, optimizer, train_meter, cur_epoch, cfg, writer=None
):
    """
    Perform the video training for one epoch.
    Args:
        train_loader (loader): video training loader.
        model (model): the video model to train.
        optimizer (optim): the optimizer to perform optimization on the model's
            parameters.
        train_meter (TrainMeter): training meters to log the training performance.
        cur_epoch (int): current epoch of training.
        cfg (CfgNode): configs. Details can be found in
            slowfast/config/defaults.py
        writer (TensorboardWriter, optional): TensorboardWriter object
            to writer Tensorboard log.
    """
    # Enable train mode.
    model.train()
    # Check if the correct params are set to requires_grad = True
    assert_requires_grad_correctness(model, du.is_master_proc(), cfg)
    train_meter.iter_tic()
    data_size = len(train_loader)
    np.set_printoptions(suppress=True)

    for cur_iter, (inputs, labels, _, meta) in enumerate(train_loader):
        # Transfer the data to the current GPU device.
        if isinstance(inputs, (list,)):
            for i in range(len(inputs)):
                inputs[i] = inputs[i].cuda(non_blocking=True)
        else:
            inputs = inputs.cuda(non_blocking=True)
        labels = labels.cuda()
        for key, val in meta.items():
            if isinstance(val, (list,)):
                for i in range(len(val)):
                    val[i] = val[i].cuda(non_blocking=True)
            else:
                meta[key] = val.cuda(non_blocking=True)

        if cfg.MODEL.HEAD_ACT == "softmax" and cfg.TRAIN.DATASET == "custom":
            # We have to change our labels to long tensor
            labels = labels.type(torch.LongTensor)
            labels = labels.cuda()

        # Update the learning rate.
        lr = optim.get_epoch_lr(cur_epoch + float(cur_iter) / data_size, cfg)
        optim.set_lr(optimizer, lr, cfg)

        if cfg.DETECTION.ENABLE:
            # Compute the predictions.
            preds = model(inputs, meta["boxes"], is_train=True)

        else:
            # Perform the forward pass.
            preds = model(inputs)

        # Explicitly declare reduction to mean.
        loss_fun = losses.get_loss_func(cfg.MODEL.LOSS_FUNC)(reduction="mean")

        # Compute the loss.
        loss = loss_fun(preds, labels)

        """
        if cur_iter % 70 == 0:
            softmax = torch.nn.Softmax(dim=1)
            probabilities = softmax(preds)
            loss_prob = loss_fun(probabilities, labels)
            preds_numpy = probabilities.cpu().detach().numpy()
            preds_numpy = np.round(preds_numpy, 4)
            labels_numpy = labels.cpu().detach().numpy()
            print("--------------------------")
            for label, pred in zip (labels_numpy, preds_numpy):
                print(str(label) + "---->", end= "")
                print(pred[label])
        """


        # check Nan Loss.
        misc.check_nan_losses(loss)

        # Perform the backward pass.
        optimizer.zero_grad()
        loss.backward()
        # Update the parameters.
        optimizer.step()

        # Todo: adjust accordingly
        if cfg.DETECTION.ENABLE:  #and not (cfg.MODEL.HEAD_ACT == "softmax"):
            if cfg.NUM_GPUS > 1:
                loss = du.all_reduce([loss])[0]
            loss = loss.item()

            train_meter.iter_toc()
            # Update and log stats.
            train_meter.update_stats(None, None, None, loss, lr)
            # write to tensorboard format if available.
            if writer is not None:
                writer.add_scalars(
                    {"Train/loss": loss, "Train/lr": lr},
                    global_step=data_size * cur_epoch + cur_iter,
                )

        else:
            top1_err, top5_err = None, None
            if cfg.DATA.MULTI_LABEL:
                # Gather all the predictions across all the devices.
                if cfg.NUM_GPUS > 1:
                    [loss] = du.all_reduce([loss])
                loss = loss.item()
            else:
                # Compute the errors.
                num_topks_correct = metrics.topks_correct(preds, labels, (1, 5))
                top1_err, top5_err = [
                    (1.0 - x / preds.size(0)) * 100.0 for x in num_topks_correct
                ]

                # Gather all the predictions across all the devices.
                if cfg.NUM_GPUS > 1:
                    loss, top1_err, top5_err = du.all_reduce(
                        [loss, top1_err, top5_err]
                    )

                # Copy the stats from GPU to CPU (sync point).
                loss, top1_err, top5_err = (
                    loss.item(),
                    top1_err.item(),
                    top5_err.item(),
                )

            train_meter.iter_toc()
            # Update and log stats.
            train_meter.update_stats(
                top1_err, top5_err, loss, lr, inputs[0].size(0) * cfg.NUM_GPUS
            )
            # write to tensorboard format if available.
            if writer is not None:
                writer.add_scalars(
                    {
                        "Train/loss": loss,
                        "Train/lr": lr,
                        "Train/Top1_err": top1_err,
                        "Train/Top5_err": top5_err,
                    },
                    global_step=data_size * cur_epoch + cur_iter,
                )

        train_meter.log_iter_stats(cur_epoch, cur_iter)
        train_meter.iter_tic()

    # Log epoch stats.
    train_meter.log_epoch_stats(cur_epoch)
    train_meter.reset()
def eval_epoch(val_loader,
               model,
               val_meter,
               cur_epoch,
               cfg,
               writer=None,
               wandb_log=False):
    """
    Evaluate the model on the val set.
    Args:
        val_loader (loader): data loader to provide validation data.
        model (model): model to evaluate the performance.
        val_meter (ValMeter): meter instance to record and calculate the metrics.
        cur_epoch (int): number of the current epoch of training.
        cfg (CfgNode): configs. Details can be found in
            slowfast/config/defaults.py
        writer (TensorboardWriter, optional): TensorboardWriter object
            to writer Tensorboard log.
    """
    # Evaluation mode enabled. The running stats would not be updated.
    model.eval()
    val_meter.iter_tic()

    for cur_iter, (inputs, labels, _, meta) in enumerate(val_loader):
        if cfg.NUM_GPUS:
            # Transferthe data to the current GPU device.
            if isinstance(inputs, (list, )):
                for i in range(len(inputs)):
                    inputs[i] = inputs[i].cuda(non_blocking=True)
            else:
                inputs = inputs.cuda(non_blocking=True)
            if isinstance(labels, (dict, )):
                labels = {k: v.cuda() for k, v in labels.items()}
            else:
                labels = labels.cuda()
            for key, val in meta.items():
                if isinstance(val, (list, )):
                    for i in range(len(val)):
                        val[i] = val[i].cuda(non_blocking=True)
                else:
                    meta[key] = val.cuda(non_blocking=True)
        val_meter.data_toc()

        preds = model(inputs)

        if isinstance(labels, (dict, )):
            # Explicitly declare reduction to mean.
            loss_fun = losses.get_loss_func(
                cfg.MODEL.LOSS_FUNC)(reduction="mean")

            # Compute the loss.
            loss_verb = loss_fun(preds[0], labels['verb'])
            loss_noun = loss_fun(preds[1], labels['noun'])
            loss = 0.5 * (loss_verb + loss_noun)

            # Compute the verb accuracies.
            verb_top1_acc, verb_top5_acc = metrics.topk_accuracies(
                preds[0], labels['verb'], (1, 5))

            # Combine the errors across the GPUs.
            if cfg.NUM_GPUS > 1:
                loss_verb, verb_top1_acc, verb_top5_acc = du.all_reduce(
                    [loss_verb, verb_top1_acc, verb_top5_acc])

            # Copy the errors from GPU to CPU (sync point).
            loss_verb, verb_top1_acc, verb_top5_acc = (
                loss_verb.item(),
                verb_top1_acc.item(),
                verb_top5_acc.item(),
            )

            # Compute the noun accuracies.
            noun_top1_acc, noun_top5_acc = metrics.topk_accuracies(
                preds[1], labels['noun'], (1, 5))

            # Combine the errors across the GPUs.
            if cfg.NUM_GPUS > 1:
                loss_noun, noun_top1_acc, noun_top5_acc = du.all_reduce(
                    [loss_noun, noun_top1_acc, noun_top5_acc])

            # Copy the errors from GPU to CPU (sync point).
            loss_noun, noun_top1_acc, noun_top5_acc = (
                loss_noun.item(),
                noun_top1_acc.item(),
                noun_top5_acc.item(),
            )

            # Compute the action accuracies.
            action_top1_acc, action_top5_acc = metrics.multitask_topk_accuracies(
                (preds[0], preds[1]), (labels['verb'], labels['noun']), (1, 5))
            # Combine the errors across the GPUs.
            if cfg.NUM_GPUS > 1:
                loss, action_top1_acc, action_top5_acc = du.all_reduce(
                    [loss, action_top1_acc, action_top5_acc])

            # Copy the errors from GPU to CPU (sync point).
            loss, action_top1_acc, action_top5_acc = (
                loss.item(),
                action_top1_acc.item(),
                action_top5_acc.item(),
            )

            val_meter.iter_toc()
            # Update and log stats.
            val_meter.update_stats(
                (verb_top1_acc, noun_top1_acc, action_top1_acc),
                (verb_top5_acc, noun_top5_acc, action_top5_acc),
                inputs[0].size(0) * max(
                    cfg.NUM_GPUS, 1
                ),  # If running  on CPU (cfg.NUM_GPUS == 1), use 1 to represent 1 CPU.
            )
            # write to tensorboard format if available.
            if writer is not None and not wandb_log:
                writer.add_scalars(
                    {
                        "Val/loss": loss,
                        "Val/Top1_acc": action_top1_acc,
                        "Val/Top5_acc": action_top5_acc,
                        "Val/verb/loss": loss_verb,
                        "Val/verb/Top1_acc": verb_top1_acc,
                        "Val/verb/Top5_acc": verb_top5_acc,
                        "Val/noun/loss": loss_noun,
                        "Val/noun/Top1_acc": noun_top1_acc,
                        "Val/noun/Top5_acc": noun_top5_acc,
                    },
                    global_step=len(val_loader) * cur_epoch + cur_iter,
                )

            if wandb_log:
                wandb.log(
                    {
                        "Val/loss": loss,
                        "Val/Top1_acc": action_top1_acc,
                        "Val/Top5_acc": action_top5_acc,
                        "Val/verb/loss": loss_verb,
                        "Val/verb/Top1_acc": verb_top1_acc,
                        "Val/verb/Top5_acc": verb_top5_acc,
                        "Val/noun/loss": loss_noun,
                        "Val/noun/Top1_acc": noun_top1_acc,
                        "Val/noun/Top5_acc": noun_top5_acc,
                        "val_step": len(val_loader) * cur_epoch + cur_iter,
                    }, )

            val_meter.update_predictions((preds[0], preds[1]),
                                         (labels['verb'], labels['noun']))

        else:
            # Explicitly declare reduction to mean.
            loss_fun = losses.get_loss_func(
                cfg.MODEL.LOSS_FUNC)(reduction="mean")

            # Compute the loss.
            loss = loss_fun(preds, labels)

            if cfg.DATA.MULTI_LABEL:
                if cfg.NUM_GPUS > 1:
                    preds, labels = du.all_gather([preds, labels])

            else:
                # Compute the errors.
                num_topks_correct = metrics.topks_correct(
                    preds, labels, (1, 5))

                # Combine the errors across the GPUs.
                top1_err, top5_err = [(1.0 - x / preds.size(0)) * 100.0
                                      for x in num_topks_correct]
                if cfg.NUM_GPUS > 1:
                    loss, top1_err, top5_err = du.all_reduce(
                        [loss, top1_err, top5_err])

                # Copy the errors from GPU to CPU (sync point).
                loss, top1_err, top5_err = (
                    loss.item(),
                    top1_err.item(),
                    top5_err.item(),
                )

                val_meter.iter_toc()
                # Update and log stats.
                val_meter.update_stats(
                    top1_err,
                    top5_err,
                    inputs[0].size(0) * max(
                        cfg.NUM_GPUS, 1
                    ),  # If running  on CPU (cfg.NUM_GPUS == 1), use 1 to represent 1 CPU.
                )
                # write to tensorboard format if available.
                if writer is not None and not wandb_log:
                    writer.add_scalars(
                        {
                            "Val/loss": loss,
                            "Val/Top1_err": top1_err,
                            "Val/Top5_err": top5_err,
                        },
                        global_step=len(val_loader) * cur_epoch + cur_iter,
                    )

                if wandb_log:
                    wandb.log(
                        {
                            "Val/loss": loss,
                            "Val/Top1_err": top1_err,
                            "Val/Top5_err": top5_err,
                            "val_step": len(val_loader) * cur_epoch + cur_iter,
                        }, )

            val_meter.update_predictions(preds, labels)

        val_meter.log_iter_stats(cur_epoch, cur_iter)
        val_meter.iter_tic()

    # Log epoch stats.
    is_best_epoch, top1_dict = val_meter.log_epoch_stats(cur_epoch)
    # write to tensorboard format if available.
    if writer is not None:
        all_preds = [pred.clone().detach() for pred in val_meter.all_preds]
        all_labels = [label.clone().detach() for label in val_meter.all_labels]
        if cfg.NUM_GPUS:
            all_preds = [pred.cpu() for pred in all_preds]
            all_labels = [label.cpu() for label in all_labels]
        writer.plot_eval(preds=all_preds,
                         labels=all_labels,
                         global_step=cur_epoch)

    if writer is not None and not wandb_log:
        if "top1_acc" in top1_dict.keys():
            writer.add_scalars(
                {
                    "Val/epoch/Top1_acc": top1_dict["top1_acc"],
                    "Val/epoch/verb/Top1_acc": top1_dict["verb_top1_acc"],
                    "Val/epoch/noun/Top1_acc": top1_dict["noun_top1_acc"],
                },
                global_step=cur_epoch,
            )

        else:
            writer.add_scalars(
                {"Val/epoch/Top1_err": top1_dict["top1_err"]},
                global_step=cur_epoch,
            )

    if wandb_log:
        if "top1_acc" in top1_dict.keys():
            wandb.log(
                {
                    "Val/epoch/Top1_acc": top1_dict["top1_acc"],
                    "Val/epoch/verb/Top1_acc": top1_dict["verb_top1_acc"],
                    "Val/epoch/noun/Top1_acc": top1_dict["noun_top1_acc"],
                    "epoch": cur_epoch,
                }, )

        else:
            wandb.log({
                "Val/epoch/Top1_err": top1_dict["top1_err"],
                "epoch": cur_epoch
            })

    top1 = top1_dict["top1_acc"] if "top1_acc" in top1_dict.keys(
    ) else top1_dict["top1_err"]
    val_meter.reset()
    return is_best_epoch, top1
Example #28
0
label_class, vids, labels = [], [], []
for lidx, elem in enumerate(test_anno_json):
    label_class.append(elem['label'])
    labels += [lidx] * len(elem['vids'])
    vids += elem['vids']

pred_logits = torch.zeros((len(vids), len(label_class))) # [V x C]
for test_dir in test_dirs:
    with open(os.path.join('logdir', test_dir, 'pred_logits.pkl'), 'rb') as f:
        pred_dict = pickle.load(f)
    for vidx, vid in enumerate(vids):
        pred_logits[vidx] += pred_dict[vid].squeeze()

labels_torch = torch.LongTensor(labels)
top1_tensor, top2_tensor = metrics.topks_correct(pred_logits, labels_torch, (1, 2))
print_str = "top1: %.5f top2: %.5f\n"%(top1_tensor.mean(), top2_tensor.mean())
anno5_right_1, anno5_right_2, anno5_num = 0, 0, 0
for lidx, name in enumerate(label_class):
    mask = (labels_torch == lidx).float()
    top1 = (top1_tensor * mask).sum() / mask.sum()
    top2 = (top2_tensor * mask).sum() / mask.sum()
    print_str += name + " top1: %.5f top2: %.5f num: %d\n"%(top1, top2, mask.sum())
    if label_class[lidx] in  ['서있다','걷다','뛰다','앉아있다','누워있다']:
        anno5_right_1 += (top1_tensor * mask).sum()
        anno5_right_2 += (top2_tensor * mask).sum()
        anno5_num += mask.sum()
print("anno5 top1: %.5f top2: %.5f"%(anno5_right_1/anno5_num, anno5_right_2/anno5_num))
print(print_str)

# Make test csv
Example #29
0
def train_epoch(train_loader,
                model,
                optimizer,
                train_meter,
                cur_epoch,
                cfg,
                writer=None):
    """
    Perform the video training for one epoch.
    Args:
        train_loader (loader): video training loader.
        model (model): the video model to train.
        optimizer (optim): the optimizer to perform optimization on the model's
            parameters.
        train_meter (TrainMeter): training meters to log the training performance.
        cur_epoch (int): current epoch of training.
        cfg (CfgNode): configs. Details can be found in
            slowfast/config/defaults.py
        writer (TensorboardWriter, optional): TensorboardWriter object
            to writer Tensorboard log.
    """
    # Enable train mode.
    model.train()
    train_meter.iter_tic()
    data_size = len(train_loader)

    for cur_iter, (inputs, labels, _, meta, boxes,
                   b_indices) in enumerate(train_loader):
        # Transfer the data to the current GPU device.
        if cfg.NUM_GPUS:
            if isinstance(inputs, (list, )):
                for i in range(len(inputs)):
                    inputs[i] = inputs[i].cuda(non_blocking=True)
            else:
                inputs = inputs.cuda(non_blocking=True)
            labels = labels.cuda()
            for key, val in meta.items():
                if isinstance(val, (list, )):
                    for i in range(len(val)):
                        val[i] = val[i].cuda(non_blocking=True)
                else:
                    meta[key] = val.cuda(non_blocking=True)

        # Update the learning rate.
        lr = optim.get_epoch_lr(cur_epoch + float(cur_iter) / data_size, cfg)
        optim.set_lr(optimizer, lr)

        train_meter.data_toc()

        if cfg.DETECTION.ENABLE:
            preds = model(inputs, meta["boxes"])
        else:
            preds = model(inputs)
        # Explicitly declare reduction to mean.
        loss_fun = losses.get_loss_func(cfg.MODEL.LOSS_FUNC)(reduction="mean")

        # Compute the loss.
        loss = loss_fun(preds, labels)

        # check Nan Loss.
        misc.check_nan_losses(loss)

        # Perform the backward pass.
        optimizer.zero_grad()
        loss.backward()
        # Update the parameters.
        optimizer.step()

        if cfg.DETECTION.ENABLE:
            if cfg.NUM_GPUS > 1:
                loss = du.all_reduce([loss])[0]
            loss = loss.item()

            # Update and log stats.
            train_meter.update_stats(None, None, None, loss, lr)
            # write to tensorboard format if available.
            if writer is not None:
                writer.add_scalars(
                    {
                        "Train/loss": loss,
                        "Train/lr": lr
                    },
                    global_step=data_size * cur_epoch + cur_iter,
                )

        else:
            top1_err, top5_err = None, None
            if cfg.DATA.MULTI_LABEL:
                # Gather all the predictions across all the devices.
                if cfg.NUM_GPUS > 1:
                    [loss] = du.all_reduce([loss])
                loss = loss.item()
            else:
                # Compute the errors.
                num_topks_correct = metrics.topks_correct(
                    preds, labels, (1, 5))
                top1_err, top5_err = [(1.0 - x / preds.size(0)) * 100.0
                                      for x in num_topks_correct]
                # Gather all the predictions across all the devices.
                if cfg.NUM_GPUS > 1:
                    loss, top1_err, top5_err = du.all_reduce(
                        [loss, top1_err, top5_err])

                # Copy the stats from GPU to CPU (sync point).
                loss, top1_err, top5_err = (
                    loss.item(),
                    top1_err.item(),
                    top5_err.item(),
                )

            # Update and log stats.
            train_meter.update_stats(
                top1_err,
                top5_err,
                loss,
                lr,
                inputs[0].size(0) * max(
                    cfg.NUM_GPUS, 1
                ),  # If running  on CPU (cfg.NUM_GPUS == 1), use 1 to represent 1 CPU.
            )
            # write to tensorboard format if available.
            if writer is not None:
                writer.add_scalars(
                    {
                        "Train/loss": loss,
                        "Train/lr": lr,
                        "Train/Top1_err": top1_err,
                        "Train/Top5_err": top5_err,
                    },
                    global_step=data_size * cur_epoch + cur_iter,
                )

        train_meter.iter_toc()  # measure allreduce for this meter
        train_meter.log_iter_stats(cur_epoch, cur_iter)
        train_meter.iter_tic()

    # Log epoch stats.
    train_meter.log_epoch_stats(cur_epoch)
    train_meter.reset()
Example #30
0
def train_epoch(train_loader,
                model,
                optimizer,
                train_meter,
                cur_epoch,
                cfg,
                writer=None):
    """
    Perform the video training for one epoch.
    Args:
        train_loader (loader): video training loader.
        model (model): the video model to train.
        optimizer (optim): the optimizer to perform optimization on the model's
            parameters.
        train_meter (TrainMeter): training meters to log the training performance.
        cur_epoch (int): current epoch of training.
        cfg (CfgNode): configs. Details can be found in
            slowfast/config/defaults.py
        writer (TensorboardWriter, optional): TensorboardWriter object
            to writer Tensorboard log.
    """
    # Enable train mode.
    model.train()
    train_meter.iter_tic()
    data_size = len(train_loader)

    for cur_iter, (inputs, labels, _, meta) in enumerate(train_loader):
        # Transfer the data to the current GPU device.
        if cfg.NUM_GPUS:
            if isinstance(inputs, (list, )):
                for i in range(len(inputs)):
                    inputs[i] = inputs[i].cuda(non_blocking=True)
            else:
                inputs = inputs.cuda(non_blocking=True)
            labels = labels.cuda()
            for key, val in meta.items():
                if isinstance(val, (list, )):
                    for i in range(len(val)):
                        val[i] = val[i].cuda(non_blocking=True)
                else:
                    meta[key] = val.cuda(non_blocking=True)

        # Update the learning rate.
        lr = optim.get_epoch_lr(cur_epoch + float(cur_iter) / data_size, cfg)
        optim.set_lr(optimizer, lr)

        train_meter.data_toc()

        # _________________________ save model test __________________________________________
        if cur_iter % 100 == 1:
            cu.save_checkpoint(cfg.OUTPUT_DIR, model, optimizer, cur_iter,
                               cfg)  # cur_epoch
            print("----------------------- save done ")
            # exit(0)
            # _________________________________________________________________________________________

        if cfg.DETECTION.ENABLE:
            # inputs[4,3,8,224,224], preds[32,2048,7,7]
            # change {1,3,8,224,224]  ->  [8,3,224,224]
            ##################################################################################
            inputs0 = inputs[0].squeeze(0).permute(1, 0, 2, 3)
            inputs1 = inputs[1].squeeze(0).permute(1, 0, 2, 3)
            meta["boxes"] = meta["boxes"].unsqueeze(0).unsqueeze(0)
            inputs = [inputs0, inputs1]
            preds = model(inputs, meta["boxes"])
            # #################################################################################################################################
            # import os
            # weights = 'checkpoints/checkpoint_epoch_00007.pyth'
            # os.environ['CUDA_VISIBLE_DEVICES'] = '0'
            # device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
            # chkpt = torch.load(weights, map_location=device)

            # try:
            #     model_dict = model.module.state_dict()
            # except AttributeError:
            #     model_dict = model.state_dict()  # 读取原始状态及参数,                                         ## 多GPU训练,导致训练存储的模型时key会加上model
            #     # 将pretrained_dict里不属于model_dict的键剔除掉
            #     chkpt = {k: v for k, v in chkpt.items() if k in model_dict}
            # print("load pretrain model")
            # model_dict.update(chkpt)
            # model.load_state_dict(model_dict)

            # model.to(device)
            # # inputs = [inputs.to(device)]
            # model.eval()
            # input_tensor = (inputs, meta["boxes"].to(device))
            # traced_script_module = torch.jit.trace(model, input_tensor)
            # traced_script_module.save("weights/sf_pytorch.pt")
            # print("************************* out put save **********************************")
            # exit(0)


##############################################################################################
        else:
            preds = model(inputs)
        # Explicitly declare reduction to mean.
        loss_fun = losses.get_loss_func(cfg.MODEL.LOSS_FUNC)(reduction="mean")

        # Compute the loss.
        loss = loss_fun(preds, labels)

        # check Nan Loss.
        misc.check_nan_losses(loss)

        # Perform the backward pass.
        optimizer.zero_grad()
        loss.backward()
        # Update the parameters.
        optimizer.step()

        if cfg.DETECTION.ENABLE:
            if cfg.NUM_GPUS > 1:
                loss = du.all_reduce([loss])[0]
            loss = loss.item()

            # Update and log stats.
            train_meter.update_stats(None, None, None, loss, lr)
            # write to tensorboard format if available.
            if writer is not None:
                writer.add_scalars(
                    {
                        "Train/loss": loss,
                        "Train/lr": lr
                    },
                    global_step=data_size * cur_epoch + cur_iter,
                )

        else:
            top1_err, top5_err = None, None
            if cfg.DATA.MULTI_LABEL:
                # Gather all the predictions across all the devices.
                if cfg.NUM_GPUS > 1:
                    [loss] = du.all_reduce([loss])
                loss = loss.item()
            else:
                # Compute the errors.
                num_topks_correct = metrics.topks_correct(
                    preds, labels, (1, 5))
                top1_err, top5_err = [(1.0 - x / preds.size(0)) * 100.0
                                      for x in num_topks_correct]
                # Gather all the predictions across all the devices.
                if cfg.NUM_GPUS > 1:
                    loss, top1_err, top5_err = du.all_reduce(
                        [loss, top1_err, top5_err])

                # Copy the stats from GPU to CPU (sync point).
                loss, top1_err, top5_err = (
                    loss.item(),
                    top1_err.item(),
                    top5_err.item(),
                )

            # Update and log stats.
            train_meter.update_stats(
                top1_err,
                top5_err,
                loss,
                lr,
                inputs[0].size(0) * max(
                    cfg.NUM_GPUS, 1
                ),  # If running  on CPU (cfg.NUM_GPUS == 1), use 1 to represent 1 CPU.
            )
            # write to tensorboard format if available.
            if writer is not None:
                writer.add_scalars(
                    {
                        "Train/loss": loss,
                        "Train/lr": lr,
                        "Train/Top1_err": top1_err,
                        "Train/Top5_err": top5_err,
                    },
                    global_step=data_size * cur_epoch + cur_iter,
                )

        train_meter.iter_toc()  # measure allreduce for this meter
        train_meter.log_iter_stats(cur_epoch, cur_iter)
        train_meter.iter_tic()

    # Log epoch stats.
    train_meter.log_epoch_stats(cur_epoch)
    train_meter.reset()