def forward(ctx, input, num_sync_devices, num_groups):
        """
        Perform forwarding, gathering the stats across different process/ GPU
        group.
        """
        ctx.num_sync_devices = num_sync_devices
        ctx.num_groups = num_groups

        input_list = [
            torch.zeros_like(input) for k in range(du.get_local_size())
        ]
        dist.all_gather(
            input_list, input, async_op=False, group=du._LOCAL_PROCESS_GROUP
        )

        inputs = torch.stack(input_list, dim=0)
        if num_groups > 1:
            rank = du.get_local_rank()
            group_idx = rank // num_sync_devices
            inputs = inputs[
                group_idx
                * num_sync_devices : (group_idx + 1)
                * num_sync_devices
            ]
        inputs = torch.sum(inputs, dim=0)
        return inputs
    def backward(ctx, grad_output):
        """
        Perform backwarding, gathering the gradients across different process/ GPU
        group.
        """
        grad_output_list = [
            torch.zeros_like(grad_output) for k in range(du.get_local_size())
        ]
        dist.all_gather(
            grad_output_list,
            grad_output,
            async_op=False,
            group=du._LOCAL_PROCESS_GROUP,
        )

        grads = torch.stack(grad_output_list, dim=0)
        if ctx.num_groups > 1:
            rank = du.get_local_rank()
            group_idx = rank // ctx.num_sync_devices
            grads = grads[
                group_idx
                * ctx.num_sync_devices : (group_idx + 1)
                * ctx.num_sync_devices
            ]
        grads = torch.sum(grads, dim=0)
        return grads, None, None
Beispiel #3
0
    def _batch_unshuffle(self, x, idx_restore):
        if self.num_gpus > 1:
            if self.cfg.CONTRASTIVE.LOCAL_SHUFFLE_BN:
                x = du.cat_all_gather(x, local=True)
                gpu_idx = du.get_local_rank()
            else:
                x = du.cat_all_gather(x)
                gpu_idx = torch.distributed.get_rank()
        else:
            gpu_idx = 0

        idx = idx_restore[gpu_idx, :]
        x = x[idx]
        return x
Beispiel #4
0
    def _simclr_precompute_pos_neg_mask_multi(self):
        # computed once at the beginning of training
        distributed = self.cfg.CONTRASTIVE.SIMCLR_DIST_ON
        if distributed:
            total_images = self.cfg.TRAIN.BATCH_SIZE * self.cfg.NUM_SHARDS
            world_size = du.get_world_size()
            rank = du.get_rank()
        else:
            total_images = self.cfg.TRAIN.BATCH_SIZE
            world_size = du.get_local_size()
            rank = du.get_local_rank()
        local_orig_images = total_images // world_size
        local_crops = local_orig_images * self.num_crops

        pos_temps = []
        for d in np.arange(self.num_crops):
            pos_temp, neg_temp = [], []
            for i in range(world_size):
                if i == rank:
                    pos = np.eye(local_crops,
                                 k=d * local_orig_images) + np.eye(
                                     local_crops,
                                     k=-local_crops + d * local_orig_images)
                    neg = np.ones((local_crops, local_crops))
                else:
                    pos = np.zeros((local_crops, local_crops))
                    neg = np.zeros((local_crops, local_crops))
                pos_temp.append(pos)
                neg_temp.append(neg)
            pos_temps.append(np.hstack(pos_temp))
            neg_temp = np.hstack(neg_temp)

        pos_mask = []
        for i in range(self.num_crops - 1):
            pos_mask.append(torch.from_numpy(pos_temps[1 + i]))
        neg_mask = torch.from_numpy(neg_temp - sum(pos_temps))

        if self.num_gpus:
            for i in range(len(pos_mask)):
                pos_mask[i] = pos_mask[i].cuda(non_blocking=True)
            neg_mask = neg_mask.cuda(non_blocking=True)
        self.pos_mask, self.neg_mask = pos_mask, neg_mask
Beispiel #5
0
    def _batch_shuffle(self, x):
        if len(x) == 2:
            another_crop = True
        else:
            another_crop = False
        if another_crop:
            x, x_crop = x[0], x[1]
        else:
            x = x[0]

        world_size = self.cfg.NUM_GPUS * self.cfg.NUM_SHARDS
        if self.num_gpus > 1:
            if self.cfg.CONTRASTIVE.LOCAL_SHUFFLE_BN:
                x = du.cat_all_gather(x, local=True)
                if another_crop:
                    x_crop = du.cat_all_gather(x_crop, local=True)
                world_size = du.get_local_size()
                gpu_idx = du.get_local_rank()
            else:
                x = du.cat_all_gather(x)
                if another_crop:
                    x_crop = du.cat_all_gather(x_crop)
                gpu_idx = torch.distributed.get_rank()

        idx_randperm = torch.randperm(x.shape[0]).cuda()
        if self.num_gpus > 1:
            torch.distributed.broadcast(idx_randperm, src=0)
        else:
            gpu_idx = 0
        idx_randperm = idx_randperm.view(world_size, -1)
        x = x[idx_randperm[gpu_idx, :]]
        if another_crop:
            x_crop = x_crop[idx_randperm[gpu_idx, :]]

        idx_restore = torch.argsort(idx_randperm.view(-1))
        idx_restore = idx_restore.view(world_size, -1)
        if another_crop:
            return [x, x_crop], idx_restore
        else:
            return [x], idx_restore
Beispiel #6
0
    def train_epoch(self,
                    train_loader,
                    model,
                    optimizer,
                    train_meter,
                    cur_epoch,
                    cfg,
                    writer=None):
        """
        Perform the video training for one epoch.
        Args:
            train_loader (loader): video training loader.
            model (model): the video model to train.
            optimizer (optim): the optimizer to perform optimization on the model's
                parameters.
            train_meter (TrainMeter): training meters to log the training performance.
            cur_epoch (int): current epoch of training.
            cfg (CfgNode): configs. Details can be found in
                slowfast/config/defaults.py
            writer (TensorboardWriter, optional): TensorboardWriter object
                to writer Tensorboard log.
        """
        # Enable train mode.
        model.train()
        train_meter.iter_tic()
        data_size = len(train_loader)
        start = time.time()
        btch = cfg.TRAIN.BATCH_SIZE * self.cfg.NUM_SHARDS
        rankE = os.environ.get("RANK", None)
        worldE = os.environ.get("WORLD_SIZE", None)
        dSize = data_size * btch
        self.logger.info(
            "Train Epoch {} dLen {} Batch {} dSize {} localRank {} rank {} {} world {} {}"
            .format(cur_epoch, data_size, btch, dSize, du.get_local_rank(),
                    du.get_rank(), rankE, du.get_world_size(), worldE))
        tot = 0
        first = True
        predsAll = []
        labelsAll = []

        for cur_iter, (inputs, labels, _, meta) in enumerate(train_loader):
            # Transfer the data to the current GPU device.
            tot += len(labels)
            if isinstance(inputs, (list, )):
                if first:
                    self.logger.info(
                        "rank {} LEN {}  {} shape Slow {} Fast {} {} tot {}".
                        format(du.get_rank(), len(labels), len(inputs),
                               inputs[0].shape, inputs[1].shape,
                               labels[0].shape, tot))
                    first = False
                for i in range(len(inputs)):
                    inputs[i] = inputs[i].cuda(non_blocking=True)
            else:
                if first:
                    self.logger.info(
                        "rank {} LEN {} shape {} {} tot {}".format(
                            du.get_rank(), len(labels), inputs.shape,
                            labels[0].shape, tot))
                    first = False
                inputs = inputs.cuda(non_blocking=True)
            labels = labels.cuda()

            for key, val in meta.items():
                if isinstance(val, (list, )):
                    for i in range(len(val)):
                        val[i] = val[i].cuda(non_blocking=True)
                else:
                    meta[key] = val.cuda(non_blocking=True)

            # Update the learning rate.
            lr = optim.get_epoch_lr(cur_epoch + float(cur_iter) / data_size,
                                    cfg)
            optim.set_lr(optimizer, lr)
            if cfg.DETECTION.ENABLE:
                # Compute the predictions.
                preds = model(inputs, meta["boxes"])

            else:
                # Perform the forward pass.
                preds = model(inputs)
            # Explicitly declare reduction to mean.
            loss_fun = losses.get_loss_func(
                cfg.MODEL.LOSS_FUNC)(reduction="mean")

            # Compute the loss.
            loss = loss_fun(preds, labels)

            # check Nan Loss.
            misc.check_nan_losses(loss)

            # Perform the backward pass.
            optimizer.zero_grad()
            loss.backward()
            # Update the parameters.
            optimizer.step()

            if cfg.DETECTION.ENABLE:
                if cfg.NUM_GPUS > 1:
                    loss = du.all_reduce([loss])[0]
                loss = loss.item()

                train_meter.iter_toc()
                # Update and log stats.
                train_meter.update_stats(None, None, None, loss, lr)
                # write to tensorboard format if available.
                if writer is not None:
                    writer.add_scalars(
                        {
                            "Train/loss": loss,
                            "Train/lr": lr
                        },
                        global_step=data_size * cur_epoch + cur_iter,
                    )
                ite = data_size * cur_epoch + cur_iter
                if du.is_master_proc():
                    self.logger.log_row(name='TrainLoss',
                                        iter=ite,
                                        loss=loss,
                                        description="train loss")
                    self.logger.log_row(name='TrainLr',
                                        iter=ite,
                                        lr=lr,
                                        description="train learn rate")

            else:
                top1_err, top5_err = None, None
                if cfg.DATA.MULTI_LABEL:
                    # Gather all the predictions across all the devices.
                    if cfg.NUM_GPUS > 1:
                        [loss] = du.all_reduce([loss])
                    loss = loss.item()
                else:
                    # Binary classifier - save preds / labels for metrics
                    if cfg.MODEL.NUM_CLASSES == 2:
                        predsAll.extend(preds.detach().cpu().numpy()[:, -1])
                        labelsAll.extend(labels.detach().cpu().numpy())
                    # Compute the errors.
                    num_topks_correct = metrics.topks_correct(
                        preds, labels, (1, min(5, cfg.MODEL.NUM_CLASSES)))
                    top1_err, top5_err = [(1.0 - x / preds.size(0)) * 100.0
                                          for x in num_topks_correct]

                    # Gather all the predictions across all the devices.
                    if cfg.NUM_GPUS > 1:
                        loss, top1_err, top5_err = du.all_reduce(
                            [loss, top1_err, top5_err])

                    # Copy the stats from GPU to CPU (sync point).
                    loss, top1_err, top5_err = (
                        loss.item(),
                        top1_err.item(),
                        top5_err.item(),
                    )

                train_meter.iter_toc()
                # Update and log stats.
                # self.logger.info("UPDATING stat {} {} {}".format(inputs[0].size(0), cfg.NUM_GPUS, inputs[0].size(0) * cfg.NUM_GPUS))
                train_meter.update_stats(top1_err, top5_err, loss, lr,
                                         inputs[0].size(0) * cfg.NUM_GPUS)
                # write to tensorboard format if available.
                if writer is not None:
                    writer.add_scalars(
                        {
                            "Train/loss": loss,
                            "Train/lr": lr,
                            "Train/Top1_err": top1_err,
                            "Train/Top5_err": top5_err,
                        },
                        global_step=data_size * cur_epoch + cur_iter,
                    )

            stats = train_meter.log_iter_stats(cur_epoch, cur_iter, predsAll,
                                               labelsAll)
            ite = dSize * cur_epoch + btch * (cur_iter + 1)
            self.plotStats(stats, ite, 'TrainIter')
            train_meter.iter_tic()

        if du.is_master_proc() and cfg.LOG_MODEL_INFO:
            misc.log_model_info(model, cfg, use_train_input=True)
        # Log epoch stats.
        gathered = du.all_gather([
            torch.tensor(predsAll).to(torch.device("cuda")),
            torch.tensor(labelsAll).to(torch.device("cuda"))
        ])
        stats = train_meter.log_epoch_stats(cur_epoch,
                                            gathered[0].detach().cpu().numpy(),
                                            gathered[1].detach().cpu().numpy())
        ite = (cur_epoch + 1) * dSize
        self.plotStats(stats, ite, 'TrainEpoch')
        train_meter.reset()
        end = time.time()
        el = end - start
        totAll = du.all_reduce([torch.tensor(tot).cuda()], average=False)
        tSum = totAll[0].item()
        elT = torch.tensor(el).cuda()
        elMax = du.all_reduce([elT], op=dist.ReduceOp.MAX,
                              average=False)[0].item()
        jobRate = tSum / elMax
        self.logger.info(
            "totSampCnt {} workerSampCnt {}  eTimeMax {} eTimeWorker {}  SampPerSecJob {:.1f} SampPerSecWorker {:.1f}"
            .format(tSum, tot, elMax, el, jobRate, tot / el))
        return jobRate
Beispiel #7
0
    def eval_epoch(self,
                   val_loader,
                   model,
                   val_meter,
                   cur_epoch,
                   cfg,
                   writer=None):
        """
        Evaluate the model on the val set.
        Args:
            val_loader (loader): data loader to provide validation data.
            model (model): model to evaluate the performance.
            val_meter (ValMeter): meter instance to record and calculate the metrics.
            cur_epoch (int): number of the current epoch of training.
            cfg (CfgNode): configs. Details can be found in
                slowfast/config/defaults.py
            writer (TensorboardWriter, optional): TensorboardWriter object
                to writer Tensorboard log.
        """

        # Evaluation mode enabled. The running stats would not be updated.
        model.eval()
        data_size = len(val_loader)
        btch = cfg.TRAIN.BATCH_SIZE * self.cfg.NUM_SHARDS
        rankE = os.environ.get("RANK", None)
        worldE = os.environ.get("WORLD_SIZE", None)
        dSize = data_size * btch
        self.logger.info(
            "Val Epoch {} dLen {} Batch {} dSize {} localRank {} rank {} {} world {} {}"
            .format(cur_epoch, data_size, btch, dSize, du.get_local_rank(),
                    du.get_rank(), rankE, du.get_world_size(), worldE))

        val_meter.iter_tic()
        predsAll = []
        labelsAll = []
        data_size = len(val_loader)

        for cur_iter, (inputs, labels, _, meta) in enumerate(val_loader):
            # Transferthe data to the current GPU device.
            if isinstance(inputs, (list, )):
                for i in range(len(inputs)):
                    inputs[i] = inputs[i].cuda(non_blocking=True)
            else:
                inputs = inputs.cuda(non_blocking=True)
            labels = labels.cuda()
            for key, val in meta.items():
                if isinstance(val, (list, )):
                    for i in range(len(val)):
                        val[i] = val[i].cuda(non_blocking=True)
                else:
                    meta[key] = val.cuda(non_blocking=True)

            if cfg.DETECTION.ENABLE:
                # Compute the predictions.
                preds = model(inputs, meta["boxes"])

                preds = preds.cpu()
                ori_boxes = meta["ori_boxes"].cpu()
                metadata = meta["metadata"].cpu()

                if cfg.NUM_GPUS > 1:
                    preds = torch.cat(du.all_gather_unaligned(preds), dim=0)
                    ori_boxes = torch.cat(du.all_gather_unaligned(ori_boxes),
                                          dim=0)
                    metadata = torch.cat(du.all_gather_unaligned(metadata),
                                         dim=0)

                val_meter.iter_toc()
                # Update and log stats.
                val_meter.update_stats(preds.cpu(), ori_boxes.cpu(),
                                       metadata.cpu())

            else:
                preds = model(inputs)

                if cfg.DATA.MULTI_LABEL:
                    if cfg.NUM_GPUS > 1:
                        preds, labels = du.all_gather([preds, labels])
                else:
                    if cfg.MODEL.NUM_CLASSES == 2:
                        predsAll.extend(preds.detach().cpu().numpy()[:, -1])
                        labelsAll.extend(labels.detach().cpu().numpy())

                    # Compute the errors.
                    num_topks_correct = metrics.topks_correct(
                        preds, labels, (1, min(5, cfg.MODEL.NUM_CLASSES)))

                    # Combine the errors across the GPUs.
                    top1_err, top5_err = [(1.0 - x / preds.size(0)) * 100.0
                                          for x in num_topks_correct]
                    if cfg.NUM_GPUS > 1:
                        top1_err, top5_err = du.all_reduce(
                            [top1_err, top5_err])

                    # Copy the errors from GPU to CPU (sync point).
                    top1_err, top5_err = top1_err.item(), top5_err.item()

                    val_meter.iter_toc()
                    # Update and log stats.
                    val_meter.update_stats(top1_err, top5_err,
                                           inputs[0].size(0) * cfg.NUM_GPUS)
                    # write to tensorboard format if available.
                    if writer is not None:
                        writer.add_scalars(
                            {
                                "Val/Top1_err": top1_err,
                                "Val/Top5_err": top5_err
                            },
                            global_step=len(val_loader) * cur_epoch + cur_iter,
                        )

                    if du.is_master_proc():
                        ite = len(val_loader) * cur_epoch + cur_iter
                        self.logger.log_row(name='ValTop1',
                                            iter=ite,
                                            lr=top1_err,
                                            description="Top 1 Err")
                        self.logger.log_row(name='ValTop5',
                                            iter=ite,
                                            lr=top5_err,
                                            description="Top 5 Err")

                val_meter.update_predictions(preds, labels)

            stats = val_meter.log_iter_stats(cur_epoch, cur_iter, predsAll,
                                             labelsAll)
            ite = dSize * cur_epoch + btch * (cur_iter + 1)
            self.plotStats(stats, ite, 'ValIter')

            val_meter.iter_tic()

        # Log epoch stats.
        gathered = du.all_gather([
            torch.tensor(predsAll).to(torch.device("cuda")),
            torch.tensor(labelsAll).to(torch.device("cuda"))
        ])
        stats = val_meter.log_epoch_stats(cur_epoch,
                                          gathered[0].detach().cpu().numpy(),
                                          gathered[1].detach().cpu().numpy())
        ite = (cur_epoch + 1) * dSize
        self.plotStats(stats, ite, 'ValEpoch')

        # write to tensorboard format if available.
        if writer is not None:
            if cfg.DETECTION.ENABLE:
                writer.add_scalars({"Val/mAP": val_meter.full_map},
                                   global_step=cur_epoch)
            all_preds_cpu = [
                pred.clone().detach().cpu() for pred in val_meter.all_preds
            ]
            all_labels_cpu = [
                label.clone().detach().cpu() for label in val_meter.all_labels
            ]
            # plotScatter(all_preds_cpu, all_labels_cpu, "Epoch_{}".format(cur_epoch))
            # writer.plot_eval(
            #     preds=all_preds_cpu, labels=all_labels_cpu, global_step=cur_epoch
            # )
        val_meter.reset()