def setup_logging(output_dir=None): """ Sets up the logging for multiple processes. Only enable the logging for the master process, and suppress logging for the non-master processes. """ # Set up logging format. _FORMAT = "[%(levelname)s: %(filename)s: %(lineno)4d]: %(message)s" if du.is_master_proc(): # Enable logging for the master process. logging.root.handlers = [] else: # Suppress logging for non-master processes. _suppress_print() logger = logging.getLogger() logger.setLevel(logging.INFO) logger.propagate = False plain_formatter = logging.Formatter( "[%(asctime)s][%(levelname)s] %(filename)s: %(lineno)3d: %(message)s", datefmt="%m/%d %H:%M:%S", ) if du.is_master_proc(): ch = logging.StreamHandler(stream=sys.stdout) ch.setLevel(logging.DEBUG) ch.setFormatter(plain_formatter) logger.addHandler(ch) if output_dir is not None and du.is_master_proc(du.get_world_size()): filename = os.path.join(output_dir, "stdout.log") fh = logging.StreamHandler(_cached_log_stream(filename)) fh.setLevel(logging.DEBUG) fh.setFormatter(plain_formatter) logger.addHandler(fh)
def forward(self, input): if get_world_size() == 1 or not self.training: return super().forward(input) assert input.shape[0] > 0, "SyncBatchNorm does not support empty inputs" C = input.size(1).item() mean = torch.mean(input, dim=[0, 2, 3, 4]) meansqr = torch.mean(input * input, dim=[0, 2, 3, 4]) vec = torch.cat([mean, meansqr], dim=0) vec = AllReduce.apply(vec) * (1.0 / dist.get_world_size()) mean, meansqr = vec.split(C) #torch.split(vec, C) var = meansqr - mean * mean self.running_mean += self.momentum * (mean.detach() - self.running_mean) self.running_var += self.momentum * (var.detach() - self.running_var) invstd = torch.rsqrt(var + self.eps) scale = self.weight * invstd bias = self.bias - mean * scale scale = scale.reshape(1, -1, 1, 1, 1) bias = bias.reshape(1, -1, 1, 1, 1) return input * scale + bias
def _simclr_precompute_pos_neg_mask_multi(self): # computed once at the beginning of training distributed = self.cfg.CONTRASTIVE.SIMCLR_DIST_ON if distributed: total_images = self.cfg.TRAIN.BATCH_SIZE * self.cfg.NUM_SHARDS world_size = du.get_world_size() rank = du.get_rank() else: total_images = self.cfg.TRAIN.BATCH_SIZE world_size = du.get_local_size() rank = du.get_local_rank() local_orig_images = total_images // world_size local_crops = local_orig_images * self.num_crops pos_temps = [] for d in np.arange(self.num_crops): pos_temp, neg_temp = [], [] for i in range(world_size): if i == rank: pos = np.eye(local_crops, k=d * local_orig_images) + np.eye( local_crops, k=-local_crops + d * local_orig_images) neg = np.ones((local_crops, local_crops)) else: pos = np.zeros((local_crops, local_crops)) neg = np.zeros((local_crops, local_crops)) pos_temp.append(pos) neg_temp.append(neg) pos_temps.append(np.hstack(pos_temp)) neg_temp = np.hstack(neg_temp) pos_mask = [] for i in range(self.num_crops - 1): pos_mask.append(torch.from_numpy(pos_temps[1 + i])) neg_mask = torch.from_numpy(neg_temp - sum(pos_temps)) if self.num_gpus: for i in range(len(pos_mask)): pos_mask[i] = pos_mask[i].cuda(non_blocking=True) neg_mask = neg_mask.cuda(non_blocking=True) self.pos_mask, self.neg_mask = pos_mask, neg_mask
def distributed_sinkhorn(self, Q, nmb_iters): with torch.no_grad(): sum_Q = torch.sum(Q) du.all_reduce([sum_Q], average=False) Q /= sum_Q u = torch.zeros(Q.shape[0]).cuda(non_blocking=True) r = torch.ones(Q.shape[0]).cuda(non_blocking=True) / Q.shape[0] c = torch.ones(Q.shape[1]).cuda( non_blocking=True) / (du.get_world_size() * Q.shape[1]) curr_sum = torch.sum(Q, dim=1) du.all_reduce([curr_sum], average=False) for _ in range(nmb_iters): u = curr_sum Q *= (r / u).unsqueeze(1) Q *= (c / torch.sum(Q, dim=0)).unsqueeze(0) curr_sum = torch.sum(Q, dim=1) du.all_reduce([curr_sum], average=False) return (Q / torch.sum(Q, dim=0, keepdim=True)).t().float()
def log_toc(self, key, **kwargs): if len(self.tocs[key]) + 1 > len(self.tics[key]): warnings.warn(f"Trying to log tocs to {key} without tic") warnings.warn(f"Last Toc at {self._last_toc(key)}.") warnings.warn(f"Last Tic at {self._last_tic(key)}.") warnings.warn(f"New Toc at {time.time()}.") self.tocs[key].append(time.time()) self._log_iter(key) if 'shape' in kwargs.keys(): if not isinstance(kwargs['shape'], torch.Size): raise NotImplementedError if len(kwargs['shape']) != 5: raise NotImplementedError self.num_instances[key].append(kwargs['shape'][0] * du.get_world_size()) else: self.num_instances[key].append(None)
def train_epoch(self, train_loader, model, optimizer, train_meter, cur_epoch, cfg, writer=None): """ Perform the video training for one epoch. Args: train_loader (loader): video training loader. model (model): the video model to train. optimizer (optim): the optimizer to perform optimization on the model's parameters. train_meter (TrainMeter): training meters to log the training performance. cur_epoch (int): current epoch of training. cfg (CfgNode): configs. Details can be found in slowfast/config/defaults.py writer (TensorboardWriter, optional): TensorboardWriter object to writer Tensorboard log. """ # Enable train mode. model.train() train_meter.iter_tic() data_size = len(train_loader) start = time.time() btch = cfg.TRAIN.BATCH_SIZE * self.cfg.NUM_SHARDS rankE = os.environ.get("RANK", None) worldE = os.environ.get("WORLD_SIZE", None) dSize = data_size * btch self.logger.info( "Train Epoch {} dLen {} Batch {} dSize {} localRank {} rank {} {} world {} {}" .format(cur_epoch, data_size, btch, dSize, du.get_local_rank(), du.get_rank(), rankE, du.get_world_size(), worldE)) tot = 0 first = True predsAll = [] labelsAll = [] for cur_iter, (inputs, labels, _, meta) in enumerate(train_loader): # Transfer the data to the current GPU device. tot += len(labels) if isinstance(inputs, (list, )): if first: self.logger.info( "rank {} LEN {} {} shape Slow {} Fast {} {} tot {}". format(du.get_rank(), len(labels), len(inputs), inputs[0].shape, inputs[1].shape, labels[0].shape, tot)) first = False for i in range(len(inputs)): inputs[i] = inputs[i].cuda(non_blocking=True) else: if first: self.logger.info( "rank {} LEN {} shape {} {} tot {}".format( du.get_rank(), len(labels), inputs.shape, labels[0].shape, tot)) first = False inputs = inputs.cuda(non_blocking=True) labels = labels.cuda() for key, val in meta.items(): if isinstance(val, (list, )): for i in range(len(val)): val[i] = val[i].cuda(non_blocking=True) else: meta[key] = val.cuda(non_blocking=True) # Update the learning rate. lr = optim.get_epoch_lr(cur_epoch + float(cur_iter) / data_size, cfg) optim.set_lr(optimizer, lr) if cfg.DETECTION.ENABLE: # Compute the predictions. preds = model(inputs, meta["boxes"]) else: # Perform the forward pass. preds = model(inputs) # Explicitly declare reduction to mean. loss_fun = losses.get_loss_func( cfg.MODEL.LOSS_FUNC)(reduction="mean") # Compute the loss. loss = loss_fun(preds, labels) # check Nan Loss. misc.check_nan_losses(loss) # Perform the backward pass. optimizer.zero_grad() loss.backward() # Update the parameters. optimizer.step() if cfg.DETECTION.ENABLE: if cfg.NUM_GPUS > 1: loss = du.all_reduce([loss])[0] loss = loss.item() train_meter.iter_toc() # Update and log stats. train_meter.update_stats(None, None, None, loss, lr) # write to tensorboard format if available. if writer is not None: writer.add_scalars( { "Train/loss": loss, "Train/lr": lr }, global_step=data_size * cur_epoch + cur_iter, ) ite = data_size * cur_epoch + cur_iter if du.is_master_proc(): self.logger.log_row(name='TrainLoss', iter=ite, loss=loss, description="train loss") self.logger.log_row(name='TrainLr', iter=ite, lr=lr, description="train learn rate") else: top1_err, top5_err = None, None if cfg.DATA.MULTI_LABEL: # Gather all the predictions across all the devices. if cfg.NUM_GPUS > 1: [loss] = du.all_reduce([loss]) loss = loss.item() else: # Binary classifier - save preds / labels for metrics if cfg.MODEL.NUM_CLASSES == 2: predsAll.extend(preds.detach().cpu().numpy()[:, -1]) labelsAll.extend(labels.detach().cpu().numpy()) # Compute the errors. num_topks_correct = metrics.topks_correct( preds, labels, (1, min(5, cfg.MODEL.NUM_CLASSES))) top1_err, top5_err = [(1.0 - x / preds.size(0)) * 100.0 for x in num_topks_correct] # Gather all the predictions across all the devices. if cfg.NUM_GPUS > 1: loss, top1_err, top5_err = du.all_reduce( [loss, top1_err, top5_err]) # Copy the stats from GPU to CPU (sync point). loss, top1_err, top5_err = ( loss.item(), top1_err.item(), top5_err.item(), ) train_meter.iter_toc() # Update and log stats. # self.logger.info("UPDATING stat {} {} {}".format(inputs[0].size(0), cfg.NUM_GPUS, inputs[0].size(0) * cfg.NUM_GPUS)) train_meter.update_stats(top1_err, top5_err, loss, lr, inputs[0].size(0) * cfg.NUM_GPUS) # write to tensorboard format if available. if writer is not None: writer.add_scalars( { "Train/loss": loss, "Train/lr": lr, "Train/Top1_err": top1_err, "Train/Top5_err": top5_err, }, global_step=data_size * cur_epoch + cur_iter, ) stats = train_meter.log_iter_stats(cur_epoch, cur_iter, predsAll, labelsAll) ite = dSize * cur_epoch + btch * (cur_iter + 1) self.plotStats(stats, ite, 'TrainIter') train_meter.iter_tic() if du.is_master_proc() and cfg.LOG_MODEL_INFO: misc.log_model_info(model, cfg, use_train_input=True) # Log epoch stats. gathered = du.all_gather([ torch.tensor(predsAll).to(torch.device("cuda")), torch.tensor(labelsAll).to(torch.device("cuda")) ]) stats = train_meter.log_epoch_stats(cur_epoch, gathered[0].detach().cpu().numpy(), gathered[1].detach().cpu().numpy()) ite = (cur_epoch + 1) * dSize self.plotStats(stats, ite, 'TrainEpoch') train_meter.reset() end = time.time() el = end - start totAll = du.all_reduce([torch.tensor(tot).cuda()], average=False) tSum = totAll[0].item() elT = torch.tensor(el).cuda() elMax = du.all_reduce([elT], op=dist.ReduceOp.MAX, average=False)[0].item() jobRate = tSum / elMax self.logger.info( "totSampCnt {} workerSampCnt {} eTimeMax {} eTimeWorker {} SampPerSecJob {:.1f} SampPerSecWorker {:.1f}" .format(tSum, tot, elMax, el, jobRate, tot / el)) return jobRate
def eval_epoch(self, val_loader, model, val_meter, cur_epoch, cfg, writer=None): """ Evaluate the model on the val set. Args: val_loader (loader): data loader to provide validation data. model (model): model to evaluate the performance. val_meter (ValMeter): meter instance to record and calculate the metrics. cur_epoch (int): number of the current epoch of training. cfg (CfgNode): configs. Details can be found in slowfast/config/defaults.py writer (TensorboardWriter, optional): TensorboardWriter object to writer Tensorboard log. """ # Evaluation mode enabled. The running stats would not be updated. model.eval() data_size = len(val_loader) btch = cfg.TRAIN.BATCH_SIZE * self.cfg.NUM_SHARDS rankE = os.environ.get("RANK", None) worldE = os.environ.get("WORLD_SIZE", None) dSize = data_size * btch self.logger.info( "Val Epoch {} dLen {} Batch {} dSize {} localRank {} rank {} {} world {} {}" .format(cur_epoch, data_size, btch, dSize, du.get_local_rank(), du.get_rank(), rankE, du.get_world_size(), worldE)) val_meter.iter_tic() predsAll = [] labelsAll = [] data_size = len(val_loader) for cur_iter, (inputs, labels, _, meta) in enumerate(val_loader): # Transferthe data to the current GPU device. if isinstance(inputs, (list, )): for i in range(len(inputs)): inputs[i] = inputs[i].cuda(non_blocking=True) else: inputs = inputs.cuda(non_blocking=True) labels = labels.cuda() for key, val in meta.items(): if isinstance(val, (list, )): for i in range(len(val)): val[i] = val[i].cuda(non_blocking=True) else: meta[key] = val.cuda(non_blocking=True) if cfg.DETECTION.ENABLE: # Compute the predictions. preds = model(inputs, meta["boxes"]) preds = preds.cpu() ori_boxes = meta["ori_boxes"].cpu() metadata = meta["metadata"].cpu() if cfg.NUM_GPUS > 1: preds = torch.cat(du.all_gather_unaligned(preds), dim=0) ori_boxes = torch.cat(du.all_gather_unaligned(ori_boxes), dim=0) metadata = torch.cat(du.all_gather_unaligned(metadata), dim=0) val_meter.iter_toc() # Update and log stats. val_meter.update_stats(preds.cpu(), ori_boxes.cpu(), metadata.cpu()) else: preds = model(inputs) if cfg.DATA.MULTI_LABEL: if cfg.NUM_GPUS > 1: preds, labels = du.all_gather([preds, labels]) else: if cfg.MODEL.NUM_CLASSES == 2: predsAll.extend(preds.detach().cpu().numpy()[:, -1]) labelsAll.extend(labels.detach().cpu().numpy()) # Compute the errors. num_topks_correct = metrics.topks_correct( preds, labels, (1, min(5, cfg.MODEL.NUM_CLASSES))) # Combine the errors across the GPUs. top1_err, top5_err = [(1.0 - x / preds.size(0)) * 100.0 for x in num_topks_correct] if cfg.NUM_GPUS > 1: top1_err, top5_err = du.all_reduce( [top1_err, top5_err]) # Copy the errors from GPU to CPU (sync point). top1_err, top5_err = top1_err.item(), top5_err.item() val_meter.iter_toc() # Update and log stats. val_meter.update_stats(top1_err, top5_err, inputs[0].size(0) * cfg.NUM_GPUS) # write to tensorboard format if available. if writer is not None: writer.add_scalars( { "Val/Top1_err": top1_err, "Val/Top5_err": top5_err }, global_step=len(val_loader) * cur_epoch + cur_iter, ) if du.is_master_proc(): ite = len(val_loader) * cur_epoch + cur_iter self.logger.log_row(name='ValTop1', iter=ite, lr=top1_err, description="Top 1 Err") self.logger.log_row(name='ValTop5', iter=ite, lr=top5_err, description="Top 5 Err") val_meter.update_predictions(preds, labels) stats = val_meter.log_iter_stats(cur_epoch, cur_iter, predsAll, labelsAll) ite = dSize * cur_epoch + btch * (cur_iter + 1) self.plotStats(stats, ite, 'ValIter') val_meter.iter_tic() # Log epoch stats. gathered = du.all_gather([ torch.tensor(predsAll).to(torch.device("cuda")), torch.tensor(labelsAll).to(torch.device("cuda")) ]) stats = val_meter.log_epoch_stats(cur_epoch, gathered[0].detach().cpu().numpy(), gathered[1].detach().cpu().numpy()) ite = (cur_epoch + 1) * dSize self.plotStats(stats, ite, 'ValEpoch') # write to tensorboard format if available. if writer is not None: if cfg.DETECTION.ENABLE: writer.add_scalars({"Val/mAP": val_meter.full_map}, global_step=cur_epoch) all_preds_cpu = [ pred.clone().detach().cpu() for pred in val_meter.all_preds ] all_labels_cpu = [ label.clone().detach().cpu() for label in val_meter.all_labels ] # plotScatter(all_preds_cpu, all_labels_cpu, "Epoch_{}".format(cur_epoch)) # writer.plot_eval( # preds=all_preds_cpu, labels=all_labels_cpu, global_step=cur_epoch # ) val_meter.reset()
def __init__(self, cfg): super(ContrastiveModel, self).__init__() # Construct the model. self.backbone = _MODEL_TYPES[cfg.MODEL.ARCH](cfg) self.type = cfg.CONTRASTIVE.TYPE self.T = cfg.CONTRASTIVE.T self.dim = cfg.CONTRASTIVE.DIM self.length = cfg.CONTRASTIVE.LENGTH self.k = cfg.CONTRASTIVE.QUEUE_LEN self.mmt = cfg.CONTRASTIVE.MOMENTUM self.momentum_annealing = cfg.CONTRASTIVE.MOMENTUM_ANNEALING self.duration = 1 self.cfg = cfg self.num_gpus = cfg.NUM_GPUS self.l2_norm = Normalize() self.knn_num_imgs = 0 self.knn_on = cfg.CONTRASTIVE.KNN_ON self.train_labels = np.zeros((0, ), dtype=np.int32) self.num_pos = 2 self.num_crops = (self.cfg.DATA.TRAIN_CROP_NUM_TEMPORAL * self.cfg.DATA.TRAIN_CROP_NUM_SPATIAL) self.nce_loss_fun = losses.get_loss_func("contrastive_loss")( reduction="mean") assert self.cfg.MODEL.LOSS_FUNC == "contrastive_loss" self.softmax = nn.Softmax(dim=1).cuda() if self.type == "mem": self.mem_type = cfg.CONTRASTIVE.MEM_TYPE if self.mem_type == "1d": self.memory = Memory1D(self.length, self.duration, self.dim, cfg) else: self.memory = Memory(self.length, self.duration, self.dim, cfg) self.examplar_type = "video" self.interp = cfg.CONTRASTIVE.INTERP_MEMORY elif self.type == "self": pass elif self.type == "moco" or self.type == "byol": # MoCo components self.backbone_hist = _MODEL_TYPES[cfg.MODEL.ARCH](cfg) for p in self.backbone_hist.parameters(): p.requires_grad = False self.register_buffer("ptr", torch.tensor([0])) self.ptr.requires_grad = False stdv = 1.0 / math.sqrt(self.dim / 3) self.register_buffer( "queue_x", torch.rand(self.k, self.dim).mul_(2 * stdv).add_(-stdv), ) self.register_buffer("iter", torch.zeros([1], dtype=torch.long)) self._batch_shuffle_on = (False if ("sync" in cfg.BN.NORM_TYPE and cfg.BN.NUM_SYNC_DEVICES == cfg.NUM_GPUS) or self.type == "byol" else True) elif self.type == "swav": self.swav_use_public_code = True if self.swav_use_public_code: self.swav_prototypes = nn.Linear( self.dim, 1000, bias=False) # for orig implementation else: self.swav_prototypes = nn.Parameter( torch.randn((self.dim, 1000), dtype=torch.float)) self.swav_eps_sinkhorn = 0.05 self.swav_use_the_queue = False # optionally starts a queue if self.cfg.CONTRASTIVE.SWAV_QEUE_LEN > 0: self.register_buffer( "queue_swav", torch.zeros( 2, # = args.crops_for_assign self.cfg.CONTRASTIVE.SWAV_QEUE_LEN // du.get_world_size(), self.dim, ), ) elif self.type == "simclr": self._simclr_precompute_pos_neg_mask_multi() self.simclr_dist_on = cfg.CONTRASTIVE.SIMCLR_DIST_ON # self.knn_mem = Memory1D(self.length, 1, self.dim, cfg) # does not work if self.knn_on: self.knn_mem = Memory(self.length, 1, self.dim, cfg)