def benchmark_data(cfg: AttrDict, split: str = "train"): split = split.upper() total_images = MAX_ITERS * cfg["DATA"][split]["BATCHSIZE_PER_REPLICA"] timer = Timer() dataset = build_dataset(cfg, split) try: device = torch.device("cuda" if cfg.MACHINE.DEVICE == "gpu" else "cpu") except AttributeError: device = torch.device("cuda") # Gives sampler same seed for entire distributed group as per pytorch documentation. sampler_seed = cfg.SEED_VALUE dataloader = get_loader( dataset=dataset, dataset_config=cfg["DATA"][split], num_dataloader_workers=cfg.DATA.NUM_DATALOADER_WORKERS, pin_memory=False, multi_processing_method=cfg.MULTI_PROCESSING_METHOD, device=device, sampler_seed=sampler_seed, ) # Fairstore data sampler would require setting the start iter before it can start. if hasattr(dataloader.sampler, "set_start_iter"): dataloader.sampler.set_start_iter(0) # initial warmup measured as warmup time timer.reset() data_iterator = iter(dataloader) for i in range(10): # warmup next(data_iterator) if i == 0: # the total number of seconds since the start/reset of the timer warmup_time = timer.seconds() logging.info(f"Warmup time {WARMUP_ITERS} batches: {warmup_time} seconds") # measure the number of images per sec in 1000 iterations. timer = Timer() for _ in tqdm.trange(MAX_ITERS): next(data_iterator) time_elapsed = timer.seconds() logging.info( f"iters: {MAX_ITERS}; images: {total_images}; time: {time_elapsed} seconds; " f"images/sec: {round(float(total_images / time_elapsed), 4)}; " f"ms/img: {round(float(1000 * time_elapsed / total_images), 4)} ") # run benchmark for a few more rounds to catch fluctuations for round_idx in range(BENCHMARK_ROUNDS): timer = Timer() for _ in tqdm.trange(MAX_ITERS): next(data_iterator) time_elapsed = timer.seconds() logging.info( f"round: {round_idx}: iters: {MAX_ITERS}; images: {total_images}; " f"time: {time_elapsed} seconds; " f"images/sec: {round(float(total_images / time_elapsed), 4)}; " f"ms/img: {round(float(1000 * time_elapsed / total_images), 4)} ") del data_iterator del dataloader
def benchmark_data(args): cfg = setup(args) timer = Timer() dataloader = build_detection_train_loader(cfg) logger.info("Initialize loader using {} seconds.".format(timer.seconds())) timer.reset() itr = iter(dataloader) for i in range(10): # warmup next(itr) if i == 0: startup_time = timer.seconds() timer = Timer() max_iter = 1000 for _ in tqdm.trange(max_iter): next(itr) logger.info("{} iters ({} images) in {} seconds.".format( max_iter, max_iter * cfg.SOLVER.IMS_PER_BATCH, timer.seconds())) logger.info("Startup time: {} seconds".format(startup_time)) vram = psutil.virtual_memory() logger.info("RAM Usage: {:.2f}/{:.2f} GB".format( (vram.total - vram.available) / 1024**3, vram.total / 1024**3)) # test for a few more rounds for _ in range(10): timer = Timer() max_iter = 1000 for _ in tqdm.trange(max_iter): next(itr) logger.info("{} iters ({} images) in {} seconds.".format( max_iter, max_iter * cfg.SOLVER.IMS_PER_BATCH, timer.seconds()))
def benchmark_data(args): cfg = setup(args) logger.info("After spawning " + RAM_msg()) timer = Timer() dataloader = build_detection_train_loader(cfg) logger.info("Initialize loader using {} seconds.".format(timer.seconds())) timer.reset() itr = iter(dataloader) for i in range(10): # warmup next(itr) if i == 0: startup_time = timer.seconds() logger.info("Startup time: {} seconds".format(startup_time)) timer = Timer() max_iter = 1000 for _ in tqdm.trange(max_iter): next(itr) logger.info("{} iters ({} images) in {} seconds.".format( max_iter, max_iter * cfg.SOLVER.IMS_PER_BATCH, timer.seconds())) # test for a few more rounds for k in range(10): logger.info(f"Iteration {k} " + RAM_msg()) timer = Timer() max_iter = 1000 for _ in tqdm.trange(max_iter): next(itr) logger.info("{} iters ({} images) in {} seconds.".format( max_iter, max_iter * cfg.SOLVER.IMS_PER_BATCH, timer.seconds()))
class TestMeter(object): def __init__(self, cfg): self.cfg = cfg self.forward_timer = Timer() self.total_time = 0 self.cnt = 0 self.score = dict() self.output_dir = Join(cfg.TEST.OUTPUT_DIR, cfg.TEST.DATASET) self.save_img = cfg.TEST.SAVE_IMG if not os.path.exists(self.output_dir): os.makedirs(self.output_dir) self.score_csv = open(Join(self.output_dir, "score.csv"), 'w') self.score_csv.write("vid, image_id, psnr, ssim\n") def forward_tic(self): """ Start to record time. """ self.forward_timer.reset() def forward_toc(self): """ Stop to record time. """ self.forward_timer.pause() self.total_time += self.forward_timer.seconds() self.cnt += 1 def log_img_result(self, img_out, vid, img_id, psnr, ssim): if vid not in self.score.keys(): self.score[vid] = {} # log score self.score[vid][img_id] = (psnr, ssim) self.score_csv.write("{},{},{},{}\n".format(vid, img_id, psnr, ssim)) # save img if self.save_img: # if not os.path.exists(Join(self.output_dir, vid)): # os.makedirs(Join(self.output_dir, vid)) img_out = cv2.cvtColor(img_out, cv2.COLOR_RGB2BGR) cv2.imwrite(Join(self.output_dir, img_id), img_out) def log_average_score(self): score_per_vid = {} for vid in self.score.keys(): psnrs = [x[0] for x in self.score[vid].values()] ssims = [x[1] for x in self.score[vid].values()] score_per_vid[vid] = (np.mean(psnrs), np.mean(ssims)) with open(Join(self.output_dir, 'videos_scores.csv'), 'w') as f: f.write('video_id, psnr, ssim\n') for vid in self.score.keys(): f.write("{},{},{}\n".format(vid, score_per_vid[vid][0], score_per_vid[vid][1])) return score_per_vid def speed(self): return self.total_time, self.total_time / self.cnt
def test_timer(self): timer = Timer() time.sleep(0.5) self.assertTrue(0.99 > timer.seconds() >= 0.5) timer.pause() time.sleep(0.5) self.assertTrue(0.99 > timer.seconds() >= 0.5) timer.resume() time.sleep(0.5) self.assertTrue(1.49 > timer.seconds() >= 1.0) timer.reset() self.assertTrue(0.49 > timer.seconds() >= 0)
def test_timer(self) -> None: """ Test basic timer functions (pause, resume, and reset). """ timer = Timer() time.sleep(0.5) self.assertTrue(0.99 > timer.seconds() >= 0.5) timer.pause() time.sleep(0.5) self.assertTrue(0.99 > timer.seconds() >= 0.5) timer.resume() time.sleep(0.5) self.assertTrue(1.49 > timer.seconds() >= 1.0) timer.reset() self.assertTrue(0.49 > timer.seconds() >= 0)
def test_model(epoch): """ Evaluate the model on the test set """ model.eval() test_metrics = {"loss": [], "acc": []} timer = Timer() for batch_i, (X, y) in enumerate(test_dataloader): batch_i += 1 image_sequences = Variable(X.to(device), requires_grad=False) labels = Variable(y, requires_grad=False).to(device) with torch.no_grad(): # Reset LSTM hidden state model.lstm.reset_hidden_state() # Get sequence predictions predictions = model(image_sequences) # Compute metrics loss = criterion(predictions, labels) acc = (predictions.detach().argmax(1) == labels ).cpu().numpy().mean() # Keep track of loss and accuracy test_metrics["loss"].append(loss.item()) test_metrics["acc"].append(acc) # Determine approximate time left batches_done = batch_i - 1 batches_left = len(test_dataloader) - batches_done time_left = datetime.timedelta(seconds=batches_left * timer.seconds()) time_iter = round(timer.seconds(), 3) timer.reset() # Log test performance logger.info( f'Testing - [Epoch: {epoch}/{cfg.train.num_epochs}] [Batch: {batch_i}/{len(test_dataloader)}] [Loss: {np.mean(test_metrics["loss"]):.3f}] [Acc: {np.mean(test_metrics["acc"]):.3f}] [ETA: {time_left}] [Iter time: {time_iter}s/it]' ) writer.add_scalar("test/loss", np.mean(test_metrics["loss"]), epoch) writer.add_scalar("test/acc", np.mean(test_metrics["acc"]), epoch) model.train()
class EpochTimer: """ A timer which computes the epoch time. """ def __init__(self) -> None: self.timer = Timer() self.timer.reset() self.epoch_times = [] def reset(self) -> None: """ Reset the epoch timer. """ self.timer.reset() self.epoch_times = [] def epoch_tic(self): """ Start to record time. """ self.timer.reset() def epoch_toc(self): """ Stop to record time. """ self.timer.pause() self.epoch_times.append(self.timer.seconds()) def last_epoch_time(self): """ Get the time for the last epoch. """ assert len(self.epoch_times) > 0, "No epoch time has been recorded!" return self.epoch_times[-1] def avg_epoch_time(self): """ Calculate the average epoch time among the recorded epochs. """ assert len(self.epoch_times) > 0, "No epoch time has been recorded!" return np.mean(self.epoch_times) def median_epoch_time(self): """ Calculate the median epoch time among the recorded epochs. """ assert len(self.epoch_times) > 0, "No epoch time has been recorded!" return np.median(self.epoch_times)
class IterationTimer(HookBase): def __init__(self, warmup_iter=3): self._warmup_iter = warmup_iter self._step_timer = Timer() self._start_time = time.perf_counter() self._total_timer = Timer() def before_train(self): self._start_time = time.perf_counter() self._total_timer.reset() self._total_timer.pause() def after_train(self): logger = logging.getLogger(__name__) total_time = time.perf_counter() - self._start_time total_time_minus_hooks = self._total_timer.seconds() hook_time = total_time - total_time_minus_hooks num_iter = self.trainer.iter + 1 - self.trainer.start_iter - self._warmup_iter if num_iter > 0 and total_time_minus_hooks > 0: logger.info( "Overall training speed: {} iterations in {} ({:.4f} s / it)".format( num_iter, str(datetime.timedelta(seconds=int(total_time_minus_hooks))), total_time_minus_hooks / num_iter, ) ) logger.info( "Total training time: {} ({} on hooks)".format( str(datetime.timedelta(seconds=int(total_time))), str(datetime.timedelta(seconds=int(hook_time))), ) ) def before_step(self): self._step_timer.reset() self._total_timer.resume() def after_step(self): iter_done = self.trainer.iter - self.trainer.start_iter + 1 if iter_done >= self._warmup_iter: sec = self._step_timer.seconds() self.trainer.storage.put_scalars(time=sec) else: self._start_time = time.perf_counter() self._total_timer.reset() self._total_timer.pause()
# learning_rate = 0.00075 # if epoch == 3: # learning_rate = 0.001 if epoch == 30: learning_rate = 0.0001 if epoch == 40: learning_rate = 0.00001 # optimizer = torch.optim.SGD(net.parameters(),lr=learning_rate*0.1,momentum=0.9,weight_decay=1e-4) for param_group in optimizer.param_groups: param_group['lr'] = learning_rate print('\n\nStarting epoch %d / %d' % (epoch + 1, num_epochs)) print('Learning Rate for this epoch: {}'.format(learning_rate)) total_loss = 0. tt.reset() for i, (images, target) in enumerate(train_loader): # a,b=next(imgiter);images = Variable(a) ;target = Variable(b) # images = Variable(images) #torch.Size([4, 3, 448, 448]) # target = Variable(target) #torch.Size([4, 14, 14, 30]) if CHANNEL_LAST: images = images.to(memory_format=torch.channels_last) target = target.to(memory_format=torch.channels_last) if use_gpu: images, target = images.cuda(), target.cuda() pred = net(images) # torch.Size([4, 14, 14, 30]) loss = criterion(pred, target) # torch.Size([]) tloss = loss.item() # print(tloss)
class TrainMeter(object): """ Measure training stats. """ def __init__(self, epoch_iters, cfg): """ Args: epoch_iters (int): the overall number of iterations of one epoch. cfg (CfgNode): configs. """ self._cfg = cfg self.epoch_iters = epoch_iters self.MAX_EPOCH = cfg.SOLVER.MAX_EPOCH * epoch_iters self.iter_timer = Timer() self.data_timer = Timer() self.net_timer = Timer() self.loss = ScalarMeter(cfg.LOG_PERIOD) self.loss_total = 0.0 self.lr = None # Current minibatch errors (smoothed over a window). self.mb_top1_err = ScalarMeter(cfg.LOG_PERIOD) self.mb_top5_err = ScalarMeter(cfg.LOG_PERIOD) # Number of misclassified examples. self.num_top1_mis = 0 self.num_top5_mis = 0 self.num_samples = 0 self.output_dir = cfg.OUTPUT_DIR def reset(self): """ Reset the Meter. """ self.loss.reset() self.loss_total = 0.0 self.lr = None self.mb_top1_err.reset() self.mb_top5_err.reset() self.num_top1_mis = 0 self.num_top5_mis = 0 self.num_samples = 0 def iter_tic(self): """ Start to record time. """ self.iter_timer.reset() self.data_timer.reset() def iter_toc(self): """ Stop to record time. """ self.iter_timer.pause() self.net_timer.pause() def data_toc(self): self.data_timer.pause() self.net_timer.reset() def update_stats(self, top1_err, top5_err, loss, lr, mb_size): """ Update the current stats. Args: top1_err (float): top1 error rate. top5_err (float): top5 error rate. loss (float): loss value. lr (float): learning rate. mb_size (int): mini batch size. """ self.loss.add_value(loss) self.lr = lr self.loss_total += loss * mb_size self.num_samples += mb_size if not self._cfg.DATA.MULTI_LABEL: # Current minibatch stats self.mb_top1_err.add_value(top1_err) self.mb_top5_err.add_value(top5_err) # Aggregate stats self.num_top1_mis += top1_err * mb_size self.num_top5_mis += top5_err * mb_size def log_iter_stats(self, cur_epoch, cur_iter): """ log the stats of the current iteration. Args: cur_epoch (int): the number of current epoch. cur_iter (int): the number of current iteration. """ if (cur_iter + 1) % self._cfg.LOG_PERIOD != 0: return eta_sec = self.iter_timer.seconds() * ( self.MAX_EPOCH - (cur_epoch * self.epoch_iters + cur_iter + 1)) eta = str(datetime.timedelta(seconds=int(eta_sec))) stats = { "_type": "train_iter", "epoch": "{}/{}".format(cur_epoch + 1, self._cfg.SOLVER.MAX_EPOCH), "iter": "{}/{}".format(cur_iter + 1, self.epoch_iters), "dt": self.iter_timer.seconds(), "dt_data": self.data_timer.seconds(), "dt_net": self.net_timer.seconds(), "eta": eta, "loss": self.loss.get_win_median(), "lr": self.lr, "gpu_mem": "{:.2f}G".format(misc.gpu_mem_usage()), } if not self._cfg.DATA.MULTI_LABEL: stats["top1_err"] = self.mb_top1_err.get_win_median() stats["top5_err"] = self.mb_top5_err.get_win_median() logging.log_json_stats(stats) def log_epoch_stats(self, cur_epoch): """ Log the stats of the current epoch. Args: cur_epoch (int): the number of current epoch. """ eta_sec = self.iter_timer.seconds() * ( self.MAX_EPOCH - (cur_epoch + 1) * self.epoch_iters) eta = str(datetime.timedelta(seconds=int(eta_sec))) stats = { "_type": "train_epoch", "epoch": "{}/{}".format(cur_epoch + 1, self._cfg.SOLVER.MAX_EPOCH), "dt": self.iter_timer.seconds(), "dt_data": self.data_timer.seconds(), "dt_net": self.net_timer.seconds(), "eta": eta, "lr": self.lr, "gpu_mem": "{:.2f}G".format(misc.gpu_mem_usage()), "RAM": "{:.2f}/{:.2f}G".format(*misc.cpu_mem_usage()), } if not self._cfg.DATA.MULTI_LABEL: top1_err = self.num_top1_mis / self.num_samples top5_err = self.num_top5_mis / self.num_samples avg_loss = self.loss_total / self.num_samples stats["top1_err"] = top1_err stats["top5_err"] = top5_err stats["loss"] = avg_loss logging.log_json_stats(stats)
class TestMeter(object): """ Perform the multi-view ensemble for testing: each video with an unique index will be sampled with multiple clips, and the predictions of the clips will be aggregated to produce the final prediction for the video. The accuracy is calculated with the given ground truth labels. """ def __init__( self, num_videos, num_clips, num_cls, overall_iters, multi_label=False, ensemble_method="sum", ): """ Construct tensors to store the predictions and labels. Expect to get num_clips predictions from each video, and calculate the metrics on num_videos videos. Args: num_videos (int): number of videos to test. num_clips (int): number of clips sampled from each video for aggregating the final prediction for the video. num_cls (int): number of classes for each prediction. overall_iters (int): overall iterations for testing. multi_label (bool): if True, use map as the metric. ensemble_method (str): method to perform the ensemble, options include "sum", and "max". """ self.iter_timer = Timer() self.data_timer = Timer() self.net_timer = Timer() self.num_clips = num_clips self.overall_iters = overall_iters self.multi_label = multi_label self.ensemble_method = ensemble_method # Initialize tensors. self.video_preds = torch.zeros((num_videos, num_cls)) if multi_label: self.video_preds -= 1e10 self.video_labels = (torch.zeros( (num_videos, num_cls)) if multi_label else torch.zeros( (num_videos)).long()) self.clip_count = torch.zeros((num_videos)).long() self.topk_accs = [] self.stats = {} # Reset metric. self.reset() def reset(self): """ Reset the metric. """ self.clip_count.zero_() self.video_preds.zero_() if self.multi_label: self.video_preds -= 1e10 self.video_labels.zero_() def update_stats(self, preds, labels, clip_ids): """ Collect the predictions from the current batch and perform on-the-flight summation as ensemble. Args: preds (tensor): predictions from the current batch. Dimension is N x C where N is the batch size and C is the channel size (num_cls). labels (tensor): the corresponding labels of the current batch. Dimension is N. clip_ids (tensor): clip indexes of the current batch, dimension is N. """ for ind in range(preds.shape[0]): vid_id = int(clip_ids[ind]) // self.num_clips if self.video_labels[vid_id].sum() > 0: assert torch.equal( self.video_labels[vid_id].type(torch.FloatTensor), labels[ind].type(torch.FloatTensor), ) self.video_labels[vid_id] = labels[ind] if self.ensemble_method == "sum": self.video_preds[vid_id] += preds[ind] elif self.ensemble_method == "max": self.video_preds[vid_id] = torch.max(self.video_preds[vid_id], preds[ind]) else: raise NotImplementedError( "Ensemble Method {} is not supported".format( self.ensemble_method)) self.clip_count[vid_id] += 1 def log_iter_stats(self, cur_iter): """ Log the stats. Args: cur_iter (int): the current iteration of testing. """ eta_sec = self.iter_timer.seconds() * (self.overall_iters - cur_iter) eta = str(datetime.timedelta(seconds=int(eta_sec))) stats = { "split": "test_iter", "cur_iter": "{}".format(cur_iter + 1), "eta": eta, "time_diff": self.iter_timer.seconds(), } logging.log_json_stats(stats) def iter_tic(self): """ Start to record time. """ self.iter_timer.reset() self.data_timer.reset() def iter_toc(self): """ Stop to record time. """ self.iter_timer.pause() self.net_timer.pause() def data_toc(self): self.data_timer.pause() self.net_timer.reset() def finalize_metrics(self, ks=(1, 5)): """ Calculate and log the final ensembled metrics. ks (tuple): list of top-k values for topk_accuracies. For example, ks = (1, 5) correspods to top-1 and top-5 accuracy. """ if not all(self.clip_count == self.num_clips): logger.warning("clip count {} ~= num clips {}".format( ", ".join([ "{}: {}".format(i, k) for i, k in enumerate(self.clip_count.tolist()) ]), self.num_clips, )) self.stats = {"split": "test_final"} if self.multi_label: map = get_map(self.video_preds.cpu().numpy(), self.video_labels.cpu().numpy()) self.stats["map"] = map else: num_topks_correct = metrics.topks_correct(self.video_preds, self.video_labels, ks) topks = [(x / self.video_preds.size(0)) * 100.0 for x in num_topks_correct] assert len({len(ks), len(topks)}) == 1 for k, topk in zip(ks, topks): self.stats["top{}_acc".format(k)] = "{:.{prec}f}".format( topk, prec=2) logging.log_json_stats(self.stats)
class ValMeter(object): """ Measures validation stats. """ def __init__(self, max_iter, cfg): """ Args: max_iter (int): the max number of iteration of the current epoch. cfg (CfgNode): configs. """ self._cfg = cfg self.max_iter = max_iter self.iter_timer = Timer() # Current minibatch errors (smoothed over a window). self.mb_top1_err = ScalarMeter(cfg.LOG_PERIOD) self.mb_top5_err = ScalarMeter(cfg.LOG_PERIOD) # Min errors (over the full val set). self.min_top1_err = 100.0 self.min_top5_err = 100.0 # Number of misclassified examples. self.num_top1_mis = 0 self.num_top5_mis = 0 self.num_samples = 0 def reset(self): """ Reset the Meter. """ self.iter_timer.reset() self.mb_top1_err.reset() self.mb_top5_err.reset() self.num_top1_mis = 0 self.num_top5_mis = 0 self.num_samples = 0 def iter_tic(self): """ Start to record time. """ self.iter_timer.reset() def iter_toc(self): """ Stop to record time. """ self.iter_timer.pause() def update_stats(self, top1_err, top5_err, mb_size): """ Update the current stats. Args: top1_err (float): top1 error rate. top5_err (float): top5 error rate. mb_size (int): mini batch size. """ self.mb_top1_err.add_value(top1_err) self.mb_top5_err.add_value(top5_err) self.num_top1_mis += top1_err * mb_size self.num_top5_mis += top5_err * mb_size self.num_samples += mb_size def log_iter_stats(self, cur_epoch, cur_iter): """ log the stats of the current iteration. Args: cur_epoch (int): the number of current epoch. cur_iter (int): the number of current iteration. """ if (cur_iter + 1) % self._cfg.LOG_PERIOD != 0: return eta_sec = self.iter_timer.seconds() * (self.max_iter - cur_iter - 1) eta = str(datetime.timedelta(seconds=int(eta_sec))) mem_usage = misc.gpu_mem_usage() stats = { "_type": "val_iter", "epoch": "{}/{}".format(cur_epoch + 1, self._cfg.SOLVER.MAX_EPOCH), "iter": "{}/{}".format(cur_iter + 1, self.max_iter), "time_diff": self.iter_timer.seconds(), "eta": eta, "top1_err": self.mb_top1_err.get_win_median(), "top5_err": self.mb_top5_err.get_win_median(), "mem": int(np.ceil(mem_usage)), } logging.log_json_stats(stats) def log_epoch_stats(self, cur_epoch): """ Log the stats of the current epoch. Args: cur_epoch (int): the number of current epoch. """ top1_err = self.num_top1_mis / self.num_samples top5_err = self.num_top5_mis / self.num_samples self.min_top1_err = min(self.min_top1_err, top1_err) self.min_top5_err = min(self.min_top5_err, top5_err) mem_usage = misc.gpu_mem_usage() stats = { "_type": "val_epoch", "epoch": "{}/{}".format(cur_epoch + 1, self._cfg.SOLVER.MAX_EPOCH), "time_diff": self.iter_timer.seconds(), "top1_err": top1_err, "top5_err": top5_err, "min_top1_err": self.min_top1_err, "min_top5_err": self.min_top5_err, "mem": int(np.ceil(mem_usage)), } logging.log_json_stats(stats) return self.min_top1_err
class EPICTrainMeter(object): """ Measure training stats. """ def __init__(self, summary_writer, epoch_iters, cfg): """ Args: epoch_iters (int): the overall number of iterations of one epoch. cfg (CfgNode): configs. """ self._cfg = cfg self.epoch_iters = epoch_iters self.MAX_EPOCH = cfg.SOLVER.MAX_EPOCH * epoch_iters self.iter_timer = Timer() self.loss = ScalarMeter(cfg.LOG_PERIOD) self.loss_total = 0.0 self.loss_verb = ScalarMeter(cfg.LOG_PERIOD) self.loss_verb_total = 0.0 self.loss_noun = ScalarMeter(cfg.LOG_PERIOD) self.loss_noun_total = 0.0 self.lr = None # Current minibatch accuracies (smoothed over a window). self.mb_top1_acc = ScalarMeter(cfg.LOG_PERIOD) self.mb_top5_acc = ScalarMeter(cfg.LOG_PERIOD) self.mb_verb_top1_acc = ScalarMeter(cfg.LOG_PERIOD) self.mb_verb_top5_acc = ScalarMeter(cfg.LOG_PERIOD) self.mb_noun_top1_acc = ScalarMeter(cfg.LOG_PERIOD) self.mb_noun_top5_acc = ScalarMeter(cfg.LOG_PERIOD) # Number of correctly classified examples. self.num_top1_cor = 0 self.num_top5_cor = 0 self.num_verb_top1_cor = 0 self.num_verb_top5_cor = 0 self.num_noun_top1_cor = 0 self.num_noun_top5_cor = 0 self.num_samples = 0 self.tb_writer: SummaryWriter = summary_writer def reset(self): """ Reset the Meter. """ self.loss.reset() self.loss_total = 0.0 self.loss_verb.reset() self.loss_verb_total = 0.0 self.loss_noun.reset() self.loss_noun_total = 0.0 self.lr = None self.mb_top1_acc.reset() self.mb_top5_acc.reset() self.mb_verb_top1_acc.reset() self.mb_verb_top5_acc.reset() self.mb_noun_top1_acc.reset() self.mb_noun_top5_acc.reset() self.num_top1_cor = 0 self.num_top5_cor = 0 self.num_verb_top1_cor = 0 self.num_verb_top5_cor = 0 self.num_noun_top1_cor = 0 self.num_noun_top5_cor = 0 self.num_samples = 0 def iter_tic(self): """ Start to record time. """ self.iter_timer.reset() def iter_toc(self): """ Stop to record time. """ self.iter_timer.pause() def update_stats(self, top1_acc, top5_acc, loss, lr, mb_size): """ Update the current stats. Args: top1_acc (float): top1 accuracy rate. top5_acc (float): top5 accuracy rate. loss (float): loss value. lr (float): learning rate. mb_size (int): mini batch size. """ # Current minibatch stats self.mb_verb_top1_acc.add_value(top1_acc[0]) self.mb_verb_top5_acc.add_value(top5_acc[0]) self.mb_noun_top1_acc.add_value(top1_acc[1]) self.mb_noun_top5_acc.add_value(top5_acc[1]) self.mb_top1_acc.add_value(top1_acc[2]) self.mb_top5_acc.add_value(top5_acc[2]) self.loss_verb.add_value(loss[0]) self.loss_noun.add_value(loss[1]) self.loss.add_value(loss[2]) self.lr = lr # Aggregate stats self.num_verb_top1_cor += top1_acc[0] * mb_size self.num_verb_top5_cor += top5_acc[0] * mb_size self.num_noun_top1_cor += top1_acc[1] * mb_size self.num_noun_top5_cor += top5_acc[1] * mb_size self.num_top1_cor += top1_acc[2] * mb_size self.num_top5_cor += top5_acc[2] * mb_size self.loss_verb_total += loss[0] * mb_size self.loss_noun_total += loss[1] * mb_size self.loss_total += loss[2] * mb_size self.num_samples += mb_size def log_iter_stats(self, cur_epoch, cur_iter): """ log the stats of the current iteration. Args: cur_epoch (int): the number of current epoch. cur_iter (int): the number of current iteration. """ if (cur_iter + 1) % self._cfg.LOG_PERIOD != 0: return eta_sec = self.iter_timer.seconds() * ( self.MAX_EPOCH - (cur_epoch * self.epoch_iters + cur_iter + 1)) eta = str(datetime.timedelta(seconds=int(eta_sec))) mem_usage = misc.gpu_mem_usage() stats = { "_type": "train_iter", "epoch": "{}/{}".format(cur_epoch + 1, self._cfg.SOLVER.MAX_EPOCH), "iter": "{}/{}".format(cur_iter + 1, self.epoch_iters), "time_diff": self.iter_timer.seconds(), "eta": eta, "verb_top1_acc": self.mb_verb_top1_acc.get_win_median(), "verb_top5_acc": self.mb_verb_top5_acc.get_win_median(), "noun_top1_acc": self.mb_noun_top1_acc.get_win_median(), "noun_top5_acc": self.mb_noun_top5_acc.get_win_median(), "top1_acc": self.mb_top1_acc.get_win_median(), "top5_acc": self.mb_top5_acc.get_win_median(), "verb_loss": self.loss_verb.get_win_median(), "noun_loss": self.loss_noun.get_win_median(), "loss": self.loss.get_win_median(), "lr": self.lr, "mem": int(np.ceil(mem_usage)), } log_to_tensorboard(self.tb_writer, stats) logging.log_json_stats(stats) def log_epoch_stats(self, cur_epoch): """ Log the stats of the current epoch. Args: cur_epoch (int): the number of current epoch. """ eta_sec = self.iter_timer.seconds() * ( self.MAX_EPOCH - (cur_epoch + 1) * self.epoch_iters) eta = str(datetime.timedelta(seconds=int(eta_sec))) mem_usage = misc.gpu_mem_usage() verb_top1_acc = self.num_verb_top1_cor / self.num_samples verb_top5_acc = self.num_verb_top5_cor / self.num_samples noun_top1_acc = self.num_noun_top1_cor / self.num_samples noun_top5_acc = self.num_noun_top5_cor / self.num_samples top1_acc = self.num_top1_cor / self.num_samples top5_acc = self.num_top5_cor / self.num_samples avg_loss_verb = self.loss_verb_total / self.num_samples avg_loss_noun = self.loss_noun_total / self.num_samples avg_loss = self.loss_total / self.num_samples stats = { "_type": "train_epoch", "epoch": "{}/{}".format(cur_epoch + 1, self._cfg.SOLVER.MAX_EPOCH), "time_diff": self.iter_timer.seconds(), "eta": eta, "verb_top1_acc": verb_top1_acc, "verb_top5_acc": verb_top5_acc, "noun_top1_acc": noun_top1_acc, "noun_top5_acc": noun_top5_acc, "top1_acc": top1_acc, "top5_acc": top5_acc, "verb_loss": avg_loss_verb, "noun_loss": avg_loss_noun, "loss": avg_loss, "lr": self.lr, "mem": int(np.ceil(mem_usage)), } log_to_tensorboard(self.tb_writer, stats, False) logging.log_json_stats(stats)
def benchmark_data(cfg): # Set up environment. setup_environment() # Set random seed from configs. np.random.seed(cfg.RNG_SEED) torch.manual_seed(cfg.RNG_SEED) # Setup logging format. logging.setup_logging() # Print config. logger.info("Benchmark data loading with config:") logger.info(pprint.pformat(cfg)) timer = Timer() dataloader = loader.construct_loader(cfg, "train") logger.info("Initialize loader using {:.2f} seconds.".format( timer.seconds())) batch_size = cfg.TRAIN.BATCH_SIZE log_period = cfg.BENCHMARK.LOG_PERIOD epoch_times = [] # Test for a few epochs. for cur_epoch in range(cfg.BENCHMARK.NUM_EPOCHS): timer = Timer() timer_epoch = Timer() iter_times = [] for cur_iter, _ in enumerate(tqdm.tqdm(dataloader)): if cur_iter > 0 and cur_iter % log_period == 0: iter_times.append(timer.seconds()) vram = psutil.virtual_memory() logger.info( "Epoch {}: {} iters ({} videos) in {:.2f} seconds. " "RAM Usage: {:.2f}/{:.2f} GB.".format( cur_epoch, log_period, log_period * batch_size, iter_times[-1], (vram.total - vram.available) / 1024**3, vram.total / 1024**3, )) timer.reset() epoch_times.append(timer_epoch.seconds()) vram = psutil.virtual_memory() logger.info( "Epoch {}: in total {} iters ({} videos) in {:.2f} seconds. " "RAM Usage: {:.2f}/{:.2f} GB.".format( cur_epoch, len(dataloader), len(dataloader) * batch_size, epoch_times[-1], (vram.total - vram.available) / 1024**3, vram.total / 1024**3, )) logger.info( "Epoch {}: on average every {} iters ({} videos) take {:.2f}/{:.2f} " "(avg/std) seconds.".format( cur_epoch, log_period, log_period * batch_size, np.mean(iter_times), np.std(iter_times), )) logger.info("On average every epoch ({} videos) takes {:.2f}/{:.2f} " "(avg/std) seconds.".format( len(dataloader) * batch_size, np.mean(epoch_times), np.std(epoch_times), ))
class TrainMeter(object): """ Measures training stats. """ def __init__(self, epoch_iters, cfg): """ Args: epoch_iters (int): the overall number of iterations of one epoch. cfg (CfgNode): configs. """ self._cfg = cfg self.epoch_iters = epoch_iters self.MAX_EPOCH = cfg.SOLVER.MAX_EPOCH * epoch_iters self.iter_timer = Timer() self.log_period = cfg.LOG_PERIOD self.infos = None self.num_samples = 0 def init(self, keys): self.infos = {} for key in keys: self.infos[key] = ScalarMeter(self.log_period) def reset(self): """ Reset the Meter. """ for k, v in self.infos.items(): v.reset() def iter_tic(self): """ Start to record time. """ self.iter_timer.reset() def iter_toc(self): """ Stop to record time. """ self.iter_timer.pause() def update_stats(self, info_dict): """ Update the current stats. Args: psnr (float): psnr loss (float): loss value. lr (float): learning rate. mb_size (int): mini batch size. """ # Current minibatch stats if self.infos is None: self.init(info_dict.keys()) # reduce from all gpus if self._cfg.NUM_GPUS > 1: for k, v in info_dict.items(): info_dict[k] = du.all_reduce([v]) # syncronize from gpu to cpu info_dict = {k: v.item() for k, v in info_dict.items()} # log value into scalar meter for k, v in info_dict.items(): self.infos[k].add_value(v) def log_iter_stats(self, cur_epoch, cur_iter): """ log the stats of the current iteration. Args: cur_epoch (int): the number of current epoch. cur_iter (int): the number of current iteration. """ if (cur_iter + 1) % self._cfg.LOG_PERIOD != 0: return eta_sec = self.iter_timer.seconds() * ( self.MAX_EPOCH - (cur_epoch * self.epoch_iters + cur_iter + 1)) eta = str(datetime.timedelta(seconds=int(eta_sec))) mem_usage = misc.gpu_mem_usage() stats = { "_type": "train_iter", "epoch": "{}/{}".format(cur_epoch + 1, self._cfg.SOLVER.MAX_EPOCH), "iter": "{}/{}".format(cur_iter + 1, self.epoch_iters), "time_diff": self.iter_timer.seconds(), "eta": eta, "mem": int(np.ceil(mem_usage)), } infos = {k: v.get_win_avg() for k, v in self.infos} stats.update(infos) logging.log_json_stats(stats) def log_epoch_stats(self, cur_epoch): """ Log the stats of the current epoch. Args: cur_epoch (int): the number of current epoch. """ eta_sec = self.iter_timer.seconds() * ( self.MAX_EPOCH - (cur_epoch + 1) * self.epoch_iters) eta = str(datetime.timedelta(seconds=int(eta_sec))) mem_usage = misc.gpu_mem_usage() stats = { "_type": "train_epoch", "epoch": "{}/{}".format(cur_epoch + 1, self._cfg.SOLVER.MAX_EPOCH), "time_diff": self.iter_timer.seconds(), "eta": eta, "mem": int(np.ceil(mem_usage)), } infos = {k: v.get_global_avg() for k, v in self.infos} stats.update(infos) logging.log_json_stats(stats)
def do_train(cfg, model, resume=False): model.train() optimizer = build_optimizer(cfg, model) scheduler = build_lr_scheduler(cfg, optimizer) checkpointer = DetectionCheckpointer(model, cfg.OUTPUT_DIR, optimizer=optimizer, scheduler=scheduler) start_iter = (checkpointer.resume_or_load( cfg.MODEL.WEIGHTS, resume=resume, ).get("iteration", -1) + 1) if cfg.SOLVER.RESET_ITER: logger.info('Reset loaded iteration. Start training from iteration 0.') start_iter = 0 max_iter = cfg.SOLVER.MAX_ITER if cfg.SOLVER.TRAIN_ITER < 0 else cfg.SOLVER.TRAIN_ITER periodic_checkpointer = PeriodicCheckpointer(checkpointer, cfg.SOLVER.CHECKPOINT_PERIOD, max_iter=max_iter) writers = ([ CommonMetricPrinter(max_iter), JSONWriter(os.path.join(cfg.OUTPUT_DIR, "metrics.json")), TensorboardXWriter(cfg.OUTPUT_DIR), ] if comm.is_main_process() else []) mapper = DatasetMapper(cfg, True) if cfg.INPUT.CUSTOM_AUG == '' else \ DatasetMapper(cfg, True, augmentations=build_custom_augmentation(cfg, True)) if cfg.DATALOADER.SAMPLER_TRAIN in [ 'TrainingSampler', 'RepeatFactorTrainingSampler' ]: data_loader = build_detection_train_loader(cfg, mapper=mapper) else: from centernet.data.custom_dataset_dataloader import build_custom_train_loader data_loader = build_custom_train_loader(cfg, mapper=mapper) logger.info("Starting training from iteration {}".format(start_iter)) with EventStorage(start_iter) as storage: step_timer = Timer() data_timer = Timer() start_time = time.perf_counter() for data, iteration in zip(data_loader, range(start_iter, max_iter)): data_time = data_timer.seconds() storage.put_scalars(data_time=data_time) step_timer.reset() iteration = iteration + 1 storage.step() loss_dict = model(data) losses = sum(loss for k, loss in loss_dict.items()) assert torch.isfinite(losses).all(), loss_dict loss_dict_reduced = {k: v.item() \ for k, v in comm.reduce_dict(loss_dict).items()} losses_reduced = sum(loss for loss in loss_dict_reduced.values()) if comm.is_main_process(): storage.put_scalars(total_loss=losses_reduced, **loss_dict_reduced) optimizer.zero_grad() losses.backward() optimizer.step() storage.put_scalar("lr", optimizer.param_groups[0]["lr"], smoothing_hint=False) step_time = step_timer.seconds() storage.put_scalars(time=step_time) data_timer.reset() scheduler.step() if (cfg.TEST.EVAL_PERIOD > 0 and iteration % cfg.TEST.EVAL_PERIOD == 0 and iteration != max_iter): do_test(cfg, model) comm.synchronize() if iteration - start_iter > 5 and \ (iteration % 20 == 0 or iteration == max_iter): for writer in writers: writer.write() periodic_checkpointer.step(iteration) total_time = time.perf_counter() - start_time logger.info("Total training time: {}".format( str(datetime.timedelta(seconds=int(total_time)))))
def main(cfg: DictConfig) -> None: if "experiments" in cfg.keys(): cfg = OmegaConf.merge(cfg, cfg.experiments) if "debug" in cfg.keys(): logger.info(f"Run script in debug") cfg = OmegaConf.merge(cfg, cfg.debug) # A logger for this file logger = logging.getLogger(__name__) # NOTE: hydra causes the python file to run in hydra.run.dir by default logger.info(f"Run script in {HydraConfig.get().run.dir}") writer = SummaryWriter(log_dir=cfg.train.tensorboard_dir) checkpoints_dir = Path(cfg.train.checkpoints_dir) if not checkpoints_dir.exists(): checkpoints_dir.mkdir(parents=True) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") image_shape = (cfg.train.channels, cfg.train.image_height, cfg.train.image_width) # NOTE: With hydra, the python file runs in hydra.run.dir by default, so set the dataset path to a full path or an appropriate relative path dataset_path = Path(cfg.dataset.root) / cfg.dataset.frames split_path = Path(cfg.dataset.root) / cfg.dataset.split_file assert dataset_path.exists(), "Video image folder not found" assert (split_path.exists() ), "The file that describes the split of train/test not found." # Define training set train_dataset = Dataset( dataset_path=dataset_path, split_path=split_path, split_number=cfg.dataset.split_number, input_shape=image_shape, sequence_length=cfg.train.sequence_length, training=True, ) # Define train dataloader train_dataloader = DataLoader( train_dataset, batch_size=cfg.train.batch_size, shuffle=True, num_workers=cfg.train.num_workers, ) # Define test set test_dataset = Dataset( dataset_path=dataset_path, split_path=split_path, split_number=cfg.dataset.split_number, input_shape=image_shape, sequence_length=cfg.train.sequence_length, training=False, ) # Define test dataloader test_dataloader = DataLoader( test_dataset, batch_size=cfg.train.batch_size, shuffle=False, num_workers=cfg.train.num_workers, ) # Classification criterion criterion = nn.CrossEntropyLoss().to(device) # Define network model = CNNLSTM( num_classes=train_dataset.num_classes, latent_dim=cfg.train.latent_dim, lstm_layers=cfg.train.lstm_layers, hidden_dim=cfg.train.hidden_dim, bidirectional=cfg.train.bidirectional, attention=cfg.train.attention, ) model = model.to(device) optimizer = torch.optim.Adam(model.parameters(), lr=1e-5) checkpointer = Checkpointer( model, optimizer=optimizer, # scheduler=scheduler, save_dir=cfg.train.checkpoints_dir, save_to_disk=True, ) if cfg.train.resume: if not checkpointer.has_checkpoint(): start_epoch = 0 else: ckpt = checkpointer.resume_or_load("", resume=True) start_epoch = ckpt["epoch"] model.to(device) for state in optimizer.state.values(): for k, v in state.items(): if isinstance(v, torch.Tensor): state[k] = v.to(device) elif cfg.train.checkpoint_model != "": ckpt = torch.load(cfg.train.checkpoint_model, map_location="cpu") model.load_state_dict(ckpt["model"]) model.to(device) start_epoch = 0 else: start_epoch = 0 for epoch in range(start_epoch, cfg.train.num_epochs): epoch += 1 epoch_metrics = {"loss": [], "acc": []} timer = Timer() for batch_i, (X, y) in enumerate(train_dataloader): batch_i += 1 if X.size(0) == 1: continue image_sequences = Variable(X.to(device), requires_grad=True) labels = Variable(y.to(device), requires_grad=False) optimizer.zero_grad() # Reset LSTM hidden state model.lstm.reset_hidden_state() # Get sequence predictions predictions = model(image_sequences) # Compute metrics loss = criterion(predictions, labels) acc = ( predictions.detach().argmax(1) == labels).cpu().numpy().mean() loss.backward() optimizer.step() # Keep track of epoch metrics epoch_metrics["loss"].append(loss.item()) epoch_metrics["acc"].append(acc) # Determine approximate time left batches_done = (epoch - 1) * len(train_dataloader) + (batch_i - 1) batches_left = cfg.train.num_epochs * len( train_dataloader) - batches_done time_left = datetime.timedelta(seconds=batches_left * timer.seconds()) time_iter = round(timer.seconds(), 3) timer.reset() logger.info( f'Training - [Epoch: {epoch}/{cfg.train.num_epochs}] [Batch: {batch_i}/{len(train_dataloader)}] [Loss: {np.mean(epoch_metrics["loss"]):.3f}] [Acc: {np.mean(epoch_metrics["acc"]):.3f}] [ETA: {time_left}] [Iter time: {time_iter}s/it]' ) # Empty cache if torch.cuda.is_available(): torch.cuda.empty_cache() writer.add_scalar("train/loss", np.mean(epoch_metrics["loss"]), epoch) writer.add_scalar("train/acc", np.mean(epoch_metrics["acc"]), epoch) def test_model(epoch): """ Evaluate the model on the test set """ model.eval() test_metrics = {"loss": [], "acc": []} timer = Timer() for batch_i, (X, y) in enumerate(test_dataloader): batch_i += 1 image_sequences = Variable(X.to(device), requires_grad=False) labels = Variable(y, requires_grad=False).to(device) with torch.no_grad(): # Reset LSTM hidden state model.lstm.reset_hidden_state() # Get sequence predictions predictions = model(image_sequences) # Compute metrics loss = criterion(predictions, labels) acc = (predictions.detach().argmax(1) == labels ).cpu().numpy().mean() # Keep track of loss and accuracy test_metrics["loss"].append(loss.item()) test_metrics["acc"].append(acc) # Determine approximate time left batches_done = batch_i - 1 batches_left = len(test_dataloader) - batches_done time_left = datetime.timedelta(seconds=batches_left * timer.seconds()) time_iter = round(timer.seconds(), 3) timer.reset() # Log test performance logger.info( f'Testing - [Epoch: {epoch}/{cfg.train.num_epochs}] [Batch: {batch_i}/{len(test_dataloader)}] [Loss: {np.mean(test_metrics["loss"]):.3f}] [Acc: {np.mean(test_metrics["acc"]):.3f}] [ETA: {time_left}] [Iter time: {time_iter}s/it]' ) writer.add_scalar("test/loss", np.mean(test_metrics["loss"]), epoch) writer.add_scalar("test/acc", np.mean(test_metrics["acc"]), epoch) model.train() # Evaluate the model on the test set test_model(epoch) # Save model checkpoint if epoch % cfg.train.checkpoint_interval == 0: checkpointer.save(f"checkpoint_{epoch:04}", epoch=epoch) writer.close()
class EPICValMeter(object): """ Measures validation stats. """ def __init__(self, max_iter, cfg): """ Args: max_iter (int): the max number of iteration of the current epoch. cfg (CfgNode): configs. """ self._cfg = cfg self.max_iter = max_iter self.iter_timer = Timer() self.data_timer = Timer() self.net_timer = Timer() # Current minibatch accuracies (smoothed over a window). self.mb_top1_acc = ScalarMeter(cfg.LOG_PERIOD) self.mb_top5_acc = ScalarMeter(cfg.LOG_PERIOD) self.mb_verb_top1_acc = ScalarMeter(cfg.LOG_PERIOD) self.mb_verb_top5_acc = ScalarMeter(cfg.LOG_PERIOD) self.mb_noun_top1_acc = ScalarMeter(cfg.LOG_PERIOD) self.mb_noun_top5_acc = ScalarMeter(cfg.LOG_PERIOD) # Max accuracies (over the full val set). self.max_top1_acc = 0.0 self.max_top5_acc = 0.0 self.max_verb_top1_acc = 0.0 self.max_verb_top5_acc = 0.0 self.max_noun_top1_acc = 0.0 self.max_noun_top5_acc = 0.0 # Number of correctly classified examples. self.num_top1_cor = 0 self.num_top5_cor = 0 self.num_verb_top1_cor = 0 self.num_verb_top5_cor = 0 self.num_noun_top1_cor = 0 self.num_noun_top5_cor = 0 self.num_samples = 0 self.all_verb_preds = [] self.all_verb_labels = [] self.all_noun_preds = [] self.all_noun_labels = [] self.output_dir = cfg.OUTPUT_DIR def reset(self): """ Reset the Meter. """ self.iter_timer.reset() self.mb_top1_acc.reset() self.mb_top5_acc.reset() self.mb_verb_top1_acc.reset() self.mb_verb_top5_acc.reset() self.mb_noun_top1_acc.reset() self.mb_noun_top5_acc.reset() self.num_top1_cor = 0 self.num_top5_cor = 0 self.num_verb_top1_cor = 0 self.num_verb_top5_cor = 0 self.num_noun_top1_cor = 0 self.num_noun_top5_cor = 0 self.num_samples = 0 self.all_verb_preds = [] self.all_verb_labels = [] self.all_noun_preds = [] self.all_noun_labels = [] def iter_tic(self): """ Start to record time. """ self.iter_timer.reset() self.data_timer.reset() def iter_toc(self): """ Stop to record time. """ self.iter_timer.pause() self.net_timer.pause() def data_toc(self): self.data_timer.pause() self.net_timer.reset() def update_stats(self, top1_acc, top5_acc, mb_size): """ Update the current stats. Args: top1_acc (float): top1 accuracy rate. top5_acc (float): top5 accuracy rate. mb_size (int): mini batch size. """ self.mb_verb_top1_acc.add_value(top1_acc[0]) self.mb_verb_top5_acc.add_value(top5_acc[0]) self.mb_noun_top1_acc.add_value(top1_acc[1]) self.mb_noun_top5_acc.add_value(top5_acc[1]) self.mb_top1_acc.add_value(top1_acc[2]) self.mb_top5_acc.add_value(top5_acc[2]) self.num_verb_top1_cor += top1_acc[0] * mb_size self.num_verb_top5_cor += top5_acc[0] * mb_size self.num_noun_top1_cor += top1_acc[1] * mb_size self.num_noun_top5_cor += top5_acc[1] * mb_size self.num_top1_cor += top1_acc[2] * mb_size self.num_top5_cor += top5_acc[2] * mb_size self.num_samples += mb_size def update_predictions(self, preds, labels): """ Update predictions and labels. Args: preds (tensor): model output predictions. labels (tensor): labels. """ # TODO: merge update_prediction with update_stats. self.all_verb_preds.append(preds[0]) self.all_verb_labels.append(labels[0]) self.all_noun_preds.append(preds[1]) self.all_noun_labels.append(labels[1]) def log_iter_stats(self, cur_epoch, cur_iter): """ log the stats of the current iteration. Args: cur_epoch (int): the number of current epoch. cur_iter (int): the number of current iteration. """ if (cur_iter + 1) % self._cfg.LOG_PERIOD != 0: return eta_sec = self.iter_timer.seconds() * (self.max_iter - cur_iter - 1) eta = str(datetime.timedelta(seconds=int(eta_sec))) stats = { "_type": "val_iter", "epoch": "{}/{}".format(cur_epoch + 1, self._cfg.SOLVER.MAX_EPOCH), "iter": "{}/{}".format(cur_iter + 1, self.max_iter), "time_diff": self.iter_timer.seconds(), "eta": eta, "verb_top1_acc": self.mb_verb_top1_acc.get_win_median(), "verb_top5_acc": self.mb_verb_top5_acc.get_win_median(), "noun_top1_acc": self.mb_noun_top1_acc.get_win_median(), "noun_top5_acc": self.mb_noun_top5_acc.get_win_median(), "top1_acc": self.mb_top1_acc.get_win_median(), "top5_acc": self.mb_top5_acc.get_win_median(), "gpu_mem": "{:.2f}G".format(misc.gpu_mem_usage()), } logging.log_json_stats(stats) def log_epoch_stats(self, cur_epoch): """ Log the stats of the current epoch. Args: cur_epoch (int): the number of current epoch. """ verb_top1_acc = self.num_verb_top1_cor / self.num_samples verb_top5_acc = self.num_verb_top5_cor / self.num_samples noun_top1_acc = self.num_noun_top1_cor / self.num_samples noun_top5_acc = self.num_noun_top5_cor / self.num_samples top1_acc = self.num_top1_cor / self.num_samples top5_acc = self.num_top5_cor / self.num_samples self.max_verb_top1_acc = max(self.max_verb_top1_acc, verb_top1_acc) self.max_verb_top5_acc = max(self.max_verb_top5_acc, verb_top5_acc) self.max_noun_top1_acc = max(self.max_noun_top1_acc, noun_top1_acc) self.max_noun_top5_acc = max(self.max_noun_top5_acc, noun_top5_acc) is_best_epoch = top1_acc > self.max_top1_acc self.max_top1_acc = max(self.max_top1_acc, top1_acc) self.max_top5_acc = max(self.max_top5_acc, top5_acc) stats = { "_type": "val_epoch", "epoch": "{}/{}".format(cur_epoch + 1, self._cfg.SOLVER.MAX_EPOCH), "time_diff": self.iter_timer.seconds(), "verb_top1_acc": verb_top1_acc, "verb_top5_acc": verb_top5_acc, "noun_top1_acc": noun_top1_acc, "noun_top5_acc": noun_top5_acc, "top1_acc": top1_acc, "top5_acc": top5_acc, "max_verb_top1_acc": self.max_verb_top1_acc, "max_verb_top5_acc": self.max_verb_top5_acc, "max_noun_top1_acc": self.max_noun_top1_acc, "max_noun_top5_acc": self.max_noun_top5_acc, "max_top1_acc": self.max_top1_acc, "max_top5_acc": self.max_top5_acc, "gpu_mem": "{:.2f}G".format(misc.gpu_mem_usage()), "RAM": "{:.2f}/{:.2f}G".format(*misc.cpu_mem_usage()), } logging.log_json_stats(stats) return is_best_epoch, {"top1_acc": top1_acc, "verb_top1_acc": verb_top1_acc, "noun_top1_acc": noun_top1_acc}
class TrainMeter(object): """ Measure training stats. """ def __init__(self, epoch_iters, cfg): """ Args: epoch_iters (int): the overall number of iterations of one epoch. cfg (CfgNode): configs. """ self._cfg = cfg self.epoch_iters = epoch_iters self.MAX_EPOCH = cfg.SOLVER.MAX_EPOCH * epoch_iters self.iter_timer = Timer() # self.loss = ScalarMeter(cfg.LOG_PERIOD) # self.loss_total = 0.0 self.lr = None # Current minibatch errors (smoothed over a window). # self.mb_top1_err = ScalarMeter(cfg.LOG_PERIOD) # self.mb_top5_err = ScalarMeter(cfg.LOG_PERIOD) # Number of misclassified examples. # self.num_top1_mis = 0 # self.num_top5_mis = 0 self.num_samples = 0 self.stats = {} def reset(self): """ Reset the Meter. """ # self.loss.reset() # self.loss_total = 0.0 self.lr = None # if self.mb_top1_err: # self.mb_top1_err.reset() # self.mb_top5_err.reset() # self.num_top1_mis = 0 # self.num_top5_mis = 0 self.num_samples = 0 for k,v in self.stats.items(): self.stats[k].reset() def iter_tic(self): """ Start to record time. """ self.iter_timer.reset() def iter_toc(self): """ Stop to record time. """ self.iter_timer.pause() def update_stats(self, lr, mb_size, **kwargs): #, top1_err, top5_err, loss """ Update the current stats. Args: top1_err (float): top1 error rate. top5_err (float): top5 error rate. loss (float): loss value. lr (float): learning rate. mb_size (int): mini batch size. """ # Current minibatch stats # if self.mb_top1_err: # self.mb_top1_err.add_value(top1_err) # self.mb_top5_err.add_value(top5_err) # self.num_top1_mis += top1_err * mb_size # self.num_top5_mis += top5_err * mb_size for k,v in kwargs.items(): if k not in self.stats: self.stats[k] = ScalarMeter(self._cfg.LOG_PERIOD) self.stats[k].add_value(v) # self.loss.add_value(loss) self.lr = lr # Aggregate stats # self.loss_total += loss * mb_size self.num_samples += mb_size def log_iter_stats(self, cur_epoch, cur_iter): """ log the stats of the current iteration. Args: cur_epoch (int): the number of current epoch. cur_iter (int): the number of current iteration. """ if (cur_iter + 1) % self._cfg.LOG_PERIOD != 0: return eta_sec = self.iter_timer.seconds() * ( self.MAX_EPOCH - (cur_epoch * self.epoch_iters + cur_iter + 1) ) eta = str(datetime.timedelta(seconds=int(eta_sec))) mem_usage = misc.gpu_mem_usage() stats = { "_type": "train_iter", "epoch": "{}/{}".format(cur_epoch + 1, self._cfg.SOLVER.MAX_EPOCH), "iter": "{}/{}".format(cur_iter + 1, self.epoch_iters), "time_diff": self.iter_timer.seconds(), "time_left": eta, # "top1_err": self.mb_top1_err.get_win_median(), # "top5_err": self.mb_top5_err.get_win_median(), # "loss": self.loss.get_win_median(), "lr": self.lr, "mem": int(np.ceil(mem_usage)), } for k, v in self.stats.items(): stats[k] = v.get_win_median() # if self.mb_top1_err: # stats = {**stats, **{"top1_err": self.mb_top1_err.get_win_median(), # "top5_err": self.mb_top5_err.get_win_median()}} logging.log_json_stats(stats) def log_epoch_stats(self, cur_epoch): """ Log the stats of the current epoch. Args: cur_epoch (int): the number of current epoch. """ eta_sec = self.iter_timer.seconds() * ( self.MAX_EPOCH - (cur_epoch + 1) * self.epoch_iters ) eta = str(datetime.timedelta(seconds=int(eta_sec))) mem_usage = misc.gpu_mem_usage() # top1_err = self.num_top1_mis / self.num_samples # top5_err = self.num_top5_mis / self.num_samples # avg_loss = self.loss_total / self.num_samples stats = { "_type": "train_epoch", "epoch": "{}/{}".format(cur_epoch + 1, self._cfg.SOLVER.MAX_EPOCH), "time_diff": self.iter_timer.seconds(), "time_left": eta, # "top1_err": top1_err, # "top5_err": top5_err, # "loss": avg_loss, "lr": self.lr, "mem": int(np.ceil(mem_usage)), } for k, v in self.stats.items(): stats[k] = v.get_global_avg() # if self.mb_top1_err: # stats = {**stats, **{"top1_err": top1_err, # "top5_err": top5_err}} logging.log_json_stats(stats)
class EPICValMeter(object): """ Measures validation stats. """ def __init__(self, summary_writer, max_iter, cfg): """ Args: max_iter (int): the max number of iteration of the current epoch. cfg (CfgNode): configs. """ self._cfg = cfg self.max_iter = max_iter self.iter_timer = Timer() # Current minibatch accuracies (smoothed over a window). self.mb_top1_acc = ScalarMeter(cfg.LOG_PERIOD) self.mb_top5_acc = ScalarMeter(cfg.LOG_PERIOD) self.mb_verb_top1_acc = ScalarMeter(cfg.LOG_PERIOD) self.mb_verb_top5_acc = ScalarMeter(cfg.LOG_PERIOD) self.mb_noun_top1_acc = ScalarMeter(cfg.LOG_PERIOD) self.mb_noun_top5_acc = ScalarMeter(cfg.LOG_PERIOD) # Max accuracies (over the full val set). self.max_top1_acc = 0.0 self.max_top5_acc = 0.0 self.max_verb_top1_acc = 0.0 self.max_verb_top5_acc = 0.0 self.max_noun_top1_acc = 0.0 self.max_noun_top5_acc = 0.0 # Number of correctly classified examples. self.num_top1_cor = 0 self.num_top5_cor = 0 self.num_verb_top1_cor = 0 self.num_verb_top5_cor = 0 self.num_noun_top1_cor = 0 self.num_noun_top5_cor = 0 self.num_samples = 0 self.tb_writer: SummaryWriter = summary_writer def reset(self): """ Reset the Meter. """ self.iter_timer.reset() self.mb_top1_acc.reset() self.mb_top5_acc.reset() self.mb_verb_top1_acc.reset() self.mb_verb_top5_acc.reset() self.mb_noun_top1_acc.reset() self.mb_noun_top5_acc.reset() self.num_top1_cor = 0 self.num_top5_cor = 0 self.num_verb_top1_cor = 0 self.num_verb_top5_cor = 0 self.num_noun_top1_cor = 0 self.num_noun_top5_cor = 0 self.num_samples = 0 def iter_tic(self): """ Start to record time. """ self.iter_timer.reset() def iter_toc(self): """ Stop to record time. """ self.iter_timer.pause() def update_stats(self, top1_acc, top5_acc, mb_size): """ Update the current stats. Args: top1_acc (float): top1 accuracy rate. top5_acc (float): top5 accuracy rate. mb_size (int): mini batch size. """ self.mb_verb_top1_acc.add_value(top1_acc[0]) self.mb_verb_top5_acc.add_value(top5_acc[0]) self.mb_noun_top1_acc.add_value(top1_acc[1]) self.mb_noun_top5_acc.add_value(top5_acc[1]) self.mb_top1_acc.add_value(top1_acc[2]) self.mb_top5_acc.add_value(top5_acc[2]) self.num_verb_top1_cor += top1_acc[0] * mb_size self.num_verb_top5_cor += top5_acc[0] * mb_size self.num_noun_top1_cor += top1_acc[1] * mb_size self.num_noun_top5_cor += top5_acc[1] * mb_size self.num_top1_cor += top1_acc[2] * mb_size self.num_top5_cor += top5_acc[2] * mb_size self.num_samples += mb_size def log_iter_stats(self, cur_epoch, cur_iter): """ log the stats of the current iteration. Args: cur_epoch (int): the number of current epoch. cur_iter (int): the number of current iteration. """ if (cur_iter + 1) % self._cfg.LOG_PERIOD != 0: return eta_sec = self.iter_timer.seconds() * (self.max_iter - cur_iter - 1) eta = str(datetime.timedelta(seconds=int(eta_sec))) mem_usage = misc.gpu_mem_usage() stats = { "_type": "val_iter", "epoch": "{}/{}".format(cur_epoch + 1, self._cfg.SOLVER.MAX_EPOCH), "iter": "{}/{}".format(cur_iter + 1, self.max_iter), "time_diff": self.iter_timer.seconds(), "eta": eta, "verb_top1_acc": self.mb_verb_top1_acc.get_win_median(), "verb_top5_acc": self.mb_verb_top5_acc.get_win_median(), "noun_top1_acc": self.mb_noun_top1_acc.get_win_median(), "noun_top5_acc": self.mb_noun_top5_acc.get_win_median(), "top1_acc": self.mb_top1_acc.get_win_median(), "top5_acc": self.mb_top5_acc.get_win_median(), "mem": int(np.ceil(mem_usage)), } log_to_tensorboard(self.tb_writer, stats) logging.log_json_stats(stats) def log_epoch_stats(self, cur_epoch): """ Log the stats of the current epoch. Args: cur_epoch (int): the number of current epoch. """ verb_top1_acc = self.num_verb_top1_cor / self.num_samples verb_top5_acc = self.num_verb_top5_cor / self.num_samples noun_top1_acc = self.num_noun_top1_cor / self.num_samples noun_top5_acc = self.num_noun_top5_cor / self.num_samples top1_acc = self.num_top1_cor / self.num_samples top5_acc = self.num_top5_cor / self.num_samples self.max_verb_top1_acc = max(self.max_verb_top1_acc, verb_top1_acc) self.max_verb_top5_acc = max(self.max_verb_top5_acc, verb_top5_acc) self.max_noun_top1_acc = max(self.max_noun_top1_acc, noun_top1_acc) self.max_noun_top5_acc = max(self.max_noun_top5_acc, noun_top5_acc) is_best_epoch = top1_acc > self.max_top1_acc self.max_top1_acc = max(self.max_top1_acc, top1_acc) self.max_top5_acc = max(self.max_top5_acc, top5_acc) mem_usage = misc.gpu_mem_usage() stats = { "_type": "val_epoch", "epoch": "{}/{}".format(cur_epoch + 1, self._cfg.SOLVER.MAX_EPOCH), "time_diff": self.iter_timer.seconds(), "verb_top1_acc": verb_top1_acc, "verb_top5_acc": verb_top5_acc, "noun_top1_acc": noun_top1_acc, "noun_top5_acc": noun_top5_acc, "top1_acc": top1_acc, "top5_acc": top5_acc, "max_verb_top1_acc": self.max_verb_top1_acc, "max_verb_top5_acc": self.max_verb_top5_acc, "max_noun_top1_acc": self.max_noun_top1_acc, "max_noun_top5_acc": self.max_noun_top5_acc, "max_top1_acc": self.max_top1_acc, "max_top5_acc": self.max_top5_acc, "mem": int(np.ceil(mem_usage)), } log_to_tensorboard(self.tb_writer, stats, False) logging.log_json_stats(stats) return is_best_epoch
class ValMeter(object): """ Measures validation stats. """ def __init__(self, max_iter, cfg): """ Args: max_iter (int): the max number of iteration of the current epoch. cfg (CfgNode): configs. """ self._cfg = cfg self.max_iter = max_iter self.iter_timer = Timer() self.num_samples = 0 self.stats = {} def reset(self): """ Reset the Meter. """ self.iter_timer.reset() self.num_samples = 0 for k,v in self.stats.items(): self.stats[k].reset() def iter_tic(self): """ Start to record time. """ self.iter_timer.reset() def iter_toc(self): """ Stop to record time. """ self.iter_timer.pause() def update_stats(self, mb_size, **kwargs): #, top1_err, top5_err """ Update the current stats. Args: top1_err (float): top1 error rate. top5_err (float): top5 error rate. mb_size (int): mini batch size. """ self.num_samples += mb_size for k,v in kwargs.items(): if k not in self.stats: self.stats[k] = ScalarMeter(self._cfg.LOG_PERIOD) self.stats[k].add_value(v) def log_iter_stats(self, cur_epoch, cur_iter): """ log the stats of the current iteration. Args: cur_epoch (int): the number of current epoch. cur_iter (int): the number of current iteration. """ if (cur_iter + 1) % self._cfg.LOG_PERIOD != 0: return eta_sec = self.iter_timer.seconds() * (self.max_iter - cur_iter - 1) eta = str(datetime.timedelta(seconds=int(eta_sec))) mem_usage = misc.gpu_mem_usage() stats = { "_type": "val_iter", "epoch": "{}/{}".format(cur_epoch + 1, self._cfg.SOLVER.MAX_EPOCH), "iter": "{}/{}".format(cur_iter + 1, self.max_iter), "time_diff": self.iter_timer.seconds(), "time_left": eta, # "top1_err": self.mb_top1_err.get_win_median(), # "top5_err": self.mb_top5_err.get_win_median(), "mem": int(np.ceil(mem_usage)), } for k, v in self.stats.items(): stats[k] = v.get_win_median() logging.log_json_stats(stats) def log_epoch_stats(self, cur_epoch): """ Log the stats of the current epoch. Args: cur_epoch (int): the number of current epoch. """ mem_usage = misc.gpu_mem_usage() stats = { "_type": "val_epoch", "epoch": "{}/{}".format(cur_epoch + 1, self._cfg.SOLVER.MAX_EPOCH), "time_diff": self.iter_timer.seconds(), "mem": int(np.ceil(mem_usage)), } for k, v in self.stats.items(): stats[k] = v.get_global_avg() logging.log_json_stats(stats)
class TestMeter(object): """ Perform the multi-view ensemble for testing: each video with an unique index will be sampled with multiple clips, and the predictions of the clips will be aggregated to produce the final prediction for the video. The accuracy is calculated with the given ground truth labels. """ def __init__(self, num_videos, num_clips, num_cls, overall_iters): """ Construct tensors to store the predictions and labels. Expect to get num_clips predictions from each video, and calculate the metrics on num_videos videos. Args: num_videos (int): number of videos to test. num_clips (int): number of clips sampled from each video for aggregating the final prediction for the video. num_cls (int): number of classes for each prediction. overall_iters (int): overall iterations for testing. """ self.iter_timer = Timer() self.num_clips = num_clips self.overall_iters = overall_iters # Initialize tensors. self.video_preds = torch.zeros((num_videos, num_cls)) self.video_labels = torch.zeros((num_videos)).long() self.clip_count = torch.zeros((num_videos)).long() # Reset metric. self.reset() def reset(self): """ Reset the metric. """ self.clip_count.zero_() self.video_preds.zero_() self.video_labels.zero_() def update_stats(self, preds, labels, clip_ids): """ Collect the predictions from the current batch and perform on-the-flight summation as ensemble. Args: preds (tensor): predictions from the current batch. Dimension is N x C where N is the batch size and C is the channel size (num_cls). labels (tensor): the corresponding labels of the current batch. Dimension is N. clip_ids (tensor): clip indexes of the current batch, dimension is N. """ for ind in range(preds.shape[0]): vid_id = int(clip_ids[ind]) // self.num_clips self.video_labels[vid_id] = labels[ind] self.video_preds[vid_id] += preds[ind] self.clip_count[vid_id] += 1 def log_iter_stats(self, cur_iter): """ Log the stats. Args: cur_iter (int): the current iteration of testing. """ eta_sec = self.iter_timer.seconds() * (self.overall_iters - cur_iter) eta = str(datetime.timedelta(seconds=int(eta_sec))) stats = { "split": "test_iter", "cur_iter": "{}".format(cur_iter + 1), "eta": eta, "time_diff": self.iter_timer.seconds(), } logging.log_json_stats(stats) def iter_tic(self): self.iter_timer.reset() def iter_toc(self): self.iter_timer.pause() def finalize_metrics(self, ks=(1, 5)): """ Calculate and log the final ensembled metrics. ks (tuple): list of top-k values for topk_accuracies. For example, ks = (1, 5) correspods to top-1 and top-5 accuracy. """ if not all(self.clip_count == self.num_clips): logger.warning( "clip count {} ~= num clips {}".format( self.clip_count, self.num_clips ) ) logger.warning(self.clip_count) num_topks_correct = metrics.topks_correct( self.video_preds, self.video_labels, ks ) topks = [ (x / self.video_preds.size(0)) * 100.0 for x in num_topks_correct ] assert len({len(ks), len(topks)}) == 1 stats = {"split": "test_final"} for k, topk in zip(ks, topks): stats["top{}_acc".format(k)] = "{:.{prec}f}".format(topk, prec=2) logging.log_json_stats(stats)
class TrainMeter(object): def __init__(self, epoch_iters, cfg): """ :param epoch_iters: iters in one epoch :param cfg: """ self._cfg = cfg self.epoch_iters = epoch_iters # self.loss=ScalarMeter(cfg.LOG_PERIOD) self.mse_loss = ScalarMeter(cfg.LOG_PERIOD) self.entropy_loss = ScalarMeter(cfg.LOG_PERIOD) self.combine_loss = ScalarMeter(cfg.LOG_PERIOD) self.iter_timer = Timer() self.lr = None # self.loss_total=0.0 self.MAX_EPOCH = cfg.SOLVER.MAX_EPOCH * epoch_iters # self.num_samples=0 def reset(self): """ reset meter :return: """ self.lr = None self.mse_loss.reset() self.entropy_loss.reset() self.combine_loss.reset() # self.loss_total=0.0 def iter_start(self): """ start to recode time :return: """ self.iter_timer.reset() def iter_stop(self): """ stop recode time :return: """ self.iter_timer.pause() def update_stats(self, mse_loss, entropy_loss, combine_loss, lr, mb_size): self.mse_loss.add_value(mse_loss) self.entropy_loss.add_value(entropy_loss) self.combine_loss.add_value(combine_loss) self.lr = lr # self.loss_total+=loss*mb_size # self.num_samples+=mb_size def log_iter_stats(self, cur_epoch, cur_iter): """ log the stats for cur iteration :param cur_epoch: :param cur_iter: :return: """ if (cur_iter + 1) % self._cfg.LOG_PERIOD != 0: return eta_sec = self.iter_timer.seconds() * ( self.MAX_EPOCH - (cur_epoch * self.epoch_iters + cur_iter + 1)) eta = str(datetime.timedelta(seconds=int(eta_sec))) stats = { "_type": "train_iter", "epoch": "{}/{}".format(cur_epoch + 1, self._cfg.SOLVER.MAX_EPOCH), "iter": "{}/{}".format(cur_iter + 1, self.epoch_iters), "time": self.iter_timer.seconds(), "eta": eta, "mse_loss": self.mse_loss.get_win_median(), "entropy_loss": self.entropy_loss.get_win_median(), "combine_loss": self.combine_loss.get_win_median(), "lr": self.lr, "gpu": "{:.2f}GB".format(torch.cuda.max_memory_allocated() / 1024**3) } logging.log_json_stats(stats) def log_epoch_stats(self, cur_epoch): """ :param cur_epoch: :return: """ stats = { "_type": "train_epoch", "epoch": "{}/{}".format(cur_epoch + 1, self._cfg.SOLVER.MAX_EPOCH), "time_diff": self.iter_timer.seconds(), "mse_loss": self.mse_loss.get_win_avg(), "entropy_loss": self.entropy_loss.get_win_avg(), "combine_loss": self.combine_loss.get_win_avg(), "gpu_mem": "{:.2f} GB".format(torch.cuda.max_memory_allocated() / 1024**3), } logging.log_json_stats(stats)
class IterationTimer(HookBase): """ Track the time spent for each iteration (each run_step call in the trainer). Print a summary in the end of training. This hook uses the time between the call to its :meth:`before_step` and :meth:`after_step` methods. Under the convention that :meth:`before_step` of all hooks should only take negligible amount of time, the :class:`IterationTimer` hook should be placed at the beginning of the list of hooks to obtain accurate timing. """ def __init__(self, warmup_iter=3): """ Args: warmup_iter (int): the number of iterations at the beginning to exclude from timing. """ self._warmup_iter = warmup_iter self._step_timer = Timer() self._start_time = time.perf_counter() self._total_timer = Timer() def before_train(self): self._start_time = time.perf_counter() self._total_timer.reset() self._total_timer.pause() def after_train(self): logger = logging.getLogger(__name__) total_time = time.perf_counter() - self._start_time total_time_minus_hooks = self._total_timer.seconds() hook_time = total_time - total_time_minus_hooks num_iter = self.trainer.iter + 1 - self.trainer.start_iter - self._warmup_iter if num_iter > 0 and total_time_minus_hooks > 0: # Speed is meaningful only after warmup # NOTE this format is parsed by grep in some scripts logger.info( "Overall training speed: {} iterations in {} ({:.4f} s / it)".format( num_iter, str(datetime.timedelta(seconds=int(total_time_minus_hooks))), total_time_minus_hooks / num_iter, ) ) logger.info( "Total training time: {} ({} on hooks)".format( str(datetime.timedelta(seconds=int(total_time))), str(datetime.timedelta(seconds=int(hook_time))), ) ) def before_step(self): self._step_timer.reset() self._total_timer.resume() def after_step(self): # +1 because we're in after_step iter_done = self.trainer.iter - self.trainer.start_iter + 1 if iter_done >= self._warmup_iter: sec = self._step_timer.seconds() self.trainer.storage.put_scalars(time=sec) else: self._start_time = time.perf_counter() self._total_timer.reset() self._total_timer.pause()
class TestMeter(object): """ Perform the multi-view ensemble for testing: each video with an unique index will be sampled with multiple clips, and the predictions of the clips will be aggregated to produce the final prediction for the video. The accuracy is calculated with the given ground truth labels. """ def __init__(self, num_videos, num_clips, num_cls, overall_iters, isDemo): """ Construct tensors to store the predictions and labels. Expect to get num_clips predictions from each video, and calculate the metrics on num_videos videos. Args: num_videos (int): number of videos to test. num_clips (int): number of clips sampled from each video for aggregating the final prediction for the video. num_cls (int): number of classes for each prediction. overall_iters (int): overall iterations for testing. """ self.iter_timer = Timer() self.num_clips = num_clips self.overall_iters = overall_iters # Initialize tensors. self.video_preds = torch.zeros((num_videos, num_cls)) self.video_labels = torch.zeros((num_videos)).long() self.clip_count = torch.zeros((num_videos)).long() # Reset metric. self.reset() self.isDemo = isDemo def reset(self): """ Reset the metric. """ self.clip_count.zero_() self.video_preds.zero_() self.video_labels.zero_() def update_stats(self, preds, labels, clip_ids): """ Collect the predictions from the current batch and perform on-the-flight summation as ensemble. Args: preds (tensor): predictions from the current batch. Dimension is N x C where N is the batch size and C is the channel size (num_cls). labels (tensor): the corresponding labels of the current batch. Dimension is N. clip_ids (tensor): clip indexes of the current batch, dimension is N. """ #print(preds,labels) for ind in range(preds.shape[0]): vid_id = int(clip_ids[ind]) // self.num_clips self.video_labels[vid_id] = labels[ind] self.video_preds[vid_id] += preds[ind] self.clip_count[vid_id] += 1 def log_iter_stats(self, cur_iter): """ Log the stats. Args: cur_iter (int): the current iteration of testing. """ eta_sec = self.iter_timer.seconds() * (self.overall_iters - cur_iter) eta = str(datetime.timedelta(seconds=int(eta_sec))) stats = { "split": "test_iter", "cur_iter": "{}".format(cur_iter + 1), #"eta": eta, #"time_diff": self.iter_timer.seconds(), } #logging.log_json_stats(stats) def iter_tic(self): self.iter_timer.reset() def iter_toc(self): self.iter_timer.pause() def finalize_metrics(self, ks=(1, 2)): """ Calculate and log the final ensembled metrics. ks (tuple): list of top-k values for topk_accuracies. For example, ks = (1, 5) correspods to top-1 and top-5 accuracy. """ if self.isDemo: preds_numpy = self.video_preds.clone() normalize = np.array(softmax(preds_numpy.cpu().numpy())) jogging_label = 21 sort_p = [] for p in normalize: sort_p.append(sorted(p, reverse=True)) propability = np.transpose( np.array(softmax(preds_numpy.cpu().numpy()))) for i, v in enumerate(propability[jogging_label]): top1_v = sort_p[i][0] top2_v = sort_p[i][1] if v == top1_v or v == top2_v: propability[jogging_label][ i] = propability[jogging_label][i] / (top1_v + top2_v) cwd = os.getcwd() tmp_dir = os.path.join(cwd, "tmp") if not os.path.exists(tmp_dir): os.mkdir(tmp_dir) out_dir = os.path.join(tmp_dir, "probability.npy") np.save(out_dir, propability[jogging_label]) if not all(self.clip_count == self.num_clips): logger.warning("clip count {} ~= num clips {}".format( self.clip_count, self.num_clips)) logger.warning(self.clip_count) num_topks_correct = metrics.topks_correct(self.video_preds, self.video_labels, ks) topks = [(x / self.video_preds.size(0)) * 100.0 for x in num_topks_correct] #binary = [ # (x / self.video_preds.size(0)) * 100.0 for x in binary_correct #] assert len({len(ks), len(topks)}) == 1 stats = {"split": "test_final"} for k, topk in zip(ks, topks): stats["top{}_acc".format(k)] = "{:.{prec}f}".format(topk, prec=2)
class AVAMeter(object): """ Measure the AVA train, val, and test stats. """ def __init__(self, overall_iters, cfg, mode): """ overall_iters (int): the overall number of iterations of one epoch. cfg (CfgNode): configs. mode (str): `train`, `val`, or `test` mode. """ self.cfg = cfg self.lr = None self.loss = ScalarMeter(cfg.LOG_PERIOD) self.full_ava_test = cfg.AVA.FULL_TEST_ON_VAL self.mode = mode self.iter_timer = Timer() self.data_timer = Timer() self.net_timer = Timer() self.all_preds_train = [] self.all_ori_boxes_train = [] self.all_metadata_train = [] self.all_preds = [] self.all_ori_boxes = [] self.all_metadata = [] self.overall_iters = overall_iters self.categories, self.class_whitelist = read_labelmap( os.path.join(cfg.AVA.ANNOTATION_DIR, cfg.AVA.LABEL_MAP_FILE)) gt_filename = os.path.join(cfg.AVA.ANNOTATION_DIR, cfg.AVA.GROUNDTRUTH_FILE) self.full_groundtruth = read_csv(gt_filename, self.class_whitelist) self.mini_groundtruth = get_ava_mini_groundtruth(self.full_groundtruth) _, self.video_idx_to_name = ava_helper.load_image_lists( cfg, mode == "train") self.output_dir = cfg.OUTPUT_DIR def log_iter_stats(self, cur_epoch, cur_iter): """ Log the stats. Args: cur_epoch (int): the current epoch. cur_iter (int): the current iteration. """ if (cur_iter + 1) % self.cfg.LOG_PERIOD != 0: return eta_sec = self.iter_timer.seconds() * (self.overall_iters - cur_iter) eta = str(datetime.timedelta(seconds=int(eta_sec))) if self.mode == "train": stats = { "_type": "{}_iter".format(self.mode), "cur_epoch": "{}".format(cur_epoch + 1), "cur_iter": "{}".format(cur_iter + 1), "eta": eta, "dt": self.iter_timer.seconds(), "dt_data": self.data_timer.seconds(), "dt_net": self.net_timer.seconds(), "mode": self.mode, "loss": self.loss.get_win_median(), "lr": self.lr, } elif self.mode == "val": stats = { "_type": "{}_iter".format(self.mode), "cur_epoch": "{}".format(cur_epoch + 1), "cur_iter": "{}".format(cur_iter + 1), "eta": eta, "dt": self.iter_timer.seconds(), "dt_data": self.data_timer.seconds(), "dt_net": self.net_timer.seconds(), "mode": self.mode, } elif self.mode == "test": stats = { "_type": "{}_iter".format(self.mode), "cur_iter": "{}".format(cur_iter + 1), "eta": eta, "dt": self.iter_timer.seconds(), "dt_data": self.data_timer.seconds(), "dt_net": self.net_timer.seconds(), "mode": self.mode, } else: raise NotImplementedError("Unknown mode: {}".format(self.mode)) logging.log_json_stats(stats) def iter_tic(self): """ Start to record time. """ self.iter_timer.reset() self.data_timer.reset() def iter_toc(self): """ Stop to record time. """ self.iter_timer.pause() self.net_timer.pause() def data_toc(self): self.data_timer.pause() self.net_timer.reset() def reset(self): """ Reset the Meter. """ self.loss.reset() self.all_preds = [] self.all_ori_boxes = [] self.all_metadata = [] def update_stats(self, preds, ori_boxes, metadata, loss=None, lr=None): """ Update the current stats. Args: preds (tensor): prediction embedding. ori_boxes (tensor): original boxes (x1, y1, x2, y2). metadata (tensor): metadata of the AVA data. loss (float): loss value. lr (float): learning rate. """ if self.mode in ["val", "test"]: self.all_preds.append(preds) self.all_ori_boxes.append(ori_boxes) self.all_metadata.append(metadata) if self.mode in ["train"]: self.all_preds_train.append(preds) self.all_ori_boxes_train.append(ori_boxes) self.all_metadata_train.append(metadata) if loss is not None: self.loss.add_value(loss) if lr is not None: self.lr = lr def finalize_metrics(self, log=True): """ Calculate and log the final AVA metrics. """ all_preds = torch.cat(self.all_preds, dim=0) all_ori_boxes = torch.cat(self.all_ori_boxes, dim=0) all_metadata = torch.cat(self.all_metadata, dim=0) if self.mode == "test" or (self.full_ava_test and self.mode == "val"): groundtruth = self.full_groundtruth else: groundtruth = self.mini_groundtruth self.full_map = evaluate_ava( all_preds, all_ori_boxes, all_metadata.tolist(), self.class_whitelist, self.categories, groundtruth=groundtruth, video_idx_to_name=self.video_idx_to_name, ) if log: stats = {"mode": self.mode, "map": self.full_map} logging.log_json_stats(stats) def log_epoch_stats(self, cur_epoch): """ Log the stats of the current epoch. Args: cur_epoch (int): the number of current epoch. """ if self.mode in ["val", "test"]: self.finalize_metrics(log=False) stats = { "_type": "{}_epoch".format(self.mode), "cur_epoch": "{}".format(cur_epoch + 1), "mode": self.mode, "map": self.full_map, "gpu_mem": "{:.2f}G".format(misc.gpu_mem_usage()), "RAM": "{:.2f}/{:.2f}G".format(*misc.cpu_mem_usage()), } logging.log_json_stats(stats)
def benchmark_data_loading(cfg): """ Benchmark the speed of data loading in PySlowFast. Args: cfg (CfgNode): configs. Details can be found in slowfast/config/defaults.py """ # Set up environment. setup_environment() # Set random seed from configs. np.random.seed(cfg.RNG_SEED) torch.manual_seed(cfg.RNG_SEED) # Setup logging format. logging.setup_logging() # Print config. logger.info("Benchmark data loading with config:") logger.info(pprint.pformat(cfg)) timer = Timer() dataloader = loader.construct_loader(cfg, "train") logger.info("Initialize loader using {:.2f} seconds.".format( timer.seconds())) # Total batch size across different machines. batch_size = cfg.TRAIN.BATCH_SIZE * cfg.NUM_SHARDS log_period = cfg.BENCHMARK.LOG_PERIOD epoch_times = [] # Test for a few epochs. for cur_epoch in range(cfg.BENCHMARK.NUM_EPOCHS): timer = Timer() timer_epoch = Timer() iter_times = [] for cur_iter, _ in enumerate(tqdm.tqdm(dataloader)): if cur_iter > 0 and cur_iter % log_period == 0: iter_times.append(timer.seconds()) ram_usage, ram_total = misc.cpu_mem_usage() logger.info( "Epoch {}: {} iters ({} videos) in {:.2f} seconds. " "RAM Usage: {:.2f}/{:.2f} GB.".format( cur_epoch, log_period, log_period * batch_size, iter_times[-1], ram_usage, ram_total, )) timer.reset() epoch_times.append(timer_epoch.seconds()) ram_usage, ram_total = misc.cpu_mem_usage() logger.info( "Epoch {}: in total {} iters ({} videos) in {:.2f} seconds. " "RAM Usage: {:.2f}/{:.2f} GB.".format( cur_epoch, len(dataloader), len(dataloader) * batch_size, epoch_times[-1], ram_usage, ram_total, )) logger.info( "Epoch {}: on average every {} iters ({} videos) take {:.2f}/{:.2f} " "(avg/std) seconds.".format( cur_epoch, log_period, log_period * batch_size, np.mean(iter_times), np.std(iter_times), )) logger.info("On average every epoch ({} videos) takes {:.2f}/{:.2f} " "(avg/std) seconds.".format( len(dataloader) * batch_size, np.mean(epoch_times), np.std(epoch_times), ))
class ValMeter(object): """ Measures validation stats. """ def __init__(self, max_iter, cfg): """ Args: max_iter (int): the max number of iteration of the current epoch. cfg (CfgNode): configs. """ self._cfg = cfg self.max_iter = max_iter self.iter_timer = Timer() self.data_timer = Timer() self.net_timer = Timer() # Current minibatch errors (smoothed over a window). self.mb_top1_err = ScalarMeter(cfg.LOG_PERIOD) self.mb_top5_err = ScalarMeter(cfg.LOG_PERIOD) # Min errors (over the full val set). self.min_top1_err = 100.0 self.min_top5_err = 100.0 # Number of misclassified examples. self.num_top1_mis = 0 self.num_top5_mis = 0 self.num_samples = 0 self.all_preds = [] self.all_labels = [] self.output_dir = cfg.OUTPUT_DIR def reset(self): """ Reset the Meter. """ self.iter_timer.reset() self.mb_top1_err.reset() self.mb_top5_err.reset() self.num_top1_mis = 0 self.num_top5_mis = 0 self.num_samples = 0 self.all_preds = [] self.all_labels = [] def iter_tic(self): """ Start to record time. """ self.iter_timer.reset() self.data_timer.reset() def iter_toc(self): """ Stop to record time. """ self.iter_timer.pause() self.net_timer.pause() def data_toc(self): self.data_timer.pause() self.net_timer.reset() def update_stats(self, top1_err, top5_err, mb_size): """ Update the current stats. Args: top1_err (float): top1 error rate. top5_err (float): top5 error rate. mb_size (int): mini batch size. """ self.mb_top1_err.add_value(top1_err) self.mb_top5_err.add_value(top5_err) self.num_top1_mis += top1_err * mb_size self.num_top5_mis += top5_err * mb_size self.num_samples += mb_size def update_predictions(self, preds, labels): """ Update predictions and labels. Args: preds (tensor): model output predictions. labels (tensor): labels. """ # TODO: merge update_prediction with update_stats. self.all_preds.append(preds) self.all_labels.append(labels) def log_iter_stats(self, cur_epoch, cur_iter): """ log the stats of the current iteration. Args: cur_epoch (int): the number of current epoch. cur_iter (int): the number of current iteration. """ if (cur_iter + 1) % self._cfg.LOG_PERIOD != 0: return eta_sec = self.iter_timer.seconds() * (self.max_iter - cur_iter - 1) eta = str(datetime.timedelta(seconds=int(eta_sec))) stats = { "_type": "val_iter", "epoch": "{}/{}".format(cur_epoch + 1, self._cfg.SOLVER.MAX_EPOCH), "iter": "{}/{}".format(cur_iter + 1, self.max_iter), "time_diff": self.iter_timer.seconds(), "eta": eta, "gpu_mem": "{:.2f}G".format(misc.gpu_mem_usage()), } if not self._cfg.DATA.MULTI_LABEL: stats["top1_err"] = self.mb_top1_err.get_win_median() stats["top5_err"] = self.mb_top5_err.get_win_median() logging.log_json_stats(stats) def log_epoch_stats(self, cur_epoch): """ Log the stats of the current epoch. Args: cur_epoch (int): the number of current epoch. """ stats = { "_type": "val_epoch", "epoch": "{}/{}".format(cur_epoch + 1, self._cfg.SOLVER.MAX_EPOCH), "time_diff": self.iter_timer.seconds(), "gpu_mem": "{:.2f}G".format(misc.gpu_mem_usage()), "RAM": "{:.2f}/{:.2f}G".format(*misc.cpu_mem_usage()), } if self._cfg.DATA.MULTI_LABEL: stats["map"] = get_map( torch.cat(self.all_preds).cpu().numpy(), torch.cat(self.all_labels).cpu().numpy(), ) else: top1_err = self.num_top1_mis / self.num_samples top5_err = self.num_top5_mis / self.num_samples self.min_top1_err = min(self.min_top1_err, top1_err) self.min_top5_err = min(self.min_top5_err, top5_err) stats["top1_err"] = top1_err stats["top5_err"] = top5_err stats["min_top1_err"] = self.min_top1_err stats["min_top5_err"] = self.min_top5_err logging.log_json_stats(stats)
class TrainMeter(object): """ Measure training stats. """ def __init__(self, epoch_iters, cfg): """ Args: epoch_iters (int): the overall number of iterations of one epoch. cfg (CfgNode): configs. """ self._cfg = cfg self.epoch_iters = epoch_iters self.MAX_EPOCH = cfg.SOLVER.MAX_EPOCH * epoch_iters self.iter_timer = Timer() self.loss_D = ScalarMeter(cfg.LOG_PERIOD) self.loss_G = ScalarMeter(cfg.LOG_PERIOD) self.appe_loss = ScalarMeter(cfg.LOG_PERIOD) self.flow_loss = ScalarMeter(cfg.LOG_PERIOD) self.loss_G_three_part = ScalarMeter(cfg.LOG_PERIOD) self.loss_D_total = 0.0 # loss_G,appe_loss,flow_loss,loss_G_total self.loss_G_total = 0.0 self.appe_loss_total = 0.0 self.flow_loss_total = 0.0 self.loss_G_three_part_total = 0.0 self.lr_G = None self.lr_D = None # Current minibatch errors (smoothed over a window). # self.mb_top1_err = ScalarMeter(cfg.LOG_PERIOD) # self.mb_top5_err = ScalarMeter(cfg.LOG_PERIOD) # Number of misclassified examples. self.num_top1_mis = 0 self.num_top5_mis = 0 self.num_samples = 0 self.num_samples_G = 0 self.num_samples_D = 0 def reset(self): """ Reset the Meter. """ # self.loss.reset() # self.loss_total = 0.0 # self.lr = None self.loss_D.reset() self.loss_G.reset() self.appe_loss.reset() self.flow_loss.reset() self.loss_G_three_part.reset() self.loss_D_total = 0.0 self.loss_G_total = 0.0 self.appe_loss_total = 0.0 self.flow_loss_total = 0.0 self.loss_G_three_part_total = 0.0 self.lr_G = None self.lr_D = None # self.mb_top1_err.reset() # self.mb_top5_err.reset() # self.num_top1_mis = 0 # self.num_top5_mis = 0 self.num_samples = 0 self.num_samples_D = 0 self.num_samples_G = 0 def iter_tic(self): """ Start to record time. """ self.iter_timer.reset() def iter_toc(self): """ Stop to record time. """ self.iter_timer.pause() def update_stats(self, top1_err, top5_err, loss, lr, mb_size): """ Update the current stats. Args: top1_err (float): top1 error rate. top5_err (float): top5 error rate. loss (float): loss value. lr (float): learning rate. mb_size (int): mini batch size. """ self.loss.add_value(loss) self.lr = lr self.loss_total += loss * mb_size self.num_samples += mb_size # if not self._cfg.DATA.MULTI_LABEL: # # Current minibatch stats # self.mb_top1_err.add_value(top1_err) # self.mb_top5_err.add_value(top5_err) # # Aggregate stats # self.num_top1_mis += top1_err * mb_size # self.num_top5_mis += top5_err * mb_size def update_stats_G(self, loss_G, appe_loss, flow_loss, loss_G_three_part, lr, mb_size): """ Update the current stats. Args: loss (float): loss value. lr (float): learning rate. mb_size (int): mini batch size. """ self.loss_G.add_value(loss_G) self.appe_loss.add_value(appe_loss) self.flow_loss.add_value(flow_loss) self.loss_G_three_part.add_value(loss_G_three_part) # self.lr_G = lr # self.loss_total_G+= loss * mb_size self.loss_G_total += loss_G * mb_size self.appe_loss_total = appe_loss * mb_size self.flow_loss_total = flow_loss * mb_size self.loss_G_three_part_total = loss_G_three_part * mb_size self.num_samples_G += mb_size def update_stats_D(self, loss_D, lr, mb_size): """ Update the current stats of D . Args: loss (float): loss value. lr (float): learning rate. mb_size (int): mini batch size. """ self.loss_D.add_value(loss_D) self.lr_D = lr self.loss_D_total += loss_D * mb_size self.num_samples_D += mb_size def log_iter_stats(self, cur_epoch, cur_iter, mode): """ log the stats of the current iteration. Args: cur_epoch (int): the number of current epoch. cur_iter (int): the number of current iteration. """ if (cur_iter + 1) % self._cfg.LOG_PERIOD != 0: return eta_sec = self.iter_timer.seconds() * ( self.MAX_EPOCH - (cur_epoch * self.epoch_iters + cur_iter + 1)) eta = str(datetime.timedelta(seconds=int(eta_sec))) # stats in D or G if mode in ["D", "Discriminator"]: stats = { "_type": "train_iter", "epoch": "{}/{}".format(cur_epoch + 1, self._cfg.SOLVER.MAX_EPOCH), "iter": "{}/{}".format(cur_iter + 1, self.epoch_iters), "time_diff": self.iter_timer.seconds(), "eta": eta, "loss_D": self.loss_D.get_win_median(), "lr_D": self.lr_D, "gpu_mem": "{:.2f} GB".format(misc.gpu_mem_usage()), } elif mode in ["G", "Generator"]: stats = { "_type": "train_iter", "epoch": "{}/{}".format(cur_epoch + 1, self._cfg.SOLVER.MAX_EPOCH), "iter": "{}/{}".format(cur_iter + 1, self.epoch_iters), "time_diff": self.iter_timer.seconds(), "eta": eta, "loss_G": self.loss_G.get_win_median(), "appe_loss": self.appe_loss.get_win_median(), "flow_loss": self.flow_loss.get_win_median(), "three_part_loss_G": self.loss_G_three_part.get_win_median(), "lr_G": self.lr_G, "gpu_mem": "{:.2f} GB".format(misc.gpu_mem_usage()), } else: raise NotImplementedError("Does not support state") logging.log_json_stats(stats) # stats = { # "_type": "train_iter", # "epoch": "{}/{}".format(cur_epoch + 1, self._cfg.SOLVER.MAX_EPOCH), # "iter": "{}/{}".format(cur_iter + 1, self.epoch_iters), # "time_diff": self.iter_timer.seconds(), # "eta": eta, # # "loss": self.loss.get_win_median(), # "lr": self.lr, # "gpu_mem": "{:.2f} GB".format(misc.gpu_mem_usage()), # } # if not self._cfg.DATA.MULTI_LABEL: # stats["top1_err"] = self.mb_top1_err.get_win_median() # stats["top5_err"] = self.mb_top5_err.get_win_median() def log_epoch_stats(self, cur_epoch): """ Log the stats of the current epoch. Args: cur_epoch (int): the number of current epoch. """ eta_sec = self.iter_timer.seconds() * ( self.MAX_EPOCH - (cur_epoch + 1) * self.epoch_iters) eta = str(datetime.timedelta(seconds=int(eta_sec))) # stats in G or D stats = { "_type": "train_epoch", "epoch": "{}/{}".format(cur_epoch + 1, self._cfg.SOLVER.MAX_EPOCH), "time_diff": self.iter_timer.seconds(), "eta": eta, "lr_D": self.lr_D, "loss_D": self.loss_D_total / self.num_samples_D, "lr_G": self.lr_G, "loss_G": self.loss_G_total / self.num_samples_G, "appe_loss": self.appe_loss_total / self.num_samples_G, "flow_loss": self.flow_loss_total / self.num_samples_G, "total_G_loss": self.loss_G_three_part_total / self.num_samples_G, "gpu_mem": "{:.2f} GB".format(misc.gpu_mem_usage()), "RAM": "{:.2f}/{:.2f} GB".format(*misc.cpu_mem_usage()), } # avg_loss = self.loss_total_D / self.num_samples_D # stats["loss_D"] = avg_loss # stats = { # "_type": "train_epoch", # "epoch": "{}/{}".format(cur_epoch + 1, self._cfg.SOLVER.MAX_EPOCH), # "time_diff": self.iter_timer.seconds(), # "eta": eta, # "lr": self.lr, # "gpu_mem": "{:.2f} GB".format(misc.gpu_mem_usage()), # "RAM": "{:.2f}/{:.2f} GB".format(*misc.cpu_mem_usage()), # } # if not self._cfg.DATA.MULTI_LABEL: # top1_err = self.num_top1_mis / self.num_samples # top5_err = self.num_top5_mis / self.num_samples # avg_loss = self.loss_total / self.num_samples # stats["top1_err"] = top1_err # stats["top5_err"] = top5_err # stats["loss"] = avg_loss logging.log_json_stats(stats)