def inference(cfg): # # Set up environment. du.init_distributed_training(cfg) # Set random seed from configs. np.random.seed(cfg.RNG_SEED) torch.manual_seed(cfg.RNG_SEED) # Build the video model and print model statistics. model = build_model(cfg) if du.is_master_proc() and cfg.LOG_MODEL_INFO: misc.log_model_info(model, cfg, use_train_input=False) cu.load_test_checkpoint(cfg, model) # Create video loaders. video_loader = loader.construct_loader(cfg, "test") # Create saver saver = Saver(cfg.DATA.PATH_TO_DATA_DIR, video_loader.dataset) model.eval() for i, (inputs, index) in tqdm(enumerate(video_loader), total=len(video_loader)): for i in range(len(inputs)): inputs[i] = inputs[i].cuda(non_blocking=True) index = index.cuda() feats = model(inputs) # Gather all the predictions across all the devices to perform ensemble. if cfg.NUM_GPUS > 1: feats, index = du.all_gather([feats, index]) saver.save(feats, index) saver.merge()
def saveOnnxModel(self): model = build_model(self.cfg) optimizer = optim.construct_optimizer(model, self.cfg) start_epoch = cu.load_train_checkpoint(self.cfg, model, optimizer, self.logger) self.cfg.TRAIN['BATCH_SIZE'] = self.cfg.ONNX.BATCH_SIZE dl = loader.construct_loader(self.cfg, "train") inputs, labels, _, _ = next(iter(dl)) if isinstance(inputs, (list, )): for i in range(len(inputs)): inputs[i].to(self.onnxDevice) model.to(torch.device(self.onnxDevice)) model.eval() onnxPath, _ = self.getOnnxModelPath() with torch.no_grad(): torch.onnx.export( model, inputs, onnxPath, opset_version=self.cfg.ONNX.OPSET_VER, verbose=True, input_names=self.cfg.ONNX.INPUT_NAMES, output_names=self.cfg.ONNX.OUTPUT_NAMES, ) self.logger.info("Exported {}".format(onnxPath))
def test(cfg): """ Perform multi-view testing on the pretrained video model. Args: cfg (CfgNode): configs. Details can be found in slowfast/config/defaults.py """ # Set up environment. du.init_distributed_training(cfg) # Set random seed from configs. np.random.seed(cfg.RNG_SEED) torch.manual_seed(cfg.RNG_SEED) # Setup logging format. logging.setup_logging(cfg.OUTPUT_DIR) # Print config. logger.info("Test with config:") logger.info(cfg) # Build the video model and print model statistics. model = build_model(cfg) if du.is_master_proc() and cfg.LOG_MODEL_INFO: misc.log_model_info(model, cfg, use_train_input=False) cu.load_test_checkpoint(cfg, model) # Create video testing loaders. test_loader = loader.construct_loader(cfg, "test") logger.info("Testing model for {} iterations".format(len(test_loader))) if cfg.DETECTION.ENABLE: assert cfg.NUM_GPUS == cfg.TEST.BATCH_SIZE or cfg.NUM_GPUS == 0 test_meter = AVAMeter(len(test_loader), cfg, mode="test") else: assert ( test_loader.dataset.num_videos % (cfg.TEST.NUM_ENSEMBLE_VIEWS * cfg.TEST.NUM_SPATIAL_CROPS) == 0) # Create meters for multi-view testing. test_meter = TestMeter( test_loader.dataset.num_videos // (cfg.TEST.NUM_ENSEMBLE_VIEWS * cfg.TEST.NUM_SPATIAL_CROPS), cfg.TEST.NUM_ENSEMBLE_VIEWS * cfg.TEST.NUM_SPATIAL_CROPS, cfg.MODEL.NUM_CLASSES, len(test_loader), cfg.DATA.MULTI_LABEL, cfg.DATA.ENSEMBLE_METHOD, ) # Set up writer for logging to Tensorboard format. if cfg.TENSORBOARD.ENABLE and du.is_master_proc( cfg.NUM_GPUS * cfg.NUM_SHARDS): writer = tb.TensorboardWriter(cfg) else: writer = None # # Perform multi-view test on the entire dataset. test_meter = perform_test(test_loader, model, test_meter, cfg, writer) if writer is not None: writer.close()
def test_from_train(model, cfg, cnt=-1): test_loader = loader.construct_loader(cfg, "test") logger.info("Testing model for {} iterations".format(len(test_loader))) if cfg.DETECTION.ENABLE: assert cfg.NUM_GPUS == cfg.TEST.BATCH_SIZE test_meter = AVAMeter(len(test_loader), cfg, mode="test") else: assert ( len(test_loader.dataset) % (cfg.TEST.NUM_ENSEMBLE_VIEWS * cfg.TEST.NUM_SPATIAL_CROPS) == 0 ) # Create meters for multi-view testing. if cfg.TEST.DATASET == 'epickitchens': test_meter = EPICTestMeter( len(test_loader.dataset) // (cfg.TEST.NUM_ENSEMBLE_VIEWS * cfg.TEST.NUM_SPATIAL_CROPS), cfg.TEST.NUM_ENSEMBLE_VIEWS * cfg.TEST.NUM_SPATIAL_CROPS, cfg.MODEL.NUM_CLASSES, len(test_loader), ) else: test_meter = TestMeter( len(test_loader.dataset) // (cfg.TEST.NUM_ENSEMBLE_VIEWS * cfg.TEST.NUM_SPATIAL_CROPS), cfg.TEST.NUM_ENSEMBLE_VIEWS * cfg.TEST.NUM_SPATIAL_CROPS, cfg.MODEL.NUM_CLASSES, len(test_loader), ) # # Perform multi-view test on the entire dataset. scores_path = os.path.join(cfg.OUTPUT_DIR, 'scores') if not os.path.exists(scores_path): os.makedirs(scores_path) filename_root = cfg.EPICKITCHENS.TEST_LIST.split('.')[0] if cnt >= 0: file_name = '{}_{}_{}.pkl'.format(filename_root, cnt, cfg.MODEL.MODEL_NAME) else: file_name = '{}_{}_{}.pkl'.format(filename_root, 'test_only', cfg.MODEL.MODEL_NAME) file_path = os.path.join(scores_path, file_name) logger.info(file_path) pickle.dump([], open(file_path, 'wb+')) preds, labels, metadata = perform_test(test_loader, model, test_meter, cfg) if du.is_master_proc(): if cfg.TEST.DATASET == 'epickitchens': results = {'verb_output': preds[0], 'noun_output': preds[1], 'verb_gt': labels[0], 'noun_gt': labels[1], 'narration_id': metadata} scores_path = os.path.join(cfg.OUTPUT_DIR, 'scores') if not os.path.exists(scores_path): os.makedirs(scores_path) pickle.dump(results, open(file_path, 'wb'))
def build_trainer(cfg): """ Build training model and its associated tools, including optimizer, dataloaders and meters. Args: cfg (CfgNode): configs. Details can be found in slowfast/config/defaults.py Returns: model (nn.Module): training model. optimizer (Optimizer): optimizer. train_loader (DataLoader): training data loader. val_loader (DataLoader): validatoin data loader. precise_bn_loader (DataLoader): training data loader for computing precise BN. train_meter (TrainMeter): tool for measuring training stats. val_meter (ValMeter): tool for measuring validation stats. """ # Build the video model and print model statistics. model = build_model(cfg) if du.is_master_proc() and cfg.LOG_MODEL_INFO: misc.log_model_info(model, cfg, use_train_input=True) # Construct the optimizer. optimizer = optim.construct_optimizer(model, cfg) # Create the video train and val loaders. train_loader = loader.construct_loader(cfg, "train") val_loader = loader.construct_loader(cfg, "val") precise_bn_loader = loader.construct_loader(cfg, "train", is_precise_bn=True) # Create meters. train_meter = TrainMeter(len(train_loader), cfg) val_meter = ValMeter(len(val_loader), cfg) return ( model, optimizer, train_loader, val_loader, precise_bn_loader, train_meter, val_meter, )
def train(): # Create model. model = build_model(configs.backbone) # Construct the optimizer. optimizer = torch.optim.SGD(model.parameters(), 0.1, 0.9, weight_decay=0.0001, nesterov=True) start_epoch = cu.load(model, optimizer) # Create the video train and val loaders. train_loader = loader.construct_loader('train') val_loader = loader.construct_loader('val') # Train. for epoch in range(start_epoch, configs.max_epoch): train_epoch(train_loader, model, optimizer, epoch) eval_epoch(val_loader, model, epoch) cu.save(model, optimizer, epoch)
def visualize(cfg): """ Perform layer weights and activations visualization on the model. Args: cfg (CfgNode): configs. Details can be found in slowfast/config/defaults.py """ if cfg.TENSORBOARD.ENABLE and cfg.TENSORBOARD.MODEL_VIS.ENABLE: # Set up environment. du.init_distributed_training(cfg) # Set random seed from configs. np.random.seed(cfg.RNG_SEED) torch.manual_seed(cfg.RNG_SEED) # Setup logging format. logging.setup_logging(cfg.OUTPUT_DIR) # Print config. logger.info("Model Visualization with config:") logger.info(cfg) # Build the video model and print model statistics. model = build_model(cfg) if du.is_master_proc() and cfg.LOG_MODEL_INFO: misc.log_model_info(model, cfg, is_train=False) cu.load_test_checkpoint(cfg, model) # Create video testing loaders. vis_loader = loader.construct_loader(cfg, "test") logger.info( "Visualize model for {} data points".format(len(vis_loader)) ) if cfg.DETECTION.ENABLE: assert cfg.NUM_GPUS == cfg.TEST.BATCH_SIZE # Set up writer for logging to Tensorboard format. if du.is_master_proc(cfg.NUM_GPUS * cfg.NUM_SHARDS): writer = tb.TensorboardWriter(cfg) else: writer = None # Run visualization on the model run_visualization(vis_loader, model, cfg, writer) if writer is not None: writer.close()
def test(cfg): """ Perform multi-view testing on the pretrained video model. Args: cfg (CfgNode): configs. Details can be found in slowfast/config/defaults.py """ # Set up environment. du.init_distributed_training(cfg) # Set random seed from configs. np.random.seed(cfg.RNG_SEED) torch.manual_seed(cfg.RNG_SEED) # Setup logging format. logging.setup_logging(cfg.OUTPUT_DIR) # Print config. logger.info("Test with config:") logger.info(cfg) # Build the video model and print model statistics. model = build_model(cfg) cu.load_test_checkpoint(cfg, model) # Create video testing loaders. test_loader = loader.construct_loader(cfg, "test") logger.info("Testing model for {} iterations".format(len(test_loader))) # Create meters for loss tracking test_meter = TrainMeter(test_loader.dataset.num_videos, cfg) # Set up writer for logging to Tensorboard format. if cfg.TENSORBOARD.ENABLE and du.is_master_proc( cfg.NUM_GPUS * cfg.NUM_SHARDS ): writer = tb.TensorboardWriter(cfg) else: writer = None # # Perform multi-view test on the entire dataset. test_meter = perform_test(test_loader, model, test_meter, cfg, writer) if writer is not None: writer.close()
def infer(cfg): # Setup logging format. logging.setup_logging() # Print config. logger.info("Infer with config:") logger.info(cfg) # Build the SlowFast model and print its statistics model = build_model(cfg) if du.is_master_proc(): misc.log_model_info(model, cfg, is_train=False) # load weights if cfg.INFERENCE.WEIGHTS_FILE_PATH != "": cu.load_checkpoint(cfg.INFERENCE.WEIGHTS_FILE_PATH, model, cfg.NUM_GPUS > 1, None, inflation=False, convert_from_caffe2=cfg.INFERENCE.WEIGHTS_TYPE == "caffe2") else: raise FileNotFoundError("Model weights file could not be found") inference_loader = loader.construct_loader(cfg, "inference") perform_inference(inference_loader, model, cfg)
def test(cfg): """ Perform multi-view testing on the pretrained video model. Args: cfg (CfgNode): configs. Details can be found in slowfast/config/defaults.py """ # Set up environment. du.init_distributed_training(cfg) # Set random seed from configs. np.random.seed(cfg.RNG_SEED) torch.manual_seed(cfg.RNG_SEED) # Setup logging format. logging.setup_logging(cfg.OUTPUT_DIR) # Print config. logger.info("Test with config:") logger.info(cfg) # Build the video model and print model statistics. model = build_model(cfg) out_str_prefix = "lin" if cfg.MODEL.DETACH_FINAL_FC else "" if du.is_master_proc() and cfg.LOG_MODEL_INFO: misc.log_model_info(model, cfg, use_train_input=False) if (cfg.TASK == "ssl" and cfg.MODEL.MODEL_NAME == "ContrastiveModel" and cfg.CONTRASTIVE.KNN_ON): train_loader = loader.construct_loader(cfg, "train") out_str_prefix = "knn" if hasattr(model, "module"): model.module.init_knn_labels(train_loader) else: model.init_knn_labels(train_loader) cu.load_test_checkpoint(cfg, model) # Create video testing loaders. test_loader = loader.construct_loader(cfg, "test") logger.info("Testing model for {} iterations".format(len(test_loader))) if cfg.DETECTION.ENABLE: assert cfg.NUM_GPUS == cfg.TEST.BATCH_SIZE or cfg.NUM_GPUS == 0 test_meter = AVAMeter(len(test_loader), cfg, mode="test") else: assert ( test_loader.dataset.num_videos % (cfg.TEST.NUM_ENSEMBLE_VIEWS * cfg.TEST.NUM_SPATIAL_CROPS) == 0) # Create meters for multi-view testing. test_meter = TestMeter( test_loader.dataset.num_videos // (cfg.TEST.NUM_ENSEMBLE_VIEWS * cfg.TEST.NUM_SPATIAL_CROPS), cfg.TEST.NUM_ENSEMBLE_VIEWS * cfg.TEST.NUM_SPATIAL_CROPS, cfg.MODEL.NUM_CLASSES if not cfg.TASK == "ssl" else cfg.CONTRASTIVE.NUM_CLASSES_DOWNSTREAM, len(test_loader), cfg.DATA.MULTI_LABEL, cfg.DATA.ENSEMBLE_METHOD, ) # Set up writer for logging to Tensorboard format. if cfg.TENSORBOARD.ENABLE and du.is_master_proc( cfg.NUM_GPUS * cfg.NUM_SHARDS): writer = tb.TensorboardWriter(cfg) else: writer = None # # Perform multi-view test on the entire dataset. test_meter = perform_test(test_loader, model, test_meter, cfg, writer) if writer is not None: writer.close() result_string = ( "_a{}{}{} Top1 Acc: {} Top5 Acc: {} MEM: {:.2f} dataset: {}{}" "".format( out_str_prefix, cfg.TEST.DATASET[0], test_meter.stats["top1_acc"], test_meter.stats["top1_acc"], test_meter.stats["top5_acc"], misc.gpu_mem_usage(), cfg.TEST.DATASET[0], cfg.MODEL.NUM_CLASSES, )) logger.info("testing done: {}".format(result_string)) return result_string
def train(cfg): """ Train a video model for many epochs on train set and evaluate it on val set. Args: cfg (CfgNode): configs. Details can be found in slowfast/config/defaults.py """ # Set random seed from configs. np.random.seed(cfg.RNG_SEED) torch.manual_seed(cfg.RNG_SEED) # Setup logging format. logging.setup_logging(cfg) # Print config. logger.info("Train with config:") logger.info(pprint.pformat(cfg)) if du.get_rank()==0 and du.is_master_proc(num_gpus=cfg.NUM_GPUS): writer = SummaryWriter(log_dir=cfg.OUTPUT_DIR) else: writer = None if du.get_rank()==0 and du.is_master_proc(num_gpus=cfg.NUM_GPUS) and not cfg.DEBUG: tags = [] if 'TAGS' in cfg and cfg.TAGS !=[]: tags=list(cfg.TAGS) neptune.set_project('Serre-Lab/motion') ###################### overrides = sys.argv[1:] overrides_dict = {} for i in range(len(overrides)//2): overrides_dict[overrides[2*i]] = overrides[2*i+1] overrides_dict['dir'] = cfg.OUTPUT_DIR ###################### if 'NEP_ID' in cfg and cfg.NEP_ID != "": session = Session() project = session.get_project(project_qualified_name='Serre-Lab/motion') nep_experiment = project.get_experiments(id=cfg.NEP_ID)[0] else: nep_experiment = neptune.create_experiment (name=cfg.NAME, params=overrides_dict, tags=tags) else: nep_experiment=None # Build the video model and print model statistics. model = build_model(cfg) if du.is_master_proc(num_gpus=cfg.NUM_GPUS): misc.log_model_info(model, cfg, is_train=True) # Construct the optimizer. optimizer = optim.construct_optimizer(model, cfg) # Load a checkpoint to resume training if applicable. if cfg.TRAIN.AUTO_RESUME and cu.has_checkpoint(cfg.OUTPUT_DIR): logger.info("Load from last checkpoint.") last_checkpoint = cu.get_last_checkpoint(cfg.OUTPUT_DIR) checkpoint_epoch = cu.load_checkpoint( last_checkpoint, model, cfg.NUM_GPUS > 1, optimizer ) start_epoch = checkpoint_epoch + 1 elif cfg.TRAIN.CHECKPOINT_FILE_PATH != "": logger.info("Load from given checkpoint file.") checkpoint_epoch = cu.load_checkpoint( cfg.TRAIN.CHECKPOINT_FILE_PATH, model, cfg.NUM_GPUS > 1, optimizer, inflation=cfg.TRAIN.CHECKPOINT_INFLATE, convert_from_caffe2=cfg.TRAIN.CHECKPOINT_TYPE == "caffe2", ) start_epoch = checkpoint_epoch + 1 else: start_epoch = 0 # Create the video train and val loaders. train_loader = loader.construct_loader(cfg, "train") val_loader = loader.construct_loader(cfg, "val") # Create meters. if cfg.DETECTION.ENABLE: train_meter = AVAMeter(len(train_loader), cfg, mode="train") val_meter = AVAMeter(len(val_loader), cfg, mode="val") else: train_meter = TrainMeter(len(train_loader), cfg) val_meter = ValMeter(len(val_loader), cfg) # Perform the training loop. logger.info("Start epoch: {}".format(start_epoch + 1)) for cur_epoch in range(start_epoch, cfg.SOLVER.MAX_EPOCH): # Shuffle the dataset. loader.shuffle_dataset(train_loader, cur_epoch) # Train for one epoch. train_epoch(train_loader, model, optimizer, train_meter, cur_epoch, writer, nep_experiment, cfg) # Compute precise BN stats. # if cfg.BN.USE_PRECISE_STATS and len(get_bn_modules(model)) > 0: # calculate_and_update_precise_bn( # train_loader, model, cfg.BN.NUM_BATCHES_PRECISE # ) # Save a checkpoint. if cu.is_checkpoint_epoch(cur_epoch, cfg.TRAIN.CHECKPOINT_PERIOD): cu.save_checkpoint(cfg.OUTPUT_DIR, model, optimizer, cur_epoch, cfg) # Evaluate the model on validation set. if misc.is_eval_epoch(cfg, cur_epoch): eval_epoch(val_loader, model, val_meter, cur_epoch, nep_experiment, cfg) if du.get_rank()==0 and du.is_master_proc(num_gpus=cfg.NUM_GPUS) and not cfg.DEBUG: nep_experiment.log_metric('epoch', cur_epoch)
writer = None # # Perform multi-view test on the entire dataset. perform_test(test_loader, model, test_meter, cfg, writer) if writer is not None: writer.close() from slowfast.utils.misc import launch_job from slowfast.utils.parser import load_config, parse_args import numpy as np import torch import slowfast.utils.checkpoint as cu import slowfast.utils.distributed as du import slowfast.utils.logging as logging import slowfast.utils.misc as misc import slowfast.visualization.tensorboard_vis as tb from slowfast.datasets import loader from slowfast.models import build_model from slowfast.utils.meters import AVAMeter, TestMeter logger = logging.get_logger(__name__) args = parse_args() cfg = load_config(args) test_loader = loader.construct_loader(cfg, "test")
def test(cfg): """ Perform multi-view testing on the pretrained video model. Args: cfg (CfgNode): configs. Details can be found in slowfast/config/defaults.py """ # Set up environment. du.init_distributed_training(cfg) # Set random seed from configs. np.random.seed(cfg.RNG_SEED) torch.manual_seed(cfg.RNG_SEED) # Setup logging format. logging.setup_logging(cfg.OUTPUT_DIR) # Print config. logger.info("Test with config:") logger.info(cfg) # Build the video model and print model statistics. model = build_model(cfg) if du.is_master_proc() and cfg.LOG_MODEL_INFO: misc.log_model_info(model, cfg, is_train=False) # Load a checkpoint to test if applicable. if cfg.TEST.CHECKPOINT_FILE_PATH != "": cu.load_checkpoint( cfg.TEST.CHECKPOINT_FILE_PATH, model, cfg.NUM_GPUS > 1, None, inflation=False, convert_from_caffe2=cfg.TEST.CHECKPOINT_TYPE == "caffe2", ) elif cu.has_checkpoint(cfg.OUTPUT_DIR): last_checkpoint = cu.get_last_checkpoint(cfg.OUTPUT_DIR) cu.load_checkpoint(last_checkpoint, model, cfg.NUM_GPUS > 1) elif cfg.TRAIN.CHECKPOINT_FILE_PATH != "": # If no checkpoint found in TEST.CHECKPOINT_FILE_PATH or in the current # checkpoint folder, try to load checkpoint from # TRAIN.CHECKPOINT_FILE_PATH and test it. cu.load_checkpoint( cfg.TRAIN.CHECKPOINT_FILE_PATH, model, cfg.NUM_GPUS > 1, None, inflation=False, convert_from_caffe2=cfg.TRAIN.CHECKPOINT_TYPE == "caffe2", ) else: # raise NotImplementedError("Unknown way to load checkpoint.") logger.info("Testing with random initialization. Only for debugging.") # Create video testing loaders. test_loader = loader.construct_loader(cfg, "test") logger.info("Testing model for {} iterations".format(len(test_loader))) assert (len(test_loader.dataset) % (cfg.TEST.NUM_ENSEMBLE_VIEWS * cfg.TEST.NUM_SPATIAL_CROPS) == 0) # Create meters for multi-view testing. test_meter = TestMeter( len(test_loader.dataset) // (cfg.TEST.NUM_ENSEMBLE_VIEWS * cfg.TEST.NUM_SPATIAL_CROPS), cfg.TEST.NUM_ENSEMBLE_VIEWS * cfg.TEST.NUM_SPATIAL_CROPS, cfg.MODEL.NUM_CLASSES, len(test_loader), cfg.DATA.MULTI_LABEL, cfg.DATA.ENSEMBLE_METHOD, ) # Set up writer for logging to Tensorboard format. if cfg.TENSORBOARD.ENABLE and du.is_master_proc( cfg.NUM_GPUS * cfg.NUM_SHARDS): writer = tb.TensorboardWriter(cfg) else: writer = None # # Perform multi-view test on the entire dataset. perform_test(test_loader, model, test_meter, cfg, writer) if writer is not None: writer.close()
def benchmark_data_loading(cfg): """ Benchmark the speed of data loading in PySlowFast. Args: cfg (CfgNode): configs. Details can be found in slowfast/config/defaults.py """ # Set up environment. setup_environment() # Set random seed from configs. np.random.seed(cfg.RNG_SEED) torch.manual_seed(cfg.RNG_SEED) # Setup logging format. logging.setup_logging() # Print config. logger.info("Benchmark data loading with config:") logger.info(pprint.pformat(cfg)) timer = Timer() dataloader = loader.construct_loader(cfg, "train") logger.info("Initialize loader using {:.2f} seconds.".format( timer.seconds())) # Total batch size across different machines. batch_size = cfg.TRAIN.BATCH_SIZE * cfg.NUM_SHARDS log_period = cfg.BENCHMARK.LOG_PERIOD epoch_times = [] # Test for a few epochs. for cur_epoch in range(cfg.BENCHMARK.NUM_EPOCHS): timer = Timer() timer_epoch = Timer() iter_times = [] for cur_iter, _ in enumerate(tqdm.tqdm(dataloader)): if cur_iter > 0 and cur_iter % log_period == 0: iter_times.append(timer.seconds()) ram_usage, ram_total = misc.cpu_mem_usage() logger.info( "Epoch {}: {} iters ({} videos) in {:.2f} seconds. " "RAM Usage: {:.2f}/{:.2f} GB.".format( cur_epoch, log_period, log_period * batch_size, iter_times[-1], ram_usage, ram_total, )) timer.reset() epoch_times.append(timer_epoch.seconds()) ram_usage, ram_total = misc.cpu_mem_usage() logger.info( "Epoch {}: in total {} iters ({} videos) in {:.2f} seconds. " "RAM Usage: {:.2f}/{:.2f} GB.".format( cur_epoch, len(dataloader), len(dataloader) * batch_size, epoch_times[-1], ram_usage, ram_total, )) logger.info( "Epoch {}: on average every {} iters ({} videos) take {:.2f}/{:.2f} " "(avg/std) seconds.".format( cur_epoch, log_period, log_period * batch_size, np.mean(iter_times), np.std(iter_times), )) logger.info("On average every epoch ({} videos) takes {:.2f}/{:.2f} " "(avg/std) seconds.".format( len(dataloader) * batch_size, np.mean(epoch_times), np.std(epoch_times), ))
def test(cfg, cnt=-1): """ Perform multi-view testing on the pretrained video model. Args: cfg (CfgNode): configs. Details can be found in slowfast/config/defaults.py """ # Set random seed from configs. np.random.seed(cfg.RNG_SEED) torch.manual_seed(cfg.RNG_SEED) # Setup logging format. logging.setup_logging() # # Perform multi-view test on the entire dataset. scores_path = os.path.join(cfg.OUTPUT_DIR, 'scores') if not os.path.exists(scores_path): os.makedirs(scores_path) filename_root = cfg.EPICKITCHENS.TEST_LIST.split('.')[0] if cnt >= 0: file_name = '{}_{}_{}.pkl'.format(filename_root, cnt, cfg.MODEL.MODEL_NAME) else: file_name = '{}_{}_{}.pkl'.format(filename_root, 'test_only', cfg.MODEL.MODEL_NAME) file_path = os.path.join(scores_path, file_name) logger.info(file_path) # Print config. # if cnt < 0: # logger.info("Test with config:") # logger.info(cfg) # Build the video model and print model statistics. model = build_model(cfg) if cfg.EPICKITCHENS.USE_BBOX: model.module.load_weight_slowfast() # if du.is_master_proc(): # misc.log_model_info(model, cfg, is_train=False) # Load a checkpoint to test if applicable. if cfg.TEST.CHECKPOINT_FILE_PATH != "": logger.info("Load from given checkpoint file.") cu.load_checkpoint( cfg.TEST.CHECKPOINT_FILE_PATH, model, cfg.NUM_GPUS > 1, None, inflation=False, convert_from_caffe2=cfg.TEST.CHECKPOINT_TYPE == "caffe2", ) elif cu.has_checkpoint(cfg.OUTPUT_DIR): logger.info("Load from last checkpoint.") last_checkpoint = cu.get_last_checkpoint(cfg.OUTPUT_DIR) cu.load_checkpoint(last_checkpoint, model, cfg.NUM_GPUS > 1) elif cfg.TRAIN.CHECKPOINT_FILE_PATH != "": # If no checkpoint found in TEST.CHECKPOINT_FILE_PATH or in the current # checkpoint folder, try to load checkpint from # TRAIN.CHECKPOINT_FILE_PATH and test it. cu.load_checkpoint( cfg.TRAIN.CHECKPOINT_FILE_PATH, model, cfg.NUM_GPUS > 1, None, inflation=False, convert_from_caffe2=cfg.TRAIN.CHECKPOINT_TYPE == "caffe2", ) else: # raise NotImplementedError("Unknown way to load checkpoint.") logger.info("Testing with random initialization. Only for debugging.") # Create video testing loaders. if cfg.TEST.EXTRACT_FEATURES_MODE != "" and cfg.TEST.EXTRACT_FEATURES_MODE in ["test","train","val"]: test_loader = loader.construct_loader(cfg, cfg.TEST.EXTRACT_FEATURES_MODE) else: test_loader = loader.construct_loader(cfg, "test") logger.info("Testing model for {} iterations".format(len(test_loader))) if cfg.DETECTION.ENABLE: assert cfg.NUM_GPUS == cfg.TEST.BATCH_SIZE test_meter = AVAMeter(len(test_loader), cfg, mode="test") else: assert ( len(test_loader.dataset) % (cfg.TEST.NUM_ENSEMBLE_VIEWS * cfg.TEST.NUM_SPATIAL_CROPS) == 0 ) # Create meters for multi-view testing. if cfg.TEST.DATASET == 'epickitchens': test_meter = EPICTestMeter( len(test_loader.dataset) // (cfg.TEST.NUM_ENSEMBLE_VIEWS * cfg.TEST.NUM_SPATIAL_CROPS), cfg.TEST.NUM_ENSEMBLE_VIEWS * cfg.TEST.NUM_SPATIAL_CROPS, cfg.MODEL.NUM_CLASSES, len(test_loader), ) else: test_meter = TestMeter( len(test_loader.dataset) // (cfg.TEST.NUM_ENSEMBLE_VIEWS * cfg.TEST.NUM_SPATIAL_CROPS), cfg.TEST.NUM_ENSEMBLE_VIEWS * cfg.TEST.NUM_SPATIAL_CROPS, cfg.MODEL.NUM_CLASSES, len(test_loader), ) pickle.dump([], open(file_path, 'wb+')) if cfg.TEST.EXTRACT_FEATURES: preds, labels, metadata, x_feat_list = perform_test(test_loader, model, test_meter, cfg) else: preds, labels, metadata = perform_test(test_loader, model, test_meter, cfg) if du.is_master_proc(): if cfg.TEST.DATASET == 'epickitchens': results = {'verb_output': preds[0], 'noun_output': preds[1], 'verb_gt': labels[0], 'noun_gt': labels[1], 'narration_id': metadata} scores_path = os.path.join(cfg.OUTPUT_DIR, 'scores') if not os.path.exists(scores_path): os.makedirs(scores_path) pickle.dump(results, open(file_path, 'wb')) if cfg.TEST.EXTRACT_FEATURES: pid = cfg.EPICKITCHENS.FEATURE_VID.split("_")[0] if not os.path.exists(os.path.join(cfg.TEST.EXTRACT_FEATURES_PATH, pid)): os.mkdir(os.path.join(cfg.TEST.EXTRACT_FEATURES_PATH, pid)) if not cfg.TEST.EXTRACT_MSTCN_FEATURES and cfg.TEST.EXTRACT_FEATURES: arr_slow = torch.cat(x_feat_list[0], dim=0).numpy() arr_fast = torch.cat(x_feat_list[1], dim=0).numpy() print(arr_slow.shape, arr_fast.shape) fpath_feat = os.path.join(cfg.TEST.EXTRACT_FEATURES_PATH, pid, '{}.pkl'.format(cfg.EPICKITCHENS.FEATURE_VID)) with open(fpath_feat,'wb+') as f: pickle.dump([arr_slow, arr_fast], f) elif cfg.TEST.EXTRACT_MSTCN_FEATURES and cfg.TEST.EXTRACT_FEATURES: fpath_feat = os.path.join(cfg.TEST.EXTRACT_FEATURES_PATH, pid, '{}.npy'.format(cfg.EPICKITCHENS.FEATURE_VID)) with open(fpath_feat,'wb+') as f: arr = torch.cat(x_feat_list, dim=0).numpy() print(arr.shape) np.save(f, arr)
def feature_extract(kwargs): """ Perform multi-view testing on the pretrained video model. Args: cfg (CfgNode): configs. Details can be found in slowfast/config/defaults.py """ cfg = kwargs.pop('cfg') path_to_video_list = kwargs.pop('path_to_video_list') path_to_video_dir = kwargs.pop('path_to_video_dir') path_to_feat_dir = kwargs.pop('path_to_feat_dir') num_features = kwargs.pop('num_features') if not os.path.isdir(path_to_feat_dir): os.makedirs(path_to_feat_dir) # Setup logging format. logging.setup_logging() # Print config. logger.info("Extract feature with config:") logger.info(cfg) # Build the video model and print model statistics. model = model_builder.build_model(cfg, feature_extraction=True) if du.is_master_proc(): misc.log_model_info(model) # Load a checkpoint to test if applicable. if cfg.TEST.CHECKPOINT_FILE_PATH != "": cu.load_checkpoint( cfg.TEST.CHECKPOINT_FILE_PATH, model, cfg.NUM_GPUS > 1, None, inflation=False, convert_from_caffe2=cfg.TEST.CHECKPOINT_TYPE == "caffe2", ) elif cu.has_checkpoint(cfg.OUTPUT_DIR): last_checkpoint = cu.get_last_checkpoint(cfg.OUTPUT_DIR) cu.load_checkpoint(last_checkpoint, model, cfg.NUM_GPUS > 1) elif cfg.TRAIN.CHECKPOINT_FILE_PATH != "": # If no checkpoint found in TEST.CHECKPOINT_FILE_PATH or in the current # checkpoint folder, try to load checkpint from # TRAIN.CHECKPOINT_FILE_PATH and test it. cu.load_checkpoint( cfg.TRAIN.CHECKPOINT_FILE_PATH, model, cfg.NUM_GPUS > 1, None, inflation=False, convert_from_caffe2=cfg.TRAIN.CHECKPOINT_TYPE == "caffe2", ) else: # raise NotImplementedError("Unknown way to load checkpoint.") logger.info("Extracting features with random initialization. Only for debugging.") feature_extract_fn = perform_bbox_feature_extract if cfg.DETECTION.ENABLE else perform_feature_extract # Create video feature extraction loaders. if osp.isfile(path_to_video_list): path_to_videos = [] with open(path_to_video_list, 'r') as f: video_list = json.load(f) for video_name in video_list: path_to_video = osp.join(path_to_video_dir, video_name) if osp.isfile(path_to_video): path_to_videos.append(path_to_video) else: path_to_videos = glob.glob(osp.join(path_to_video_dir, '*')) for video_idx, path_to_video in enumerate(tqdm(path_to_videos)): path_to_feature = osp.join(path_to_feat_dir, osp.splitext(osp.basename(path_to_video))[0] + '.json') if osp.isfile(path_to_feature) and json.load(open(path_to_feature))['num_features'] == num_features: continue video_extraction_loader = loader.construct_loader(cfg, path_to_video) video_data = { 'num_features': 0, 'video_features': {} } if len(video_extraction_loader) > 0: video_features = feature_extract_fn(video_extraction_loader, model, cfg) assert all([feature is not None for feature in video_features]), 'Missing some features !' video_data['num_features'] = len(video_features) video_data['video_features'] = video_features if video_data['num_features'] != num_features: print('Warning! Video %s has %d features.' % (path_to_video, video_data['num_features'])) with open(path_to_feature, 'w') as f: json.dump(video_data, f)
def train(cfg): """ Train a video model for many epochs on train set and evaluate it on val set. Args: cfg (CfgNode): configs. Details can be found in slowfast/config/defaults.py """ # Set up environment. du.init_distributed_training(cfg) # Set random seed from configs. np.random.seed(cfg.RNG_SEED) torch.manual_seed(cfg.RNG_SEED) # Setup logging format. logging.setup_logging(cfg.OUTPUT_DIR) # Init multigrid. multigrid = None if cfg.MULTIGRID.LONG_CYCLE or cfg.MULTIGRID.SHORT_CYCLE: multigrid = MultigridSchedule() cfg = multigrid.init_multigrid(cfg) if cfg.MULTIGRID.LONG_CYCLE: cfg, _ = multigrid.update_long_cycle(cfg, cur_epoch=0) # Print config. logger.info("Train with config:") logger.info(pprint.pformat(cfg)) # Build the video model and print model statistics. model = build_model(cfg) if du.is_master_proc() and cfg.LOG_MODEL_INFO: misc.log_model_info(model, cfg, use_train_input=True) # Construct the optimizer. optimizer = optim.construct_optimizer(model, cfg) # Create a GradScaler for mixed precision training scaler = torch.cuda.amp.GradScaler(enabled=cfg.TRAIN.MIXED_PRECISION) # Load a checkpoint to resume training if applicable. if cfg.TRAIN.AUTO_RESUME and cu.has_checkpoint(cfg.OUTPUT_DIR): logger.info("Load from last checkpoint.") last_checkpoint = cu.get_last_checkpoint(cfg.OUTPUT_DIR, task=cfg.TASK) if last_checkpoint is not None: checkpoint_epoch = cu.load_checkpoint( last_checkpoint, model, cfg.NUM_GPUS > 1, optimizer, scaler if cfg.TRAIN.MIXED_PRECISION else None, ) start_epoch = checkpoint_epoch + 1 elif "ssl_eval" in cfg.TASK: last_checkpoint = cu.get_last_checkpoint(cfg.OUTPUT_DIR, task="ssl") checkpoint_epoch = cu.load_checkpoint( last_checkpoint, model, cfg.NUM_GPUS > 1, optimizer, scaler if cfg.TRAIN.MIXED_PRECISION else None, epoch_reset=True, clear_name_pattern=cfg.TRAIN.CHECKPOINT_CLEAR_NAME_PATTERN, ) start_epoch = checkpoint_epoch + 1 else: start_epoch = 0 elif cfg.TRAIN.CHECKPOINT_FILE_PATH != "": logger.info("Load from given checkpoint file.") checkpoint_epoch = cu.load_checkpoint( cfg.TRAIN.CHECKPOINT_FILE_PATH, model, cfg.NUM_GPUS > 1, optimizer, scaler if cfg.TRAIN.MIXED_PRECISION else None, inflation=cfg.TRAIN.CHECKPOINT_INFLATE, convert_from_caffe2=cfg.TRAIN.CHECKPOINT_TYPE == "caffe2", epoch_reset=cfg.TRAIN.CHECKPOINT_EPOCH_RESET, clear_name_pattern=cfg.TRAIN.CHECKPOINT_CLEAR_NAME_PATTERN, ) start_epoch = checkpoint_epoch + 1 else: start_epoch = 0 # Create the video train and val loaders. train_loader = loader.construct_loader(cfg, "train") val_loader = loader.construct_loader(cfg, "val") precise_bn_loader = (loader.construct_loader( cfg, "train", is_precise_bn=True) if cfg.BN.USE_PRECISE_STATS else None) # if ( # cfg.TASK == "ssl" # and cfg.MODEL.MODEL_NAME == "ContrastiveModel" # and cfg.CONTRASTIVE.KNN_ON # ): # if hasattr(model, "module"): # model.module.init_knn_labels(train_loader) # else: # model.init_knn_labels(train_loader) # Create meters. if cfg.DETECTION.ENABLE: train_meter = AVAMeter(len(train_loader), cfg, mode="train") val_meter = AVAMeter(len(val_loader), cfg, mode="val") else: train_meter = TrainMeter(1e6, cfg) val_meter = ValMeter(1e6, cfg) # set up writer for logging to Tensorboard format. if cfg.TENSORBOARD.ENABLE and du.is_master_proc( cfg.NUM_GPUS * cfg.NUM_SHARDS): writer = tb.TensorboardWriter(cfg) else: writer = None # Perform the training loop. logger.info("Start epoch: {}".format(start_epoch + 1)) epoch_timer = EpochTimer() for cur_epoch in range(start_epoch, cfg.SOLVER.MAX_EPOCH): if cur_epoch > 0 and cfg.DATA.LOADER_CHUNK_SIZE > 0: num_chunks = math.ceil(cfg.DATA.LOADER_CHUNK_OVERALL_SIZE / cfg.DATA.LOADER_CHUNK_SIZE) skip_rows = (cur_epoch) % num_chunks * cfg.DATA.LOADER_CHUNK_SIZE logger.info( f"=================+++ num_chunks {num_chunks} skip_rows {skip_rows}" ) cfg.DATA.SKIP_ROWS = skip_rows logger.info(f"|===========| skip_rows {skip_rows}") train_loader = loader.construct_loader(cfg, "train") loader.shuffle_dataset(train_loader, cur_epoch) if cfg.MULTIGRID.LONG_CYCLE: cfg, changed = multigrid.update_long_cycle(cfg, cur_epoch) if changed: ( model, optimizer, train_loader, val_loader, precise_bn_loader, train_meter, val_meter, ) = build_trainer(cfg) # Load checkpoint. if cu.has_checkpoint(cfg.OUTPUT_DIR): last_checkpoint = cu.get_last_checkpoint(cfg.OUTPUT_DIR, task=cfg.TASK) assert "{:05d}.pyth".format(cur_epoch) in last_checkpoint else: last_checkpoint = cfg.TRAIN.CHECKPOINT_FILE_PATH logger.info("Load from {}".format(last_checkpoint)) cu.load_checkpoint(last_checkpoint, model, cfg.NUM_GPUS > 1, optimizer) # Shuffle the dataset. loader.shuffle_dataset(train_loader, cur_epoch) if hasattr(train_loader.dataset, "_set_epoch_num"): train_loader.dataset._set_epoch_num(cur_epoch) # Train for one epoch. epoch_timer.epoch_tic() train_epoch( train_loader, model, optimizer, scaler, train_meter, cur_epoch, cfg, writer, ) epoch_timer.epoch_toc() logger.info( f"Epoch {cur_epoch} takes {epoch_timer.last_epoch_time():.2f}s. Epochs " f"from {start_epoch} to {cur_epoch} take " f"{epoch_timer.avg_epoch_time():.2f}s in average and " f"{epoch_timer.median_epoch_time():.2f}s in median.") logger.info( f"For epoch {cur_epoch}, each iteraction takes " f"{epoch_timer.last_epoch_time()/len(train_loader):.2f}s in average. " f"From epoch {start_epoch} to {cur_epoch}, each iteraction takes " f"{epoch_timer.avg_epoch_time()/len(train_loader):.2f}s in average." ) is_checkp_epoch = (cu.is_checkpoint_epoch( cfg, cur_epoch, None if multigrid is None else multigrid.schedule, ) or cur_epoch == cfg.SOLVER.MAX_EPOCH - 1) is_eval_epoch = misc.is_eval_epoch( cfg, cur_epoch, None if multigrid is None else multigrid.schedule) # Compute precise BN stats. if ((is_checkp_epoch or is_eval_epoch) and cfg.BN.USE_PRECISE_STATS and len(get_bn_modules(model)) > 0): calculate_and_update_precise_bn( precise_bn_loader, model, min(cfg.BN.NUM_BATCHES_PRECISE, len(precise_bn_loader)), cfg.NUM_GPUS > 0, ) _ = misc.aggregate_sub_bn_stats(model) # Save a checkpoint. if is_checkp_epoch: cu.save_checkpoint( cfg.OUTPUT_DIR, model, optimizer, cur_epoch, cfg, scaler if cfg.TRAIN.MIXED_PRECISION else None, ) # Evaluate the model on validation set. if is_eval_epoch: eval_epoch( val_loader, model, val_meter, cur_epoch, cfg, train_loader, writer, ) if writer is not None: writer.close() result_string = "Top1 Acc: {:.2f} Top5 Acc: {:.2f} MEM: {:.2f}" "".format( 100 - val_meter.min_top1_err, 100 - val_meter.min_top5_err, misc.gpu_mem_usage(), ) logger.info("training done: {}".format(result_string)) return result_string
def evalOnnxOne(self, dataSet): self.cfg.TRAIN['BATCH_SIZE'] = 1 dl = loader.construct_loader(self.cfg, dataSet) meter = ValMeter(len(dl), self.cfg, self.logger) totTime = 0 results = [] predsAll = [] labelsAll = [] GB = 1024**3 meta = {'id': ['s'] * 100} sess, inNames, outNames = self.loadOnnx() for cur_iter, (frames, labels, idx, meta) in enumerate(dl): # if sess is None or cur_iter % 50000 == 0: # if sess is not None: # del sess # sess, inNames, outNames = self.loadOnnx() startTime = time.time() meter.iter_tic() idx = torch.tensor(cur_iter) # assert isinstance(frames, (list,)), 'Expect a list as input' # assert len(inNames) == len(frames), "Num Onnx inputs {} [{}] Data has {}".format(len(inNames), inNames, len(frames)) inputs = { n: d.numpy().astype(np.float32) for n, d in zip(inNames, frames) } pred = sess.run([outNames], inputs).pop().squeeze() # extract the track start and end from meta data # m = re.search('.*tr_([\d]+)_st_([\d]+)_end_([\d]+).pt', meta['id'][0]) # tr, st, end = [int(x) for x in m.groups()] meter.iter_toc() results.append((meta['id'], pred, labels)) if np.isnan(pred).any(): self.logger.info("NAN Iter {} Idx {} {} {}".format( cur_iter, idx.item(), meta, pred)) continue if labels != 0.0 and labels != 1.0: self.logger.info( "Bad Label Iter {} Idx {} {} Label {}".format( cur_iter, idx.item(), meta, labels)) continue # num_topks_correct = metrics.topks_correct(torch.tensor(pred), labels, (1, min(5, self.cfg.MODEL.NUM_CLASSES))) # top1_err, _ = [(1.0 - x / pred.size(0)) * 100.0 for x in num_topks_correct ] meter.update_stats(0, 0, 1) meter.update_predictions(pred, labels) labels = labels.numpy().astype(np.float32)[0] predsAll.append(pred[-1]) labelsAll.append(labels) meter.log_iter_stats(0, cur_iter, predsAll, labelsAll) totTime += time.time() - startTime if len(results) % self.cfg.LOG_PERIOD == 0: c = len(results) mm = psutil.virtual_memory() ss = psutil.swap_memory() self.logger.info( "{} {} eTime {:.4f} sec avg {:.4f} sec CPU {} used GB {:.3f} AVAIL {:.3f} FREE {:.3f} SWAP Tot {:.3f} SWAP USED {:.3f} SWAP Free {:.3f} " .format( dataSet, c, totTime, totTime / max(c, 1), psutil.cpu_percent(), mm.used / GB, mm.available / GB, mm.free / GB, ss.total / GB, ss.used / GB, ss.free / GB, )) c = len(results) eTime = totTime / max(c, 1) self.logger.info("{} {} eTime {} sec avg {} sec".format( dataSet, c, totTime, eTime)) meter.log_epoch_stats(0, predsAll, labelsAll) if len(self.cfg.ONNX.SAVE_PREDS_PATH) > 0: df = pd.DataFrame(results, columns=['clip', 'pred', 'label']) dfPath = createFullPathTree( self.cfg.OUTPUT_DIR, addFilenameSuffix(self.cfg.ONNX.SAVE_PREDS_PATH, dataSet)) ensureDir(os.path.dirname(dfPath)) self.logger.info("Saving predictions results {}".format(dfPath)) df.to_pickle(dfPath) if self.cfg.ONNX.PREC_RECALL_PLOT: precision, recall, thresholds = precision_recall_curve( labelsAll, predsAll) self.plotPrCurve( dataSet, 'PR AUC', recall, precision, xLabel='Recall', yLabel='Precision', ) if self.cfg.ONNX.ROC_PLOT: fpr, tpr, rocThresholds = roc_curve(labelsAll, predsAll) self.plotPrCurve(dataSet, 'ROC AUC', fpr, tpr, xLabel='FPR', yLabel='TPR')
def train(cfg): """ Train a video model for many epochs on train set and evaluate it on val set. Args: cfg (CfgNode): configs. Details can be found in slowfast/config/defaults.py """ # Set random seed from configs. np.random.seed(cfg.RNG_SEED) torch.manual_seed(cfg.RNG_SEED) # Setup logging format. logging.setup_logging() # Print config. logger.info("Train with config:") logger.info(pprint.pformat(cfg)) # Build the video model and print model statistics. model = model_builder.build_model(cfg) if du.is_master_proc(): misc.log_model_info(model) # Construct the optimizer. optimizer = optim.construct_optimizer(model, cfg) # Load a checkpoint to resume training if applicable. if cfg.TRAIN.AUTO_RESUME and cu.has_checkpoint(cfg.OUTPUT_DIR): logger.info("Load from last checkpoint.") last_checkpoint = cu.get_last_checkpoint(cfg.OUTPUT_DIR) checkpoint_epoch = cu.load_checkpoint(last_checkpoint, model, cfg.NUM_GPUS > 1, optimizer) start_epoch = checkpoint_epoch + 1 elif cfg.TRAIN.CHECKPOINT_FILE_PATH != "": logger.info("Load from given checkpoint file.") checkpoint_epoch = cu.load_checkpoint( cfg.TRAIN.CHECKPOINT_FILE_PATH, model, cfg.NUM_GPUS > 1, optimizer, inflation=cfg.TRAIN.CHECKPOINT_INFLATE, convert_from_caffe2=cfg.TRAIN.CHECKPOINT_TYPE == "caffe2", ) start_epoch = checkpoint_epoch + 1 else: start_epoch = 0 # Create the video train and val loaders. train_loader = loader.construct_loader(cfg, "train") val_loader = loader.construct_loader(cfg, "val") # Create meters. if cfg.DETECTION.ENABLE: train_meter = AVAMeter(len(train_loader), cfg, mode="train") val_meter = AVAMeter(len(val_loader), cfg, mode="val") else: train_meter = TrainMeter(len(train_loader), cfg) val_meter = ValMeter(len(val_loader), cfg) # Perform the training loop. logger.info("Start epoch: {}".format(start_epoch + 1)) for cur_epoch in range(start_epoch, cfg.SOLVER.MAX_EPOCH): # Shuffle the dataset. loader.shuffle_dataset(train_loader, cur_epoch) # Train for one epoch. train_epoch(train_loader, model, optimizer, train_meter, cur_epoch, cfg) # Compute precise BN stats. if cfg.BN.USE_PRECISE_STATS and len(get_bn_modules(model)) > 0: calculate_and_update_precise_bn(train_loader, model, cfg.BN.NUM_BATCHES_PRECISE) # Save a checkpoint. if cu.is_checkpoint_epoch(cur_epoch, cfg.TRAIN.CHECKPOINT_PERIOD): cu.save_checkpoint(cfg.OUTPUT_DIR, model, optimizer, cur_epoch, cfg) # Evaluate the model on validation set. if misc.is_eval_epoch(cfg, cur_epoch): eval_epoch(val_loader, model, val_meter, cur_epoch, cfg)
def train(cfg): """ Train an audio model for many epochs on train set and evaluate it on val set. Args: cfg (CfgNode): configs. Details can be found in slowfast/config/defaults.py """ # Set up environment. du.init_distributed_training(cfg) # Set random seed from configs. np.random.seed(cfg.RNG_SEED) torch.manual_seed(cfg.RNG_SEED) # Setup logging format. logging.setup_logging(cfg.OUTPUT_DIR) # Print config. logger.info("Train with config:") logger.info(pprint.pformat(cfg)) # Build the audio model and print model statistics. model = build_model(cfg) if du.is_master_proc() and cfg.LOG_MODEL_INFO: misc.log_model_info(model, cfg) if cfg.BN.FREEZE: model.module.freeze_fn( 'bn_parameters') if cfg.NUM_GPUS > 1 else model.freeze_fn( 'bn_parameters') # Construct the optimizer. optimizer = optim.construct_optimizer(model, cfg) # Load a checkpoint to resume training if applicable. start_epoch = cu.load_train_checkpoint(cfg, model, optimizer) # Create the audio train and val loaders. if cfg.TRAIN.DATASET != 'epickitchens' or not cfg.EPICKITCHENS.TRAIN_PLUS_VAL: train_loader = loader.construct_loader(cfg, "train") val_loader = loader.construct_loader(cfg, "val") precise_bn_loader = (loader.construct_loader(cfg, "train") if cfg.BN.USE_PRECISE_STATS else None) else: train_loader = loader.construct_loader(cfg, "train+val") val_loader = loader.construct_loader(cfg, "val") precise_bn_loader = (loader.construct_loader(cfg, "train+val") if cfg.BN.USE_PRECISE_STATS else None) # Create meters. if cfg.TRAIN.DATASET == 'epickitchens': train_meter = EPICTrainMeter(len(train_loader), cfg) val_meter = EPICValMeter(len(val_loader), cfg) else: train_meter = TrainMeter(len(train_loader), cfg) val_meter = ValMeter(len(val_loader), cfg) # set up writer for logging to Tensorboard format. if cfg.TENSORBOARD.ENABLE and du.is_master_proc( cfg.NUM_GPUS * cfg.NUM_SHARDS): writer = tb.TensorboardWriter(cfg) else: writer = None if cfg.WANDB.ENABLE and du.is_master_proc(cfg.NUM_GPUS * cfg.NUM_SHARDS): wandb_log = True if cfg.TRAIN.AUTO_RESUME and cfg.WANDB.RUN_ID != "": wandb.init(project='slowfast', config=cfg, sync_tensorboard=True, resume=cfg.WANDB.RUN_ID) else: wandb.init(project='slowfast', config=cfg, sync_tensorboard=True) wandb.watch(model) else: wandb_log = False # Perform the training loop. logger.info("Start epoch: {}".format(start_epoch + 1)) for cur_epoch in range(start_epoch, cfg.SOLVER.MAX_EPOCH): # Shuffle the dataset. loader.shuffle_dataset(train_loader, cur_epoch) # Train for one epoch. train_epoch(train_loader, model, optimizer, train_meter, cur_epoch, cfg, writer, wandb_log) is_checkp_epoch = cu.is_checkpoint_epoch( cfg, cur_epoch, ) is_eval_epoch = misc.is_eval_epoch( cfg, cur_epoch, ) # Compute precise BN stats. if ((is_checkp_epoch or is_eval_epoch) and cfg.BN.USE_PRECISE_STATS and len(get_bn_modules(model)) > 0): calculate_and_update_precise_bn( precise_bn_loader, model, min(cfg.BN.NUM_BATCHES_PRECISE, len(precise_bn_loader)), cfg.NUM_GPUS > 0, ) _ = misc.aggregate_sub_bn_stats(model) # Save a checkpoint. if is_checkp_epoch: cu.save_checkpoint(cfg.OUTPUT_DIR, model, optimizer, cur_epoch, cfg) # Evaluate the model on validation set. if is_eval_epoch: is_best_epoch, _ = eval_epoch(val_loader, model, val_meter, cur_epoch, cfg, writer, wandb_log) if is_best_epoch: cu.save_checkpoint(cfg.OUTPUT_DIR, model, optimizer, cur_epoch, cfg, is_best_epoch=is_best_epoch) if writer is not None: writer.close()
def benchmark_data(cfg): # Set up environment. setup_environment() # Set random seed from configs. np.random.seed(cfg.RNG_SEED) torch.manual_seed(cfg.RNG_SEED) # Setup logging format. logging.setup_logging() # Print config. logger.info("Benchmark data loading with config:") logger.info(pprint.pformat(cfg)) timer = Timer() dataloader = loader.construct_loader(cfg, "train") logger.info("Initialize loader using {:.2f} seconds.".format( timer.seconds())) batch_size = cfg.TRAIN.BATCH_SIZE log_period = cfg.BENCHMARK.LOG_PERIOD epoch_times = [] # Test for a few epochs. for cur_epoch in range(cfg.BENCHMARK.NUM_EPOCHS): timer = Timer() timer_epoch = Timer() iter_times = [] for cur_iter, _ in enumerate(tqdm.tqdm(dataloader)): if cur_iter > 0 and cur_iter % log_period == 0: iter_times.append(timer.seconds()) vram = psutil.virtual_memory() logger.info( "Epoch {}: {} iters ({} videos) in {:.2f} seconds. " "RAM Usage: {:.2f}/{:.2f} GB.".format( cur_epoch, log_period, log_period * batch_size, iter_times[-1], (vram.total - vram.available) / 1024**3, vram.total / 1024**3, )) timer.reset() epoch_times.append(timer_epoch.seconds()) vram = psutil.virtual_memory() logger.info( "Epoch {}: in total {} iters ({} videos) in {:.2f} seconds. " "RAM Usage: {:.2f}/{:.2f} GB.".format( cur_epoch, len(dataloader), len(dataloader) * batch_size, epoch_times[-1], (vram.total - vram.available) / 1024**3, vram.total / 1024**3, )) logger.info( "Epoch {}: on average every {} iters ({} videos) take {:.2f}/{:.2f} " "(avg/std) seconds.".format( cur_epoch, log_period, log_period * batch_size, np.mean(iter_times), np.std(iter_times), )) logger.info("On average every epoch ({} videos) takes {:.2f}/{:.2f} " "(avg/std) seconds.".format( len(dataloader) * batch_size, np.mean(epoch_times), np.std(epoch_times), ))
def test(cfg): """ Perform multi-view testing on the pretrained audio model. Args: cfg (CfgNode): configs. Details can be found in slowfast/config/defaults.py """ # Set up environment. du.init_distributed_training(cfg) # Set random seed from configs. np.random.seed(cfg.RNG_SEED) torch.manual_seed(cfg.RNG_SEED) # Setup logging format. logging.setup_logging(cfg.OUTPUT_DIR) # Print config. logger.info("Test with config:") logger.info(cfg) # Build the audio model and print model statistics. model = build_model(cfg) if du.is_master_proc() and cfg.LOG_MODEL_INFO: misc.log_model_info(model, cfg) cu.load_test_checkpoint(cfg, model) # Create audio testing loaders. test_loader = loader.construct_loader(cfg, "test") logger.info("Testing model for {} iterations".format(len(test_loader))) assert ( len(test_loader.dataset) % cfg.TEST.NUM_ENSEMBLE_VIEWS == 0 ) # Create meters for multi-view testing. if cfg.TEST.DATASET == 'epickitchens': test_meter = EPICTestMeter( len(test_loader.dataset) // cfg.TEST.NUM_ENSEMBLE_VIEWS, cfg.TEST.NUM_ENSEMBLE_VIEWS, cfg.MODEL.NUM_CLASSES, len(test_loader), cfg.DATA.ENSEMBLE_METHOD, ) else: test_meter = TestMeter( len(test_loader.dataset) // cfg.TEST.NUM_ENSEMBLE_VIEWS, cfg.TEST.NUM_ENSEMBLE_VIEWS, cfg.MODEL.NUM_CLASSES[0], len(test_loader), cfg.DATA.MULTI_LABEL, cfg.DATA.ENSEMBLE_METHOD, ) # Set up writer for logging to Tensorboard format. if cfg.TENSORBOARD.ENABLE and du.is_master_proc( cfg.NUM_GPUS * cfg.NUM_SHARDS ): writer = tb.TensorboardWriter(cfg) else: writer = None # # Perform multi-view test on the entire dataset. test_meter, preds, preds_clips, labels, metadata = perform_test(test_loader, model, test_meter, cfg, writer) if du.is_master_proc(): if cfg.TEST.DATASET == 'epickitchens': results = {'verb_output': preds[0], 'noun_output': preds[1], 'narration_id': metadata} scores_path = os.path.join(cfg.OUTPUT_DIR, 'scores') if not os.path.exists(scores_path): os.makedirs(scores_path) file_path = os.path.join(scores_path, cfg.EPICKITCHENS.TEST_SPLIT+'.pkl') pickle.dump(results, open(file_path, 'wb')) else: if cfg.TEST.DATASET == 'vggsound': get_stats(preds, labels) results = {'scores': preds, 'labels': labels} scores_path = os.path.join(cfg.OUTPUT_DIR, 'scores') if not os.path.exists(scores_path): os.makedirs(scores_path) file_path = os.path.join(scores_path, 'test.pkl') pickle.dump(results, open(file_path, 'wb')) if writer is not None: writer.close()
def visualize(cfg): """ Perform layer weights and activations visualization on the model. Args: cfg (CfgNode): configs. Details can be found in slowfast/config/defaults.py """ if cfg.TENSORBOARD.ENABLE and (cfg.TENSORBOARD.MODEL_VIS.ENABLE or cfg.TENSORBOARD.WRONG_PRED_VIS.ENABLE): # Set up environment. du.init_distributed_training(cfg) # Set random seed from configs. np.random.seed(cfg.RNG_SEED) torch.manual_seed(cfg.RNG_SEED) # Setup logging format. logging.setup_logging(cfg.OUTPUT_DIR) # Print config. logger.info("Model Visualization with config:") logger.info(cfg) # Build the video model and print model statistics. model = build_model(cfg) model.eval() if du.is_master_proc() and cfg.LOG_MODEL_INFO: misc.log_model_info(model, cfg, use_train_input=False) cu.load_test_checkpoint(cfg, model) # Create video testing loaders. vis_loader = loader.construct_loader(cfg, "test") if cfg.DETECTION.ENABLE: assert cfg.NUM_GPUS == cfg.TEST.BATCH_SIZE or cfg.NUM_GPUS == 0 # Set up writer for logging to Tensorboard format. if du.is_master_proc(cfg.NUM_GPUS * cfg.NUM_SHARDS): writer = tb.TensorboardWriter(cfg) else: writer = None if cfg.TENSORBOARD.PREDICTIONS_PATH != "": assert not cfg.DETECTION.ENABLE, "Detection is not supported." logger.info( "Visualizing class-level performance from saved results...") if writer is not None: with g_pathmgr.open(cfg.TENSORBOARD.PREDICTIONS_PATH, "rb") as f: preds, labels = pickle.load(f, encoding="latin1") writer.plot_eval(preds, labels) if cfg.TENSORBOARD.MODEL_VIS.ENABLE: if cfg.TENSORBOARD.MODEL_VIS.GRAD_CAM.ENABLE: assert ( not cfg.DETECTION.ENABLE ), "Detection task is currently not supported for Grad-CAM visualization." if cfg.MODEL.ARCH in cfg.MODEL.SINGLE_PATHWAY_ARCH: assert ( len(cfg.TENSORBOARD.MODEL_VIS.GRAD_CAM.LAYER_LIST) == 1 ), "The number of chosen CNN layers must be equal to the number of pathway(s), given {} layer(s).".format( len(cfg.TENSORBOARD.MODEL_VIS.GRAD_CAM.LAYER_LIST)) elif cfg.MODEL.ARCH in cfg.MODEL.MULTI_PATHWAY_ARCH: assert ( len(cfg.TENSORBOARD.MODEL_VIS.GRAD_CAM.LAYER_LIST) == 2 ), "The number of chosen CNN layers must be equal to the number of pathway(s), given {} layer(s).".format( len(cfg.TENSORBOARD.MODEL_VIS.GRAD_CAM.LAYER_LIST)) else: raise NotImplementedError( "Model arch {} is not in {}".format( cfg.MODEL.ARCH, cfg.MODEL.SINGLE_PATHWAY_ARCH + cfg.MODEL.MULTI_PATHWAY_ARCH, )) logger.info("Visualize model analysis for {} iterations".format( len(vis_loader))) # Run visualization on the model run_visualization(vis_loader, model, cfg, writer) if cfg.TENSORBOARD.WRONG_PRED_VIS.ENABLE: logger.info("Visualize Wrong Predictions for {} iterations".format( len(vis_loader))) perform_wrong_prediction_vis(vis_loader, model, cfg) if writer is not None: writer.close()
def test(cfg): """ Perform multi-view testing on the pretrained video model. Args: cfg (CfgNode): configs. Details can be found in slowfast/config/defaults.py """ # Set random seed from configs. np.random.seed(cfg.RNG_SEED) torch.manual_seed(cfg.RNG_SEED) # Setup logging format. logging.setup_logging() # Print config. logger.info("Test with config:") logger.info(cfg) # Build the video model and print model statistics. model = build_model(cfg) if du.is_master_proc(): misc.log_model_info(model, cfg, is_train=False) # Load a checkpoint to test if applicable. if cfg.TEST.CHECKPOINT_FILE_PATH != "": cu.load_checkpoint( cfg.TEST.CHECKPOINT_FILE_PATH, model, cfg.NUM_GPUS > 1, None, inflation=False, convert_from_caffe2=cfg.TEST.CHECKPOINT_TYPE == "caffe2", ) elif cu.has_checkpoint(cfg.OUTPUT_DIR): last_checkpoint = cu.get_last_checkpoint(cfg.OUTPUT_DIR) cu.load_checkpoint(last_checkpoint, model, cfg.NUM_GPUS > 1) elif cfg.TRAIN.CHECKPOINT_FILE_PATH != "": # If no checkpoint found in TEST.CHECKPOINT_FILE_PATH or in the current # checkpoint folder, try to load checkpint from # TRAIN.CHECKPOINT_FILE_PATH and test it. cu.load_checkpoint( cfg.TRAIN.CHECKPOINT_FILE_PATH, model, cfg.NUM_GPUS > 1, None, inflation=False, convert_from_caffe2=cfg.TRAIN.CHECKPOINT_TYPE == "caffe2", ) else: # raise NotImplementedError("Unknown way to load checkpoint.") logger.info("Testing with random initialization. Only for debugging.") # Create video testing loaders. test_loader = loader.construct_loader(cfg, "test") logger.info("Testing model for {} iterations".format(len(test_loader))) if cfg.DETECTION.ENABLE: assert cfg.NUM_GPUS == cfg.TEST.BATCH_SIZE test_meter = AVAMeter(len(test_loader), cfg, mode="test") else: assert ( len(test_loader.dataset) % (cfg.TEST.NUM_ENSEMBLE_VIEWS * cfg.TEST.NUM_SPATIAL_CROPS) == 0) # Create meters for multi-view testing. if cfg.TEST.DATASET == 'epickitchens': test_meter = EPICTestMeter( len(test_loader.dataset) // (cfg.TEST.NUM_ENSEMBLE_VIEWS * cfg.TEST.NUM_SPATIAL_CROPS), cfg.TEST.NUM_ENSEMBLE_VIEWS * cfg.TEST.NUM_SPATIAL_CROPS, cfg.MODEL.NUM_CLASSES, len(test_loader), ) else: test_meter = TestMeter( len(test_loader.dataset) // (cfg.TEST.NUM_ENSEMBLE_VIEWS * cfg.TEST.NUM_SPATIAL_CROPS), cfg.TEST.NUM_ENSEMBLE_VIEWS * cfg.TEST.NUM_SPATIAL_CROPS, cfg.MODEL.NUM_CLASSES, len(test_loader), ) # # Perform multi-view test on the entire dataset. preds, labels, metadata = perform_test(test_loader, model, test_meter, cfg) if du.is_master_proc(): if cfg.TEST.DATASET == 'epickitchens': results = { 'scores': { 'verb': preds[0], 'noun': preds[1] }, 'labels': { 'verb': labels[0], 'noun': labels[1] }, 'narration_id': metadata } scores_path = os.path.join(cfg.OUTPUT_DIR, 'scores') if not os.path.exists(scores_path): os.makedirs(scores_path) file_path = os.path.join(scores_path, cfg.EPICKITCHENS.TEST_SPLIT + '.pkl') pickle.dump(results, open(file_path, 'wb'))
def test(cfg): """ Perform multi-view testing on the pretrained video model. Args: cfg (CfgNode): configs. Details can be found in slowfast/config/defaults.py """ # Set random seed from configs. np.random.seed(cfg.RNG_SEED) torch.manual_seed(cfg.RNG_SEED) # Setup logging format. logging.setup_logging() # Print config. logger.info("Test with config:") logger.info(cfg) # Build the video model and print model statistics. model = model_builder.build_model(cfg) if du.is_master_proc(): misc.log_model_info(model) # Load a checkpoint to test if applicable. if cfg.TEST.CHECKPOINT_FILE_PATH != "": cu.load_checkpoint( cfg.TEST.CHECKPOINT_FILE_PATH, model, cfg.NUM_GPUS > 1, None, inflation=False, convert_from_caffe2=cfg.TEST.CHECKPOINT_TYPE == "caffe2", ) elif cu.has_checkpoint(cfg.OUTPUT_DIR): last_checkpoint = cu.get_last_checkpoint(cfg.OUTPUT_DIR) cu.load_checkpoint(last_checkpoint, model, cfg.NUM_GPUS > 1) elif cfg.TRAIN.CHECKPOINT_FILE_PATH != "": # If no checkpoint found in TEST.CHECKPOINT_FILE_PATH or in the current # checkpoint folder, try to load checkpint from # TRAIN.CHECKPOINT_FILE_PATH and test it. cu.load_checkpoint( cfg.TRAIN.CHECKPOINT_FILE_PATH, model, cfg.NUM_GPUS > 1, None, inflation=False, convert_from_caffe2=cfg.TRAIN.CHECKPOINT_TYPE == "caffe2", ) else: # raise NotImplementedError("Unknown way to load checkpoint.") logger.info("Testing with random initialization. Only for debugging.") # Create video testing loaders. test_loader = loader.construct_loader(cfg, "test") logger.info("Testing model for {} iterations".format(len(test_loader))) if cfg.DETECTION.ENABLE: assert cfg.NUM_GPUS == cfg.TEST.BATCH_SIZE test_meter = AVAMeter(len(test_loader), cfg, mode="test") else: assert ( len(test_loader.dataset) % (cfg.TEST.NUM_ENSEMBLE_VIEWS * cfg.TEST.NUM_SPATIAL_CROPS) == 0 ) # Create meters for multi-view testing. test_meter = TestMeter( len(test_loader.dataset) // (cfg.TEST.NUM_ENSEMBLE_VIEWS * cfg.TEST.NUM_SPATIAL_CROPS), cfg.TEST.NUM_ENSEMBLE_VIEWS * cfg.TEST.NUM_SPATIAL_CROPS, cfg.MODEL.NUM_CLASSES, len(test_loader), ) # # Perform multi-view test on the entire dataset. perform_test(test_loader, model, test_meter, cfg)
def train(cfg): """ Train a video model for many epochs on train set and evaluate it on val set. Args: cfg (CfgNode): configs. Details can be found in slowfast/config/defaults.py """ # Set random seed from configs. np.random.seed(cfg.RNG_SEED) torch.manual_seed(cfg.RNG_SEED) # Setup logging format. logging.setup_logging() if du.is_master_proc(): if cfg.ENABLE_WANDB: wandb.login() wandb.init(project='bbox', entity='slowfast') # Print config. # logger.info("Train with config:") # logger.info(pprint.pformat(cfg)) # Build the video model and print model statistics. model = build_model(cfg) if cfg.EPICKITCHENS.USE_BBOX and not cu.has_checkpoint(cfg.OUTPUT_DIR): slow_fast_model = SlowFast(cfg) if cfg.EPICKITCHENS.USE_BBOX and cfg.EPICKITCHENS.LOAD_SLOWFAST_PRETRAIN: if cfg.EPICKITCHENS.SLOWFAST_PRETRAIN_CHECKPOINT_FILE_PATH != "": _ = cu.load_checkpoint( cfg.EPICKITCHENS.SLOWFAST_PRETRAIN_CHECKPOINT_FILE_PATH, slow_fast_model, False, optimizer=None, inflation=cfg.TRAIN.CHECKPOINT_INFLATE, convert_from_caffe2=False, ) # cfg.TRAIN.CHECKPOINT_TYPE == "caffe2" logger.info("Load from slowfast.") if cfg.NUM_GPUS > 1: model.module.load_weight_slowfast(slow_fast_model) else: model.load_weight_slowfast(slow_fast_model) # if du.is_master_proc(): # misc.log_model_info(model, cfg, is_train=True) if cfg.BN.FREEZE: model.freeze_fn('bn_parameters') if cfg.EPICKITCHENS.USE_BBOX and cfg.EPICKITCHENS.FREEZE_BACKBONE: model.freeze_fn('slowfast_bbox') # Construct the optimizer. optimizer = optim.construct_optimizer(model, cfg) # Load a checkpoint to resume training if applicable. if cfg.TRAIN.AUTO_RESUME and cu.has_checkpoint(cfg.OUTPUT_DIR): logger.info("Load from last checkpoint.") last_checkpoint = cu.get_last_checkpoint(cfg.OUTPUT_DIR) checkpoint_epoch = cu.load_checkpoint(last_checkpoint, model, cfg.NUM_GPUS > 1, optimizer) start_epoch = checkpoint_epoch + 1 elif cfg.TRAIN.CHECKPOINT_FILE_PATH != "" and not cfg.TRAIN.FINETUNE: logger.info("Load from given checkpoint file.") checkpoint_epoch = cu.load_checkpoint( cfg.TRAIN.CHECKPOINT_FILE_PATH, model, cfg.NUM_GPUS > 1, optimizer, inflation=cfg.TRAIN.CHECKPOINT_INFLATE, convert_from_caffe2=cfg.TRAIN.CHECKPOINT_TYPE == "caffe2", ) start_epoch = checkpoint_epoch + 1 elif cfg.TRAIN.CHECKPOINT_FILE_PATH != "" and cfg.TRAIN.FINETUNE: logger.info("Load from given checkpoint file. Finetuning.") _ = cu.load_checkpoint( cfg.TRAIN.CHECKPOINT_FILE_PATH, model, cfg.NUM_GPUS > 1, None, inflation=cfg.TRAIN.CHECKPOINT_INFLATE, convert_from_caffe2=cfg.TRAIN.CHECKPOINT_TYPE == "caffe2", ) start_epoch = 0 else: start_epoch = 0 # Create the video train and val loaders. if cfg.TRAIN.DATASET != 'epickitchens' or not cfg.EPICKITCHENS.TRAIN_PLUS_VAL: train_loader = loader.construct_loader(cfg, "train") val_loader = loader.construct_loader(cfg, "val") logger.info("Train loader size: {}".format(len(train_loader))) logger.info("Val loader size: {}".format(len(val_loader))) else: train_loader = loader.construct_loader(cfg, "train+val") val_loader = loader.construct_loader(cfg, "val") # Create meters. if cfg.DETECTION.ENABLE: train_meter = AVAMeter(len(train_loader), cfg, mode="train") val_meter = AVAMeter(len(val_loader), cfg, mode="val") else: if cfg.TRAIN.DATASET == 'epickitchens': train_meter = EPICTrainMeterSimple(len(train_loader), cfg) val_meter = EPICValMeterSimple(len(val_loader), cfg) else: train_meter = TrainMeter(len(train_loader), cfg) val_meter = ValMeter(len(val_loader), cfg) # Perform the training loop. logger.info("Start epoch: {}".format(start_epoch + 1)) cnt = 0 # eval_epoch(val_loader, model, val_meter, 0, cfg, cnt) # test_from_train(model, cfg, cnt=cnt) for cur_epoch in range(start_epoch, cfg.SOLVER.MAX_EPOCH): # Shuffle the dataset. loader.shuffle_dataset(train_loader, cur_epoch) # Train for one epoch. cnt = train_epoch(train_loader, model, optimizer, train_meter, cur_epoch, cfg, cnt) # Compute precise BN stats. if cfg.BN.USE_PRECISE_STATS and len(get_bn_modules(model)) > 0: calculate_and_update_precise_bn(train_loader, model, cfg.BN.NUM_BATCHES_PRECISE) # Save a checkpoint. if cu.is_checkpoint_epoch(cur_epoch, cfg.TRAIN.CHECKPOINT_PERIOD): cu.save_checkpoint(cfg.OUTPUT_DIR, model, optimizer, cur_epoch, cfg) # Evaluate the model on validation set. if misc.is_eval_epoch(cfg, cur_epoch): is_best_epoch = eval_epoch(val_loader, model, val_meter, cur_epoch, cfg, cnt) if is_best_epoch: cu.save_checkpoint(cfg.OUTPUT_DIR, model, optimizer, cur_epoch, cfg, is_best_epoch=is_best_epoch)
def train(cfg): """ Train a video model for many epochs on train set and evaluate it on val set. Args: cfg (CfgNode): configs. Details can be found in slowfast/config/defaults.py """ # Set up environment. du.init_distributed_training(cfg) # Set random seed from configs. np.random.seed(cfg.RNG_SEED) torch.manual_seed(cfg.RNG_SEED) # Setup logging format. logging.setup_logging(cfg.OUTPUT_DIR) # Init multigrid. multigrid = None if cfg.MULTIGRID.LONG_CYCLE or cfg.MULTIGRID.SHORT_CYCLE: multigrid = MultigridSchedule() cfg = multigrid.init_multigrid(cfg) if cfg.MULTIGRID.LONG_CYCLE: cfg, _ = multigrid.update_long_cycle(cfg, cur_epoch=0) # Print config. logger.info("Train with config:") logger.info(pprint.pformat(cfg)) # Build the video model and print model statistics. model = build_model(cfg) if du.is_master_proc() and cfg.LOG_MODEL_INFO: misc.log_model_info(model, cfg, use_train_input=True) # Construct the optimizer. optimizer = optim.construct_optimizer(model, cfg) # Load a checkpoint to resume training if applicable. start_epoch = cu.load_train_checkpoint(cfg, model, optimizer) # Create the video train and val loaders. train_loader = loader.construct_loader(cfg, "train") val_loader = loader.construct_loader(cfg, "val") precise_bn_loader = (loader.construct_loader( cfg, "train", is_precise_bn=True) if cfg.BN.USE_PRECISE_STATS else None) # Create meters. if cfg.DETECTION.ENABLE: train_meter = AVAMeter(len(train_loader), cfg, mode="train") val_meter = AVAMeter(len(val_loader), cfg, mode="val") else: train_meter = TrainMeter(len(train_loader), cfg) val_meter = ValMeter(len(val_loader), cfg) # set up writer for logging to Tensorboard format. if cfg.TENSORBOARD.ENABLE and du.is_master_proc( cfg.NUM_GPUS * cfg.NUM_SHARDS): writer = tb.TensorboardWriter(cfg) else: writer = None # Perform the training loop. logger.info("Start epoch: {}".format(start_epoch + 1)) for cur_epoch in range(start_epoch, cfg.SOLVER.MAX_EPOCH): if cfg.MULTIGRID.LONG_CYCLE: cfg, changed = multigrid.update_long_cycle(cfg, cur_epoch) if changed: ( model, optimizer, train_loader, val_loader, precise_bn_loader, train_meter, val_meter, ) = build_trainer(cfg) # Load checkpoint. if cu.has_checkpoint(cfg.OUTPUT_DIR): last_checkpoint = cu.get_last_checkpoint(cfg.OUTPUT_DIR) assert "{:05d}.pyth".format(cur_epoch) in last_checkpoint else: last_checkpoint = cfg.TRAIN.CHECKPOINT_FILE_PATH logger.info("Load from {}".format(last_checkpoint)) cu.load_checkpoint(last_checkpoint, model, cfg.NUM_GPUS > 1, optimizer) # Shuffle the dataset. loader.shuffle_dataset(train_loader, cur_epoch) # Train for one epoch. train_epoch(train_loader, model, optimizer, train_meter, cur_epoch, cfg, writer) is_checkp_epoch = (cu.is_checkpoint_epoch( cfg, cur_epoch, None if multigrid is None else multigrid.schedule, )) is_eval_epoch = misc.is_eval_epoch( cfg, cur_epoch, None if multigrid is None else multigrid.schedule) # Compute precise BN stats. if ((is_checkp_epoch or is_eval_epoch) and cfg.BN.USE_PRECISE_STATS and len(get_bn_modules(model)) > 0): calculate_and_update_precise_bn( precise_bn_loader, model, min(cfg.BN.NUM_BATCHES_PRECISE, len(precise_bn_loader)), cfg.NUM_GPUS > 0, ) _ = misc.aggregate_sub_bn_stats(model) # Save a checkpoint. if is_checkp_epoch: cu.save_checkpoint(cfg.OUTPUT_DIR, model, optimizer, cur_epoch, cfg) # Evaluate the model on validation set. if is_eval_epoch: eval_epoch(val_loader, model, val_meter, cur_epoch, cfg, writer) if writer is not None: writer.close()
def test(cfg): """ Perform multi-view testing on the pretrained video model. Args: cfg (CfgNode): configs. Details can be found in slowfast/config/defaults.py """ # Set random seed from configs. np.random.seed(cfg.RNG_SEED) torch.manual_seed(cfg.RNG_SEED) # Setup logging format. logging.setup_logging() # ADD Demo if cfg.TEST.DEMO_PATH != "": generate_subclip(cfg) # Print config. logger.info("Test with config:") logger.info(cfg) # Build the video model and print model statistics. model = build_model(cfg) if du.is_master_proc(): misc.log_model_info(model, cfg, is_train=False) # Load a checkpoint to test if applicable. if cfg.TEST.CHECKPOINT_FILE_PATH != "": cu.load_checkpoint( cfg.TEST.CHECKPOINT_FILE_PATH, model, cfg.NUM_GPUS > 1, None, inflation=False, convert_from_caffe2=cfg.TEST.CHECKPOINT_TYPE == "caffe2", ) elif cu.has_checkpoint(cfg.OUTPUT_DIR): last_checkpoint = cu.get_last_checkpoint(cfg.OUTPUT_DIR) cu.load_checkpoint(last_checkpoint, model, cfg.NUM_GPUS > 1) elif cfg.TRAIN.CHECKPOINT_FILE_PATH != "": # If no checkpoint found in TEST.CHECKPOINT_FILE_PATH or in the current # checkpoint folder, try to load checkpint from # TRAIN.CHECKPOINT_FILE_PATH and test it. cu.load_checkpoint( cfg.TRAIN.CHECKPOINT_FILE_PATH, model, cfg.NUM_GPUS > 1, None, inflation=False, convert_from_caffe2=cfg.TRAIN.CHECKPOINT_TYPE == "caffe2", ) else: # raise NotImplementedError("Unknown way to load checkpoint.") logger.info("Testing with random initialization. Only for debugging.") # Create video testing loaders. test_loader = loader.construct_loader(cfg, "test") logger.info("Testing model for {} iterations".format(len(test_loader))) if cfg.DETECTION.ENABLE: assert cfg.NUM_GPUS == cfg.TEST.BATCH_SIZE test_meter = AVAMeter(len(test_loader), cfg, mode="test") else: assert ( len(test_loader.dataset) % (cfg.TEST.NUM_ENSEMBLE_VIEWS * cfg.TEST.NUM_SPATIAL_CROPS) == 0) # Create meters for multi-view testing. test_meter = TestMeter( len(test_loader.dataset) // (cfg.TEST.NUM_ENSEMBLE_VIEWS * cfg.TEST.NUM_SPATIAL_CROPS), cfg.TEST.NUM_ENSEMBLE_VIEWS * cfg.TEST.NUM_SPATIAL_CROPS, cfg.MODEL.NUM_CLASSES, len(test_loader), (cfg.TEST.DEMO_PATH != ""), ) # # Perform multi-view test on the entire dataset. perform_test(test_loader, model, test_meter, cfg) if cfg.TEST.DEMO_PATH != "": end_time_dir = os.path.join(cfg.DATA.PATH_TO_DATA_DIR, "end_time.npy") end_time_array = np.load(end_time_dir) cwd = os.getcwd() probability_dir = os.path.join(cwd, "tmp/probability.npy") p_array = np.load(probability_dir) print(end_time_array) print(p_array) end_time_dir = os.path.join(cfg.DATA.PATH_TO_DATA_DIR) if end_time_dir.split("/")[-1] == "demo": try: shutil.rmtree(end_time_dir) except OSError as e: print("Error: %s - %s." % (e.filename, e.strerror)) json_list = [] for i in range(len(end_time_array)): json_list.append([end_time_array[i], p_array[i]]) print(json_list) with open('timeLabel.json', 'w') as f: json.dump({"jogging": str(json_list)}, f) plt.plot(end_time_array, p_array, 'ro-') plt.axis( [0, end_time_array[len(end_time_array) - 1] + 0.5, -0.01, 1.01]) plt.xlabel('time(s)') plt.ylabel('probability') plt.suptitle("Jogging/Running") plt.savefig("fig.png")