def train_des(cfg): """ Train a video model for many epochs on train set and evaluate it on val set. Args: cfg (CfgNode): configs. Details can be found in slowfast/config/defaults.py """ # Set random seed from configs. np.random.seed(cfg.RNG_SEED) torch.manual_seed(cfg.RNG_SEED) # Setup logging format. logging.setup_logging(cfg.OUTPUT_DIR) # Print config. logger.info("Train with config:") logger.info(pprint.pformat(cfg)) # Build the video model and print model statistics. model = build_clevrer_model(cfg) # Construct the optimizer. optimizer = AdamW(model.parameters(), lr=cfg.SOLVER.BASE_LR, eps=1e-8) start_epoch = cu.load_train_checkpoint(cfg, model, optimizer) # Create the video train and val loaders. train_loader = build_dataloader(cfg, "train") val_loader = build_dataloader(cfg, "val") total_steps = len(train_loader) * cfg.SOLVER.MAX_EPOCH # Create the learning rate scheduler. scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=0, # Default value in run_glue.py num_training_steps=total_steps) # Create meters. train_meter = ClevrerTrainMeter(len(train_loader), cfg) val_meter = ClevrerValMeter(len(val_loader), cfg) # Perform the training loop. logger.info("Start epoch: {}".format(start_epoch + 1)) for cur_epoch in range(start_epoch, cfg.SOLVER.MAX_EPOCH): # Train for one epoch. train_epoch(train_loader, model, optimizer, scheduler, train_meter, cur_epoch, cfg) is_checkp_epoch = cu.is_checkpoint_epoch( cfg, cur_epoch, None, ) is_eval_epoch = misc.is_eval_epoch(cfg, cur_epoch, None) # Save a checkpoint. if is_checkp_epoch: cu.save_checkpoint(cfg.OUTPUT_DIR, model, optimizer, cur_epoch, cfg) # Evaluate the model on validation set. if is_eval_epoch: eval_epoch(val_loader, model, val_meter, cur_epoch, cfg)
def test_train_iter2(): cfg = get_cfg() cfg.LOG_PERIOD = 1 train_meter = ClevrerTrainMeter(5, cfg) train_meter.update_stats(top1_err=40.0, top5_err=20.0, mc_opt_err=55.0, mc_q_err=78.0, loss_des=6.3, loss_mc=0.2, lr=0.1, mb_size_des=10, mb_size_mc=10) train_meter.update_stats(top1_err=41.0, top5_err=21.0, mc_opt_err=56.0, mc_q_err=86.0, loss_des=6.7, loss_mc=1.2, lr=0.1, mb_size_des=10, mb_size_mc=10) stats = train_meter.log_iter_stats(cur_epoch=1, cur_iter=1) assert stats['top1_err'] == 41.0 assert stats['top5_err'] == 21.0 assert stats['mc_opt_err'] == 56.0 assert stats['mc_q_err'] == 86.0 assert stats['loss_des'] == 6.7 assert stats['loss_mc'] == 1.2
def build_trainer(cfg): """ Build training model and its associated tools, including optimizer, dataloaders and meters. Args: cfg (CfgNode): configs. Details can be found in slowfast/config/defaults.py Returns: model (nn.Module): training model. optimizer (Optimizer): optimizer. train_loader (DataLoader): training data loader. val_loader (DataLoader): validatoin data loader. precise_bn_loader (DataLoader): training data loader for computing precise BN. train_meter (ClevrerTrainMeter): tool for measuring training stats. val_meter (ClevrerValMeter): tool for measuring validation stats. """ # Build the video model and print model statistics. model = build_clevrer_model(cfg) # if du.is_master_proc() and cfg.LOG_MODEL_INFO: # misc.log_model_info(model, cfg, use_train_input=True) # Construct the optimizer. optimizer = optim.construct_optimizer(model, cfg) # Create the video train and val loaders. train_loader = loader.construct_loader(cfg, "train") val_loader = loader.construct_loader(cfg, "val") precise_bn_loader = loader.construct_loader(cfg, "train", is_precise_bn=True) # Create meters. train_meter = ClevrerTrainMeter(len(train_loader), cfg) val_meter = ClevrerValMeter(len(val_loader), cfg) return ( model, optimizer, train_loader, val_loader, precise_bn_loader, train_meter, val_meter, )
def test_train_iter4(): cfg = get_cfg() cfg.LOG_PERIOD = 2 train_meter = ClevrerTrainMeter(5, cfg) train_meter.update_stats(top1_err=40.0, top5_err=20.0, mc_opt_err=55.0, mc_q_err=78.0, loss_des=6.3, loss_mc=0.2, lr=0.1, mb_size_des=10, mb_size_mc=10) train_meter.update_stats(top1_err=10.0, top5_err=10.0, mc_opt_err=10.0, mc_q_err=10.0, loss_des=10.0, loss_mc=10.0, lr=0.1, mb_size_des=10, mb_size_mc=10) stats = train_meter.log_iter_stats(cur_epoch=1, cur_iter=2) assert stats is None train_meter.update_stats(top1_err=20.0, top5_err=20.0, mc_opt_err=20.0, mc_q_err=20.0, loss_des=20.0, loss_mc=20.0, lr=0.1, mb_size_des=10, mb_size_mc=10) stats = train_meter.log_iter_stats(cur_epoch=0, cur_iter=3) assert stats['top1_err'] == 15.0 assert stats['top5_err'] == 15.0 assert stats['mc_opt_err'] == 15.0 assert stats['mc_q_err'] == 15.0 assert stats['loss_des'] == 15.0 assert stats['loss_mc'] == 15.0
def test_train_iter5(): cfg = get_cfg() cfg.LOG_PERIOD = 3 train_meter = ClevrerTrainMeter(5, cfg) train_meter.update_stats(top1_err=30.0, top5_err=30.0, mc_opt_err=30.0, mc_q_err=30.0, loss_des=30.0, loss_mc=30.0, lr=0.1, mb_size_des=10, mb_size_mc=10) train_meter.update_stats(top1_err=10.0, top5_err=10.0, mc_opt_err=10.0, mc_q_err=10.0, loss_des=10.0, loss_mc=10.0, lr=0.1, mb_size_des=10, mb_size_mc=10) stats = train_meter.log_iter_stats(cur_epoch=1, cur_iter=1) assert stats is None train_meter.update_stats(top1_err=20.0, top5_err=20.0, mc_opt_err=20.0, mc_q_err=20.0, loss_des=20.0, loss_mc=20.0, lr=0.1, mb_size_des=10, mb_size_mc=10) stats = train_meter.log_iter_stats(cur_epoch=0, cur_iter=5) assert stats['top1_err'] == 20.0, print(stats['top1_err']) assert stats['top5_err'] == 20.0 assert stats['mc_opt_err'] == 20.0 assert stats['mc_q_err'] == 20.0 assert stats['loss_des'] == 20.0 assert stats['loss_mc'] == 20.0
def test_train_epoch_only_des(): cfg = get_cfg() cfg.LOG_PERIOD = 3 train_meter = ClevrerTrainMeter(5, cfg) train_meter.update_stats(top1_err=30.0, top5_err=30.0, mc_opt_err=30.0, mc_q_err=30.0, loss_des=30.0, loss_mc=30.0, lr=0.1, mb_size_des=10, mb_size_mc=0) train_meter.update_stats(top1_err=10.0, top5_err=10.0, mc_opt_err=10.0, mc_q_err=10.0, loss_des=10.0, loss_mc=10.0, lr=0.1, mb_size_des=10, mb_size_mc=0) train_meter.update_stats(top1_err=20.0, top5_err=20.0, mc_opt_err=20.0, mc_q_err=20.0, loss_des=20.0, loss_mc=20.0, lr=0.1, mb_size_des=10, mb_size_mc=0) stats = train_meter.log_epoch_stats(cur_epoch=1) assert stats['top1_err'] == 20.0, print(stats['top1_err']) assert stats['top5_err'] == 20.0 assert not 'mc_opt_err' in stats assert not 'mc_q_err' in stats assert stats['loss_des'] == 20.0 assert not 'loss_mc' in stats
def test_implementation_des(cfg): """ Simulates a train and val epoch to check if the gradients are being updated, metrics are being calculated correctly Args: cfg (CfgNode): configs. Details can be found in slowfast/config/defaults.py """ # Set random seed from configs. np.random.seed(cfg.RNG_SEED) torch.manual_seed(cfg.RNG_SEED) # Setup logging format. logging.setup_logging(cfg.OUTPUT_DIR) # Print config. logger.info("Test implementation") # Build the video model and print model statistics. model = build_clevrer_model(cfg) # Construct the optimizer. optimizer = optim.construct_optimizer(model, cfg) start_epoch = cu.load_train_checkpoint(cfg, model, optimizer) # Create the video train and val loaders. if cfg.TRAIN.DATASET != 'Clevrer_des': print("This train script does not support your dataset: -{}-. Only Clevrer_des".format(cfg.TRAIN.DATASET)) exit() train_loader = build_dataloader(cfg, "train") val_loader = build_dataloader(cfg, "val") # Create meters. train_meter = ClevrerTrainMeter(len(train_loader), cfg) val_meter = ClevrerValMeter(len(val_loader), cfg) # Perform the training loop. logger.info("Start epoch: {}".format(start_epoch + 1)) # Train for one epoch. model_before = copy.deepcopy(model) cur_epoch = start_epoch train_epoch( train_loader, model, optimizer, train_meter, cur_epoch, cfg, test_imp=True ) print("Check how much parameters changed") for (p_b_name, p_b), (p_name, p) in zip(model_before.named_parameters(), model.named_parameters()): if p.requires_grad: print("Parameter requires grad:") print(p_name, p_b_name) #Calculate ratio of change change = torch.abs(torch.norm(p) - torch.norm(p_b)) print("Ratio of change = {}".format(torch.true_divide(change, torch.norm(p_b)))) if (p_b != p).any(): print("--Check--") else: print("ALERT - WEIGHTS DID NOT CHANGE WITH TRAINING.") else: print("Parameter does not require grad:") print(p_name) print(p) print("Val epoch") eval_epoch(val_loader, model, val_meter, cur_epoch, cfg, test_imp=True)
def train_des(cfg): """ Train a video model for many epochs on train set and evaluate it on val set. Args: cfg (CfgNode): configs. Details can be found in slowfast/config/defaults.py """ # Set random seed from configs. np.random.seed(cfg.RNG_SEED) torch.manual_seed(cfg.RNG_SEED) # Setup logging format. logging.setup_logging(cfg.OUTPUT_DIR) # Print config. logger.info("Train with config:") logger.info(pprint.pformat(cfg)) # Build the video model and print model statistics. model = build_clevrer_model(cfg) # Construct the optimizer. optimizer = optim.construct_optimizer(model, cfg) # Load a checkpoint to resume training if applicable. start_epoch = cu.load_train_checkpoint(cfg, model, optimizer) # Create the video train and val loaders. if cfg.TRAIN.DATASET != 'Clevrer_des': print("This train script does not support your dataset: -{}-. Only Clevrer_des".format(cfg.TRAIN.DATASET)) exit() # Create the video train and val loaders. train_loader = build_dataloader(cfg, "train") val_loader = build_dataloader(cfg, "val") # Create meters. train_meter = ClevrerTrainMeter(len(train_loader), cfg) val_meter = ClevrerValMeter(len(val_loader), cfg) # Perform the training loop. logger.info("Start epoch: {}".format(start_epoch + 1)) for cur_epoch in range(start_epoch, cfg.SOLVER.MAX_EPOCH): # Shuffle the dataset. #loader.shuffle_dataset(train_loader, cur_epoch) # Train for one epoch. train_epoch( train_loader, model, optimizer, train_meter, cur_epoch, cfg ) is_checkp_epoch = cu.is_checkpoint_epoch( cfg, cur_epoch, None, ) is_eval_epoch = misc.is_eval_epoch( cfg, cur_epoch, None ) # Save a checkpoint. # if is_checkp_epoch: # cu.save_checkpoint(cfg.OUTPUT_DIR, model, optimizer, cur_epoch, cfg) # Evaluate the model on validation set. if is_eval_epoch: eval_epoch(val_loader, model, val_meter, cur_epoch, cfg)
def train(cfg): """ Train a video model for many epochs on train set and evaluate it on val set. Args: cfg (CfgNode): configs. Details can be found in slowfast/config/defaults.py """ # Set up environment. du.init_distributed_training(cfg) # Set random seed from configs. np.random.seed(cfg.RNG_SEED) torch.manual_seed(cfg.RNG_SEED) # Setup logging format. logging.setup_logging(cfg.OUTPUT_DIR) # Init multigrid. multigrid = None if cfg.MULTIGRID.LONG_CYCLE or cfg.MULTIGRID.SHORT_CYCLE: multigrid = MultigridSchedule() cfg = multigrid.init_multigrid(cfg) if cfg.MULTIGRID.LONG_CYCLE: cfg, _ = multigrid.update_long_cycle(cfg, cur_epoch=0) # Print config. logger.info("Train with config:") logger.info(pprint.pformat(cfg)) # Build the video model and print model statistics. model = build_clevrer_model(cfg) # if du.is_master_proc() and cfg.LOG_MODEL_INFO: # misc.log_model_info(model, cfg, use_train_input=True) # Construct the optimizer. optimizer = optim.construct_optimizer(model, cfg) # Load a checkpoint to resume training if applicable. start_epoch = cu.load_train_checkpoint(cfg, model, optimizer) # Create the video train and val loaders. train_loader = loader.construct_loader(cfg, "train") val_loader = loader.construct_loader(cfg, "val") precise_bn_loader = (loader.construct_loader( cfg, "train", is_precise_bn=True) if cfg.BN.USE_PRECISE_STATS else None) # Create meters. train_meter = ClevrerTrainMeter(len(train_loader), cfg) val_meter = ClevrerValMeter(len(val_loader), cfg) # set up writer for logging to Tensorboard format. if cfg.TENSORBOARD.ENABLE and du.is_master_proc( cfg.NUM_GPUS * cfg.NUM_SHARDS): writer = tb.TensorboardWriter(cfg) else: writer = None # Perform the training loop. logger.info("Start epoch: {}".format(start_epoch + 1)) for cur_epoch in range(start_epoch, cfg.SOLVER.MAX_EPOCH): if cfg.MULTIGRID.LONG_CYCLE: cfg, changed = multigrid.update_long_cycle(cfg, cur_epoch) if changed: ( model, optimizer, train_loader, val_loader, precise_bn_loader, train_meter, val_meter, ) = build_trainer(cfg) # Load checkpoint. if cu.has_checkpoint(cfg.OUTPUT_DIR): last_checkpoint = cu.get_last_checkpoint(cfg.OUTPUT_DIR) assert "{:05d}.pyth".format(cur_epoch) in last_checkpoint else: last_checkpoint = cfg.TRAIN.CHECKPOINT_FILE_PATH logger.info("Load from {}".format(last_checkpoint)) cu.load_checkpoint(last_checkpoint, model, cfg.NUM_GPUS > 1, optimizer) # Shuffle the dataset. loader.shuffle_dataset(train_loader, cur_epoch) # Train for one epoch. train_epoch(train_loader, model, optimizer, train_meter, cur_epoch, cfg, writer) is_checkp_epoch = cu.is_checkpoint_epoch( cfg, cur_epoch, None if multigrid is None else multigrid.schedule, ) is_eval_epoch = misc.is_eval_epoch( cfg, cur_epoch, None if multigrid is None else multigrid.schedule) # Compute precise BN stats. # if ( # (is_checkp_epoch or is_eval_epoch) # and cfg.BN.USE_PRECISE_STATS # and len(get_bn_modules(model)) > 0 # ): # calculate_and_update_precise_bn( # precise_bn_loader, # model, # min(cfg.BN.NUM_BATCHES_PRECISE, len(precise_bn_loader)), # cfg.NUM_GPUS > 0, # ) # _ = misc.aggregate_sub_bn_stats(model) # Save a checkpoint. if is_checkp_epoch: cu.save_checkpoint(cfg.OUTPUT_DIR, model, optimizer, cur_epoch, cfg) # Evaluate the model on validation set. if is_eval_epoch: eval_epoch(val_loader, model, val_meter, cur_epoch, cfg, writer) if writer is not None: writer.close()
def train_des(cfg): """ Train a video model for many epochs on train set and evaluate it on val set. Args: cfg (CfgNode): configs. Details can be found in slowfast/config/defaults.py """ # Set random seed from configs. np.random.seed(cfg.RNG_SEED) torch.manual_seed(cfg.RNG_SEED) # Setup logging format. logging.setup_logging(cfg.OUTPUT_DIR) # Print config. logger.info("Train with config:") logger.info(pprint.pformat(cfg)) # Build the video model and print model statistics. model = MONET_BERT(cfg) if cfg.NUM_GPUS: # Determine the GPU used by the current process cur_device = torch.cuda.current_device() # Transfer the model to the current GPU device model = model.cuda(device=cur_device) # Construct the optimizer. # optimizer = AdamW(model.parameters(), # lr = cfg.SOLVER.BASE_LR, # eps = 1e-8 # ) optimizer = Lamb(model.parameters(), lr=cfg.SOLVER.BASE_LR, betas=(0.9, 0.999), eps=1e-6, weight_decay=cfg.SOLVER.WEIGHT_DECAY, adam=False) # optimizer = optim.Adam(model.parameters(), lr=cfg.SOLVER.BASE_LR) start_epoch = cu.load_train_checkpoint(cfg, model, optimizer) optimizer = Lamb(model.parameters(), lr=cfg.SOLVER.BASE_LR, betas=(0.9, 0.999), eps=1e-6, weight_decay=cfg.SOLVER.WEIGHT_DECAY, adam=False) # Create the video train and val loaders. train_loader = build_dataloader(cfg, "train") val_loader = build_dataloader(cfg, "val") total_steps = len(train_loader) * cfg.SOLVER.MAX_EPOCH # Create the learning rate scheduler. scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=4000, # Default value in run_glue.py num_training_steps=total_steps) # Create meters. train_meter = ClevrerTrainMeter(len(train_loader), cfg) val_meter = ClevrerValMeter(len(val_loader), cfg) # Perform the training loop. logger.info("Start epoch: {}".format(start_epoch + 1)) for cur_epoch in range(start_epoch, cfg.SOLVER.MAX_EPOCH): # Train for one epoch. train_epoch(train_loader, model, optimizer, scheduler, train_meter, cur_epoch, cfg) is_checkp_epoch = cu.is_checkpoint_epoch( cfg, cur_epoch, None, ) is_eval_epoch = misc.is_eval_epoch(cfg, cur_epoch, None) # Save a checkpoint. if is_checkp_epoch: cu.save_checkpoint(cfg.OUTPUT_DIR, model, optimizer, cur_epoch, cfg) # Evaluate the model on validation set. if is_eval_epoch: eval_epoch(val_loader, model, val_meter, cur_epoch, cfg)