Exemple #1
0
def train_des(cfg):
    """
    Train a video model for many epochs on train set and evaluate it on val set.
    Args:
        cfg (CfgNode): configs. Details can be found in
            slowfast/config/defaults.py
    """
    # Set random seed from configs.
    np.random.seed(cfg.RNG_SEED)
    torch.manual_seed(cfg.RNG_SEED)

    # Setup logging format.
    logging.setup_logging(cfg.OUTPUT_DIR)
    # Print config.
    logger.info("Train with config:")
    logger.info(pprint.pformat(cfg))

    # Build the video model and print model statistics.
    model = build_clevrer_model(cfg)

    # Construct the optimizer.
    optimizer = AdamW(model.parameters(), lr=cfg.SOLVER.BASE_LR, eps=1e-8)
    start_epoch = cu.load_train_checkpoint(cfg, model, optimizer)
    # Create the video train and val loaders.
    train_loader = build_dataloader(cfg, "train")
    val_loader = build_dataloader(cfg, "val")

    total_steps = len(train_loader) * cfg.SOLVER.MAX_EPOCH

    # Create the learning rate scheduler.
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=0,  # Default value in run_glue.py
        num_training_steps=total_steps)

    # Create meters.
    train_meter = ClevrerTrainMeter(len(train_loader), cfg)
    val_meter = ClevrerValMeter(len(val_loader), cfg)

    # Perform the training loop.
    logger.info("Start epoch: {}".format(start_epoch + 1))
    for cur_epoch in range(start_epoch, cfg.SOLVER.MAX_EPOCH):
        # Train for one epoch.
        train_epoch(train_loader, model, optimizer, scheduler, train_meter,
                    cur_epoch, cfg)

        is_checkp_epoch = cu.is_checkpoint_epoch(
            cfg,
            cur_epoch,
            None,
        )
        is_eval_epoch = misc.is_eval_epoch(cfg, cur_epoch, None)

        # Save a checkpoint.
        if is_checkp_epoch:
            cu.save_checkpoint(cfg.OUTPUT_DIR, model, optimizer, cur_epoch,
                               cfg)
        # Evaluate the model on validation set.
        if is_eval_epoch:
            eval_epoch(val_loader, model, val_meter, cur_epoch, cfg)
Exemple #2
0
def test_train_iter2():
    cfg = get_cfg()
    cfg.LOG_PERIOD = 1
    train_meter = ClevrerTrainMeter(5, cfg)
    train_meter.update_stats(top1_err=40.0,
                             top5_err=20.0,
                             mc_opt_err=55.0,
                             mc_q_err=78.0,
                             loss_des=6.3,
                             loss_mc=0.2,
                             lr=0.1,
                             mb_size_des=10,
                             mb_size_mc=10)
    train_meter.update_stats(top1_err=41.0,
                             top5_err=21.0,
                             mc_opt_err=56.0,
                             mc_q_err=86.0,
                             loss_des=6.7,
                             loss_mc=1.2,
                             lr=0.1,
                             mb_size_des=10,
                             mb_size_mc=10)
    stats = train_meter.log_iter_stats(cur_epoch=1, cur_iter=1)
    assert stats['top1_err'] == 41.0
    assert stats['top5_err'] == 21.0
    assert stats['mc_opt_err'] == 56.0
    assert stats['mc_q_err'] == 86.0
    assert stats['loss_des'] == 6.7
    assert stats['loss_mc'] == 1.2
Exemple #3
0
def test_train_iter4():
    cfg = get_cfg()
    cfg.LOG_PERIOD = 2
    train_meter = ClevrerTrainMeter(5, cfg)
    train_meter.update_stats(top1_err=40.0,
                             top5_err=20.0,
                             mc_opt_err=55.0,
                             mc_q_err=78.0,
                             loss_des=6.3,
                             loss_mc=0.2,
                             lr=0.1,
                             mb_size_des=10,
                             mb_size_mc=10)
    train_meter.update_stats(top1_err=10.0,
                             top5_err=10.0,
                             mc_opt_err=10.0,
                             mc_q_err=10.0,
                             loss_des=10.0,
                             loss_mc=10.0,
                             lr=0.1,
                             mb_size_des=10,
                             mb_size_mc=10)
    stats = train_meter.log_iter_stats(cur_epoch=1, cur_iter=2)
    assert stats is None
    train_meter.update_stats(top1_err=20.0,
                             top5_err=20.0,
                             mc_opt_err=20.0,
                             mc_q_err=20.0,
                             loss_des=20.0,
                             loss_mc=20.0,
                             lr=0.1,
                             mb_size_des=10,
                             mb_size_mc=10)
    stats = train_meter.log_iter_stats(cur_epoch=0, cur_iter=3)
    assert stats['top1_err'] == 15.0
    assert stats['top5_err'] == 15.0
    assert stats['mc_opt_err'] == 15.0
    assert stats['mc_q_err'] == 15.0
    assert stats['loss_des'] == 15.0
    assert stats['loss_mc'] == 15.0
Exemple #4
0
def test_train_iter5():
    cfg = get_cfg()
    cfg.LOG_PERIOD = 3
    train_meter = ClevrerTrainMeter(5, cfg)
    train_meter.update_stats(top1_err=30.0,
                             top5_err=30.0,
                             mc_opt_err=30.0,
                             mc_q_err=30.0,
                             loss_des=30.0,
                             loss_mc=30.0,
                             lr=0.1,
                             mb_size_des=10,
                             mb_size_mc=10)
    train_meter.update_stats(top1_err=10.0,
                             top5_err=10.0,
                             mc_opt_err=10.0,
                             mc_q_err=10.0,
                             loss_des=10.0,
                             loss_mc=10.0,
                             lr=0.1,
                             mb_size_des=10,
                             mb_size_mc=10)
    stats = train_meter.log_iter_stats(cur_epoch=1, cur_iter=1)
    assert stats is None
    train_meter.update_stats(top1_err=20.0,
                             top5_err=20.0,
                             mc_opt_err=20.0,
                             mc_q_err=20.0,
                             loss_des=20.0,
                             loss_mc=20.0,
                             lr=0.1,
                             mb_size_des=10,
                             mb_size_mc=10)
    stats = train_meter.log_iter_stats(cur_epoch=0, cur_iter=5)
    assert stats['top1_err'] == 20.0, print(stats['top1_err'])
    assert stats['top5_err'] == 20.0
    assert stats['mc_opt_err'] == 20.0
    assert stats['mc_q_err'] == 20.0
    assert stats['loss_des'] == 20.0
    assert stats['loss_mc'] == 20.0
Exemple #5
0
def test_train_epoch_only_des():
    cfg = get_cfg()
    cfg.LOG_PERIOD = 3
    train_meter = ClevrerTrainMeter(5, cfg)
    train_meter.update_stats(top1_err=30.0,
                             top5_err=30.0,
                             mc_opt_err=30.0,
                             mc_q_err=30.0,
                             loss_des=30.0,
                             loss_mc=30.0,
                             lr=0.1,
                             mb_size_des=10,
                             mb_size_mc=0)
    train_meter.update_stats(top1_err=10.0,
                             top5_err=10.0,
                             mc_opt_err=10.0,
                             mc_q_err=10.0,
                             loss_des=10.0,
                             loss_mc=10.0,
                             lr=0.1,
                             mb_size_des=10,
                             mb_size_mc=0)
    train_meter.update_stats(top1_err=20.0,
                             top5_err=20.0,
                             mc_opt_err=20.0,
                             mc_q_err=20.0,
                             loss_des=20.0,
                             loss_mc=20.0,
                             lr=0.1,
                             mb_size_des=10,
                             mb_size_mc=0)
    stats = train_meter.log_epoch_stats(cur_epoch=1)
    assert stats['top1_err'] == 20.0, print(stats['top1_err'])
    assert stats['top5_err'] == 20.0
    assert not 'mc_opt_err' in stats
    assert not 'mc_q_err' in stats
    assert stats['loss_des'] == 20.0
    assert not 'loss_mc' in stats
Exemple #6
0
def build_trainer(cfg):
    """
    Build training model and its associated tools, including optimizer,
    dataloaders and meters.
    Args:
        cfg (CfgNode): configs. Details can be found in
            slowfast/config/defaults.py
    Returns:
        model (nn.Module): training model.
        optimizer (Optimizer): optimizer.
        train_loader (DataLoader): training data loader.
        val_loader (DataLoader): validatoin data loader.
        precise_bn_loader (DataLoader): training data loader for computing
            precise BN.
        train_meter (ClevrerTrainMeter): tool for measuring training stats.
        val_meter (ClevrerValMeter): tool for measuring validation stats.
    """
    # Build the video model and print model statistics.
    model = build_clevrer_model(cfg)
    # if du.is_master_proc() and cfg.LOG_MODEL_INFO:
    #     misc.log_model_info(model, cfg, use_train_input=True)

    # Construct the optimizer.
    optimizer = optim.construct_optimizer(model, cfg)

    # Create the video train and val loaders.
    train_loader = loader.construct_loader(cfg, "train")
    val_loader = loader.construct_loader(cfg, "val")
    precise_bn_loader = loader.construct_loader(cfg,
                                                "train",
                                                is_precise_bn=True)
    # Create meters.
    train_meter = ClevrerTrainMeter(len(train_loader), cfg)
    val_meter = ClevrerValMeter(len(val_loader), cfg)

    return (
        model,
        optimizer,
        train_loader,
        val_loader,
        precise_bn_loader,
        train_meter,
        val_meter,
    )
def test_implementation_des(cfg):
    """
    Simulates a train and val epoch to check if the gradients are being updated,
    metrics are being calculated correctly
    Args:
        cfg (CfgNode): configs. Details can be found in
            slowfast/config/defaults.py
    """
    # Set random seed from configs.
    np.random.seed(cfg.RNG_SEED)
    torch.manual_seed(cfg.RNG_SEED)

    # Setup logging format.
    logging.setup_logging(cfg.OUTPUT_DIR)
    # Print config.
    logger.info("Test implementation")

    # Build the video model and print model statistics.
    model = build_clevrer_model(cfg)

    # Construct the optimizer.
    optimizer = optim.construct_optimizer(model, cfg)

    start_epoch = cu.load_train_checkpoint(cfg, model, optimizer)

    # Create the video train and val loaders.
    if cfg.TRAIN.DATASET != 'Clevrer_des':
        print("This train script does not support your dataset: -{}-. Only Clevrer_des".format(cfg.TRAIN.DATASET))
        exit()
    
    train_loader = build_dataloader(cfg, "train")
    val_loader = build_dataloader(cfg, "val")

    # Create meters.
    train_meter = ClevrerTrainMeter(len(train_loader), cfg)
    val_meter = ClevrerValMeter(len(val_loader), cfg)

    # Perform the training loop.
    logger.info("Start epoch: {}".format(start_epoch + 1))
    # Train for one epoch.
    model_before = copy.deepcopy(model)
    cur_epoch = start_epoch
    train_epoch(
        train_loader, model, optimizer, train_meter, cur_epoch, cfg, test_imp=True
    )
    print("Check how much parameters changed")
    for (p_b_name, p_b), (p_name, p) in zip(model_before.named_parameters(), model.named_parameters()):
        if p.requires_grad:
            print("Parameter requires grad:")
            print(p_name, p_b_name)
            #Calculate ratio of change
            change = torch.abs(torch.norm(p) - torch.norm(p_b))
            print("Ratio of change = {}".format(torch.true_divide(change, torch.norm(p_b))))
            if (p_b != p).any():
                print("--Check--")
            else:
                print("ALERT - WEIGHTS DID NOT CHANGE WITH TRAINING.")
        else:
            print("Parameter does not require grad:")
            print(p_name)
            print(p)
    print("Val epoch")
    eval_epoch(val_loader, model, val_meter, cur_epoch, cfg, test_imp=True)
def train_des(cfg):
    """
    Train a video model for many epochs on train set and evaluate it on val set.
    Args:
        cfg (CfgNode): configs. Details can be found in
            slowfast/config/defaults.py
    """
    # Set random seed from configs.
    np.random.seed(cfg.RNG_SEED)
    torch.manual_seed(cfg.RNG_SEED)

    # Setup logging format.
    logging.setup_logging(cfg.OUTPUT_DIR)
    # Print config.
    logger.info("Train with config:")
    logger.info(pprint.pformat(cfg))

    # Build the video model and print model statistics.
    model = build_clevrer_model(cfg)

    # Construct the optimizer.
    optimizer = optim.construct_optimizer(model, cfg)

    # Load a checkpoint to resume training if applicable.
    start_epoch = cu.load_train_checkpoint(cfg, model, optimizer)
    # Create the video train and val loaders.
    if cfg.TRAIN.DATASET != 'Clevrer_des':
        print("This train script does not support your dataset: -{}-. Only Clevrer_des".format(cfg.TRAIN.DATASET))
        exit()
    # Create the video train and val loaders.
    train_loader = build_dataloader(cfg, "train")
    val_loader = build_dataloader(cfg, "val")

    # Create meters.
    train_meter = ClevrerTrainMeter(len(train_loader), cfg)
    val_meter = ClevrerValMeter(len(val_loader), cfg)

    # Perform the training loop.
    logger.info("Start epoch: {}".format(start_epoch + 1))
    for cur_epoch in range(start_epoch, cfg.SOLVER.MAX_EPOCH):
        # Shuffle the dataset.
        #loader.shuffle_dataset(train_loader, cur_epoch)
        # Train for one epoch.
        train_epoch(
            train_loader, model, optimizer, train_meter, cur_epoch, cfg
        )

        is_checkp_epoch = cu.is_checkpoint_epoch(
            cfg,
            cur_epoch,
            None,
        )
        is_eval_epoch = misc.is_eval_epoch(
            cfg, cur_epoch, None
        )

        # Save a checkpoint.
        # if is_checkp_epoch:
        #     cu.save_checkpoint(cfg.OUTPUT_DIR, model, optimizer, cur_epoch, cfg)
        # Evaluate the model on validation set.
        if is_eval_epoch:
            eval_epoch(val_loader, model, val_meter, cur_epoch, cfg)
Exemple #9
0
def train(cfg):
    """
    Train a video model for many epochs on train set and evaluate it on val set.
    Args:
        cfg (CfgNode): configs. Details can be found in
            slowfast/config/defaults.py
    """
    # Set up environment.
    du.init_distributed_training(cfg)
    # Set random seed from configs.
    np.random.seed(cfg.RNG_SEED)
    torch.manual_seed(cfg.RNG_SEED)

    # Setup logging format.
    logging.setup_logging(cfg.OUTPUT_DIR)

    # Init multigrid.
    multigrid = None
    if cfg.MULTIGRID.LONG_CYCLE or cfg.MULTIGRID.SHORT_CYCLE:
        multigrid = MultigridSchedule()
        cfg = multigrid.init_multigrid(cfg)
        if cfg.MULTIGRID.LONG_CYCLE:
            cfg, _ = multigrid.update_long_cycle(cfg, cur_epoch=0)
    # Print config.
    logger.info("Train with config:")
    logger.info(pprint.pformat(cfg))

    # Build the video model and print model statistics.
    model = build_clevrer_model(cfg)
    # if du.is_master_proc() and cfg.LOG_MODEL_INFO:
    #     misc.log_model_info(model, cfg, use_train_input=True)

    # Construct the optimizer.
    optimizer = optim.construct_optimizer(model, cfg)

    # Load a checkpoint to resume training if applicable.
    start_epoch = cu.load_train_checkpoint(cfg, model, optimizer)

    # Create the video train and val loaders.
    train_loader = loader.construct_loader(cfg, "train")
    val_loader = loader.construct_loader(cfg, "val")
    precise_bn_loader = (loader.construct_loader(
        cfg, "train", is_precise_bn=True)
                         if cfg.BN.USE_PRECISE_STATS else None)

    # Create meters.
    train_meter = ClevrerTrainMeter(len(train_loader), cfg)
    val_meter = ClevrerValMeter(len(val_loader), cfg)

    # set up writer for logging to Tensorboard format.
    if cfg.TENSORBOARD.ENABLE and du.is_master_proc(
            cfg.NUM_GPUS * cfg.NUM_SHARDS):
        writer = tb.TensorboardWriter(cfg)
    else:
        writer = None

    # Perform the training loop.
    logger.info("Start epoch: {}".format(start_epoch + 1))

    for cur_epoch in range(start_epoch, cfg.SOLVER.MAX_EPOCH):
        if cfg.MULTIGRID.LONG_CYCLE:
            cfg, changed = multigrid.update_long_cycle(cfg, cur_epoch)
            if changed:
                (
                    model,
                    optimizer,
                    train_loader,
                    val_loader,
                    precise_bn_loader,
                    train_meter,
                    val_meter,
                ) = build_trainer(cfg)

                # Load checkpoint.
                if cu.has_checkpoint(cfg.OUTPUT_DIR):
                    last_checkpoint = cu.get_last_checkpoint(cfg.OUTPUT_DIR)
                    assert "{:05d}.pyth".format(cur_epoch) in last_checkpoint
                else:
                    last_checkpoint = cfg.TRAIN.CHECKPOINT_FILE_PATH
                logger.info("Load from {}".format(last_checkpoint))
                cu.load_checkpoint(last_checkpoint, model, cfg.NUM_GPUS > 1,
                                   optimizer)

        # Shuffle the dataset.
        loader.shuffle_dataset(train_loader, cur_epoch)

        # Train for one epoch.
        train_epoch(train_loader, model, optimizer, train_meter, cur_epoch,
                    cfg, writer)

        is_checkp_epoch = cu.is_checkpoint_epoch(
            cfg,
            cur_epoch,
            None if multigrid is None else multigrid.schedule,
        )
        is_eval_epoch = misc.is_eval_epoch(
            cfg, cur_epoch, None if multigrid is None else multigrid.schedule)

        # Compute precise BN stats.
        # if (
        #     (is_checkp_epoch or is_eval_epoch)
        #     and cfg.BN.USE_PRECISE_STATS
        #     and len(get_bn_modules(model)) > 0
        # ):
        #     calculate_and_update_precise_bn(
        #         precise_bn_loader,
        #         model,
        #         min(cfg.BN.NUM_BATCHES_PRECISE, len(precise_bn_loader)),
        #         cfg.NUM_GPUS > 0,
        #     )
        # _ = misc.aggregate_sub_bn_stats(model)

        # Save a checkpoint.
        if is_checkp_epoch:
            cu.save_checkpoint(cfg.OUTPUT_DIR, model, optimizer, cur_epoch,
                               cfg)
        # Evaluate the model on validation set.
        if is_eval_epoch:
            eval_epoch(val_loader, model, val_meter, cur_epoch, cfg, writer)

    if writer is not None:
        writer.close()
Exemple #10
0
def train_des(cfg):
    """
    Train a video model for many epochs on train set and evaluate it on val set.
    Args:
        cfg (CfgNode): configs. Details can be found in
            slowfast/config/defaults.py
    """
    # Set random seed from configs.
    np.random.seed(cfg.RNG_SEED)
    torch.manual_seed(cfg.RNG_SEED)

    # Setup logging format.
    logging.setup_logging(cfg.OUTPUT_DIR)
    # Print config.
    logger.info("Train with config:")
    logger.info(pprint.pformat(cfg))

    # Build the video model and print model statistics.
    model = MONET_BERT(cfg)
    if cfg.NUM_GPUS:
        # Determine the GPU used by the current process
        cur_device = torch.cuda.current_device()
        # Transfer the model to the current GPU device
        model = model.cuda(device=cur_device)

    # Construct the optimizer.
    # optimizer = AdamW(model.parameters(),
    #               lr = cfg.SOLVER.BASE_LR,
    #               eps = 1e-8
    #             )
    optimizer = Lamb(model.parameters(),
                     lr=cfg.SOLVER.BASE_LR,
                     betas=(0.9, 0.999),
                     eps=1e-6,
                     weight_decay=cfg.SOLVER.WEIGHT_DECAY,
                     adam=False)
    # optimizer = optim.Adam(model.parameters(), lr=cfg.SOLVER.BASE_LR)
    start_epoch = cu.load_train_checkpoint(cfg, model, optimizer)
    optimizer = Lamb(model.parameters(),
                     lr=cfg.SOLVER.BASE_LR,
                     betas=(0.9, 0.999),
                     eps=1e-6,
                     weight_decay=cfg.SOLVER.WEIGHT_DECAY,
                     adam=False)
    # Create the video train and val loaders.
    train_loader = build_dataloader(cfg, "train")
    val_loader = build_dataloader(cfg, "val")

    total_steps = len(train_loader) * cfg.SOLVER.MAX_EPOCH

    # Create the learning rate scheduler.
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=4000,  # Default value in run_glue.py
        num_training_steps=total_steps)

    # Create meters.
    train_meter = ClevrerTrainMeter(len(train_loader), cfg)
    val_meter = ClevrerValMeter(len(val_loader), cfg)

    # Perform the training loop.
    logger.info("Start epoch: {}".format(start_epoch + 1))
    for cur_epoch in range(start_epoch, cfg.SOLVER.MAX_EPOCH):
        # Train for one epoch.
        train_epoch(train_loader, model, optimizer, scheduler, train_meter,
                    cur_epoch, cfg)

        is_checkp_epoch = cu.is_checkpoint_epoch(
            cfg,
            cur_epoch,
            None,
        )
        is_eval_epoch = misc.is_eval_epoch(cfg, cur_epoch, None)

        # Save a checkpoint.
        if is_checkp_epoch:
            cu.save_checkpoint(cfg.OUTPUT_DIR, model, optimizer, cur_epoch,
                               cfg)
        # Evaluate the model on validation set.
        if is_eval_epoch:
            eval_epoch(val_loader, model, val_meter, cur_epoch, cfg)