Esempio n. 1
0
def main():
    """Execute operation (train, test, time, etc.)."""
    args = parse_args()
    mode = args.mode
    config.load_cfg(args.cfg)
    cfg.merge_from_list(args.opts)
    config.assert_cfg()
    cfg.freeze()
    if mode == "info":
        print(builders.get_model()())
        print("complexity:", net.complexity(builders.get_model()))
    elif mode == "train":
        dist.multi_proc_run(num_proc=cfg.NUM_GPUS, fun=trainer.train_model)
    elif mode == "test":
        dist.multi_proc_run(num_proc=cfg.NUM_GPUS, fun=trainer.test_model)
    elif mode == "time":
        dist.multi_proc_run(num_proc=cfg.NUM_GPUS, fun=trainer.time_model)
    elif mode == "scale":
        cfg.defrost()
        cx_orig = net.complexity(builders.get_model())
        scaler.scale_model()
        cx_scaled = net.complexity(builders.get_model())
        cfg_file = config.dump_cfg()
        print("Scaled config dumped to:", cfg_file)
        print("Original model complexity:", cx_orig)
        print("Scaled model complexity:", cx_scaled)
Esempio n. 2
0
def main():
    config.load_cfg_fom_args("Scale a model.")
    config.assert_and_infer_cfg()
    cx_orig = net.complexity(builders.get_model())
    scaler.scale_model()
    cx_scaled = net.complexity(builders.get_model())
    cfg_file = config.dump_cfg()
    print("Scaled config dumped to:", cfg_file)
    print("Original model complexity:", cx_orig)
    print("Scaled model complexity:", cx_scaled)
Esempio n. 3
0
def get_model_data(name, timings, errors):
    """Get model data for a single model."""
    # Load model config
    reset_cfg()
    cfg.merge_from_file(model_zoo.get_config_file(name))
    config_url, _, model_id, _, weight_url_full = model_zoo.get_model_info(
        name)
    # Get model complexity
    cx = net.complexity(builders.get_model())
    # Inference time is measured in ms with a reference batch_size and num_gpus
    batch_size, num_gpus = 64, 1
    reference = batch_size / cfg.TEST.BATCH_SIZE * cfg.NUM_GPUS / num_gpus
    infer_time = timings[name]["test_fw_time"] * reference * 1000
    # Training time is measured in hours for 100 epochs over the ImageNet train set
    iterations = 1281167 / cfg.TRAIN.BATCH_SIZE * 100
    train_time = timings[name]["train_fw_bw_time"] * iterations / 3600
    # Gather all data about the model
    return {
        "config_url": "configs/" + config_url,
        "flops": round(cx["flops"] / 1e9, 1),
        "params": round(cx["params"] / 1e6, 1),
        "acts": round(cx["acts"] / 1e6, 1),
        "batch_size": cfg.TRAIN.BATCH_SIZE,
        "infer_time": round(infer_time),
        "train_time": round(train_time, 1),
        "error": round(errors[name]["top1_err"], 1),
        "model_id": model_id,
        "weight_url": weight_url_full,
    }
Esempio n. 4
0
 def test_complexity(self, cfg_file, cx_expected):
     """Test complexity of a single model with the specified config."""
     cfg_init = cfg.clone()
     cfg.merge_from_file(cfg_file)
     cx = net.complexity(builders.get_model())
     cfg.merge_from_other_cfg(cfg_init)
     self.assertEqual(cx_expected, cx)
Esempio n. 5
0
def check_complexity_constraints(constraints):
    """Checks complexity constraints."""
    cx, valid = None, True
    for p, v in constraints.CX.items():
        p, min_v, max_v = p.lower(), v[0], v[1]
        if min_v != 0 or max_v != 0:
            cx = cx if cx else net.complexity(builders.get_model())
            min_v = cx[p] if min_v == 0 else min_v
            max_v = cx[p] if max_v == 0 else max_v
            valid = valid and (min_v <= cx[p] <= max_v)
    return valid
Esempio n. 6
0
def main():
    config.load_cfg_fom_args("Train a classification model.")
    config.assert_and_infer_cfg()
    cfg.freeze()
    print("building model {}".format(cfg.MODEL.TYPE))
    model = build_model()
    model.eval()
    x = torch.randn(1, 3, 224, 224)
    y = model(x)
    print(y.shape)
    model_complex = complexity(model)
    print(model_complex)
Esempio n. 7
0
def dump_complexity():
    """Measure the complexity of every model in the configs/ directory."""
    complexity = {"date-created": str(datetime.datetime.now())}
    cfg_files = [os.path.join(r, f) for r, _, fs in os.walk("configs/") for f in fs]
    cfg_files = sorted(f for f in cfg_files if ".yaml" in f)
    for cfg_file in cfg_files:
        cfg_init = cfg.clone()
        cfg.merge_from_file(cfg_file)
        complexity[cfg_file] = net.complexity(builders.get_model())
        cfg.merge_from_other_cfg(cfg_init)
    with open(_COMPLEXITY_FILE, "w") as file:
        json.dump(complexity, file, sort_keys=True, indent=4)
Esempio n. 8
0
def setup_model():
    """Sets up a model for training or testing and log the results."""
    # Build the model
    model = builders.build_model()
    logger.info("Model:\n{}".format(model)) if cfg.VERBOSE else ()
    # Log model complexity
    logger.info(logging.dump_log_data(net.complexity(model), "complexity"))
    # Transfer the model to the current GPU device
    err_str = "Cannot use more GPU devices than available"
    assert cfg.NUM_GPUS <= torch.cuda.device_count(), err_str
    cur_device = torch.cuda.current_device()
    model = model.cuda(device=cur_device)
    # Use multi-process data parallel model in the multi-gpu setting
    if cfg.NUM_GPUS > 1:
        # Make model replica operate on the current device
        ddp = torch.nn.parallel.DistributedDataParallel
        model = ddp(module=model, device_ids=[cur_device], output_device=cur_device)
    return model
Esempio n. 9
0
def test_model():
    """Evaluates the model."""

    # Setup logging
    logging.setup_logging()
    # Show the config
    logger.info("Config:\n{}".format(cfg))

    # Fix the RNG seeds (see RNG comment in core/config.py for discussion)
    np.random.seed(cfg.RNG_SEED)
    torch.manual_seed(cfg.RNG_SEED)
    # Configure the CUDNN backend
    torch.backends.cudnn.benchmark = cfg.CUDNN.BENCHMARK

    # Build the model (before the loaders to speed up debugging)
    model = builders.build_model()
    logger.info("Model:\n{}".format(model))
    logger.info(logging.dump_json_stats(net.complexity(model)))

    # Compute precise time
    if cfg.PREC_TIME.ENABLED:
        logger.info("Computing precise time...")
        loss_fun = builders.build_loss_fun()
        prec_time = net.compute_precise_time(model, loss_fun)
        logger.info(logging.dump_json_stats(prec_time))
        net.reset_bn_stats(model)

    # Load model weights
    checkpoint.load_checkpoint(cfg.TEST.WEIGHTS, model)
    logger.info("Loaded model weights from: {}".format(cfg.TEST.WEIGHTS))

    # Create data loaders
    test_loader = loader.construct_test_loader()

    # Create meters
    test_meter = meters.TestMeter(len(test_loader))

    # Evaluate the model
    test_epoch(test_loader, model, test_meter, 0)
Esempio n. 10
0
def setup_model():
    """Sets up a model for training or testing and log the results."""
    # Build the model
    model = builders.build_model()
    logger.info("Model:\n{}".format(model))
    # Log model complexity
    logger.info(logging.dump_json_stats(net.complexity(model)))
    # Transfer the model to the current GPU device
    err_str = "Cannot use more GPU devices than available"
    assert cfg.NUM_GPUS <= torch.cuda.device_count(), err_str
    cur_device = torch.cuda.current_device()
    model = model.cuda(device=cur_device)
    # Use multi-process data parallel model in the multi-gpu setting
    if cfg.NUM_GPUS > 1:
        # Make model replica operate on the current device
        model = torch.nn.parallel.DistributedDataParallel(
            module=model,
            device_ids=[cur_device],
            output_device=cur_device,
            find_unused_parameters=True)
        # Set complexity function to be module's complexity function
        model.complexity = model.module.complexity
    return model
Esempio n. 11
0
def setup_model():
    """Sets up a model for training or testing and log the results."""
    # Build the model
    model = builders.build_model()
    logger.info("Model:\n{}".format(model)) if cfg.VERBOSE else ()
    # Log model complexity
    logger.info(logging.dump_log_data(net.complexity(model), "complexity"))
    # Transfer the model to the current GPU device
    err_str = "Cannot use more GPU devices than available"
    #assert cfg.NUM_GPUS <= torch.cuda.device_count(), err_str
    assert cfg.NUM_GPUS <= torch.npu.device_count(), err_str
    cur_device = torch.npu.current_device()
    model = model.to(cur_device)
    optimizer = optim.construct_optimizer(model)
    model, optimizer = amp.initialize(model,
                                      optimizer,
                                      opt_level="O2",
                                      loss_scale=128)
    if cfg.NUM_GPUS > 1:
        #Make model replica operate on the current device
        ddp = torch.nn.parallel.DistributedDataParallel
        model = ddp(model, device_ids=[cur_device], broadcast_buffers=False)

    return model, optimizer
Esempio n. 12
0
def train_model():
    """Trains the model."""

    # Setup logging
    logging.setup_logging()
    # Show the config
    logger.info("Config:\n{}".format(cfg))

    # Fix the RNG seeds (see RNG comment in core/config.py for discussion)
    np.random.seed(cfg.RNG_SEED)
    torch.manual_seed(cfg.RNG_SEED)
    # Configure the CUDNN backend
    torch.backends.cudnn.benchmark = cfg.CUDNN.BENCHMARK

    # Build the model (before the loaders to speed up debugging)
    model = builders.build_model()
    logger.info("Model:\n{}".format(model))
    logger.info(logging.dump_json_stats(net.complexity(model)))

    # Define the loss function
    loss_fun = builders.build_loss_fun()
    # Construct the optimizer
    optimizer = optim.construct_optimizer(model)

    # Load checkpoint or initial weights
    start_epoch = 0
    if cfg.TRAIN.AUTO_RESUME and checkpoint.has_checkpoint():
        last_checkpoint = checkpoint.get_last_checkpoint()
        checkpoint_epoch = checkpoint.load_checkpoint(last_checkpoint, model,
                                                      optimizer)
        logger.info("Loaded checkpoint from: {}".format(last_checkpoint))
        start_epoch = checkpoint_epoch + 1
    elif cfg.TRAIN.WEIGHTS:
        checkpoint.load_checkpoint(cfg.TRAIN.WEIGHTS, model)
        logger.info("Loaded initial weights from: {}".format(
            cfg.TRAIN.WEIGHTS))

    # Compute precise time
    if start_epoch == 0 and cfg.PREC_TIME.ENABLED:
        logger.info("Computing precise time...")
        prec_time = net.compute_precise_time(model, loss_fun)
        logger.info(logging.dump_json_stats(prec_time))
        net.reset_bn_stats(model)

    # Create data loaders
    train_loader = loader.construct_train_loader()
    test_loader = loader.construct_test_loader()

    # Create meters
    train_meter = meters.TrainMeter(len(train_loader))
    test_meter = meters.TestMeter(len(test_loader))

    # Perform the training loop
    logger.info("Start epoch: {}".format(start_epoch + 1))

    for cur_epoch in range(start_epoch, cfg.OPTIM.MAX_EPOCH):
        # Train for one epoch
        train_epoch(train_loader, model, loss_fun, optimizer, train_meter,
                    cur_epoch)
        # Compute precise BN stats
        if cfg.BN.USE_PRECISE_STATS:
            net.compute_precise_bn_stats(model, train_loader)
        # Save a checkpoint
        if checkpoint.is_checkpoint_epoch(cur_epoch):
            checkpoint_file = checkpoint.save_checkpoint(
                model, optimizer, cur_epoch)
            logger.info("Wrote checkpoint to: {}".format(checkpoint_file))
        # Evaluate the model
        if is_eval_epoch(cur_epoch):
            test_epoch(test_loader, model, test_meter, cur_epoch)
Esempio n. 13
0
def test_complexity(key):
    """Measure the complexity of a single model."""
    reset_cfg()
    cfg_file = os.path.join(_PYCLS_DIR, key)
    merge_from_file(cfg_file)
    return net.complexity(builders.get_model())