Beispiel #1
0
def test_model(test_loader, eval_checkpoint_file, ssl_checkpoint_file, cfg,
               cur_episode):

    global plot_epoch_xvalues
    global plot_epoch_yvalues

    global plot_it_x_values
    global plot_it_y_values

    test_meter = TestMeter(len(test_loader))

    if cfg.MODEL.TYPE == 'linear':
        model = SSLEvaluator(n_input=cfg.MODEL.NUM_INPUT,
                             n_classes=cfg.MODEL.NUM_OUTPUT,
                             n_hidden=None)
    else:
        model = SSLEvaluator(n_input=cfg.MODEL.NUM_INPUT,
                             n_classes=cfg.MODEL.NUM_OUTPUT,
                             n_hidden=cfg.MODEL.NUM_HIDDEN)
    model = cu.load_checkpoint(eval_checkpoint_file, model)

    ssl_model = model_builder.build_model(cfg)
    ssl_model = cu.load_checkpoint(ssl_checkpoint_file, ssl_model)

    test_err = test_epoch(test_loader, model, ssl_model, test_meter,
                          cur_episode)
    test_acc = 100. - test_err

    return test_acc
Beispiel #2
0
def setup_model():
    """Sets up a model for training or testing and log the results."""
    # Build the model
    model = builders.build_model()
    logger.info("Model:\n{}".format(model))
    # Log model complexity
    # logger.info(logging.dump_log_data(net.complexity(model), "complexity"))
    if cfg.TASK == "seg" and cfg.TRAIN.DATASET == "cityscapes":
        h, w = 1025, 2049
    else:
        h, w = cfg.TRAIN.IM_SIZE, cfg.TRAIN.IM_SIZE
    if cfg.TASK == "jig":
        x = torch.randn(1, cfg.JIGSAW_GRID ** 2, cfg.MODEL.INPUT_CHANNELS, h, w)
    else:
        x = torch.randn(1, cfg.MODEL.INPUT_CHANNELS, h, w)
    macs, params = profile(model, inputs=(x, ), verbose=False)
    logger.info("Params: {:,}".format(params))
    logger.info("Flops: {:,}".format(macs))
    # Transfer the model to the current GPU device
    err_str = "Cannot use more GPU devices than available"
    assert cfg.NUM_GPUS <= torch.cuda.device_count(), err_str
    cur_device = torch.cuda.current_device()
    model = model.cuda(device=cur_device)
    # Use multi-process data parallel model in the multi-gpu setting
    if cfg.NUM_GPUS > 1:
        # Make model replica operate on the current device
        model = torch.nn.parallel.DistributedDataParallel(
            module=model, device_ids=[cur_device], output_device=cur_device
        )
        # Set complexity function to be module's complexity function
        # model.complexity = model.module.complexity
    return model
Beispiel #3
0
def test_model(test_loader, checkpoint_file, cfg, cur_episode):

    global plot_episode_xvalues
    global plot_episode_yvalues

    global plot_epoch_xvalues
    global plot_epoch_yvalues

    global plot_it_x_values
    global plot_it_y_values

    test_meter = TestMeter(len(test_loader))

    model = model_builder.build_model(cfg)
    model = cu.load_checkpoint(checkpoint_file, model)

    test_err = test_epoch(test_loader, model, test_meter, cur_episode)
    test_acc = 100. - test_err

    plot_episode_xvalues.append(cur_episode)
    plot_episode_yvalues.append(test_acc)

    plot_arrays(x_vals=plot_episode_xvalues, y_vals=plot_episode_yvalues, \
        x_name="Episodes", y_name="Test Accuracy", dataset_name=cfg.DATASET.NAME, out_dir=cfg.EXP_DIR)

    save_plot_values([plot_episode_xvalues, plot_episode_yvalues], \
        ["plot_episode_xvalues", "plot_episode_yvalues"], out_dir=cfg.EXP_DIR)

    return test_acc
Beispiel #4
0
def ensemble_test_model(test_loader, checkpoint_file, cfg, cur_episode):

    test_meter = TestMeter(len(test_loader))

    model = model_builder.build_model(cfg)
    model = cu.load_checkpoint(checkpoint_file, model)

    test_err = test_epoch(test_loader, model, test_meter, cur_episode)
    test_acc = 100. - test_err

    return test_acc
Beispiel #5
0
def main():
    config.load_cfg_fom_args("Train a classification model.")
    config.assert_and_infer_cfg()
    cfg.freeze()
    print("building model {}".format(cfg.MODEL.TYPE))
    model = build_model()
    model.eval()
    x = torch.randn(1, 3, 224, 224)
    y = model(x)
    print(y.shape)
    model_complex = complexity(model)
    print(model_complex)
def predict(args):
    cfg.MODEL.TYPE = "regnet"
    cfg.REGNET.DEPTH = 25
    cfg.REGNET.SE_ON = False
    cfg.REGNET.W0 = 112
    cfg.REGNET.WA = 33.22
    cfg.REGNET.WM = 2.27
    cfg.REGNET.GROUP_W = 72
    cfg.BN.NUM_GROUPS = 4
    cfg.ANYNET.STEM_CHANNELS = 1
    cfg.MODEL.NUM_CLASSES = 10958
    net = builders.build_model()
    net.load_state_dict(torch.load(args.classify_model, map_location="cpu"))
    net.eval()

    softmax = nn.Softmax(dim=1)
    label_map = load_label_file()

    # Load audio file to np.array
    audio, sr = librosa.load(args.sound_file,
                             mono=True,
                             offset=1.1,
                             sr=CFG.sample_rate)
    logmel = librosa.feature.melspectrogram(audio,
                                            sr,
                                            n_mels=CFG.n_mels,
                                            fmax=CFG.fmax)
    S_dB = librosa.power_to_db(logmel, ref=np.max)

    aug = augment.Augment(training=False)
    segs = S_dB.shape[1] // SEGMENT_SIZE
    for index in range(segs):
        begin = index * SEGMENT_SIZE
        end = begin + SEGMENT_SIZE
        if end > S_dB.shape[1]:
            print(f"{end} is out of range {S_dB.shape[1]} [{args.sound_file}]")
            continue
        sample = S_dB[:, begin:end].copy()
        sample = torch.from_numpy(sample)
        sample = sample.unsqueeze(0).unsqueeze(3)
        sample = aug(sample)
        sample = sample.permute(0, 3, 1, 2).float()
        result = net(sample)
        result = softmax(result)
        values, indices = torch.topk(result, 5)
        print("-----------------------------------------------")
        for ind, val in zip(indices[0], values[0]):
            ind = ind.item()
            # if ind > 0 and ind < 10950:
            print(ind, label_map[ind], f"({val.item()*100:.2f}%)")
 def __init__(self, num_classes=1, ckpt=None):
     super(Regnet, self).__init__()
     from pycls.core.config import cfg
     import pycls.core.config as model_config
     from pycls.core.builders import build_model
     model_config.load_cfg_fom_args("Train a cls model")
     cfg.freeze()
     model = build_model()
     if ckpt:
         model.load_state_dict(torch.load(ckpt)['model_state'])
     in_features = model.head.fc.in_features
     fc = nn.Linear(in_features, num_classes)
     self.model = model
     self.model.head.fc = fc
Beispiel #8
0
def build_model(name, pretrained=False, cfg_list=()):
    """Constructs a predefined model (note: loads global config as well)."""
    # Load the config
    reset_cfg()
    config_file = get_config_file(name)
    cfg.merge_from_file(config_file)
    cfg.merge_from_list(cfg_list)
    # Construct model
    model = builders.build_model()
    # Load pretrained weights
    if pretrained:
        weights_file = get_weights_file(name)
        cp.load_checkpoint(weights_file, model)
    return model
Beispiel #9
0
def test_model(test_loader, checkpoint_file, cfg, cur_episode):

    global plot_epoch_xvalues
    global plot_epoch_yvalues

    global plot_it_x_values
    global plot_it_y_values

    test_meter = TestMeter(len(test_loader))

    model = model_builder.build_model(cfg)
    model = cu.load_checkpoint(checkpoint_file, model)

    test_err = test_epoch(test_loader, model, test_meter, cur_episode)
    test_acc = 100. - test_err

    return test_acc
Beispiel #10
0
def setup_model():
    """Sets up a model for training or testing and log the results."""
    # Build the model
    model = builders.build_model()
    logger.info("Model:\n{}".format(model)) if cfg.VERBOSE else ()
    # Log model complexity
    logger.info(logging.dump_log_data(net.complexity(model), "complexity"))
    # Transfer the model to the current GPU device
    err_str = "Cannot use more GPU devices than available"
    assert cfg.NUM_GPUS <= torch.cuda.device_count(), err_str
    cur_device = torch.cuda.current_device()
    model = model.cuda(device=cur_device)
    # Use multi-process data parallel model in the multi-gpu setting
    if cfg.NUM_GPUS > 1:
        # Make model replica operate on the current device
        ddp = torch.nn.parallel.DistributedDataParallel
        model = ddp(module=model, device_ids=[cur_device], output_device=cur_device)
    return model
 def __init__(self, num_clusters, num_tiles, num_classes, ckpt):
     super().__init__()
     from pycls.core.config import cfg
     import pycls.core.config as model_config
     from pycls.core.builders import build_model
     model_config.load_cfg_fom_args("Train a cls model")
     cfg.freeze()
     model = build_model()
     if ckpt:
         model.load_state_dict(torch.load(ckpt)['model_state'])
     self.enc = nn.Sequential(model.stem, model.s1, model.s2, model.s3,
                              model.s4,
                              nn.AdaptiveAvgPool2d(output_size=(1, 1)),
                              nn.Flatten(), nn.Dropout(p=0.3))
     self.nc = model.head.fc.in_features
     self.netvlad = NetVLAD(cluster_size=num_clusters,
                            max_frames=num_tiles,
                            feature_size=self.nc,
                            truncate=False)
     self.fc = nn.Linear(num_clusters * self.nc, num_classes)
Beispiel #12
0
def test_model():
    """Evaluates the model."""

    # Setup logging
    logging.setup_logging()
    # Show the config
    logger.info("Config:\n{}".format(cfg))

    # Fix the RNG seeds (see RNG comment in core/config.py for discussion)
    np.random.seed(cfg.RNG_SEED)
    torch.manual_seed(cfg.RNG_SEED)
    # Configure the CUDNN backend
    torch.backends.cudnn.benchmark = cfg.CUDNN.BENCHMARK

    # Build the model (before the loaders to speed up debugging)
    model = builders.build_model()
    logger.info("Model:\n{}".format(model))
    logger.info(logging.dump_json_stats(net.complexity(model)))

    # Compute precise time
    if cfg.PREC_TIME.ENABLED:
        logger.info("Computing precise time...")
        loss_fun = builders.build_loss_fun()
        prec_time = net.compute_precise_time(model, loss_fun)
        logger.info(logging.dump_json_stats(prec_time))
        net.reset_bn_stats(model)

    # Load model weights
    checkpoint.load_checkpoint(cfg.TEST.WEIGHTS, model)
    logger.info("Loaded model weights from: {}".format(cfg.TEST.WEIGHTS))

    # Create data loaders
    test_loader = loader.construct_test_loader()

    # Create meters
    test_meter = meters.TestMeter(len(test_loader))

    # Evaluate the model
    test_epoch(test_loader, model, test_meter, 0)
Beispiel #13
0
def setup_model():
    """Sets up a model for training or testing and log the results."""
    # Build the model
    model = builders.build_model()
    logger.info("Model:\n{}".format(model))
    # Log model complexity
    logger.info(logging.dump_json_stats(net.complexity(model)))
    # Transfer the model to the current GPU device
    err_str = "Cannot use more GPU devices than available"
    assert cfg.NUM_GPUS <= torch.cuda.device_count(), err_str
    cur_device = torch.cuda.current_device()
    model = model.cuda(device=cur_device)
    # Use multi-process data parallel model in the multi-gpu setting
    if cfg.NUM_GPUS > 1:
        # Make model replica operate on the current device
        model = torch.nn.parallel.DistributedDataParallel(
            module=model,
            device_ids=[cur_device],
            output_device=cur_device,
            find_unused_parameters=True)
        # Set complexity function to be module's complexity function
        model.complexity = model.module.complexity
    return model
Beispiel #14
0
def setup_model():
    """Sets up a model for training or testing and log the results."""
    # Build the model
    model = builders.build_model()
    logger.info("Model:\n{}".format(model)) if cfg.VERBOSE else ()
    # Log model complexity
    logger.info(logging.dump_log_data(net.complexity(model), "complexity"))
    # Transfer the model to the current GPU device
    err_str = "Cannot use more GPU devices than available"
    #assert cfg.NUM_GPUS <= torch.cuda.device_count(), err_str
    assert cfg.NUM_GPUS <= torch.npu.device_count(), err_str
    cur_device = torch.npu.current_device()
    model = model.to(cur_device)
    optimizer = optim.construct_optimizer(model)
    model, optimizer = amp.initialize(model,
                                      optimizer,
                                      opt_level="O2",
                                      loss_scale=128)
    if cfg.NUM_GPUS > 1:
        #Make model replica operate on the current device
        ddp = torch.nn.parallel.DistributedDataParallel
        model = ddp(model, device_ids=[cur_device], broadcast_buffers=False)

    return model, optimizer
Beispiel #15
0
def train_model():
    """Trains the model."""

    # Setup logging
    logging.setup_logging()
    # Show the config
    logger.info("Config:\n{}".format(cfg))

    # Fix the RNG seeds (see RNG comment in core/config.py for discussion)
    np.random.seed(cfg.RNG_SEED)
    torch.manual_seed(cfg.RNG_SEED)
    # Configure the CUDNN backend
    torch.backends.cudnn.benchmark = cfg.CUDNN.BENCHMARK

    # Build the model (before the loaders to speed up debugging)
    model = builders.build_model()
    logger.info("Model:\n{}".format(model))
    logger.info(logging.dump_json_stats(net.complexity(model)))

    # Define the loss function
    loss_fun = builders.build_loss_fun()
    # Construct the optimizer
    optimizer = optim.construct_optimizer(model)

    # Load checkpoint or initial weights
    start_epoch = 0
    if cfg.TRAIN.AUTO_RESUME and checkpoint.has_checkpoint():
        last_checkpoint = checkpoint.get_last_checkpoint()
        checkpoint_epoch = checkpoint.load_checkpoint(last_checkpoint, model,
                                                      optimizer)
        logger.info("Loaded checkpoint from: {}".format(last_checkpoint))
        start_epoch = checkpoint_epoch + 1
    elif cfg.TRAIN.WEIGHTS:
        checkpoint.load_checkpoint(cfg.TRAIN.WEIGHTS, model)
        logger.info("Loaded initial weights from: {}".format(
            cfg.TRAIN.WEIGHTS))

    # Compute precise time
    if start_epoch == 0 and cfg.PREC_TIME.ENABLED:
        logger.info("Computing precise time...")
        prec_time = net.compute_precise_time(model, loss_fun)
        logger.info(logging.dump_json_stats(prec_time))
        net.reset_bn_stats(model)

    # Create data loaders
    train_loader = loader.construct_train_loader()
    test_loader = loader.construct_test_loader()

    # Create meters
    train_meter = meters.TrainMeter(len(train_loader))
    test_meter = meters.TestMeter(len(test_loader))

    # Perform the training loop
    logger.info("Start epoch: {}".format(start_epoch + 1))

    for cur_epoch in range(start_epoch, cfg.OPTIM.MAX_EPOCH):
        # Train for one epoch
        train_epoch(train_loader, model, loss_fun, optimizer, train_meter,
                    cur_epoch)
        # Compute precise BN stats
        if cfg.BN.USE_PRECISE_STATS:
            net.compute_precise_bn_stats(model, train_loader)
        # Save a checkpoint
        if checkpoint.is_checkpoint_epoch(cur_epoch):
            checkpoint_file = checkpoint.save_checkpoint(
                model, optimizer, cur_epoch)
            logger.info("Wrote checkpoint to: {}".format(checkpoint_file))
        # Evaluate the model
        if is_eval_epoch(cur_epoch):
            test_epoch(test_loader, model, test_meter, cur_epoch)
Beispiel #16
0
def main(cfg):

    # Login to wandb
    wandb.login()

    # Initialize a new wandb run
    wandb.init(project="rotation-pred", name=cfg.EXP_NAME)

    # Setting up GPU args
    use_cuda = (cfg.NUM_GPUS > 0) and torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")
    kwargs = {
        'num_workers': cfg.DATA_LOADER.NUM_WORKERS,
        'pin_memory': cfg.DATA_LOADER.PIN_MEMORY
    } if use_cuda else {}

    # Using specific GPU
    os.environ['NVIDIA_VISIBLE_DEVICES'] = str(cfg.GPU_ID)
    os.environ['CUDA_VISIBLE_DEVICES'] = '0'
    print("Using GPU : {}.\n".format(cfg.GPU_ID))

    # Getting the output directory ready (default is "/output")
    cfg.OUT_DIR = os.path.join(os.path.abspath('..'), cfg.OUT_DIR)
    if not os.path.exists(cfg.OUT_DIR):
        os.makedirs(cfg.OUT_DIR)
    # Create "DATASET" specific directory
    dataset_out_dir = os.path.join(cfg.OUT_DIR, cfg.DATASET.NAME,
                                   cfg.MODEL.TYPE)
    if not os.path.exists(dataset_out_dir):
        os.makedirs(dataset_out_dir)
    # Creating the experiment directory inside the dataset specific directory
    # all logs, labeled, unlabeled, validation sets are stroed here
    # E.g., output/CIFAR10/resnet18/{timestamp or cfg.EXP_NAME based on arguments passed}
    if cfg.EXP_NAME == 'auto':
        now = datetime.now()
        exp_dir = f'{now.year}_{now.month}_{now.day}_{now.hour}{now.minute}{now.second}'
    else:
        exp_dir = cfg.EXP_NAME

    exp_dir = os.path.join(dataset_out_dir, exp_dir)
    if not os.path.exists(exp_dir):
        os.mkdir(exp_dir)
        print("Experiment Directory is {}.\n".format(exp_dir))
    else:
        print(
            "Experiment Directory Already Exists: {}. Reusing it may lead to loss of old logs in the directory.\n"
            .format(exp_dir))
    cfg.EXP_DIR = exp_dir

    # Setup Logger
    lu.setup_logging(cfg)

    # Dataset preparing steps
    print("\n======== PREPARING DATA AND SSL EVALUATION MODEL ========\n")
    cfg.DATASET.ROOT_DIR = os.path.join(os.path.abspath('..'),
                                        cfg.DATASET.ROOT_DIR)
    data_obj = Data(cfg)
    train_data, train_size = data_obj.getDataset(save_dir=cfg.DATASET.ROOT_DIR,
                                                 isTrain=True,
                                                 isDownload=True)
    trainSet = [i for i in range(train_size)]

    print("\n Rotation Dataset {} Loaded Sucessfully.\nTotal Train Size: {}\n".
          format(cfg.DATASET.NAME, train_size))
    logger.info(
        "Rotation Dataset {} Loaded Sucessfully. Total Train Size: {}\n".
        format(cfg.DATASET.NAME, train_size))

    trainSet_path = data_obj.saveSet(setArray=trainSet,
                                     setName='trainSet',
                                     save_dir=cfg.EXP_DIR)
    trainSet = data_obj.loadPartition(setPath=trainSet_path)

    # Preparing dataloaders for initial training
    trainSet_loader = data_obj.getSequentialDataLoader(
        indexes=trainSet, batch_size=cfg.TRAIN.BATCH_SIZE, data=train_data)

    # Initialize the evaluation model
    if cfg.MODEL.TYPE == 'linear':
        model = SSLEvaluator(n_input=cfg.MODEL.NUM_INPUT,
                             n_classes=cfg.MODEL.NUM_OUTPUT,
                             n_hidden=None)
    else:
        model = SSLEvaluator(n_input=cfg.MODEL.NUM_INPUT,
                             n_classes=cfg.MODEL.NUM_OUTPUT,
                             n_hidden=cfg.MODEL.NUM_HIDDEN)
    print("Evaluation model: {}\n".format(cfg.MODEL.EVAL))
    logger.info("Evalution model: {}\n".format(cfg.MODEL.EVAL))

    # Initialize the SSL model
    ssl_model = model_builder.build_model(cfg)
    ssl_checkpoint_file = os.path.join(os.path.abspath('..'),
                                       cfg.TEST.MODEL_PATH)
    ssl_model = cu.load_checkpoint(ssl_checkpoint_file, ssl_model)

    # Construct the optimizer
    optimizer = optim.construct_optimizer(cfg, model)
    print("optimizer: {}\n".format(optimizer))
    logger.info("optimizer: {}\n".format(optimizer))

    # This is to seamlessly use the code originally written for AL episodes
    cfg.EPISODE_DIR = cfg.EXP_DIR

    # Train model
    print("======== EVALUATOR TRAINING ========")
    logger.info("======== EVALUATOR TRAINING ========")

    _, _, eval_checkpoint_file = train_model(trainSet_loader, None, model,
                                             ssl_model, optimizer, cfg)

    # eval_checkpoint_file = os.path.join(os.path.abspath('..'), '')

    # Test best model checkpoint
    print("======== EVALUATOR TESTING ========\n")
    logger.info("======== EVALUATOR TESTING ========\n")

    test_acc = test_model(trainSet_loader,
                          eval_checkpoint_file,
                          ssl_checkpoint_file,
                          cfg,
                          cur_episode=1)
    print("Test Accuracy: {}.\n".format(round(test_acc, 4)))
    logger.info("Test Accuracy {}.\n".format(test_acc))

    print("================================\n\n")
    logger.info("================================\n\n")
Beispiel #17
0
def train(args, train_loader, eval_loader):
    cfg.MODEL.TYPE = "regnet"
    cfg.REGNET.DEPTH = 25
    cfg.REGNET.SE_ON = False
    cfg.REGNET.W0 = 112
    cfg.REGNET.WA = 33.22
    cfg.REGNET.WM = 2.27
    cfg.REGNET.GROUP_W = 72
    cfg.BN.NUM_GROUPS = 4
    cfg.ANYNET.STEM_CHANNELS = 1
    cfg.MODEL.NUM_CLASSES = config["num_classes"]
    net = builders.build_model()
    net = net.cuda(device=torch.cuda.current_device())
    print("net", net)

    if args.resume:
        print("Resuming training, loading {}...".format(args.resume))
        ckpt_file = (
            config["save_folder"]
            + config["ckpt_name"]
            + "_"
            + str(args.resume)
            + ".pth"
        )
        net.load_state_dict(torch.load(ckpt_file))

    if args.finetune:
        print("Finetuning......")
        # Freeze all layers
        for param in net.parameters():
            param.requires_grad = False
        # Unfreeze some layers
        for layer in [net.s4.b1, net.s4.b2]:
            for param in layer.parameters():
                param.requies_grad = True
        net.head.fc.weight.requires_grad = True
        optimizer = optim.SGD(
            filter(lambda param: param.requires_grad, net.parameters()),
            lr=args.lr,
            momentum=args.momentum,
            nesterov=False,
        )
    else:
        optimizer = optim.SGD(
            net.parameters(),
            lr=args.lr,
            momentum=args.momentum,
            nesterov=False,
        )

    scheduler = ReduceLROnPlateau(
        optimizer,
        "max",
        factor=0.5,
        patience=1,
        verbose=True,
        threshold=1e-3,
        threshold_mode="abs",
    )

    aug = augment.Augment().cuda()

    if args.fp16:
        import apex.amp as amp

        net, optimizer = amp.initialize(net, optimizer, opt_level="O2")

    batch_iterator = iter(train_loader)
    sum_accuracy = 0
    step = 0
    config["eval_period"] = len(train_loader.dataset) // args.batch_size
    config["verbose_period"] = config["eval_period"] // 5

    train_start_time = time.time()
    for iteration in range(
        args.resume + 1,
        args.max_epoch * len(train_loader.dataset) // args.batch_size,
    ):
        t0 = time.time()
        try:
            sounds, type_ids = next(batch_iterator)
        except StopIteration:
            batch_iterator = iter(train_loader)
            sounds, type_ids = next(batch_iterator)
        except Exception as ex:
            print("Loading data exception:", ex)

        if torch.cuda.is_available():
            sounds = Variable(sounds.cuda())
            type_ids = Variable(type_ids.cuda())
        else:
            sounds = Variable(sounds)
            type_ids = Variable(type_ids)

        sounds = sounds.unsqueeze(3)
        sounds = sounds.permute(0, 3, 1, 2).float()

        if torch.cuda.is_available():
            one_hot = torch.cuda.FloatTensor(
                type_ids.shape[0], config["num_classes"]
            )
        else:
            one_hot = torch.FloatTensor(
                type_ids.shape[0], config["num_classes"]
            )
        one_hot.fill_(0.5 / (config["num_classes"] - 1))
        one_hot.scatter_(1, type_ids.unsqueeze(1), 0.5)

        # augmentation
        sounds = aug(sounds)
        # forward
        out = net(sounds)

        # backprop
        optimizer.zero_grad()
        loss = torch.sum(-one_hot * F.log_softmax(out, -1), -1).mean()
        # loss = F.cross_entropy(out, type_ids)

        if args.fp16:
            import apex.amp as amp

            with amp.scale_loss(loss, optimizer) as scaled_loss:
                scaled_loss.backward()
        else:
            loss.backward()

        nn.utils.clip_grad_norm_(net.parameters(), max_norm=20, norm_type=2)
        optimizer.step()
        t1 = time.time()

        if iteration % config["verbose_period"] == 0:
            # accuracy
            _, predict = torch.max(out, 1)
            correct = (predict == type_ids)
            accuracy = correct.sum().item() / correct.size()[0]
            print(
                "iter: %d loss: %.4f | acc: %.4f | time: %.4f sec."
                % (iteration, loss.item(), accuracy, (t1 - t0)),
                flush=True,
            )
            sum_accuracy += accuracy
            step += 1

        warmup_steps = config["verbose_period"]
        if iteration < warmup_steps:
            warmup_learning_rate(optimizer, iteration, warmup_steps)

        if (
            iteration % config["eval_period"] == 0
            and iteration != 0
            and step != 0
        ):
            with torch.no_grad():
                loss, accuracy = evaluate(net, eval_loader)
            hours = int(time.time() - train_start_time) // 3600
            now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M")
            print(
                "[{}] [{}] Eval accuracy:{:6f} | Train accuracy:{:6f}".format(
                    now, hours, accuracy, sum_accuracy / step
                ),
                flush=True,
            )
            scheduler.step(accuracy)
            sum_accuracy = 0
            step = 0

        if iteration % config["eval_period"] == 0 and iteration != 0:
            # save checkpoint
            print("Saving state, iter:", iteration, flush=True)
            save_ckpt(net, iteration)

    # final checkpoint
    save_ckpt(net, iteration)
Beispiel #18
0
def main(cfg):

    # Setting up GPU args
    use_cuda = (cfg.NUM_GPUS > 0) and torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")
    kwargs = {
        'num_workers': cfg.DATA_LOADER.NUM_WORKERS,
        'pin_memory': cfg.DATA_LOADER.PIN_MEMORY
    } if use_cuda else {}

    # Using specific GPU
    os.environ['CUDA_VISIBLE_DEVICES'] = str(cfg.GPU_ID)
    print("Using GPU : {}.\n".format(cfg.GPU_ID))

    # Getting the output directory ready (default is "/output")
    cfg.OUT_DIR = os.path.join(os.path.abspath('..'), cfg.OUT_DIR)
    if not os.path.exists(cfg.OUT_DIR):
        os.mkdir(cfg.OUT_DIR)
    # Create "DATASET" specific directory
    dataset_out_dir = os.path.join(cfg.OUT_DIR, cfg.DATASET.NAME)
    if not os.path.exists(dataset_out_dir):
        os.mkdir(dataset_out_dir)
    # Creating the experiment directory inside the dataset specific directory
    # all logs, labeled, unlabeled, validation sets are stroed here
    # E.g., output/CIFAR10/{timestamp or cfg.EXP_NAME based on arguments passed}
    if cfg.EXP_NAME == 'auto':
        now = datetime.now()
        exp_dir = f'{now.year}_{now.month}_{now.day}_{now.hour}{now.minute}{now.second}'
    else:
        exp_dir = cfg.EXP_NAME

    exp_dir = os.path.join(dataset_out_dir, exp_dir)
    if not os.path.exists(exp_dir):
        os.mkdir(exp_dir)
        print("Experiment Directory is {}.\n".format(exp_dir))
    else:
        print(
            "Experiment Directory Already Exists: {}. Reusing it may lead to loss of old logs in the directory.\n"
            .format(exp_dir))
    cfg.EXP_DIR = exp_dir

    # Setup Logger
    lu.setup_logging(cfg)

    # Dataset preparing steps
    print("\n======== PREPARING DATA AND MODEL ========\n")
    cfg.DATASET.ROOT_DIR = os.path.join(os.path.abspath('..'),
                                        cfg.DATASET.ROOT_DIR)
    data_obj = Data(cfg)
    train_data, train_size = data_obj.getDataset(save_dir=cfg.DATASET.ROOT_DIR,
                                                 isTrain=True,
                                                 isDownload=True)
    test_data, test_size = data_obj.getDataset(save_dir=cfg.DATASET.ROOT_DIR,
                                               isTrain=False,
                                               isDownload=True)

    print(
        "\nDataset {} Loaded Sucessfully.\nTotal Train Size: {} and Total Test Size: {}\n"
        .format(cfg.DATASET.NAME, train_size, test_size))
    logger.info(
        "Dataset {} Loaded Sucessfully. Total Train Size: {} and Total Test Size: {}\n"
        .format(cfg.DATASET.NAME, train_size, test_size))

    trainSet_path, valSet_path = data_obj.makeTVSets(train_split_ratio=cfg.ACTIVE_LEARNING.INIT_RATIO, \
        val_split_ratio=cfg.DATASET.VAL_RATIO, data=train_data, seed_id=cfg.RNG_SEED, save_dir=cfg.EXP_DIR)

    trainSet, valSet = data_obj.loadTVPartitions(trainSetPath=trainSet_path,
                                                 valSetPath=valSet_path)

    print("Data Partitioning Complete. \nTrain Set: {},  Validation Set: {}\n".
          format(len(trainSet), len(valSet)))
    logger.info("\nTrain Set: {},  Validation Set: {}\n".format(
        len(trainSet), len(valSet)))

    # Preparing dataloaders for initial training
    trainSet_loader = data_obj.getIndexesDataLoader(
        indexes=trainSet, batch_size=cfg.TRAIN.BATCH_SIZE, data=train_data)
    valSet_loader = data_obj.getIndexesDataLoader(
        indexes=valSet, batch_size=cfg.TRAIN.BATCH_SIZE, data=train_data)
    test_loader = data_obj.getTestLoader(data=test_data,
                                         test_batch_size=cfg.TRAIN.BATCH_SIZE,
                                         seed_id=cfg.RNG_SEED)

    # Initialize the models
    num_ensembles = cfg.ENSEMBLE.NUM_MODELS
    models = []
    for i in range(num_ensembles):
        models.append(model_builder.build_model(cfg))

    print("{} ensemble models of type: {}\n".format(cfg.ENSEMBLE.NUM_MODELS,
                                                    cfg.ENSEMBLE.MODEL_TYPE))
    logger.info("{} ensemble models of type: {}\n".format(
        cfg.ENSEMBLE.NUM_MODELS, cfg.ENSEMBLE.MODEL_TYPE))

    # This is to seamlessly use the code originally written for AL episodes
    cfg.EPISODE_DIR = cfg.EXP_DIR

    # Train models
    print("======== ENSEMBLE TRAINING ========")
    logger.info("======== ENSEMBLE TRAINING ========")

    best_model_paths = []
    test_accs = []
    for i in range(num_ensembles):
        print("=== Training ensemble [{}/{}] ===".format(i + 1, num_ensembles))

        # Construct the optimizer
        optimizer = optim.construct_optimizer(cfg, models[i])
        print("optimizer: {}\n".format(optimizer))
        logger.info("optimizer: {}\n".format(optimizer))

        # Each ensemble gets its own output directory
        cfg.EPISODE_DIR = os.path.join(cfg.EPISODE_DIR,
                                       'model_{}   '.format(i + 1))

        # Train the model
        best_val_acc, best_val_epoch, checkpoint_file = ensemble_train_model(
            trainSet_loader, valSet_loader, models[i], optimizer, cfg)
        best_model_paths.append(checkpoint_file)

        print("Best Validation Accuracy by Model {}: {}\nBest Epoch: {}\n".
              format(i + 1, round(best_val_acc, 4), best_val_epoch))
        logger.info(
            "Best Validation Accuracy by Model {}: {}\tBest Epoch: {}\n".
            format(i + 1, round(best_val_acc, 4), best_val_epoch))

        # Test the model
        print("=== Testing ensemble [{}/{}] ===".format(i + 1, num_ensembles))
        test_acc = ensemble_test_model(test_loader,
                                       checkpoint_file,
                                       cfg,
                                       cur_episode=0)
        test_accs.append(test_acc)

        print("Test Accuracy by Model {}: {}.\n".format(
            i + 1, round(test_acc, 4)))
        logger.info("Test Accuracy by Model {}: {}.\n".format(i + 1, test_acc))

        # Reset EPISODE_DIR
        cfg.EPISODE_DIR = cfg.EXP_DIR

    # Test each best model checkpoint and report the average
    print("======== ENSEMBLE TESTING ========\n")
    logger.info("======== ENSEMBLE TESTING ========\n")

    mean_test_acc = np.mean(test_accs)
    print("Average Ensemble Test Accuracy: {}.\n".format(
        round(mean_test_acc, 4)))
    logger.info("Average Ensemble Test Accuracy: {}.\n".format(mean_test_acc))

    print("================================\n\n")
    logger.info("================================\n\n")
Beispiel #19
0
def main(cfg):

    # Setting up GPU args
    use_cuda = (cfg.NUM_GPUS > 0) and torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")
    kwargs = {
        'num_workers': cfg.DATA_LOADER.NUM_WORKERS,
        'pin_memory': cfg.DATA_LOADER.PIN_MEMORY
    } if use_cuda else {}

    # Using specific GPU
    os.environ['CUDA_VISIBLE_DEVICES'] = str(cfg.GPU_ID)
    print("Using GPU : {}.\n".format(cfg.GPU_ID))

    # Getting the output directory ready (default is "/output")
    cfg.OUT_DIR = os.path.join(os.path.abspath('..'), cfg.OUT_DIR)
    if not os.path.exists(cfg.OUT_DIR):
        os.mkdir(cfg.OUT_DIR)
    # Create "DATASET" specific directory
    dataset_out_dir = os.path.join(cfg.OUT_DIR, cfg.DATASET.NAME,
                                   cfg.MODEL.TYPE)
    if not os.path.exists(dataset_out_dir):
        os.makedirs(dataset_out_dir)
    # Creating the experiment directory inside the dataset specific directory
    # all logs, labeled, unlabeled, validation sets are stroed here
    # E.g., output/CIFAR10/resnet18/{timestamp or cfg.EXP_NAME based on arguments passed}
    if cfg.EXP_NAME == 'auto':
        now = datetime.now()
        exp_dir = f'{now.year}_{now.month}_{now.day}_{now.hour}{now.minute}{now.second}'
    else:
        exp_dir = cfg.EXP_NAME

    exp_dir = os.path.join(dataset_out_dir, exp_dir)
    if not os.path.exists(exp_dir):
        os.mkdir(exp_dir)
        print("Experiment Directory is {}.\n".format(exp_dir))
    else:
        print(
            "Experiment Directory Already Exists: {}. Reusing it may lead to loss of old logs in the directory.\n"
            .format(exp_dir))
    cfg.EXP_DIR = exp_dir

    # Setup Logger
    lu.setup_logging(cfg)

    # Dataset preparing steps
    print("\n======== PREPARING DATA AND MODEL ========\n")
    cfg.DATASET.ROOT_DIR = os.path.join(os.path.abspath('..'),
                                        cfg.DATASET.ROOT_DIR)
    data_obj = Data(cfg)
    train_data, train_size = data_obj.getDataset(save_dir=cfg.DATASET.ROOT_DIR,
                                                 isTrain=True,
                                                 isDownload=True)
    test_data, test_size = data_obj.getDataset(save_dir=cfg.DATASET.ROOT_DIR,
                                               isTrain=False,
                                               isDownload=True)

    print(
        "\nDataset {} Loaded Sucessfully.\nTotal Train Size: {} and Total Test Size: {}\n"
        .format(cfg.DATASET.NAME, train_size, test_size))
    logger.info(
        "Dataset {} Loaded Sucessfully. Total Train Size: {} and Total Test Size: {}\n"
        .format(cfg.DATASET.NAME, train_size, test_size))

    print("\nSampling Initial Pool using {}.".format(
        str.upper(cfg.INIT_POOL.SAMPLING_FN)))
    logger.info("\nSampling Initial Pool using {}.".format(
        str.upper(cfg.INIT_POOL.SAMPLING_FN)))
    if cfg.INIT_POOL.SAMPLING_FN == 'random':
        lSet_path, uSet_path, valSet_path = data_obj.makeLUVSets(train_split_ratio=cfg.INIT_POOL.INIT_RATIO, \
     val_split_ratio=cfg.DATASET.VAL_RATIO, data=train_data, seed_id=cfg.RNG_SEED, save_dir=cfg.EXP_DIR)
    else:
        lSet, uSet = InitialPool(cfg).sample_from_uSet(train_data)
        lSet_path = f'{cfg.EXP_DIR}/lSet.npy'
        np.save(lSet_path, lSet)
        np.save(f'{cfg.EXP_DIR}/lSet_initial.npy', lSet)
        uSet_path, valSet_path = data_obj.makeUVSets(
            val_split_ratio=cfg.DATASET.VAL_RATIO,
            data=uSet,
            seed_id=cfg.RNG_SEED,
            save_dir=cfg.EXP_DIR)

    cfg.ACTIVE_LEARNING.LSET_PATH = lSet_path
    cfg.ACTIVE_LEARNING.USET_PATH = uSet_path
    cfg.ACTIVE_LEARNING.VALSET_PATH = valSet_path

    lSet, uSet, valSet = data_obj.loadPartitions(lSetPath=cfg.ACTIVE_LEARNING.LSET_PATH, \
            uSetPath=cfg.ACTIVE_LEARNING.USET_PATH, valSetPath = cfg.ACTIVE_LEARNING.VALSET_PATH)

    print(
        "Data Partitioning Complete. \nLabeled Set: {}, Unlabeled Set: {}, Validation Set: {}\n"
        .format(len(lSet), len(uSet), len(valSet)))
    logger.info(
        "Labeled Set: {}, Unlabeled Set: {}, Validation Set: {}\n".format(
            len(lSet), len(uSet), len(valSet)))

    # Preparing dataloaders for initial training
    lSet_loader = data_obj.getIndexesDataLoader(
        indexes=lSet, batch_size=cfg.TRAIN.BATCH_SIZE, data=train_data)
    valSet_loader = data_obj.getIndexesDataLoader(
        indexes=valSet, batch_size=cfg.TRAIN.BATCH_SIZE, data=train_data)
    uSet_loader = data_obj.getIndexesDataLoader(
        indexes=uSet, batch_size=cfg.TRAIN.BATCH_SIZE, data=train_data)
    test_loader = data_obj.getTestLoader(data=test_data,
                                         test_batch_size=cfg.TRAIN.BATCH_SIZE,
                                         seed_id=cfg.RNG_SEED)

    # Initialize the models
    num_ensembles = cfg.ENSEMBLE.NUM_MODELS
    models = []
    for i in range(num_ensembles):
        models.append(model_builder.build_model(cfg))
    print("{} ensemble models of type: {}\n".format(cfg.ENSEMBLE.NUM_MODELS,
                                                    cfg.ENSEMBLE.MODEL_TYPE))
    logger.info("{} ensemble models of type: {}\n".format(
        cfg.ENSEMBLE.NUM_MODELS, cfg.ENSEMBLE.MODEL_TYPE))

    print("Max AL Episodes: {}\n".format(cfg.ACTIVE_LEARNING.MAX_ITER))
    logger.info("Max AL Episodes: {}\n".format(cfg.ACTIVE_LEARNING.MAX_ITER))

    for cur_episode in range(0, cfg.ACTIVE_LEARNING.MAX_ITER + 1):

        wandb.log({"Episode": cur_episode})

        print("======== EPISODE {} BEGINS ========\n".format(cur_episode))
        logger.info(
            "======== EPISODE {} BEGINS ========\n".format(cur_episode))

        # Creating output directory for the episode
        episode_dir = os.path.join(cfg.EXP_DIR, f'episode_{cur_episode}')
        if not os.path.exists(episode_dir):
            os.mkdir(episode_dir)
        cfg.EPISODE_DIR = episode_dir

        # Train models
        print("======== ENSEMBLE TRAINING ========")
        logger.info("======== ENSEMBLE TRAINING ========")

        best_model_paths = []
        test_accs = []
        for i in range(num_ensembles):
            print("=== Training ensemble [{}/{}] ===".format(
                i + 1, num_ensembles))

            # Construct the optimizer
            optimizer = optim.construct_optimizer(cfg, models[i])
            print("optimizer: {}\n".format(optimizer))
            logger.info("optimizer: {}\n".format(optimizer))

            # Each ensemble gets its own output directory
            cfg.EPISODE_DIR = os.path.join(cfg.EPISODE_DIR,
                                           'model_{}'.format(i + 1))

            # Train the model
            best_val_acc, best_val_epoch, checkpoint_file = ensemble_train_model(
                lSet_loader, valSet_loader, models[i], optimizer, cfg)
            best_model_paths.append(checkpoint_file)

            print("Best Validation Accuracy by Model {}: {}\nBest Epoch: {}\n".
                  format(i + 1, round(best_val_acc, 4), best_val_epoch))
            logger.info(
                "EPISODE {} Best Validation Accuracy by Model {}: {}\tBest Epoch: {}\n"
                .format(cur_episode, i + 1, round(best_val_acc, 4),
                        best_val_epoch))

            # Test the model
            print("=== Testing ensemble [{}/{}] ===".format(
                i + 1, num_ensembles))
            test_acc = ensemble_test_model(test_loader, checkpoint_file, cfg,
                                           cur_episode)
            test_accs.append(test_acc)

            print("Test Accuracy by Model {}: {}.\n".format(
                i + 1, round(test_acc, 4)))
            logger.info("EPISODE {} Test Accuracy by Model {}: {}.\n".format(
                cur_episode, i + 1, test_acc))

            # Reset EPISODE_DIR
            cfg.EPISODE_DIR = episode_dir

        # Test each best model checkpoint and report the average
        print("======== ENSEMBLE TESTING ========\n")
        logger.info("======== ENSEMBLE TESTING ========\n")
        mean_test_acc = np.mean(test_accs)
        print("Average Ensemble Test Accuracy: {}.\n".format(
            round(mean_test_acc, 4)))
        logger.info("EPISODE {} Average Ensemble Test Accuracy: {}.\n".format(
            cur_episode, mean_test_acc))
        wandb.log({"Test Accuracy": mean_test_acc})

        global plot_episode_xvalues
        global plot_episode_yvalues

        global plot_epoch_xvalues
        global plot_epoch_yvalues

        global plot_it_x_values
        global plot_it_y_values

        plot_episode_xvalues.append(cur_episode)
        plot_episode_yvalues.append(mean_test_acc)

        plot_arrays(x_vals=plot_episode_xvalues, y_vals=plot_episode_yvalues, \
            x_name="Episodes", y_name="Test Accuracy", dataset_name=cfg.DATASET.NAME, out_dir=cfg.EXP_DIR)

        save_plot_values([plot_episode_xvalues, plot_episode_yvalues], \
            ["plot_episode_xvalues", "plot_episode_yvalues"], out_dir=cfg.EXP_DIR, saveInTextFormat=True)

        # No need to perform active sampling in the last episode iteration
        if cur_episode == cfg.ACTIVE_LEARNING.MAX_ITER:
            break

        # Active Sample
        print("======== ENSEMBLE ACTIVE SAMPLING ========\n")
        logger.info("======== ENSEMBLE ACTIVE SAMPLING ========\n")
        al_obj = ActiveLearning(data_obj, cfg)
        clf_models = []
        for i in range(num_ensembles):
            temp = model_builder.build_model(cfg)
            clf_models.append(cu.load_checkpoint(best_model_paths[i], temp))

        activeSet, new_uSet = al_obj.sample_from_uSet(
            None, lSet, uSet, train_data, supportingModels=clf_models)

        # Save current lSet, new_uSet and activeSet in the episode directory
        data_obj.saveSets(lSet, uSet, activeSet, cfg.EPISODE_DIR)

        # Add activeSet to lSet, save new_uSet as uSet and update dataloader for the next episode
        lSet = np.append(lSet, activeSet)
        uSet = new_uSet

        lSet_loader = data_obj.getIndexesDataLoader(
            indexes=lSet, batch_size=cfg.TRAIN.BATCH_SIZE, data=train_data)
        valSet_loader = data_obj.getIndexesDataLoader(
            indexes=valSet, batch_size=cfg.TRAIN.BATCH_SIZE, data=train_data)
        uSet_loader = data_obj.getSequentialDataLoader(
            indexes=uSet, batch_size=cfg.TRAIN.BATCH_SIZE, data=train_data)

        print(
            "Ensemble Active Sampling Complete. After Episode {}:\nNew Labeled Set: {}, New Unlabeled Set: {}, Active Set: {}\n"
            .format(cur_episode, len(lSet), len(uSet), len(activeSet)))
        logger.info(
            "Ensemble Active Sampling Complete. After Episode {}:\nNew Labeled Set: {}, New Unlabeled Set: {}, Active Set: {}\n"
            .format(cur_episode, len(lSet), len(uSet), len(activeSet)))
        print("================================\n\n")
        logger.info("================================\n\n")
Beispiel #20
0
def train(args, train_loader, eval_loader):
    cfg.MODEL.TYPE = "regnet"
    cfg.REGNET.DEPTH = 20
    cfg.REGNET.SE_ON = False
    cfg.REGNET.W0 = 512
    cfg.MODEL.NUM_CLASSES = config["num_classes"]
    net = model_builder.build_model()
    net = net.cuda(device=torch.cuda.current_device())
    print("net", net)
    if args.resume:
        print("Resuming training, loading {}...".format(args.resume))
        ckpt_file = (config["save_folder"] + config["ckpt_name"] + "_" +
                     str(args.resume) + ".pth")
        net.load_state_dict(torch.load(ckpt_file))

    if args.finetune:
        print("Finetuning......")
        # Freeze all layers
        for param in net.parameters():
            param.requires_grad = False
        # Unfreeze some layers
        for layer in [net.s1.b18, net.s1.b19, net.s1.b20]:
            for param in layer.parameters():
                param.requies_grad = True
        net.head.fc.weight.requires_grad = True
        optimizer = optim.SGD(
            filter(lambda param: param.requires_grad, net.parameters()),
            lr=args.lr,
            momentum=args.momentum,
            nesterov=False,
        )
    else:
        optimizer = optim.SGD(
            net.parameters(),
            lr=args.lr,
            momentum=args.momentum,
            nesterov=False,
        )

    scheduler = ReduceLROnPlateau(
        optimizer,
        "max",
        factor=0.5,
        patience=2,
        verbose=True,
        threshold=1e-3,
        threshold_mode="abs",
    )

    if args.fp16:
        net, optimizer = amp.initialize(net, optimizer, opt_level="O2")

    aug = augmentations.Augmentations().cuda()
    batch_iterator = iter(train_loader)
    sum_accuracy = 0
    step = 0
    for iteration in range(
            args.resume + 1,
            args.max_epoch * len(train_loader.dataset) // args.batch_size,
    ):
        t0 = time.time()
        try:
            images, type_ids = next(batch_iterator)
        except StopIteration:
            batch_iterator = iter(train_loader)
            images, type_ids = next(batch_iterator)
        except Exception as e:
            print("Loading data exception:", e)

        images = Variable(images.cuda()).permute(0, 3, 1, 2).float()
        type_ids = Variable(type_ids.cuda())

        one_hot = torch.cuda.FloatTensor(type_ids.shape[0],
                                         config["num_classes"])
        one_hot.fill_((1 - 0.5) / config["num_classes"])
        one_hot.scatter_(1, type_ids.unsqueeze(1), 0.5)

        # augmentation
        if not args.finetune:
            images = aug(images)
        # forward
        out = net(images)

        loss = (torch.sum(-one_hot * F.log_softmax(out, -1), -1).mean() /
                args.iter_size)

        if args.fp16:
            with amp.scale_loss(loss, optimizer) as scaled_loss:
                scaled_loss.backward()
        else:
            loss.backward()

        nn.utils.clip_grad_norm_(net.parameters(), max_norm=20, norm_type=2)

        if iteration != 0 and iteration % args.iter_size == 0:
            # backprop
            optimizer.step()
            optimizer.zero_grad()

        t1 = time.time()

        if iteration % config["verbose_period"] == 0:
            # accuracy
            _, predict = torch.max(out, 1)
            correct = predict == type_ids
            accuracy = correct.sum().item() / correct.size()[0]
            print(
                "iter: %d loss: %.4f | acc: %.4f | time: %.4f sec." %
                (iteration, loss.item(), accuracy, (t1 - t0)),
                flush=True,
            )
            sum_accuracy += accuracy
            step += 1

        warmup_steps = config["verbose_period"] * 8 * args.iter_size
        if iteration < warmup_steps:
            warmup_learning_rate(optimizer, iteration, warmup_steps)

        if (iteration % config["eval_period"] == 0 and iteration != 0
                and step != 0):
            loss, accuracy = evaluate(net, eval_loader)
            now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
            print(
                f"[{now}] Eval accuracy: {accuracy:.4f} | Train accuracy: {sum_accuracy/step:.4f}",
                flush=True,
            )
            scheduler.step(accuracy)
            sum_accuracy = 0
            step = 0

        if iteration % config["save_period"] == 0 and iteration != 0:
            # save checkpoint
            print("Saving state, iter:", iteration, flush=True)
            save_ckpt(net, iteration)

    # final checkpoint
    save_ckpt(net, iteration)
Beispiel #21
0
def main(cfg):

    # Setting up GPU args
    use_cuda = (cfg.NUM_GPUS > 0) and torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")
    kwargs = {
        'num_workers': cfg.DATA_LOADER.NUM_WORKERS,
        'pin_memory': cfg.DATA_LOADER.PIN_MEMORY
    } if use_cuda else {}

    # Auto assign a RNG_SEED when not supplied a value
    if cfg.RNG_SEED is None:
        cfg.RNG_SEED = np.random.randint(100)

    # Using specific GPU
    # os.environ['NVIDIA_VISIBLE_DEVICES'] = str(cfg.GPU_ID)
    # os.environ['CUDA_VISIBLE_DEVICES'] = '0'
    # print("Using GPU : {}.\n".format(cfg.GPU_ID))

    # Getting the output directory ready (default is "/output")
    cfg.OUT_DIR = os.path.join(os.path.abspath('..'), cfg.OUT_DIR)
    if not os.path.exists(cfg.OUT_DIR):
        os.mkdir(cfg.OUT_DIR)
    # Create "DATASET/MODEL TYPE" specific directory
    dataset_out_dir = os.path.join(cfg.OUT_DIR, cfg.DATASET.NAME,
                                   cfg.MODEL.TYPE)
    if not os.path.exists(dataset_out_dir):
        os.makedirs(dataset_out_dir)
    # Creating the experiment directory inside the dataset specific directory
    # all logs, labeled, unlabeled, validation sets are stroed here
    # E.g., output/CIFAR10/resnet18/{timestamp or cfg.EXP_NAME based on arguments passed}
    if cfg.EXP_NAME == 'auto':
        now = datetime.now()
        exp_dir = f'{now.year}_{now.month}_{now.day}_{now.hour}{now.minute}{now.second}'
    else:
        exp_dir = cfg.EXP_NAME

    exp_dir = os.path.join(dataset_out_dir, exp_dir)
    if not os.path.exists(exp_dir):
        os.mkdir(exp_dir)
        print("Experiment Directory is {}.\n".format(exp_dir))
    else:
        print(
            "Experiment Directory Already Exists: {}. Reusing it may lead to loss of old logs in the directory.\n"
            .format(exp_dir))
    cfg.EXP_DIR = exp_dir

    # Save the config file in EXP_DIR
    dump_cfg(cfg)

    # Setup Logger
    lu.setup_logging(cfg)

    # Dataset preparing steps
    print("\n======== PREPARING DATA AND MODEL ========\n")
    cfg.DATASET.ROOT_DIR = os.path.join(os.path.abspath('..'),
                                        cfg.DATASET.ROOT_DIR)
    data_obj = Data(cfg)
    train_data, train_size = data_obj.getDataset(save_dir=cfg.DATASET.ROOT_DIR,
                                                 isTrain=True,
                                                 isDownload=True)
    test_data, test_size = data_obj.getDataset(save_dir=cfg.DATASET.ROOT_DIR,
                                               isTrain=False,
                                               isDownload=True)

    print(
        "\nDataset {} Loaded Sucessfully.\nTotal Train Size: {} and Total Test Size: {}\n"
        .format(cfg.DATASET.NAME, train_size, test_size))
    logger.info(
        "Dataset {} Loaded Sucessfully. Total Train Size: {} and Total Test Size: {}\n"
        .format(cfg.DATASET.NAME, train_size, test_size))

    lSet_path, uSet_path, valSet_path = data_obj.makeLUVSets(train_split_ratio=cfg.ACTIVE_LEARNING.INIT_L_RATIO, \
        val_split_ratio=cfg.DATASET.VAL_RATIO, data=train_data, seed_id=cfg.RNG_SEED, save_dir=cfg.EXP_DIR)

    cfg.ACTIVE_LEARNING.LSET_PATH = lSet_path
    cfg.ACTIVE_LEARNING.USET_PATH = uSet_path
    cfg.ACTIVE_LEARNING.VALSET_PATH = valSet_path

    lSet, uSet, valSet = data_obj.loadPartitions(lSetPath=cfg.ACTIVE_LEARNING.LSET_PATH, \
            uSetPath=cfg.ACTIVE_LEARNING.USET_PATH, valSetPath = cfg.ACTIVE_LEARNING.VALSET_PATH)

    print(
        "Data Partitioning Complete. \nLabeled Set: {}, Unlabeled Set: {}, Validation Set: {}\n"
        .format(len(lSet), len(uSet), len(valSet)))
    logger.info(
        "Labeled Set: {}, Unlabeled Set: {}, Validation Set: {}\n".format(
            len(lSet), len(uSet), len(valSet)))

    # Preparing dataloaders for initial training
    lSet_loader = data_obj.getIndexesDataLoader(
        indexes=lSet, batch_size=cfg.TRAIN.BATCH_SIZE, data=train_data)
    valSet_loader = data_obj.getIndexesDataLoader(
        indexes=valSet, batch_size=cfg.TRAIN.BATCH_SIZE, data=train_data)
    test_loader = data_obj.getTestLoader(data=test_data,
                                         test_batch_size=cfg.TRAIN.BATCH_SIZE,
                                         seed_id=cfg.RNG_SEED)

    # Initialize the model.
    model = model_builder.build_model(cfg)
    print("model: {}\n".format(cfg.MODEL.TYPE))
    logger.info("model: {}\n".format(cfg.MODEL.TYPE))

    # Construct the optimizer
    optimizer = optim.construct_optimizer(cfg, model)
    print("optimizer: {}\n".format(optimizer))
    logger.info("optimizer: {}\n".format(optimizer))

    print("AL Query Method: {}\nMax AL Episodes: {}\n".format(
        cfg.ACTIVE_LEARNING.SAMPLING_FN, cfg.ACTIVE_LEARNING.MAX_ITER))
    logger.info("AL Query Method: {}\nMax AL Episodes: {}\n".format(
        cfg.ACTIVE_LEARNING.SAMPLING_FN, cfg.ACTIVE_LEARNING.MAX_ITER))

    for cur_episode in range(0, cfg.ACTIVE_LEARNING.MAX_ITER + 1):

        print("======== EPISODE {} BEGINS ========\n".format(cur_episode))
        logger.info(
            "======== EPISODE {} BEGINS ========\n".format(cur_episode))

        # Creating output directory for the episode
        episode_dir = os.path.join(cfg.EXP_DIR, f'episode_{cur_episode}')
        if not os.path.exists(episode_dir):
            os.mkdir(episode_dir)
        cfg.EPISODE_DIR = episode_dir

        # Train model
        print("======== TRAINING ========")
        logger.info("======== TRAINING ========")

        best_val_acc, best_val_epoch, checkpoint_file = train_model(
            lSet_loader, valSet_loader, model, optimizer, cfg)

        print("Best Validation Accuracy: {}\nBest Epoch: {}\n".format(
            round(best_val_acc, 4), best_val_epoch))
        logger.info(
            "EPISODE {} Best Validation Accuracy: {}\tBest Epoch: {}\n".format(
                cur_episode, round(best_val_acc, 4), best_val_epoch))

        # Test best model checkpoint
        print("======== TESTING ========\n")
        logger.info("======== TESTING ========\n")
        test_acc = test_model(test_loader, checkpoint_file, cfg, cur_episode)
        print("Test Accuracy: {}.\n".format(round(test_acc, 4)))
        logger.info("EPISODE {} Test Accuracy {}.\n".format(
            cur_episode, test_acc))

        # No need to perform active sampling in the last episode iteration
        if cur_episode == cfg.ACTIVE_LEARNING.MAX_ITER:
            # Save current lSet, uSet in the final episode directory
            data_obj.saveSet(lSet, 'lSet', cfg.EPISODE_DIR)
            data_obj.saveSet(uSet, 'uSet', cfg.EPISODE_DIR)
            break

        # Active Sample
        print("======== ACTIVE SAMPLING ========\n")
        logger.info("======== ACTIVE SAMPLING ========\n")
        al_obj = ActiveLearning(data_obj, cfg)
        clf_model = model_builder.build_model(cfg)
        clf_model = cu.load_checkpoint(checkpoint_file, clf_model)
        activeSet, new_uSet = al_obj.sample_from_uSet(clf_model, lSet, uSet,
                                                      train_data)

        # Save current lSet, new_uSet and activeSet in the episode directory
        data_obj.saveSets(lSet, uSet, activeSet, cfg.EPISODE_DIR)

        # Add activeSet to lSet, save new_uSet as uSet and update dataloader for the next episode
        lSet = np.append(lSet, activeSet)
        uSet = new_uSet

        lSet_loader = data_obj.getIndexesDataLoader(
            indexes=lSet, batch_size=cfg.TRAIN.BATCH_SIZE, data=train_data)
        valSet_loader = data_obj.getIndexesDataLoader(
            indexes=valSet, batch_size=cfg.TRAIN.BATCH_SIZE, data=train_data)
        uSet_loader = data_obj.getSequentialDataLoader(
            indexes=uSet, batch_size=cfg.TRAIN.BATCH_SIZE, data=train_data)

        print(
            "Active Sampling Complete. After Episode {}:\nNew Labeled Set: {}, New Unlabeled Set: {}, Active Set: {}\n"
            .format(cur_episode, len(lSet), len(uSet), len(activeSet)))
        logger.info(
            "Active Sampling Complete. After Episode {}:\nNew Labeled Set: {}, New Unlabeled Set: {}, Active Set: {}\n"
            .format(cur_episode, len(lSet), len(uSet), len(activeSet)))
        print("================================\n\n")
        logger.info("================================\n\n")
Beispiel #22
0
def main(cfg):
    # Setting up GPU args
    use_cuda = (cfg.NUM_GPUS > 0) and torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")
    kwargs = {
        'num_workers': cfg.DATA_LOADER.NUM_WORKERS,
        'pin_memory': cfg.DATA_LOADER.PIN_MEMORY
    } if use_cuda else {}

    # Using specific GPU
    os.environ['NVIDIA_VISIBLE_DEVICES'] = str(cfg.GPU_ID)
    os.environ['CUDA_VISIBLE_DEVICES'] = '0'
    print("Using GPU : {}.\n".format(cfg.GPU_ID))

    # Getting the output directory ready (default is "/output")
    cfg.OUT_DIR = os.path.join(os.path.abspath('..'), cfg.OUT_DIR)
    if not os.path.exists(cfg.OUT_DIR):
        os.makedirs(cfg.OUT_DIR)
    # Create "DATASET" specific directory
    dataset_out_dir = os.path.join(cfg.OUT_DIR, cfg.DATASET.NAME,
                                   cfg.MODEL.TYPE)
    if not os.path.exists(dataset_out_dir):
        os.makedirs(dataset_out_dir)
    # Creating the experiment directory inside the dataset specific directory
    # all logs, labeled, unlabeled, validation sets are stroed here
    # E.g., output/CIFAR10/resnet18/{timestamp or cfg.EXP_NAME based on arguments passed}
    if cfg.EXP_NAME == 'auto':
        now = datetime.now()
        exp_dir = f'{now.year}_{now.month}_{now.day}_{now.hour}{now.minute}{now.second}'
    else:
        exp_dir = cfg.EXP_NAME

    exp_dir = os.path.join(dataset_out_dir, exp_dir)
    if not os.path.exists(exp_dir):
        os.mkdir(exp_dir)
        print("Experiment Directory is {}.\n".format(exp_dir))
    else:
        print(
            "Experiment Directory Already Exists: {}. Reusing it may lead to loss of old logs in the directory.\n"
            .format(exp_dir))
    cfg.EXP_DIR = exp_dir

    # Setup Logger
    lu.setup_logging(cfg)

    # Dataset preparing steps
    print("\n======== PREPARING DATA AND MODEL ========\n")
    cfg.DATASET.ROOT_DIR = os.path.join(os.path.abspath('..'),
                                        cfg.DATASET.ROOT_DIR)
    data_obj = Data(cfg)
    train_data, train_size = data_obj.getDataset(save_dir=cfg.DATASET.ROOT_DIR,
                                                 isTrain=True,
                                                 isDownload=True)
    train_data = RotNetDataset(cfg.DATASET.NAME, train_data)
    train_size = len(train_data)

    print("\n Rotation Dataset {} Loaded Sucessfully.\nTotal Train Size: {}\n".
          format(cfg.DATASET.NAME, train_size))
    logger.info(
        "Rotation Dataset {} Loaded Sucessfully. Total Train Size: {}\n".
        format(cfg.DATASET.NAME, train_size))

    trainSet_path, valSet_path = data_obj.makeTVSets(val_split_ratio=cfg.DATASET.VAL_RATIO, data=train_data,\
                                 seed_id=cfg.RNG_SEED, save_dir=cfg.EXP_DIR)

    cfg.INIT_POOL.TRAINSET_PATH = trainSet_path
    cfg.INIT_POOL.VALSET_PATH = valSet_path

    trainSet, valSet = data_obj.loadTVPartitions(
        trainSetPath=cfg.INIT_POOL.TRAINSET_PATH,
        valSetPath=cfg.INIT_POOL.VALSET_PATH)

    print("Data Partitioning Complete. \nTrain Set: {}, Validation Set: {}\n".
          format(len(trainSet), len(valSet)))
    logger.info("Train Set: {}, Validation Set: {}\n".format(
        len(trainSet), len(valSet)))

    # Preparing dataloaders for initial training
    trainSet_loader = data_obj.getSequentialDataLoader(
        indexes=trainSet, batch_size=cfg.TRAIN.BATCH_SIZE, data=train_data)
    valSet_loader = data_obj.getSequentialDataLoader(
        indexes=valSet, batch_size=cfg.TRAIN.BATCH_SIZE, data=train_data)

    # Initialize the model.
    model = model_builder.build_model(cfg)
    print("model: {}\n".format(cfg.MODEL.TYPE))
    logger.info("model: {}\n".format(cfg.MODEL.TYPE))

    # Construct the optimizer
    optimizer = optim.construct_optimizer(cfg, model)
    print("optimizer: {}\n".format(optimizer))
    logger.info("optimizer: {}\n".format(optimizer))

    # This is to seamlessly use the code originally written for AL episodes
    cfg.EPISODE_DIR = cfg.EXP_DIR

    # Train model
    print("======== ROTATION TRAINING ========")
    logger.info("======== ROTATION TRAINING ========")

    best_val_acc, best_val_epoch, checkpoint_file = train_model(
        trainSet_loader, valSet_loader, model, optimizer, cfg)

    print("Best Validation Accuracy: {}\nBest Epoch: {}\n".format(
        round(best_val_acc, 4), best_val_epoch))
    logger.info("Best Validation Accuracy: {}\tBest Epoch: {}\n".format(
        round(best_val_acc, 4), best_val_epoch))

    # Test best model checkpoint
    print("======== ROTATION TESTING ========\n")
    logger.info("======== ROTATION TESTING ========\n")

    test_acc = test_model(trainSet_loader, checkpoint_file, cfg, cur_episode=1)
    print("Test Accuracy: {}.\n".format(round(test_acc, 4)))
    logger.info("Test Accuracy {}.\n".format(test_acc))

    print("================================\n\n")
    logger.info("================================\n\n")