def main(params):

    # initialize the multi-GPU / multi-node training
    init_distributed_mode(params)

    # initialize the experiment / load data
    logger = initialize_exp(params)

    # Seed
    torch.manual_seed(params.seed)
    torch.cuda.manual_seed_all(params.seed)

    # initialize SLURM signal handler for time limit / pre-emption
    if params.is_slurm_job:
        init_signal_handler()

    # data loaders / samplers
    populate_dataset(params)
    train_data_loader, train_sampler, _ = get_data_loader(
        img_size=params.img_size,
        crop_size=params.crop_size,
        shuffle=True,
        batch_size=params.batch_size,
        num_classes=params.num_classes,
        nb_workers=params.nb_workers,
        distributed_sampler=params.multi_gpu,
        dataset=params.dataset,
        data_path=params.train_path,
        transform=params.train_transform,
        split='valid' if params.debug_train else 'train',
        seed=params.seed)

    valid_data_loader, _, _ = get_data_loader(img_size=params.img_size,
                                              crop_size=params.crop_size,
                                              shuffle=False,
                                              batch_size=params.batch_size,
                                              num_classes=params.num_classes,
                                              nb_workers=params.nb_workers,
                                              distributed_sampler=False,
                                              dataset=params.dataset,
                                              transform='center',
                                              split='valid',
                                              seed=params.seed)

    # build model / cuda
    logger.info("Building %s model ..." % params.architecture)
    ftmodel = build_model(params)
    ftmodel.fc = nn.Sequential()
    ftmodel.eval().cuda()

    linearmodel = nn.Linear(EMBEDDING_SIZE[params.architecture],
                            params.num_classes).cuda()

    if params.from_ckpt != "":
        ckpt = torch.load(params.from_ckpt)
        state_dict = {
            k.replace("module.", ""): v
            for k, v in ckpt['model'].items()
        }

        del state_dict["fc.weight"]
        if "fc.bias" in state_dict:
            del state_dict["fc.bias"]
        missing_keys, unexcepted_keys = ftmodel.load_state_dict(state_dict,
                                                                strict=False)
        print("Missing keys: ", missing_keys)
        print("Unexcepted keys: ", unexcepted_keys)

    # distributed  # TODO: check this https://github.com/NVIDIA/apex/blob/master/examples/imagenet/main.py#L142
    if params.multi_gpu:
        logger.info("Using nn.parallel.DistributedDataParallel ...")
        linearmodel = nn.parallel.DistributedDataParallel(
            linearmodel,
            device_ids=[params.local_rank],
            output_device=params.local_rank,
            broadcast_buffers=True)

    # build trainer / reload potential checkpoints / build evaluator
    trainer = Trainer(model=linearmodel, params=params, ftmodel=ftmodel)
    trainer.reload_checkpoint()
    evaluator = Evaluator(trainer, params)

    # evaluation
    if params.eval_only:
        scores = evaluator.run_all_evals(trainer,
                                         evals=['classif'],
                                         data_loader=valid_data_loader)

        for k, v in scores.items():
            logger.info('%s -> %.6f' % (k, v))
        logger.info("__log__:%s" % json.dumps(scores))
        exit()

    # training
    for epoch in range(trainer.epoch, params.epochs):

        # update epoch / sampler / learning rate
        trainer.epoch = epoch
        logger.info("============ Starting epoch %i ... ============" %
                    trainer.epoch)
        if params.multi_gpu:
            train_sampler.set_epoch(epoch)

        # update learning rate
        trainer.update_learning_rate()

        # train
        for i, (images, targets) in enumerate(train_data_loader):
            trainer.classif_step(images, targets)
            trainer.iter()

        logger.info("============ End of epoch %i ============" %
                    trainer.epoch)

        # evaluate classification accuracy
        scores = evaluator.run_all_evals(trainer,
                                         evals=['classif'],
                                         data_loader=valid_data_loader)

        for name, val in trainer.get_scores().items():
            scores[name] = val

        # print / JSON log
        for k, v in scores.items():
            logger.info('%s -> %.6f' % (k, v))
        if params.is_master:
            logger.info("__log__:%s" % json.dumps(scores))

        # end of epoch
        trainer.save_best_model(scores)
        trainer.save_periodic()
        trainer.end_epoch(scores)
def main(params):
    init_distributed_mode(params)

    logger = initialize_exp(params)

    torch.cuda.manual_seed_all(params.seed)
    transform = getTransform(0)

    root_data = '/private/home/asablayrolles/data/cifar-dejalight2'
    trainset = CIFAR10(root=root_data, name=params.name, transform=transform)
    trainloader = torch.utils.data.DataLoader(trainset,
                                              batch_size=params.batch_size,
                                              shuffle=True,
                                              num_workers=2)

    valid_set = CIFAR10(root=root_data, name='public_0', transform=transform)
    valid_data_loader = torch.utils.data.DataLoader(
        valid_set, batch_size=params.batch_size, shuffle=False, num_workers=2)

    model = build_model(params)
    if params.gpu:
        model = model.cuda()

    # criterion = nn.CrossEntropyLoss()
    # optimizer = optim.SGD(model.parameters(), lr=params.lr, momentum=params.momentum)

    trainer = Trainer(model=model, params=params)
    evaluator = Evaluator(trainer, params)

    for epoch in range(params.epochs):
        trainer.update_learning_rate()
        for images, targets in trainloader:
            trainer.classif_step(images, targets)

        # evaluate classification accuracy
        scores = evaluator.run_all_evals(trainer,
                                         evals=['classif'],
                                         data_loader=valid_data_loader)

        for name, val in trainer.get_scores().items():
            scores[name] = val

        accuracy, precision_train, recall_train = mast_topline(
            model, trainloader, valid_data_loader)
        print(f"Guessing accuracy: {accuracy}")

        scores["mast_accuracy"] = accuracy
        scores["mast_precision_train"] = precision_train
        scores["mast_recall_train"] = recall_train

        # print / JSON log
        for k, v in scores.items():
            logger.info('%s -> %.6f' % (k, v))

        if params.is_master:
            logger.info("__log__:%s" % json.dumps(scores))

        # end of epoch
        trainer.save_best_model(scores)
        trainer.save_periodic()
        trainer.end_epoch(scores)

    print('Finished Training')
Ejemplo n.º 3
0
def main(params):

    # initialize the multi-GPU / multi-node training
    # initialize experiment / SLURM signal handler for time limit / pre-emption
    init_distributed_mode(params)
    logger = initialize_exp(params)
    init_signal_handler()

    # CPU / CUDA
    if params.cpu:
        assert not params.multi_gpu
    else:
        assert torch.cuda.is_available()
    src.utils.CUDA = not params.cpu

    # build environment / modules / trainer / evaluator
    env = build_env(params)
    modules = build_modules(env, params)
    trainer = Trainer(modules, env, params)
    evaluator = Evaluator(trainer)

    # evaluation
    if params.eval_only:
        scores = evaluator.run_all_evals()
        for k, v in scores.items():
            logger.info("%s -> %.6f" % (k, v))
        logger.info("__log__:%s" % json.dumps(scores))
        exit()

    # training
    for _ in range(params.max_epoch):

        logger.info("============ Starting epoch %i ... ============" % trainer.epoch)

        trainer.n_equations = 0

        while trainer.n_equations < trainer.epoch_size:

            # training steps
            for task_id in np.random.permutation(len(params.tasks)):
                task = params.tasks[task_id]
                if params.export_data:
                    trainer.export_data(task)
                else:
                    trainer.enc_dec_step(task)
                trainer.iter()

        logger.info("============ End of epoch %i ============" % trainer.epoch)

        # evaluate perplexity
        scores = evaluator.run_all_evals()

        # print / JSON log
        for k, v in scores.items():
            logger.info("%s -> %.6f" % (k, v))
        if params.is_master:
            logger.info("__log__:%s" % json.dumps(scores))

        # end of epoch
        trainer.save_best_model(scores)
        trainer.save_periodic()
        trainer.end_epoch(scores)
Ejemplo n.º 4
0
def main(params):

    # initialize the multi-GPU / multi-node training
    init_distributed_mode(params)

    # initialize the experiment / load data
    logger = initialize_exp(params)

    # initialize SLURM signal handler for time limit / pre-emption
    if params.is_slurm_job:
        init_signal_handler()

    if params.dataset == "imagenet":
        params.num_classes = 1000
        params.img_size = 256
        params.crop_size = 224
    else:
        if params.dataset == "cifar10":
            params.num_classes = 10
        elif params.dataset == "cifar100":
            params.num_classes = 100
        else:
            assert False, "Dataset unbeknownst to me"

        params.img_size = 40
        params.crop_size = 32

    # data loaders / samplers
    train_data_loader, train_sampler = get_data_loader(
        img_size=params.img_size,
        crop_size=params.crop_size,
        shuffle=True,
        batch_size=params.batch_size,
        nb_workers=params.nb_workers,
        distributed_sampler=params.multi_gpu,
        dataset=params.dataset,
        transform=params.transform,
        split='valid' if params.debug_train else params.split_train,
    )

    valid_data_loader, _ = get_data_loader(
        img_size=params.img_size,
        crop_size=params.crop_size,
        shuffle=False,
        batch_size=params.batch_size,
        nb_workers=params.nb_workers,
        distributed_sampler=False,
        dataset=params.dataset,
        transform='center',
        split='valid',
    )

    # build model / cuda
    logger.info("Building %s model ..." % params.architecture)
    model = build_model(params)
    model.cuda()

    # distributed  # TODO: check this https://github.com/NVIDIA/apex/blob/master/examples/imagenet/main.py#L142
    if params.multi_gpu:
        logger.info("Using nn.parallel.DistributedDataParallel ...")
        model = nn.parallel.DistributedDataParallel(
            model,
            device_ids=[params.local_rank],
            output_device=params.local_rank,
            broadcast_buffers=True)

    # build trainer / reload potential checkpoints / build evaluator
    trainer = Trainer(model=model, params=params)
    trainer.reload_checkpoint()
    evaluator = Evaluator(trainer, params)

    # evaluation
    if params.eval_only:
        scores = evaluator.run_all_evals(trainer,
                                         evals=['classif', 'recognition'],
                                         data_loader=valid_data_loader)

        for k, v in scores.items():
            logger.info('%s -> %.6f' % (k, v))
        logger.info("__log__:%s" % json.dumps(scores))
        exit()

    # training
    for epoch in range(trainer.epoch, params.epochs):

        # update epoch / sampler / learning rate
        trainer.epoch = epoch
        logger.info("============ Starting epoch %i ... ============" %
                    trainer.epoch)
        if params.multi_gpu:
            train_sampler.set_epoch(epoch)

        # update learning rate
        trainer.update_learning_rate()

        # train
        for i, (images, targets) in enumerate(train_data_loader):
            trainer.classif_step(images, targets)
            trainer.iter()

        logger.info("============ End of epoch %i ============" %
                    trainer.epoch)

        # evaluate classification accuracy
        scores = evaluator.run_all_evals(trainer,
                                         evals=['classif'],
                                         data_loader=valid_data_loader)

        for name, val in trainer.get_scores().items():
            scores[name] = val

        # print / JSON log
        for k, v in scores.items():
            logger.info('%s -> %.6f' % (k, v))
        if params.is_master:
            logger.info("__log__:%s" % json.dumps(scores))

        # end of epoch
        trainer.save_best_model(scores)
        trainer.save_periodic()
        trainer.end_epoch(scores)