def main(params): # initialize the multi-GPU / multi-node training init_distributed_mode(params) # initialize the experiment / load data logger = initialize_exp(params) # Seed torch.manual_seed(params.seed) torch.cuda.manual_seed_all(params.seed) # initialize SLURM signal handler for time limit / pre-emption if params.is_slurm_job: init_signal_handler() # data loaders / samplers populate_dataset(params) train_data_loader, train_sampler, _ = get_data_loader( img_size=params.img_size, crop_size=params.crop_size, shuffle=True, batch_size=params.batch_size, num_classes=params.num_classes, nb_workers=params.nb_workers, distributed_sampler=params.multi_gpu, dataset=params.dataset, data_path=params.train_path, transform=params.train_transform, split='valid' if params.debug_train else 'train', seed=params.seed) valid_data_loader, _, _ = get_data_loader(img_size=params.img_size, crop_size=params.crop_size, shuffle=False, batch_size=params.batch_size, num_classes=params.num_classes, nb_workers=params.nb_workers, distributed_sampler=False, dataset=params.dataset, transform='center', split='valid', seed=params.seed) # build model / cuda logger.info("Building %s model ..." % params.architecture) ftmodel = build_model(params) ftmodel.fc = nn.Sequential() ftmodel.eval().cuda() linearmodel = nn.Linear(EMBEDDING_SIZE[params.architecture], params.num_classes).cuda() if params.from_ckpt != "": ckpt = torch.load(params.from_ckpt) state_dict = { k.replace("module.", ""): v for k, v in ckpt['model'].items() } del state_dict["fc.weight"] if "fc.bias" in state_dict: del state_dict["fc.bias"] missing_keys, unexcepted_keys = ftmodel.load_state_dict(state_dict, strict=False) print("Missing keys: ", missing_keys) print("Unexcepted keys: ", unexcepted_keys) # distributed # TODO: check this https://github.com/NVIDIA/apex/blob/master/examples/imagenet/main.py#L142 if params.multi_gpu: logger.info("Using nn.parallel.DistributedDataParallel ...") linearmodel = nn.parallel.DistributedDataParallel( linearmodel, device_ids=[params.local_rank], output_device=params.local_rank, broadcast_buffers=True) # build trainer / reload potential checkpoints / build evaluator trainer = Trainer(model=linearmodel, params=params, ftmodel=ftmodel) trainer.reload_checkpoint() evaluator = Evaluator(trainer, params) # evaluation if params.eval_only: scores = evaluator.run_all_evals(trainer, evals=['classif'], data_loader=valid_data_loader) for k, v in scores.items(): logger.info('%s -> %.6f' % (k, v)) logger.info("__log__:%s" % json.dumps(scores)) exit() # training for epoch in range(trainer.epoch, params.epochs): # update epoch / sampler / learning rate trainer.epoch = epoch logger.info("============ Starting epoch %i ... ============" % trainer.epoch) if params.multi_gpu: train_sampler.set_epoch(epoch) # update learning rate trainer.update_learning_rate() # train for i, (images, targets) in enumerate(train_data_loader): trainer.classif_step(images, targets) trainer.iter() logger.info("============ End of epoch %i ============" % trainer.epoch) # evaluate classification accuracy scores = evaluator.run_all_evals(trainer, evals=['classif'], data_loader=valid_data_loader) for name, val in trainer.get_scores().items(): scores[name] = val # print / JSON log for k, v in scores.items(): logger.info('%s -> %.6f' % (k, v)) if params.is_master: logger.info("__log__:%s" % json.dumps(scores)) # end of epoch trainer.save_best_model(scores) trainer.save_periodic() trainer.end_epoch(scores)
def main(params): init_distributed_mode(params) logger = initialize_exp(params) torch.cuda.manual_seed_all(params.seed) transform = getTransform(0) root_data = '/private/home/asablayrolles/data/cifar-dejalight2' trainset = CIFAR10(root=root_data, name=params.name, transform=transform) trainloader = torch.utils.data.DataLoader(trainset, batch_size=params.batch_size, shuffle=True, num_workers=2) valid_set = CIFAR10(root=root_data, name='public_0', transform=transform) valid_data_loader = torch.utils.data.DataLoader( valid_set, batch_size=params.batch_size, shuffle=False, num_workers=2) model = build_model(params) if params.gpu: model = model.cuda() # criterion = nn.CrossEntropyLoss() # optimizer = optim.SGD(model.parameters(), lr=params.lr, momentum=params.momentum) trainer = Trainer(model=model, params=params) evaluator = Evaluator(trainer, params) for epoch in range(params.epochs): trainer.update_learning_rate() for images, targets in trainloader: trainer.classif_step(images, targets) # evaluate classification accuracy scores = evaluator.run_all_evals(trainer, evals=['classif'], data_loader=valid_data_loader) for name, val in trainer.get_scores().items(): scores[name] = val accuracy, precision_train, recall_train = mast_topline( model, trainloader, valid_data_loader) print(f"Guessing accuracy: {accuracy}") scores["mast_accuracy"] = accuracy scores["mast_precision_train"] = precision_train scores["mast_recall_train"] = recall_train # print / JSON log for k, v in scores.items(): logger.info('%s -> %.6f' % (k, v)) if params.is_master: logger.info("__log__:%s" % json.dumps(scores)) # end of epoch trainer.save_best_model(scores) trainer.save_periodic() trainer.end_epoch(scores) print('Finished Training')
def main(params): # initialize the multi-GPU / multi-node training # initialize experiment / SLURM signal handler for time limit / pre-emption init_distributed_mode(params) logger = initialize_exp(params) init_signal_handler() # CPU / CUDA if params.cpu: assert not params.multi_gpu else: assert torch.cuda.is_available() src.utils.CUDA = not params.cpu # build environment / modules / trainer / evaluator env = build_env(params) modules = build_modules(env, params) trainer = Trainer(modules, env, params) evaluator = Evaluator(trainer) # evaluation if params.eval_only: scores = evaluator.run_all_evals() for k, v in scores.items(): logger.info("%s -> %.6f" % (k, v)) logger.info("__log__:%s" % json.dumps(scores)) exit() # training for _ in range(params.max_epoch): logger.info("============ Starting epoch %i ... ============" % trainer.epoch) trainer.n_equations = 0 while trainer.n_equations < trainer.epoch_size: # training steps for task_id in np.random.permutation(len(params.tasks)): task = params.tasks[task_id] if params.export_data: trainer.export_data(task) else: trainer.enc_dec_step(task) trainer.iter() logger.info("============ End of epoch %i ============" % trainer.epoch) # evaluate perplexity scores = evaluator.run_all_evals() # print / JSON log for k, v in scores.items(): logger.info("%s -> %.6f" % (k, v)) if params.is_master: logger.info("__log__:%s" % json.dumps(scores)) # end of epoch trainer.save_best_model(scores) trainer.save_periodic() trainer.end_epoch(scores)
def main(params): # initialize the multi-GPU / multi-node training init_distributed_mode(params) # initialize the experiment / load data logger = initialize_exp(params) # initialize SLURM signal handler for time limit / pre-emption if params.is_slurm_job: init_signal_handler() if params.dataset == "imagenet": params.num_classes = 1000 params.img_size = 256 params.crop_size = 224 else: if params.dataset == "cifar10": params.num_classes = 10 elif params.dataset == "cifar100": params.num_classes = 100 else: assert False, "Dataset unbeknownst to me" params.img_size = 40 params.crop_size = 32 # data loaders / samplers train_data_loader, train_sampler = get_data_loader( img_size=params.img_size, crop_size=params.crop_size, shuffle=True, batch_size=params.batch_size, nb_workers=params.nb_workers, distributed_sampler=params.multi_gpu, dataset=params.dataset, transform=params.transform, split='valid' if params.debug_train else params.split_train, ) valid_data_loader, _ = get_data_loader( img_size=params.img_size, crop_size=params.crop_size, shuffle=False, batch_size=params.batch_size, nb_workers=params.nb_workers, distributed_sampler=False, dataset=params.dataset, transform='center', split='valid', ) # build model / cuda logger.info("Building %s model ..." % params.architecture) model = build_model(params) model.cuda() # distributed # TODO: check this https://github.com/NVIDIA/apex/blob/master/examples/imagenet/main.py#L142 if params.multi_gpu: logger.info("Using nn.parallel.DistributedDataParallel ...") model = nn.parallel.DistributedDataParallel( model, device_ids=[params.local_rank], output_device=params.local_rank, broadcast_buffers=True) # build trainer / reload potential checkpoints / build evaluator trainer = Trainer(model=model, params=params) trainer.reload_checkpoint() evaluator = Evaluator(trainer, params) # evaluation if params.eval_only: scores = evaluator.run_all_evals(trainer, evals=['classif', 'recognition'], data_loader=valid_data_loader) for k, v in scores.items(): logger.info('%s -> %.6f' % (k, v)) logger.info("__log__:%s" % json.dumps(scores)) exit() # training for epoch in range(trainer.epoch, params.epochs): # update epoch / sampler / learning rate trainer.epoch = epoch logger.info("============ Starting epoch %i ... ============" % trainer.epoch) if params.multi_gpu: train_sampler.set_epoch(epoch) # update learning rate trainer.update_learning_rate() # train for i, (images, targets) in enumerate(train_data_loader): trainer.classif_step(images, targets) trainer.iter() logger.info("============ End of epoch %i ============" % trainer.epoch) # evaluate classification accuracy scores = evaluator.run_all_evals(trainer, evals=['classif'], data_loader=valid_data_loader) for name, val in trainer.get_scores().items(): scores[name] = val # print / JSON log for k, v in scores.items(): logger.info('%s -> %.6f' % (k, v)) if params.is_master: logger.info("__log__:%s" % json.dumps(scores)) # end of epoch trainer.save_best_model(scores) trainer.save_periodic() trainer.end_epoch(scores)