def main(): torch.autograd.set_detect_anomaly(True) logger.info('Start to declare training variable') if torch.cuda.is_available(): cfg.device = torch.device("cuda") torch.cuda.set_device(cfg.local_rank) else: cfg.device = torch.device("cpu") logger.info('Session will be ran in device: [%s]' % cfg.device) start_epoch = 0 best_acc = 0. logger.info('Start to prepare data') # get transformers # train_transform is for data perturbation train_transform = transforms.get(train=True) # test_transform is for evaluation test_transform = transforms.get(train=False) # reduced_transform is for original training data reduced_transform = get_reduced_transform(cfg.tfm_resize, cfg.tfm_size, cfg.tfm_blur, cfg.tfm_means, cfg.tfm_stds, cfg.tfm_adaptive_thresholding) # get datasets # each head should have its own trainset train_splits = dict(cifar100=[['train', 'test']], image_folder_wrapper=[['train']], stl10=[['train+unlabeled', 'test'], ['train', 'test']]) test_splits = dict(cifar100=['train', 'test'], image_folder_wrapper=['test'], stl10=['train', 'test']) # instance dataset for each head # otrainset: original trainset otrainset = [ ConcatDataset([ datasets.get(split=split, transform=reduced_transform) for split in train_splits[cfg.dataset][hidx] ]) for hidx in range(len(train_splits[cfg.dataset])) ] # ptrainset: perturbed trainset ptrainset = [ ConcatDataset([ datasets.get(split=split, transform=train_transform) for split in train_splits[cfg.dataset][hidx] ]) for hidx in range(len(train_splits[cfg.dataset])) ] # testset testset = ConcatDataset([ datasets.get(split=split, transform=test_transform) for split in test_splits[cfg.dataset] ]) # declare data loaders for testset only test_loader = DataLoader(testset, batch_size=cfg.batch_size, shuffle=False, num_workers=cfg.num_workers) logger.info('Start to build model') net = networks.get() criterion = PUILoss(cfg.pica_lamda, cfg.pica_target, cfg.pica_iic) optimizer = optimizers.get( params=[val for _, val in net.trainable_parameters().items()]) lr_handler = lr_policy.get() # load session if checkpoint is provided if cfg.resume: assert os.path.exists(cfg.resume), "Resume file not found" ckpt = torch.load(cfg.resume) logger.info('Start to resume session for file: [%s]' % cfg.resume) net.load_state_dict(ckpt['net']) best_acc = ckpt['acc'] start_epoch = ckpt['epoch'] # move modules to target device if int(os.environ["WORLD_SIZE"]) > 1: dist.init_process_group(backend="nccl", init_method="env://") print("world size: {}".format(os.environ["WORLD_SIZE"])) print("rank: {}".format(cfg.local_rank)) synchronize() criterion = criterion.to(cfg.device) net = net.to(cfg.device) if int(os.environ["WORLD_SIZE"]) > 1: net = torch.nn.SyncBatchNorm.convert_sync_batchnorm(net) net = torch.nn.parallel.DistributedDataParallel( net, device_ids=[cfg.local_rank], find_unused_parameters=True, output_device=cfg.local_rank).cuda() # Only rank 0 needs a SummaryWriter if cfg.local_rank == 0: # tensorboard writer writer = SummaryWriter(cfg.debug, log_dir=cfg.tfb_dir) else: writer = None # start training lr = cfg.base_lr epoch = start_epoch logger.info('Start to evaluate after %d epoch of training' % epoch) acc = evaluate(net, test_loader, writer, epoch) if not cfg.debug and cfg.local_rank == 0: # save checkpoint is_best = acc > best_acc best_acc = max(best_acc, acc) save_checkpoint( { 'net': net.state_dict(), 'optimizer': optimizer.state_dict(), 'acc': acc, 'epoch': epoch }, is_best=is_best) while lr > 0 and epoch < cfg.max_epochs: lr = lr_handler.update(epoch, optimizer) logger.info('Start to train at %d epoch with learning rate %.5f' % (epoch, lr)) train(epoch, net, otrainset, ptrainset, optimizer, criterion, writer) epoch += 1 logger.info('Start to evaluate after %d epoch of training' % epoch) acc = evaluate(net, test_loader, writer, epoch) if not cfg.debug and cfg.local_rank == 0: writer.add_scalar('Train/Learing_Rate', lr, epoch) # save checkpoint is_best = acc > best_acc best_acc = max(best_acc, acc) save_checkpoint( { 'net': net.state_dict(), 'optimizer': optimizer.state_dict(), 'acc': acc, 'epoch': epoch }, is_best=is_best) logger.info('Done')
def main(): logger.info('Start to declare training variable') cfg.device = 'cuda' if torch.cuda.is_available() else 'cpu' logger.info('Session will be ran in device: [%s]' % cfg.device) start_epoch = 0 best_acc = 0. logger.info('Start to prepare data') # get transformers # train_transform is for data perturbation train_transform = transforms.get(train=True) # test_transform is for evaluation test_transform = transforms.get(train=False) # reduced_transform is for original training data reduced_transform = get_reduced_transform(cfg.tfm_resize, cfg.tfm_size, cfg.tfm_means, cfg.tfm_stds) # get datasets # each head should have its own trainset train_splits = dict(cifar100=[['train', 'test']], stl10=[['train+unlabeled', 'test'], ['train', 'test']]) test_splits = dict(cifar100=['train', 'test'], stl10=['train', 'test']) # instance dataset for each head # otrainset: original trainset otrainset = [ ConcatDataset([ datasets.get(split=split, transform=reduced_transform) for split in train_splits[cfg.dataset][hidx] ]) for hidx in xrange(len(train_splits[cfg.dataset])) ] # ptrainset: perturbed trainset ptrainset = [ ConcatDataset([ datasets.get(split=split, transform=train_transform) for split in train_splits[cfg.dataset][hidx] ]) for hidx in xrange(len(train_splits[cfg.dataset])) ] # testset testset = ConcatDataset([ datasets.get(split=split, transform=test_transform) for split in test_splits[cfg.dataset] ]) # declare data loaders for testset only test_loader = DataLoader(testset, batch_size=cfg.batch_size, shuffle=False, num_workers=cfg.num_workers) logger.info('Start to build model') net = networks.get() criterion = PUILoss(cfg.pica_lamda) optimizer = optimizers.get( params=[val for _, val in net.trainable_parameters().iteritems()]) lr_handler = lr_policy.get() # load session if checkpoint is provided if cfg.resume: assert os.path.exists(cfg.resume), "Resume file not found" ckpt = torch.load(cfg.resume) logger.info('Start to resume session for file: [%s]' % cfg.resume) net.load_state_dict(ckpt['net']) best_acc = ckpt['acc'] start_epoch = ckpt['epoch'] # move modules to target device net, criterion = net.to(cfg.device), criterion.to(cfg.device) # tensorboard wrtier writer = SummaryWriter(cfg.debug, log_dir=cfg.tfb_dir) # start training lr = cfg.base_lr epoch = start_epoch while lr > 0 and epoch < cfg.max_epochs: lr = lr_handler.update(epoch, optimizer) writer.add_scalar('Train/Learing_Rate', lr, epoch) logger.info('Start to train at %d epoch with learning rate %.5f' % (epoch, lr)) train(epoch, net, otrainset, ptrainset, optimizer, criterion, writer) logger.info('Start to evaluate after %d epoch of training' % epoch) acc, nmi, ari = evaluate(net, test_loader) logger.info('Evaluation results at epoch %d are: ' 'ACC: %.3f, NMI: %.3f, ARI: %.3f' % (epoch, acc, nmi, ari)) writer.add_scalar('Evaluate/ACC', acc, epoch) writer.add_scalar('Evaluate/NMI', nmi, epoch) writer.add_scalar('Evaluate/ARI', ari, epoch) epoch += 1 if cfg.debug: continue # save checkpoint is_best = acc > best_acc best_acc = max(best_acc, acc) save_checkpoint( { 'net': net.state_dict(), 'optimizer': optimizer.state_dict(), 'acc': acc, 'epoch': epoch }, is_best=is_best) logger.info('Done')
def main(): logger.info('Start to declare training variable') cfg.device = 'cuda' if torch.cuda.is_available() else 'cpu' logger.info('Session will be ran in device: [%s]' % cfg.device) start_epoch = 0 best_acc = 0. if cfg.pica: logger.info('Work at PICA !!!!') logger.info('Start to prepare data') # get transformers # train_transform is for data perturbation #train_transform = transforms.get(train=True) # test_transform is for evaluation test_transform = transforms.get(train=False) # reduced_transform is for original training data #reduced_transform = get_reduced_transform(cfg.tfm_resize, cfg.tfm_size, # cfg.tfm_means, cfg.tfm_stds) # get datasets # each head should have its own trainset #train_splits = dict(cifar100=[['train', 'test']], cifar10=[['train', 'test']], # stl10=[['train+unlabeled', 'test'], ['train', 'test']]) test_splits = dict(cifar100=['train', 'test'], cifar10=['train', 'test'], stl10=['train', 'test']) # instance dataset for each head if cfg.dataset.startswith('stl') or cfg.dataset.startswith('cifar'): # otrainset: original trainset # otrainset = [ConcatDataset([datasets.get(split=split, transform=reduced_transform) # for split in train_splits[cfg.dataset][hidx]]) # for hidx in xrange(len(train_splits[cfg.dataset]))] # # ptrainset: perturbed trainset # ptrainset = [ConcatDataset([datasets.get(split=split, transform=train_transform) # for split in train_splits[cfg.dataset][hidx]]) # for hidx in xrange(len(train_splits[cfg.dataset]))] # testset testset = ConcatDataset([ datasets.get(split=split, transform=test_transform) for split in test_splits[cfg.dataset] ]) else: # otrainset = [ImageFolder(root = cfg.data_root, transform = reduced_transform) for hidx in xrange(2)] # ptrainset = [ImageFolder(root = cfg.data_root, transform = train_transform) for hidx in xrange(2)] testset = ImageFolder(root=cfg.data_root, transform=test_transform) logger.debug( 'Dataset [%s] from directory [%s] is declared and %d samples ' 'are loaded' % (cfg.dataset, cfg.data_root, len(testset))) # declare data loaders for testset only test_loader = DataLoader(testset, batch_size=cfg.batch_size, shuffle=False, num_workers=cfg.num_workers) logger.info('Start to build model') net = networks.get() criterion = DCLoss(cfg.dc_lamda) optimizer = optimizers.get( params=[val for _, val in net.trainable_parameters().iteritems()]) lr_handler = lr_policy.get() # load session if checkpoint is provided if cfg.resume: assert os.path.exists(cfg.resume), "Resume file not found" ckpt = torch.load(cfg.resume) logger.info('Start to resume session for file: [%s]' % cfg.resume) if not cfg.pica: net.load_state_dict(ckpt['net']) best_acc = ckpt['acc'] start_epoch = ckpt['epoch'] else: net.load_state_dict(ckpt) best_acc = 0 start_epoch = 0 # data parallel if cfg.device == 'cuda' and len(cfg.gpus.split(',')) > 1: logger.info('Data parallel will be used for acceleration purpose') device_ids = range(len(cfg.gpus.split(','))) if not (hasattr(net, 'data_parallel') and net.data_parallel(device_ids)): net = nn.DataParallel(net, device_ids=device_ids) cudnn.benchmark = True else: logger.info('Data parallel will not be used for acceleration') # move modules to target device net, criterion = net.to(cfg.device), criterion.to(cfg.device) # tensorboard wrtier writer = SummaryWriter(cfg.debug, log_dir=cfg.tfb_dir) # start training lr = cfg.base_lr epoch = start_epoch logger.info('Start to evaluate after %d epoch of training' % epoch) acc, nmi, ari = evaluate(net, test_loader) logger.info('Evaluation results at epoch %d are: ' 'ACC: %.3f, NMI: %.3f, ARI: %.3f' % (epoch, acc, nmi, ari)) writer.add_scalar('Evaluate/ACC', acc, epoch) writer.add_scalar('Evaluate/NMI', nmi, epoch) writer.add_scalar('Evaluate/ARI', ari, epoch) logger.info('Done')