def load_tgt_loaders(exp_dict): train_loader = datasets.get_loader( exp_dict["tgt_dataset"], "train", batch_size=exp_dict["tgt_batch_size"]) val_loader = datasets.get_loader( exp_dict["tgt_dataset"], "val", batch_size=exp_dict["tgt_batch_size"]) name = type(train_loader.dataset).__name__ n_train = len(train_loader.dataset) n_test = len(val_loader.dataset) print("Target ({}): train set: {} - val set: {}".format( name, n_train, n_test)) return train_loader, val_loader
def load_loaders(name, batch): train_loader = datasets.get_loader(name, "train", batch_size=batch) if train_loader is None: print("no such dataset named ", name) return val_loader = datasets.get_loader(name, "val", batch_size=batch) n_train = len(train_loader.dataset) n_test = len(val_loader.dataset) name = type(train_loader.dataset).__name__ print("dataset ({}): train set: {} - val set: {}".format( name, n_train, n_test)) return train_loader, val_loader
def test_model(model_class, run_func, args, split_idx=0, quiet=False): output_dir = args.output_dir # save the output_dir if not quiet: print('Model loaded from: %s' % args.pretrain_model) model, args = load_model(args.pretrain_model, model_class=model_class, device=args.device) args.output_dir = output_dir test_data_loader = get_loader(args.data_dir, data_type='test', batch_size=args.batch_size, shuffle=False, split=split_idx, n_labels=args.n_labels) test_stats = run_func(model=model, optim=None, data_loader=test_data_loader, data_type='test', args=args, write_path='%s/test_output.jsonl' % args.output_dir, quiet=quiet) if not quiet: test_stats.print_stats('Test: ')
def load_src_loaders(exp_dict): train_loader = datasets.get_loader(exp_dict["src_dataset"], "train", batch_size=exp_dict["src_batch_size"], exp_dict=exp_dict) val_loader = datasets.get_loader(exp_dict["src_dataset"], "val", batch_size=exp_dict["src_batch_size"], exp_dict=exp_dict) n_train = len(train_loader.dataset) n_test = len(val_loader.dataset) name = type(train_loader.dataset).__name__ print("Source ({}): train set: {} - val set: {}".format( name, n_train, n_test)) return train_loader, val_loader
def get_loader(config): # todo config = config["dataset"] DATASET = config["name"] BATCH_SIZE = config["batch_size"] workers = config["workers"] trainloader, valloader = datasets.get_loader(setname=DATASET, batch_size=BATCH_SIZE, shuffle=True, num_workers=workers) return trainloader, valloader
def get_tgt_loader_supervised(exp_dict): train_loader = datasets.get_loader( exp_dict["tgt_dataset"], "train_supervised", batch_size=exp_dict["tgt_batch_size_supervised"], exp_dict=exp_dict) test_loader = datasets.get_loader( exp_dict["tgt_dataset"], "test_supervised", batch_size=exp_dict["tgt_batch_size_supervised"], exp_dict=exp_dict) name = type(train_loader.dataset).__name__ n_train = len(train_loader.dataset) n_test = len(test_loader.dataset) print( "Target Supervised ({}): train set: {} ---------- test set: {}".format( name, n_train, n_test)) return train_loader, test_loader
def main(config): # For fast training. # cudnn.benchmark = True # Create directories if not exist. if not os.path.exists(config.log_dir): os.makedirs(config.log_dir) if not os.path.exists(config.model_save_dir): os.makedirs(config.model_save_dir) if not os.path.exists(config.sample_dir): os.makedirs(config.sample_dir) if not os.path.exists(config.result_dir): os.makedirs(config.result_dir) # Data loader. celeba_loader = None rafd_loader = None if config.dataset in ['CelebA', 'Both']: celeba_loader = get_loader(config.celeba_image_dir, config.attr_path, config.selected_attrs, config.celeba_crop_size, config.image_size, config.batch_size, 'CelebA', config.mode, config.num_workers) if config.dataset in ['RaFD', 'Both']: rafd_loader = get_loader(config.rafd_image_dir, None, None, config.rafd_crop_size, config.image_size, config.batch_size, 'RaFD', config.mode, config.num_workers) # Trainer for training and testing StarGAN. trainer = Trainer(celeba_loader, rafd_loader, config) if config.mode == 'train': if config.dataset in ['CelebA', 'RaFD']: trainer.train() elif config.dataset in ['Both']: trainer.train_multi() elif config.mode == 'test': if config.dataset in ['CelebA', 'RaFD']: trainer.test() elif config.dataset in ['Both']: trainer.test_multi()
def get_data_examples(args, data_type, n_examples, shuffle_data=False): """Returns examples from data""" data_loader = get_loader(args.data_dir, data_type=data_type, batch_size=n_examples, shuffle=shuffle_data, split=0, n_labels=args.n_labels) examples = data_loader.__iter__().next() return examples
def __init__(self, config): self.config = config self.ckpt_dir = config.ckpt_dir if not os.path.exists(self.ckpt_dir): os.makedirs(self.ckpt_dir) self.save_config(config) self.timer = Timer() self.writer = SummaryWriter(log_dir=config.ckpt_dir) self.lr = config.lr self.datasets, self.loaders = get_loader(config) self.max_iters = config.max_iters if self.max_iters is not None: self.epochs = self.max_iters // len(self.loaders['train']) else: self.epochs = config.epochs self.start_epoch = 0 self.num_classes = self.datasets['train'].n_classes self.scores = ScoreMeter(self.num_classes) self.model = ModelSelector[config.model]( in_channels=config.in_channels, num_classes=self.num_classes, **config.model_params[config.model]) if config.distributed: self.model = nn.DataParallel(self.model) patch_replication_callback(self.model) self.model = self.model.cuda() self.criterion = LossSelector[config.loss]( **config.loss_params[config.loss]) self.optimizer = optim.SGD(self.model.parameters(), lr=self.lr, momentum=0.9, weight_decay=4e-5) self.lr_decay = optim.lr_scheduler.CosineAnnealingLR( self.optimizer, self.max_iters) self.best_miou = float('-inf') if config.resume: logger.info('***Resume from checkpoint***') state = torch.load(os.path.join(self.ckpt_dir, 'ckpt.pt')) self.model.load_state_dict(state['model']) self.start_epoch = state['epoch'] self.best_miou = state['best_miou'] self.optimizer.load_state_dict(state['optim']) self.lr_decay.load_state_dict(state['lr_decay']) self.lr_decay.last_epoch = self.start_epoch
def main(): print('*'*40) print(args.checkpoint) print(args.load_model) print(args.val_set) print('-'*40) ## dynamically adjust hyper-parameters for ResNets according to base_width if args.base_width != 64 and 'sat' in args.loss: factor = 64. / args.base_width args.sat_alpha = args.sat_alpha**(1. / factor) args.sat_es = int(args.sat_es * factor) print("Adaptive parameters adjustment: alpha = {:.3f}, Es = {:d}".format(args.sat_alpha, args.sat_es)) global best_prec1 global best_auc_1, best_auc_2, best_auc_3, best_model_1, best_model_2, best_model_3, best_epoch_1, best_epoch_2, best_epoch_3 # Check the save_dir exists or not if not os.path.exists(args.save_dir): os.makedirs(args.save_dir) os.makedirs(os.path.join(args.save_dir, 'train')) os.makedirs(os.path.join(args.save_dir, 'val')) os.makedirs(os.path.join(args.save_dir, 'test')) # prepare dataset val_loader, num_classes, val_targets, pass_idx = get_loader(args) model = get_model(args, num_classes, base_width=args.base_width) if torch.cuda.device_count() > 1: model = nn.DataParallel(model) model.cuda() checkpoint = torch.load(args.checkpoint) model.load_state_dict(checkpoint[args.load_model]) torch.cuda.manual_seed(args.seed) cudnn.benchmark = True criterion = get_loss(args, labels=val_targets, num_classes=num_classes, train_len=0, val_len=len(val_targets), test_len=0, pass_idx=pass_idx) optimizer = get_optimizer(model, args) scheduler = get_scheduler(optimizer, args) train_timeline = Timeline() val_timeline = Timeline() test_timeline = Timeline() validate(val_loader, model, 0, val_timeline, args.dataset, criterion=criterion, crop=args.crop, last=True) return
def build_dataloader(config, num_workers, distributed): import torch.utils.data as data import torch.utils.data.distributed import datasets datasets, data_and_label_keys = {}, {} datasets = build_dataset(config) loader = get_loader( dataset=datasets, dataset_config=config, num_dataloader_workers=num_workers, pin_memory=False, ### Questionable ) return loader
def main(): parser = argparse.ArgumentParser() parser.add_argument('-model_path', required=True) parser.add_argument('-model_type', required=True, choices=['gcn', 'proto']) args = parser.parse_args() model_type = args.model_type if model_type == 'gcn': model_class = GCN elif model_type == 'proto': model_class = ProtoNet else: assert False print('loading model from: %s' % args.model_path) model, args = load_model(args.model_path, model_class, None) test_data_loader = get_loader( args.data_dir, data_type='test', batch_size=BATCH_SIZE, shuffle=True, split=0) s_rho, p_rho = compute_r(model_type, model, args, test_data_loader)
def main(args=None, quiet=False, splits=None, abs_output_dir=False): if args is None: args = get_args() if args.pretrain_model is not None: test_model(model_class=ProtoNet, run_func=run_func, args=args, quiet=quiet) exit() # If should initialize the point clouds as data points, get random mols # from the validation set if args.init_method == 'data': train_data_loader = get_loader(args.data_dir, data_type='val', batch_size=args.n_pc, shuffle=True, split=0, n_labels=args.n_labels) pc_data = train_data_loader.__iter__().next() args.pc_data = pc_data if args.plot_pc: init_plot_tracker(args) train_results = train_model(model_class=ProtoNet, run_func=run_func, args=args, quiet=quiet, splits=splits, abs_output_dir=abs_output_dir) if args.plot_pc: args.plot_tracker.plot_and_save(args) return train_results
opts, _ = parser.parse_options() opts_str = parser.make_opts_string(opts, verbose=True) if opts.no_viz: viz = None else: viz = Visualizer(port=opts.vizport, hostname=opts.vizaddr, is_remote=opts.viz_is_remote) model = create_model(opts, viz) t_dataset, v_dataset = create_dataset(opts) t_loader = get_loader(data=t_dataset, batch_size=opts.batch_size, shuffle=not opts.no_shuffle, num_workers=opts.num_workers) model.init_viz(opts_str) for n in range(model.epoch + 1, opts.n_epochs + 1): print('Epoch {}'.format(n)) iters_p_epoch = len(t_loader) curr_iter = 0 for example in t_loader: model.set_input(example) model.optimize_parameters() model.iter += 1 curr_iter += 1 if curr_iter % opts.print_freq == 0 \ or curr_iter == iters_p_epoch:
def train_gan(cfg, logger, vis): # Setup seeds torch.manual_seed(cfg.get("seed", 1337)) torch.cuda.manual_seed(cfg.get("seed", 1337)) np.random.seed(cfg.get("seed", 1337)) random.seed(cfg.get("seed", 1337)) # Setup Dataloader data_loader = get_loader(cfg["data"]["dataset"]) data_path = cfg["data"]["path"] t_loader = data_loader( data_path, split=cfg["data"]["train_split"], patch_size=cfg['data']['patch_size'], augmentation=cfg['data']['aug_data'] ) train_loader = DataLoader( t_loader, batch_size=cfg["batch_size"], num_workers=cfg["n_workers"], shuffle=True, ) # custom weights initialization called on netG and netD def weights_init(m): classname = m.__class__.__name__ if classname.find('Conv') != -1: m.weight.data.normal_(0.0, 0.02) m.bias.data.fill_(0) elif classname.find('BatchNorm') != -1: m.weight.data.normal_(1.0, 0.02) m.bias.data.fill_(0) ndf = cfg['ndf'] ngf = cfg['ngf'] nc = 3 netD_cls = get_model(cfg['netd']) netG_cls = get_model(cfg['netg']) netD = netD_cls(nc, cfg['output_nc'], ndf).to(device) netG = netG_cls(cfg['input_nc'], cfg['output_nc'], ngf).to(device) netG.apply(weights_init) netD.apply(weights_init) logger.info(netD) logger.info(netG) ########### LOSS & OPTIMIZER ########## criterion = torch.nn.BCELoss() criterionL1 = torch.nn.L1Loss() optimizerD = torch.optim.Adam(netD.parameters(), lr=cfg['optimizer']['lr'], betas=(cfg['optimizer']['beta1'], 0.999)) optimizerG = torch.optim.Adam(netG.parameters(), lr=cfg['optimizer']['lr'], betas=(cfg['optimizer']['beta1'], 0.999)) ########### GLOBAL VARIABLES ########### input_nc = cfg['input_nc'] output_nc = cfg['output_nc'] fineSize = cfg['data']['patch_size'] real_A = Variable(torch.FloatTensor(cfg['batch_size'], input_nc, fineSize, fineSize), requires_grad=False).to( device) real_B = Variable(torch.FloatTensor(cfg['batch_size'], output_nc, fineSize, fineSize), requires_grad=False).to( device) label = Variable(torch.FloatTensor(cfg['batch_size']), requires_grad=False).to(device) real_label = 1 fake_label = 0 ########### Training ########### netD.train() netG.train() for epoch in range(1, cfg['max_iters'] + 1): for i, image in enumerate(train_loader): ########### fDx ########### netD.zero_grad() if cfg['direction'] == 'OtoB': imgA = image[1] imgB = image[0] else: imgA = image[0] imgB = image[1] # train with real data real_A.data.resize_(imgA.size()).copy_(imgA) real_B.data.resize_(imgB.size()).copy_(imgB) real_AB = torch.cat((real_A, real_B), 1) output = netD(real_AB) label.data.resize_(output.size()) label.data.fill_(real_label) errD_real = criterion(output, label) errD_real.backward() # train with fake fake_B = netG(real_A) label.data.fill_(fake_label) fake_AB = torch.cat((real_A, fake_B), 1) output = netD(fake_AB.detach()) errD_fake = criterion(output, label) errD_fake.backward() errD = (errD_fake + errD_real) / 2 optimizerD.step() ########### fGx ########### netG.zero_grad() label.data.fill_(real_label) output = netD(fake_AB) errGAN = criterion(output, label) errL1 = criterionL1(fake_B, real_B) errG = errGAN + cfg['lamb'] * errL1 errG.backward() optimizerG.step() ########### Logging ########## if i % 50 == 0: logger.info('[%d/%d][%d/%d] Loss_D: %.4f Loss_G: %.4f Loss_L1: %.4f' % (epoch, cfg['max_iters'], i, len(train_loader), errD.item(), errGAN.item(), errL1.item())) if cfg['vis']['use'] and (i % 50 == 0): fake_B = netG(real_A) vis.images(real_A.data.cpu().numpy(), win='real_A') vis.images(fake_B.detach().cpu().numpy(), win='fake_B') vis.images(real_B.data.cpu().numpy(), win='real_B') vis.plot('error_d', errD.item()) vis.plot('error_g', errGAN.item()) vis.plot('error_L1', errL1.item()) if epoch % 20 == 0: save_image( name='train', img_lists=[real_A.data.cpu(), fake_B.data.cpu(), real_B.data.cpu()], path='%s/fake_samples_epoch_%03d.png' % (cfg['checkpoint_dir'], epoch), step=epoch, batch_size=cfg['batch_size'] ) save_checkpoints(model=netG, step=epoch, optim=optimizerG, model_dir=cfg['checkpoint_dir'], name='{}_step_{}'.format(cfg['netg'] + cfg['data']['dataset'], epoch)) save_checkpoints(model=netD, step=epoch, optim=optimizerD, model_dir=cfg['checkpoint_dir'], name='{}_step_{}'.format(cfg['netd'] + cfg['data']['dataset'], epoch))
def train(cfg): # Setup seeds torch.manual_seed(cfg.get('seed', 1337)) torch.cuda.manual_seed(cfg.get('seed', 1337)) np.random.seed(cfg.get('seed', 1337)) random.seed(cfg.get('seed', 1337)) # Setup device device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # Setup Augmentations augmentations = cfg['training'].get('augmentations', None) data_aug = get_composed_augmentations(augmentations) # Setup Dataloader data_loader = get_loader(cfg['data']['dataset']) data_path = cfg['data']['path'] t_loader = data_loader( data_path, is_transform=True, split=cfg['data']['train_split'], #img_size=(cfg['data']['img_rows'], cfg['data']['img_cols']), augmentations=data_aug) v_loader = data_loader( data_path, is_transform=True, split=cfg['data']['val_split'], #img_size=(cfg['data']['img_rows'], cfg['data']['img_cols']), ) n_classes = t_loader.n_classes trainloader = data.DataLoader(t_loader, batch_size=cfg['training']['batch_size'], num_workers=cfg['training']['n_workers'], shuffle=True) valloader = data.DataLoader(v_loader, batch_size=cfg['training']['batch_size'], num_workers=cfg['training']['n_workers']) # Setup Metrics running_metrics_val = runningScore(n_classes) # Setup Model model = get_model(cfg['model'], n_classes).to(device) model = torch.nn.DataParallel(model, device_ids=range(torch.cuda.device_count())) # Setup optimizer, lr_scheduler and loss function optimizer_cls = get_optimizer(cfg) optimizer_params = {k:v for k, v in cfg['training']['optimizer'].items() if k != 'name'} optimizer = optimizer_cls(model.parameters(), **optimizer_params) scheduler = get_scheduler(optimizer, cfg['training']['lr_schedule']) loss_fn = get_loss_function(cfg) start_iter = 0 if cfg['training']['resume'] is not None: if os.path.isfile(cfg['training']['resume']): checkpoint = torch.load(cfg['training']['resume']) model.load_state_dict(checkpoint["model_state"]) optimizer.load_state_dict(checkpoint["optimizer_state"]) scheduler.load_state_dict(checkpoint["scheduler_state"]) start_iter = checkpoint["epoch"] print("=====>", "Loaded checkpoint '{}' (iter {})".format( cfg['training']['resume'], checkpoint["epoch"] ) ) else: print("=====>","No checkpoint found at '{}'".format(cfg['training']['resume'])) val_loss_meter = averageMeter() time_meter = averageMeter() best_iou = -100.0 i = start_iter flag = True while i <= cfg['training']['train_iters'] and flag: for (images, labels) in trainloader: i += 1 start_ts = time.time() scheduler.step() model.train() images = images.to(device) labels = labels.to(device) optimizer.zero_grad() outputs = model(images) loss = loss_fn(input=outputs, target=labels) loss.backward() optimizer.step() time_meter.update(time.time() - start_ts) if (i + 1) % cfg['training']['print_interval'] == 0: fmt_str = "Iter [{:d}/{:d}] Loss: {:.4f} Time/Image: {:.4f}" print_str = fmt_str.format(i + 1, cfg['training']['train_iters'], loss.item(), time_meter.avg / cfg['training']['batch_size']) print(print_str) time_meter.reset() if (i + 1) % cfg['training']['val_interval'] == 0 or \ (i + 1) == cfg['training']['train_iters']: model.eval() with torch.no_grad(): for i_val, (images_val, labels_val) in tqdm(enumerate(valloader)): images_val = images_val.to(device) labels_val = labels_val.to(device) outputs = model(images_val) val_loss = loss_fn(input=outputs, target=labels_val) pred = outputs.data.max(1)[1].cpu().numpy() gt = labels_val.data.cpu().numpy() running_metrics_val.update(gt, pred) val_loss_meter.update(val_loss.item()) print("Iter %d Loss: %.4f" % (i + 1, val_loss_meter.avg)) score, class_iou = running_metrics_val.get_scores() for k, v in score.items(): print(k,':',v) for k, v in class_iou.items(): print('{}: {}'.format(k, v)) val_loss_meter.reset() running_metrics_val.reset() if score["Mean IoU : \t"] >= best_iou: best_iou = score["Mean IoU : \t"] state = { "epoch": i + 1, "model_state": model.state_dict(), "optimizer_state": optimizer.state_dict(), "scheduler_state": scheduler.state_dict(), "best_iou": best_iou, } save_path = os.path.join('./checkpoint', "{}_{}_best_model.pkl".format( cfg['model']['arch'], cfg['data']['dataset'])) print("saving···") torch.save(state, save_path) if (i + 1) == cfg['training']['train_iters']: flag = False break
def main(): ## dynamically adjust hyper-parameters for ResNets according to base_width if args.base_width != 64 and 'sat' in args.loss: factor = 64. / args.base_width args.sat_alpha = args.sat_alpha**(1. / factor) args.sat_es = int(args.sat_es * factor) print("Adaptive parameters adjustment: alpha = {:.3f}, Es = {:d}".format(args.sat_alpha, args.sat_es)) print(args) global best_prec1, best_auc # Check the save_dir exists or not if not os.path.exists(args.save_dir): os.makedirs(args.save_dir) os.makedirs(os.path.join(args.save_dir, 'train')) os.makedirs(os.path.join(args.save_dir, 'val')) os.makedirs(os.path.join(args.save_dir, 'test')) # prepare dataset if args.dataset == 'nexperia': train_loader, num_classes, targets = get_loader(args) else: train_loader, val_loaders, test_loader, num_classes, targets = get_loader(args) model = get_model(args, num_classes, base_width=args.base_width) if torch.cuda.device_count() > 1: model = nn.DataParallel(model) model.cuda() # optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume) args.start_epoch = checkpoint['epoch'] if args.dataset=='nexperia_split': best_auc = checkpoint['best_auc'] else: best_prec1 = checkpoint['best_prec1'] model.load_state_dict(checkpoint['state_dict']) print("=> loaded checkpoint '{}' (epoch {})" .format(args.resume, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) torch.cuda.manual_seed(args.seed) cudnn.benchmark = True criterion = get_loss(args, labels=targets, num_classes=num_classes) optimizer = get_optimizer(model, args) scheduler = get_scheduler(optimizer, args) train_timeline = Timeline() val_timeline = Timeline() test_timeline = Timeline() if args.evaluate: validate(test_loader, model, args.crop) return print("*" * 40) start = time.time() for epoch in range(args.start_epoch, args.epochs): scheduler.step(epoch) # train for one epoch train(train_loader, model, criterion, optimizer, epoch, train_timeline, args.sat_es, args.dataset, args.mod, args.crop) print("*" * 40) if args.dataset!='nexperia': # evaluate on validation sets prec1 = 0 if args.dataset=='nexperia_split': print('val:') val_auc = validate( val_loaders, model, epoch, val_timeline, args.dataset, state='val', criterion=criterion, crop=args.crop) print("*" * 40) print('test:') test_auc = validate( test_loader, model, epoch, test_timeline, args.dataset, state='test', criterion=criterion, crop=args.crop) else: for name, val_loader in zip(args.val_sets, val_loaders): print(name +":", end="\t") prec1 = validate(val_loader, model, args.crop) print("*" * 40) if args.dataset=='nexperia_split': # remember best auc and save checkpoint is_best = val_auc > best_auc best_auc = max(val_auc, best_auc) if args.save_freq > 0 and (epoch + 1) % args.save_freq == 0: filename = 'checkpoint_{}.tar'.format(epoch + 1) else: filename = None save_checkpoint(args.save_dir, { 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'best_auc': best_auc, }, is_best, filename=filename) else: # remember best prec@1 and save checkpoint is_best = prec1 > best_prec1 best_prec1 = max(prec1, best_prec1) if args.save_freq > 0 and (epoch + 1) % args.save_freq == 0: filename = 'checkpoint_{}.tar'.format(epoch + 1) else: filename = None save_checkpoint(args.save_dir, { 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'best_prec1': best_prec1, }, is_best, filename=filename) if hasattr(criterion, 'outputs'): criterion.weights[epoch] = criterion.outputs[criterion.true_labels.index] criterion.clean_weights[epoch] = criterion.outputs[criterion.clean_labels.index] else: criterion.weights[epoch] = criterion.soft_labels[criterion.true_labels.index] criterion.clean_weights[epoch] = criterion.soft_labels[criterion.clean_labels.index] if args.dataset!='nexperia': # evaludate latest checkpoint print("Test acc of latest checkpoint:", end='\t') validate(test_loader, model, epoch, test_timeline, args.dataset, last=True, crop=args.crop) print("*" * 40) # evaluate best checkpoint if args.dataset=='nexperia_split': checkpoint = torch.load(os.path.join(args.save_dir, 'checkpoint_best.tar')) print("Best validation auc ({}th epoch): {:.2f}%".format(checkpoint['epoch'], best_auc*100.)) model.load_state_dict(checkpoint['state_dict']) print("Test acc of best checkpoint:", end='\t') validate(test_loader, model, checkpoint['epoch'], test_timeline, args.dataset, last=True, crop=args.crop) print("*" * 40) else: if len(val_loaders) > 0: checkpoint = torch.load(os.path.join(args.save_dir, 'checkpoint_best.tar')) print("Best validation acc ({}th epoch): {:.2f}%".format(checkpoint['epoch'], best_prec1)) model.load_state_dict(checkpoint['state_dict']) print("Test acc of best checkpoint:", end='\t') validate(test_loader, model, last=True, crop=args.crop) print("*" * 40) time_elapsed = time.time() - start print('It takes {:.0f}m {:.0f}s to train.'.format(time_elapsed // 60, time_elapsed % 60)) # save best result filename = 'train_results.tar' save_checkpoint(args.save_dir, { 'num_epochs': args.epochs, 'state_dict': model.state_dict(), }, is_best=True, filename=filename) # save soft label if hasattr(criterion, 'soft_labels'): out_fname = os.path.join(args.save_dir, 'updated_soft_labels.npy') np.save(out_fname, criterion.soft_labels.cpu().numpy()) print("Updated soft labels is saved to {}".format(out_fname)) # save weights change of 106 images if hasattr(criterion, 'weights'): out_fname = os.path.join(args.save_dir, 'weights_change.npy') np.save(out_fname, criterion.weights.cpu().numpy()) print("weights change is saved to {}".format(out_fname)) if hasattr(criterion, 'clean_weights'): out_fname = os.path.join(args.save_dir, 'clean_weights_change.npy') np.save(out_fname, criterion.clean_weights.cpu().numpy()) print("clean weights change is saved to {}".format(out_fname)) # save timelines train_acc_class = torch.cat(train_timeline.acc_class, dim=0) train_loss_class = torch.cat(train_timeline.loss_class, dim=0) train_acc_bi_class = torch.cat(train_timeline.acc_bi_class, dim=0) train_loss_bi_class = torch.cat(train_timeline.loss_bi_class, dim=0) train_me_class = torch.cat(train_timeline.me_class, dim=0) train_me_bi_class = torch.cat(train_timeline.me_bi_class, dim=0) val_acc_class = torch.cat(val_timeline.acc_class, dim=0) val_loss_class = torch.cat(val_timeline.loss_class, dim=0) val_acc_bi_class = torch.cat(val_timeline.acc_bi_class, dim=0) val_loss_bi_class = torch.cat(val_timeline.loss_bi_class, dim=0) val_me_class = torch.cat(val_timeline.me_class, dim=0) val_me_bi_class = torch.cat(val_timeline.me_bi_class, dim=0) test_acc_class = torch.cat(test_timeline.acc_class, dim=0) test_loss_class = torch.cat(test_timeline.loss_class, dim=0) test_acc_bi_class = torch.cat(test_timeline.acc_bi_class, dim=0) test_loss_bi_class = torch.cat(test_timeline.loss_bi_class, dim=0) test_me_class = torch.cat(test_timeline.me_class, dim=0) test_me_bi_class = torch.cat(test_timeline.me_bi_class, dim=0) np.save(os.path.join(args.save_dir, 'train', 'loss.npy'), train_timeline.loss) np.save(os.path.join(args.save_dir, 'train', 'acc.npy'), train_timeline.acc) np.save(os.path.join(args.save_dir, 'train', 'loss_bi.npy'), train_timeline.loss_bi) np.save(os.path.join(args.save_dir, 'train', 'acc_bi.npy'), train_timeline.acc_bi) np.save(os.path.join(args.save_dir, 'train', 'loss_class.npy'), train_loss_class) np.save(os.path.join(args.save_dir, 'train', 'acc_class.npy'), train_acc_class) np.save(os.path.join(args.save_dir, 'train', 'loss_bi_class.npy'), train_loss_bi_class) np.save(os.path.join(args.save_dir, 'train', 'acc_bi_class.npy'), train_acc_bi_class) np.save(os.path.join(args.save_dir, 'train', 'margin_error.npy'), train_timeline.margin_error) np.save(os.path.join(args.save_dir, 'train', 'margin_error_bi.npy'), train_timeline.margin_error_bi) np.save(os.path.join(args.save_dir, 'train', 'margin_error_class.npy'), train_me_class) np.save(os.path.join(args.save_dir, 'train', 'margin_error_bi_class.npy'), train_me_bi_class) np.save(os.path.join(args.save_dir, 'train', 'auc.npy'), train_timeline.auc) np.save(os.path.join(args.save_dir, 'train', 'fpr_991.npy'), train_timeline.fpr_991) np.save(os.path.join(args.save_dir, 'train', 'fpr_993.npy'), train_timeline.fpr_993) np.save(os.path.join(args.save_dir, 'train', 'fpr_995.npy'), train_timeline.fpr_995) np.save(os.path.join(args.save_dir, 'train', 'fpr_997.npy'), train_timeline.fpr_997) np.save(os.path.join(args.save_dir, 'train', 'fpr_999.npy'), train_timeline.fpr_999) np.save(os.path.join(args.save_dir, 'train', 'fpr_1.npy'), train_timeline.fpr_1) print("other training details are saved to {}".format(os.path.join(args.save_dir, 'train'))) np.save(os.path.join(args.save_dir, 'val', 'loss.npy'), val_timeline.loss) np.save(os.path.join(args.save_dir, 'val', 'acc.npy'), val_timeline.acc) np.save(os.path.join(args.save_dir, 'val', 'loss_bi.npy'), val_timeline.loss_bi) np.save(os.path.join(args.save_dir, 'val', 'acc_bi.npy'), val_timeline.acc_bi) np.save(os.path.join(args.save_dir, 'val', 'loss_class.npy'), val_loss_class) np.save(os.path.join(args.save_dir, 'val', 'acc_class.npy'), val_acc_class) np.save(os.path.join(args.save_dir, 'val', 'loss_bi_class.npy'), val_loss_bi_class) np.save(os.path.join(args.save_dir, 'val', 'acc_bi_class.npy'), val_acc_bi_class) np.save(os.path.join(args.save_dir, 'val', 'margin_error.npy'), val_timeline.margin_error_bi) np.save(os.path.join(args.save_dir, 'val', 'margin_error_bi.npy'), val_timeline.margin_error_bi) np.save(os.path.join(args.save_dir, 'val', 'margin_error_class.npy'), val_me_class) np.save(os.path.join(args.save_dir, 'val', 'margin_error_bi_class.npy'), val_me_bi_class) np.save(os.path.join(args.save_dir, 'val', 'auc.npy'), val_timeline.auc) np.save(os.path.join(args.save_dir, 'val', 'fpr_991.npy'), val_timeline.fpr_991) np.save(os.path.join(args.save_dir, 'val', 'fpr_993.npy'), val_timeline.fpr_993) np.save(os.path.join(args.save_dir, 'val', 'fpr_995.npy'), val_timeline.fpr_995) np.save(os.path.join(args.save_dir, 'val', 'fpr_997.npy'), val_timeline.fpr_997) np.save(os.path.join(args.save_dir, 'val', 'fpr_999.npy'), val_timeline.fpr_999) np.save(os.path.join(args.save_dir, 'val', 'fpr_1.npy'), val_timeline.fpr_1) print("other validating details are saved to {}".format(os.path.join(args.save_dir, 'val'))) np.save(os.path.join(args.save_dir, 'test', 'loss.npy'), test_timeline.loss) np.save(os.path.join(args.save_dir, 'test', 'acc.npy'), test_timeline.acc) np.save(os.path.join(args.save_dir, 'test', 'loss_bi.npy'), test_timeline.loss_bi) np.save(os.path.join(args.save_dir, 'test', 'acc_bi.npy'), test_timeline.acc_bi) np.save(os.path.join(args.save_dir, 'test', 'loss_class.npy'), test_loss_class) np.save(os.path.join(args.save_dir, 'test', 'acc_class.npy'), test_acc_class) np.save(os.path.join(args.save_dir, 'test', 'loss_bi_class.npy'), test_loss_bi_class) np.save(os.path.join(args.save_dir, 'test', 'acc_bi_class.npy'), test_acc_bi_class) np.save(os.path.join(args.save_dir, 'test', 'margin_error.npy'), test_timeline.margin_error_bi) np.save(os.path.join(args.save_dir, 'test', 'margin_error_bi.npy'), test_timeline.margin_error_bi) np.save(os.path.join(args.save_dir, 'test', 'margin_error_class.npy'), test_me_class) np.save(os.path.join(args.save_dir, 'test', 'margin_error_bi_class.npy'), test_me_bi_class) np.save(os.path.join(args.save_dir, 'test', 'auc.npy'), test_timeline.auc) np.save(os.path.join(args.save_dir, 'test', 'fpr_991.npy'), test_timeline.fpr_991) np.save(os.path.join(args.save_dir, 'test', 'fpr_993.npy'), test_timeline.fpr_993) np.save(os.path.join(args.save_dir, 'test', 'fpr_995.npy'), test_timeline.fpr_995) np.save(os.path.join(args.save_dir, 'test', 'fpr_997.npy'), test_timeline.fpr_997) np.save(os.path.join(args.save_dir, 'test', 'fpr_999.npy'), test_timeline.fpr_999) np.save(os.path.join(args.save_dir, 'test', 'fpr_1.npy'), test_timeline.fpr_1) print("other testing details are saved to {}".format(os.path.join(args.save_dir, 'test')))
def main(): ## dynamically adjust hyper-parameters for ResNets according to base_width if args.base_width != 64 and 'sat' in args.loss: factor = 64. / args.base_width args.sat_alpha = args.sat_alpha**(1. / factor) args.sat_es = int(args.sat_es * factor) print( "Adaptive parameters adjustment: alpha = {:.3f}, Es = {:d}".format( args.sat_alpha, args.sat_es)) print(args) global best_prec1 # Check the save_dir exists or not if not os.path.exists(args.save_dir): os.makedirs(args.save_dir) # prepare dataset train_loader, val_loaders, test_loader, num_classes, targets, clean_targets = get_loader( args) if args.is_tpu: device = xm.xla_device() model = get_model(args, num_classes, base_width=args.base_width) if torch.cuda.device_count() > 1: model = nn.DataParallel(model) if args.is_tpu: model = model.to(device) else: model = model.cuda() # optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume) args.start_epoch = checkpoint['epoch'] best_prec1 = checkpoint['best_prec1'] model.load_state_dict(checkpoint['state_dict']) print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) torch.manual_seed(args.seed) cudnn.benchmark = True criterion = get_loss(args, device=device, labels=targets, num_classes=num_classes) optimizer = get_optimizer(model, args) scheduler = get_scheduler(optimizer, args) if args.evaluate: validate(test_loader, model, device) return print("*" * 40) for epoch in range(args.start_epoch, args.epochs): scheduler.step(epoch) # train for one epoch train(train_loader, model, criterion, optimizer, epoch, device, args) print("*" * 40) # evaluate on validation sets prec1 = 0 for name, val_loader in zip(args.val_sets, val_loaders): print(name + ":", end="\t") prec1 = validate(val_loader, model, device, epoch) print("*" * 40) # remember best prec@1 and save checkpoint is_best = prec1 > best_prec1 best_prec1 = max(prec1, best_prec1) if args.save_freq > 0 and (epoch + 1) % args.save_freq == 0: filename = 'checkpoint_{}.tar'.format(epoch + 1) else: filename = None save_checkpoint(args.save_dir, { 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'best_prec1': best_prec1, }, is_best, filename=filename) # evaluate latest checkpoint print("Test acc of latest checkpoint:", end='\t') validate(test_loader, model, device) print("*" * 40) # evaluate best checkpoint if len(val_loaders) > 0: checkpoint = torch.load( os.path.join(args.save_dir, 'checkpoint_best.tar')) print("Best validation acc ({}th epoch): {}".format( checkpoint['epoch'], best_prec1)) model.load_state_dict(checkpoint['state_dict']) print("Test acc of best checkpoint:", end='\t') validate(test_loader, model, device) print("*" * 40) # save soft label if hasattr(criterion, 'soft_labels'): out_fname = os.path.join(args.save_dir, 'updated_soft_labels.npy') np.save(out_fname, criterion.soft_labels.cpu().numpy()) print("Updated soft labels is saved to {}".format(out_fname)) # save noise targets out_fname = os.path.join(args.save_dir, 'noisy_labels.npy') np.save(out_fname, targets) print("Noisy labels saved to {}".format(out_fname)) # save clean targets out_fname = os.path.join(args.save_dir, 'clean_labels.npy') np.save(out_fname, clean_targets) print("Clean labels saved to {}".format(out_fname))
def trainval(exp_dict, savedir_base, reset=False): # bookkeeping # --------------- # get experiment directory exp_id = hu.hash_dict(exp_dict) savedir = os.path.join(savedir_base, exp_id) if reset: # delete and backup experiment hc.delete_experiment(savedir, backup_flag=True) # create folder and save the experiment dictionary os.makedirs(savedir, exist_ok=True) hu.save_json(os.path.join(savedir, 'exp_dict.json'), exp_dict) pprint.pprint(exp_dict) print('Experiment saved in %s' % savedir) # Dataset # ----------- # train loader train_loader = datasets.get_loader(dataset_name=exp_dict['dataset'], datadir=savedir_base, split='train') # val loader val_loader = datasets.get_loader(dataset_name=exp_dict['dataset'], datadir=savedir_base, split='val') # Model # ----------- model = models.get_model(model_name=exp_dict['model']) # Checkpoint # ----------- model_path = os.path.join(savedir, 'model.pth') score_list_path = os.path.join(savedir, 'score_list.pkl') if os.path.exists(score_list_path): # resume experiment model.set_state_dict(hu.torch_load(model_path)) score_list = hu.load_pkl(score_list_path) s_epoch = score_list[-1]['epoch'] + 1 else: # restart experiment score_list = [] s_epoch = 0 # Train & Val # ------------ print('Starting experiment at epoch %d' % (s_epoch)) for e in range(s_epoch, 10): score_dict = {} # Train the model train_dict = model.train_on_loader(train_loader) # Validate the model val_dict = model.val_on_loader(val_loader) # Get metrics score_dict['train_loss'] = train_dict['train_loss'] score_dict['val_acc'] = val_dict['val_acc'] score_dict['epoch'] = e # Add to score_list and save checkpoint score_list += [score_dict] # Report & Save score_df = pd.DataFrame(score_list) print(score_df.tail()) hu.torch_save(model_path, model.get_state_dict()) hu.save_pkl(score_list_path, score_list) print('Checkpoint Saved: %s' % savedir) print('experiment completed')
def train_weighted_descent(D, dataQ0, dataP, wP, opt): n_samples, n_features = dataQ0.shape device = dataQ0.device # Lagrange multiplier for Augmented Lagrangian lambda_aug = torch.tensor([opt.lambda_aug_init], requires_grad=True, device=device) # MMD distance mmd = MMD_RFF(num_features=n_features, num_outputs=300).to(device) # Train print('Start training') if opt.plot_online: fig, ax = plt.subplots() ax.set_xlim((-1.1, 1.1)) ax.set_ylim((-1.1, 1.1)) scat = ax.scatter([], [], facecolor='r') # Save stuff wQ = torch.ones((len(dataQ0), 1), device=device) collQ, collW, coll_mmd = [], [], [] dataQ = dataQ0.clone() for t in range(opt.T + 1): tic = time.time() # Snapshot of current state with torch.no_grad(): mmd_PQ = mmd(dataP, dataQ, weights_X=wP if wP is not None else None, weights_Y=wQ) coll_mmd.append(mmd_PQ) collQ.append(dataQ.detach().cpu().numpy()) # snapshot of current state collW.append( wQ.view(-1).detach().cpu().numpy()) # snapshot of current weights # (1) Update D network optimizerD = torch.optim.Adam(D.parameters(), lr=opt.lrD, weight_decay=opt.wdecay, amsgrad=True) D.train() for i in range(opt.n_c_startup if t == 0 else opt.n_c): optimizerD.zero_grad() x_p, w_p = minibatch((dataP, wP), opt.batchSizeD) x_q, w_q = minibatch((dataQ, wQ), opt.batchSizeD) loss, Ep_f, Eq_f, normgrad_f2_q = D_forward_weights( D, x_p, w_p, x_q, w_q, lambda_aug, opt.alpha, opt.rho) loss.backward() optimizerD.step() manual_sgd_(lambda_aug, opt.rho) tocD = time.time() - tic # (2) Update Q distribution (with birth/death) D.eval() with torch.no_grad(): x_q, w_q = minibatch((dataQ, wQ)) f_q = D(x_q) m_f = (w_q * f_q).mean() new_x_q, log_wQ = [], [] for x_q, w_q in get_loader((dataQ, wQ), batch_size=opt.batchSizeQ): x_q = x_q.detach().requires_grad_(True) sum_f_q = D(x_q).sum() grad_x_q = grad(outputs=sum_f_q, inputs=x_q, create_graph=True)[0] # Update particles with torch.no_grad(): # Move particles x_q.data += opt.lrQ * grad_x_q f_q = D(x_q) dw_q = f_q - m_f log_wQ.append((w_q / n_samples).log() + opt.tau * dw_q) new_x_q.append(x_q) # Update weights and dataQ wQ = F.softmax(torch.cat(log_wQ), dim=0) * n_samples dataQ = torch.cat(new_x_q) # (3) print some stuff if t % opt.log_every == 0: x_p, w_p = minibatch((dataP, wP)) x_q, w_q = minibatch((dataQ, wQ)) loss, Ep_f, Eq_f, normgrad_f2_q = D_forward_weights( D, x_p, w_p, x_q, w_q, lambda_aug, opt.alpha, opt.rho) with torch.no_grad(): SobDist_lasti = Ep_f.item() - Eq_f.item() mmd_dist = mmd(dataP, dataQ, weights_X=wP if wP is not None else None, weights_Y=wQ) print('[{:5d}/{}] SobolevDist={:.4f}\t mmd={:.5f} Eq_normgrad_f2[stepQ]={:.3f} Ep_f={:.2f} Eq_f={:.2f} lambda_aug={:.4f}'.\ format(t, opt.T, SobDist_lasti, mmd_dist, normgrad_f2_q.mean().item(), Ep_f.item(), Eq_f.item(), lambda_aug.item())) if opt.plot_online: scat.set_offsets(dataQ.detach().cpu().numpy()) rgba_colors = np.zeros((wQ.shape[0], 4)) rgba_colors[:, 0] = 1.0 rgba_colors[:, 3] = wQ.view( -1).detach().cpu().numpy() / wQ.max().item() scat.set_color(rgba_colors) plt.pause(0.01) return dataQ, wQ, collQ, collW, coll_mmd
def train_model(model_class, run_func, args, quiet=False, splits=None, abs_output_dir=False): output_dir = args.output_dir val_stat = args.val_stat # Keeps track of certain stats for all the data splits all_stats = { 'val_%s' % val_stat: [], 'test_%s' % val_stat: [], 'best_epoch': [], 'train_last': [], 'train_best': [], 'nce': [], } # Iterate over splits splits_iter = splits if splits is not None else range(args.n_splits) # Iterates through each split of the data for split_idx in splits_iter: # print('Training split idx: %d' % split_idx) # Creates the output directory for the run of the current split if not abs_output_dir: args.output_dir = output_dir + '/run_%d' % split_idx args.model_dir = args.output_dir + '/models' if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) if not os.path.exists(args.model_dir): os.makedirs(args.model_dir) write_args(args) # Create model and optimizer model = model_class(args) model.to(args.device) if args.separate_lr: optim = model.get_model_optim() else: optim = torch.optim.Adam(model.parameters(), lr=args.lr) if split_idx == 0: # Print the number of parameters num_params = get_num_params(model) if not quiet: print('Initialized model with %d params' % num_params) # Load the train, val, test data dataset_loaders = {} for data_type in ['train', 'val', 'test']: dataset_loaders[data_type] = get_loader( args.data_dir, data_type=data_type, batch_size=args.batch_size, shuffle=data_type == 'train', split=split_idx, n_labels=args.n_labels) # Keeps track of stats across all the epochs train_m, val_m = StatsManager(), StatsManager() # Tensorboard logging, only for the first run split if args.log_tb and split_idx == 0: log_dir = output_dir + '/logs' tb_writer = SummaryWriter(log_dir, max_queue=1, flush_secs=60) log_tensorboard(tb_writer, {'params': num_params}, '', 0) else: args.log_tb = False # Training loop args.latest_train_stat = 0 args.latest_val_stat = 0 # Keeps track of the latest relevant stat patience_idx = 0 for epoch_idx in range(args.n_epochs): args.epoch = epoch_idx train_stats = run_func(model=model, optim=optim, data_loader=dataset_loaders['train'], data_type='train', args=args, write_path=None, quiet=quiet) should_write = epoch_idx % args.write_every == 0 val_stats = run_func( model=model, optim=None, data_loader=dataset_loaders['val'], data_type='val', args=args, write_path='%s/val_output_%d.jsonl' % (args.output_dir, epoch_idx) if should_write else None, quiet=quiet) if not quiet: train_stats.print_stats('Train %d: ' % epoch_idx) val_stats.print_stats('Val %d: ' % epoch_idx) if args.log_tb: log_tensorboard(tb_writer, train_stats.get_stats(), 'train', epoch_idx) log_tensorboard(tb_writer, val_stats.get_stats(), 'val', epoch_idx) train_stats.add_stat('epoch', epoch_idx) val_stats.add_stat('epoch', epoch_idx) train_m.add_stats(train_stats.get_stats()) val_m.add_stats(val_stats.get_stats()) if val_stats.get_stats()[val_stat] == min(val_m.stats[val_stat]): save_model(model, args, args.model_dir, epoch_idx, should_print=not quiet) patience_idx = 0 else: patience_idx += 1 if args.patience != -1 and patience_idx >= args.patience: print( 'Validation error has not improved in %d, stopping at epoch: %d' % (args.patience, args.epoch)) break # Keep track of the latest epoch stats args.latest_train_stat = train_stats.get_stats()[val_stat] args.latest_val_stat = val_stats.get_stats()[val_stat] # Load and save the best model best_epoch = val_m.get_best_epoch_for_stat(args.val_stat) best_model_path = '%s/model_%d' % (args.model_dir, best_epoch) model, _ = load_model(best_model_path, model_class=model_class, device=args.device) if not quiet: print('Loading model from %s' % best_model_path) save_model(model, args, args.model_dir, 'best', should_print=not quiet) # Test model test_stats = run_func(model=model, optim=None, data_loader=dataset_loaders['test'], data_type='test', args=args, write_path='%s/test_output.jsonl' % args.output_dir, quiet=quiet) if not quiet: test_stats.print_stats('Test: ') if args.log_tb: log_tensorboard(tb_writer, test_stats.get_stats(), 'test', 0) tb_writer.close() # Write test output to a summary file with open('%s/summary.txt' % args.output_dir, 'w+') as summary_file: for k, v in test_stats.get_stats().items(): summary_file.write('%s: %.3f\n' % (k, v)) # Aggregate relevant stats all_stats['val_%s' % val_stat].append(min(val_m.stats[val_stat])) all_stats['test_%s' % val_stat].append( test_stats.get_stats()[val_stat]) all_stats['best_epoch'].append(best_epoch) all_stats['train_last'].append(train_m.stats[val_stat][-1]) all_stats['train_best'].append(train_m.stats[val_stat][best_epoch]) if args.nce_coef > 0: all_stats['nce'].append(train_m.stats['nce_reg'][best_epoch]) # Write the stats aggregated across all splits with open('%s/summary.txt' % (output_dir), 'w+') as summary_file: summary_file.write('Num epochs trained: %d\n' % args.epoch) for name, stats_arr in all_stats.items(): if stats_arr == []: continue stats_arr = np.array(stats_arr) stats_mean = np.mean(stats_arr) stats_std = np.std(stats_arr) summary_file.write('%s: %s, mean: %.3f, std: %.3f\n' % (name, str(stats_arr), stats_mean, stats_std)) all_val_stats = np.array(all_stats['val_%s' % val_stat]) all_test_stats = np.array(all_stats['test_%s' % val_stat]) val_mean, val_std = np.mean(all_val_stats), np.std(all_val_stats) test_mean, test_std = np.mean(all_test_stats), np.std(all_val_stats) train_last = np.mean(np.array(all_stats['train_last'])) train_best = np.mean(np.array(all_stats['train_best'])) if args.nce_coef > 0: nce_loss = np.mean(np.array(all_stats['nce'])) else: nce_loss = 0 # Return stats return (val_mean, val_std), (test_mean, test_std), (train_last, train_best), nce_loss
def train(cfg, logger, vis): # Setup seeds torch.manual_seed(cfg.get("seed", 1337)) torch.cuda.manual_seed(cfg.get("seed", 1337)) np.random.seed(cfg.get("seed", 1337)) random.seed(cfg.get("seed", 1337)) # Setup Dataloader data_loader = get_loader(cfg["data"]["dataset"]) data_path = cfg["data"]["path"] t_loader = data_loader( data_path, split=cfg["data"]["train_split"], patch_size=cfg['data']['patch_size'], augmentation=cfg['data']['aug_data'] ) v_loader = data_loader( data_path, split=cfg["data"]["val_split"], ) trainloader = DataLoader( t_loader, batch_size=cfg["batch_size"], num_workers=cfg["n_workers"], shuffle=True, ) valloader = DataLoader( v_loader, batch_size=cfg["batch_size"], num_workers=cfg["n_workers"] ) # Setup model, optimizer and loss function model_cls = get_model(cfg['model']) model = model_cls(cfg).to(device) optimizer_cls = get_optimizer(cfg) optimizer_params = {k: v for k, v in cfg["optimizer"].items() if k != "name"} optimizer = optimizer_cls(model.parameters(), **optimizer_params) scheduler = MultiStepLR(optimizer, milestones=[15000, 17500], gamma=0.1) crit = get_critical(cfg['critical'])().to(device) ssim = SSIM().to(device) step = 0 if cfg['resume'] is not None: pass while step < cfg['max_iters']: scheduler.step() model.train() if cfg['model'] == 'rescan': O, B, prediciton = inference_rescan(model=model, optimizer=optimizer, dataloader=trainloader, critical=crit, ssim=ssim, step=step, vis=vis) if cfg['model'] == 'did_mdn': O, B, prediciton, label = inference_didmdn(model=model, optimizer=optimizer, dataloader=trainloader, critical=crit, ssim=ssim, step=step, vis=vis) if step % 10 == 0: model.eval() if cfg['model'] == 'rescan': O, B, prediciton_v = inference_rescan(model=model, optimizer=optimizer, dataloader=valloader, critical=crit, ssim=ssim, step=step, vis=vis) if cfg['model'] == 'did_mdn': O, B, prediciton, label = inference_didmdn(model=model, optimizer=optimizer, dataloader=valloader, critical=crit, ssim=ssim, step=step, vis=vis) if step % int(cfg['save_steps'] / 16) == 0: save_checkpoints(model, step, optimizer, cfg['checkpoint_dir'], 'latest') if step % int(cfg['save_steps'] / 2) == 0: save_image('train', [O.cpu(), prediciton.cpu(), B.cpu()], cfg['checkpoint_dir'], step, cfg['batch_size']) if step % 10 == 0: save_image('val', [O.cpu(), prediciton.cpu(), B.cpu()], cfg['checkpoint_dir'], step, cfg['batch_size']) logger.info('save image as step_%d' % step) if step % cfg['save_steps'] == 0: save_checkpoints(model=model, step=step, optim=optimizer, model_dir=cfg['checkpoint_dir'], name='{}_step_{}'.format(cfg['model'] + cfg['data']['dataset'], step)) logger.info('save model as step_%d' % step) step += 1
def train(args): # Setup Dataloader data_loader = get_loader('doc3dwc') data_path = args.data_path t_loader = data_loader(data_path, is_transform=True, img_size=(args.img_rows, args.img_cols), augmentations=True) v_loader = data_loader(data_path, is_transform=True, split='val', img_size=(args.img_rows, args.img_cols)) n_classes = t_loader.n_classes trainloader = data.DataLoader(t_loader, batch_size=args.batch_size, num_workers=8, shuffle=True) valloader = data.DataLoader(v_loader, batch_size=args.batch_size, num_workers=8) # Setup Model model = get_model(args.arch, n_classes, in_channels=3) model = torch.nn.DataParallel(model, device_ids=range(torch.cuda.device_count())) model.cuda() # Activation htan = nn.Hardtanh(0, 1.0) # Optimizer optimizer = torch.optim.Adam(model.parameters(), lr=args.l_rate, weight_decay=5e-4, amsgrad=True) # LR Scheduler sched = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=5, verbose=True) # Losses MSE = nn.MSELoss() loss_fn = nn.L1Loss() gloss = grad_loss.Gradloss(window_size=5, padding=2) epoch_start = 0 if args.resume is not None: if os.path.isfile(args.resume): print("Loading model and optimizer from checkpoint '{}'".format( args.resume)) checkpoint = torch.load(args.resume) model.load_state_dict(checkpoint['model_state']) optimizer.load_state_dict(checkpoint['optimizer_state']) print("Loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) epoch_start = checkpoint['epoch'] else: print("No checkpoint found at '{}'".format(args.resume)) # Log file: if not os.path.exists(args.logdir): os.makedirs(args.logdir) # activation_dataset_lossparams_augmentations_trainstart experiment_name = 'htan_doc3d_l1grad_bghsaugk_scratch' log_file_name = os.path.join(args.logdir, experiment_name + '.txt') if os.path.isfile(log_file_name): log_file = open(log_file_name, 'a') else: log_file = open(log_file_name, 'w+') log_file.write('\n--------------- ' + experiment_name + ' ---------------\n') log_file.close() # Setup tensorboard for visualization if args.tboard: # save logs in runs/<experiment_name> writer = SummaryWriter(comment=experiment_name) best_val_mse = 99999.0 global_step = 0 for epoch in range(epoch_start, args.n_epoch): avg_loss = 0.0 avg_l1loss = 0.0 avg_gloss = 0.0 train_mse = 0.0 model.train() for i, (images, labels) in enumerate(trainloader): images = images.to(device) labels = labels.to(device) optimizer.zero_grad() outputs = model(images) pred = htan(outputs) g_loss = gloss(pred, labels) l1loss = loss_fn(pred, labels) loss = l1loss # +(0.2*g_loss) avg_l1loss += float(l1loss) avg_gloss += float(g_loss) avg_loss += float(loss) train_mse += float(MSE(pred, labels).item()) loss.backward() optimizer.step() global_step += 1 if (i + 1) % 50 == 0: print("Epoch[%d/%d] Batch [%d/%d] Loss: %.4f" % (epoch + 1, args.n_epoch, i + 1, len(trainloader), avg_loss / 50.0)) avg_loss = 0.0 if args.tboard and (i + 1) % 20 == 0: show_wc_tnsboard(global_step, writer, images, labels, pred, 8, 'Train Inputs', 'Train WCs', 'Train Pred. WCs') writer.add_scalar('WC: L1 Loss/train', avg_l1loss / (i + 1), global_step) writer.add_scalar('WC: Grad Loss/train', avg_gloss / (i + 1), global_step) train_mse = train_mse / len(trainloader) avg_l1loss = avg_l1loss / len(trainloader) avg_gloss = avg_gloss / len(trainloader) print("Training L1:%4f" % (avg_l1loss)) print("Training MSE:'{}'".format(train_mse)) train_losses = [avg_l1loss, train_mse, avg_gloss] lrate = get_lr(optimizer) write_log_file(experiment_name, train_losses, epoch + 1, lrate, 'Train') model.eval() val_loss = 0.0 val_mse = 0.0 # val_bg = 0.0 # val_fg = 0.0 val_gloss = 0.0 # val_dloss = 0.0 for i_val, (images_val, labels_val) in tqdm(enumerate(valloader)): with torch.no_grad(): images_val = images_val.to(device) labels_val = labels_val.to(device) outputs = model(images_val) pred_val = htan(outputs) g_loss = gloss(pred_val, labels_val).cpu() pred_val = pred_val.cpu() labels_val = labels_val.cpu() loss = loss_fn(pred_val, labels_val) val_loss += float(loss) val_mse += float(MSE(pred_val, labels_val)) val_gloss += float(g_loss) if args.tboard: show_wc_tnsboard(epoch + 1, writer, images_val, labels_val, pred, 8, 'Val Inputs', 'Val WCs', 'Val Pred. WCs') writer.add_scalar('WC: L1 Loss/val', val_loss, epoch + 1) writer.add_scalar('WC: Grad Loss/val', val_gloss, epoch + 1) val_loss = val_loss / len(valloader) val_mse = val_mse / len(valloader) val_gloss = val_gloss / len(valloader) print("val loss at epoch {}:: {}".format(epoch + 1, val_loss)) print("val MSE: {}".format(val_mse)) val_losses = [val_loss, val_mse, val_gloss] write_log_file(experiment_name, val_losses, epoch + 1, lrate, 'Val') # reduce learning rate sched.step(val_mse) if val_mse < best_val_mse: best_val_mse = val_mse state = { 'epoch': epoch + 1, 'model_state': model.state_dict(), 'optimizer_state': optimizer.state_dict(), } torch.save( state, args.logdir + "{}_{}_{}_{}_{}_best_model.pkl".format( args.arch, epoch + 1, val_mse, train_mse, experiment_name)) if (epoch + 1) % 10 == 0: state = { 'epoch': epoch + 1, 'model_state': model.state_dict(), 'optimizer_state': optimizer.state_dict(), } torch.save( state, args.logdir + "{}_{}_{}_{}_{}_model.pkl".format( args.arch, epoch + 1, val_mse, train_mse, experiment_name))
def train(args): # Setup Dataloader data_loader = get_loader('doc3dbmnic') data_path = args.data_path t_loader = data_loader(data_path, is_transform=True, img_size=(args.img_rows, args.img_cols)) v_loader = data_loader(data_path, is_transform=True, split='val', img_size=(args.img_rows, args.img_cols)) n_classes = t_loader.n_classes trainloader = data.DataLoader(t_loader, batch_size=args.batch_size, num_workers=8, shuffle=True) valloader = data.DataLoader(v_loader, batch_size=args.batch_size, num_workers=8) # Setup Model model = get_model(args.arch, n_classes, in_channels=3) model = torch.nn.DataParallel(model, device_ids=range(torch.cuda.device_count())) model.cuda() # Optimizer optimizer = torch.optim.Adam(model.parameters(), lr=args.l_rate, weight_decay=5e-4, amsgrad=True) # LR Scheduler sched = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=3, verbose=True) # Losses MSE = nn.MSELoss() loss_fn = nn.L1Loss() reconst_loss = recon_lossc.Unwarploss() epoch_start = 0 if args.resume is not None: if os.path.isfile(args.resume): print("Loading model and optimizer from checkpoint '{}'".format( args.resume)) checkpoint = torch.load(args.resume) model.load_state_dict(checkpoint['model_state']) # optimizer.load_state_dict(checkpoint['optimizer_state']) print("Loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) epoch_start = checkpoint['epoch'] else: print("No checkpoint found at '{}'".format(args.resume)) # Log file: if not os.path.exists(args.logdir): os.makedirs(args.logdir) # network_activation(t=[-1,1])_dataset_lossparams_augmentations_trainstart experiment_name = 'dnetccnl_htan_swat3dmini1kbm_l1_noaug_scratch' log_file_name = os.path.join(args.logdir, experiment_name + '.txt') if os.path.isfile(log_file_name): log_file = open(log_file_name, 'a') else: log_file = open(log_file_name, 'w+') log_file.write('\n--------------- ' + experiment_name + ' ---------------\n') log_file.close() # Setup tensorboard for visualization if args.tboard: # save logs in runs/<experiment_name> writer = SummaryWriter(comment=experiment_name) # best_val_uwarpssim = 99999.0 best_val_mse = 99999.0 global_step = 0 for epoch in range(epoch_start, args.n_epoch): avg_loss = 0.0 avgl1loss = 0.0 avgrloss = 0.0 avgssimloss = 0.0 train_mse = 0.0 model.train() for i, (images, labels) in enumerate(trainloader): images = Variable(images.cuda()) labels = Variable(labels.cuda()) optimizer.zero_grad() target = model(images[:, 3:, :, :]) target_nhwc = target.transpose(1, 2).transpose(2, 3) l1loss = loss_fn(target_nhwc, labels) rloss, ssim, uworg, uwpred = reconst_loss(images[:, :-1, :, :], target_nhwc, labels) loss = (10.0 * l1loss) + (0.5 * rloss) # + (0.3*ssim) # loss=l1loss avgl1loss += float(l1loss) avg_loss += float(loss) avgrloss += float(rloss) avgssimloss += float(ssim) train_mse += MSE(target_nhwc, labels).item() loss.backward() optimizer.step() global_step += 1 if (i + 1) % 50 == 0: avg_loss = avg_loss / 50 print("Epoch[%d/%d] Batch [%d/%d] Loss: %.4f" % (epoch + 1, args.n_epoch, i + 1, len(trainloader), avg_loss)) avg_loss = 0.0 if args.tboard and (i + 1) % 20 == 0: show_unwarp_tnsboard(global_step, writer, uwpred, uworg, 8, 'Train GT unwarp', 'Train Pred Unwarp') writer.add_scalar('BM: L1 Loss/train', avgl1loss / (i + 1), global_step) writer.add_scalar('CB: Recon Loss/train', avgrloss / (i + 1), global_step) writer.add_scalar('CB: SSIM Loss/train', avgssimloss / (i + 1), global_step) avgssimloss = avgssimloss / len(trainloader) avgrloss = avgrloss / len(trainloader) avgl1loss = avgl1loss / len(trainloader) train_mse = train_mse / len(trainloader) print("Training L1:%4f" % (avgl1loss)) print("Training MSE:'{}'".format(train_mse)) train_losses = [avgl1loss, train_mse, avgrloss, avgssimloss] lrate = get_lr(optimizer) write_log_file(log_file_name, train_losses, epoch + 1, lrate, 'Train') model.eval() # val_loss = 0.0 val_l1loss = 0.0 val_mse = 0.0 val_rloss = 0.0 val_ssimloss = 0.0 for i_val, (images_val, labels_val) in tqdm(enumerate(valloader)): with torch.no_grad(): images_val = Variable(images_val.cuda()) labels_val = Variable(labels_val.cuda()) target = model(images_val[:, 3:, :, :]) target_nhwc = target.transpose(1, 2).transpose(2, 3) pred = target_nhwc.data.cpu() gt = labels_val.cpu() l1loss = loss_fn(target_nhwc, labels_val) rloss, ssim, uworg, uwpred = reconst_loss( images_val[:, :-1, :, :], target_nhwc, labels_val) val_l1loss += float(l1loss.cpu()) val_rloss += float(rloss.cpu()) val_ssimloss += float(ssim.cpu()) val_mse += float(MSE(pred, gt)) if args.tboard: show_unwarp_tnsboard(epoch + 1, writer, uwpred, uworg, 8, 'Val GT unwarp', 'Val Pred Unwarp') val_l1loss = val_l1loss / len(valloader) val_mse = val_mse / len(valloader) val_ssimloss = val_ssimloss / len(valloader) val_rloss = val_rloss / len(valloader) print("val loss at epoch {}:: {}".format(epoch + 1, val_l1loss)) print("val mse: {}".format(val_mse)) val_losses = [val_l1loss, val_mse, val_rloss, val_ssimloss] write_log_file(log_file_name, val_losses, epoch + 1, lrate, 'Val') if args.tboard: # log the val losses writer.add_scalar('BM: L1 Loss/val', val_l1loss, epoch + 1) writer.add_scalar('CB: Recon Loss/val', val_rloss, epoch + 1) writer.add_scalar('CB: SSIM Loss/val', val_ssimloss, epoch + 1) # reduce learning rate sched.step(val_mse) if val_mse < best_val_mse: best_val_mse = val_mse state = { 'epoch': epoch + 1, 'model_state': model.state_dict(), 'optimizer_state': optimizer.state_dict(), } torch.save( state, args.logdir + "{}_{}_{}_{}_{}_best_model.pkl".format( args.arch, epoch + 1, val_mse, train_mse, experiment_name)) if (epoch + 1) % 10 == 0: state = { 'epoch': epoch + 1, 'model_state': model.state_dict(), 'optimizer_state': optimizer.state_dict(), } torch.save( state, args.logdir + "{}_{}_{}_{}_{}_model.pkl".format( args.arch, epoch + 1, val_mse, train_mse, experiment_name))
def test(cfg, logger, vis): torch.cuda.manual_seed_all(66) torch.manual_seed(66) # Setup model, optimizer and loss function model_cls = get_model(cfg['model']) model = model_cls(cfg).to(device) optimizer_cls = get_optimizer(cfg) optimizer_params = { k: v for k, v in cfg["optimizer"].items() if k != "name" } optimizer = optimizer_cls(model.parameters(), **optimizer_params) crit = get_critical(cfg['critical'])().to(device) ssim = SSIM().to(device) model.eval() _, step = load_checkpoints(model, optimizer, cfg['checkpoint_dir'], name='latest') # Setup Dataloader data_loader = get_loader(cfg["data"]["dataset"]) data_path = cfg["data"]["path"] test_loader = data_loader(data_path, split=cfg["data"]["test_split"], patch_size=cfg['data']['patch_size'], augmentation=cfg['data']['aug_data']) testloader = DataLoader( test_loader, batch_size=cfg["batch_size"], num_workers=cfg["n_workers"], shuffle=True, ) all_num = 0 all_losses = {} for i, batch in enumerate(testloader): O, B = batch O, B = Variable(O.to(device), requires_grad=False), Variable(B.to(device), requires_grad=False) R = O - B with torch.no_grad(): O_Rs = model(O) loss_list = [crit(O_R, R) for O_R in O_Rs] ssim_list = [ssim(O - O_R, O - R) for O_R in O_Rs] losses = { 'loss%d' % i: loss.item() for i, loss in enumerate(loss_list) } ssimes = { 'ssim%d' % i: ssim.item() for i, ssim in enumerate(ssim_list) } losses.update(ssimes) prediction = O - O_Rs[-1] batch_size = O.size(0) all_num += batch_size for key, val in losses.items(): if i == 0: all_losses[key] = 0. all_losses[key] += val * batch_size logger.info('batch %d loss %s: %f' % (i, key, val)) if vis is not None: for k, v in losses.items(): vis.plot(k, v) vis.images(np.clip((prediction.detach().data * 255).cpu().numpy(), 0, 255), win='pred') vis.images(O.data.cpu().numpy(), win='input') vis.images(B.data.cpu().numpy(), win='groundtruth') if i % 20 == 0: save_image(name='test', img_lists=[O.cpu(), prediction.cpu(), B.cpu()], path=cfg['show_dir'], step=i, batch_size=cfg['batch_size']) for key, val in all_losses.items(): logger.info('total loss %s: %f' % (key, val / all_num))
def __init__(self, config): self.config = config self.ckpt_dir = config.ckpt_dir if not os.path.exists(self.ckpt_dir): os.makedirs(self.ckpt_dir) self.save_config(config) self.timer = Timer() self.writer = SummaryWriter(log_dir=config.ckpt_dir) self.lr = config.lr self.lr_decay_start = config.lr_decay_start self.datasets, self.loaders = get_loader(config) self.max_iters = config.max_iters if self.max_iters is not None: self.epochs = self.max_iters // len(self.loaders['train']) else: self.epochs = config.epochs self.start_epoch = 0 ### Network ### self.netG = GModelSelector[config.G_model].ResnetGenerator( input_nc=config.in_channels, output_nc=config.out_channels, use_dropout=config.use_dropout, **config.model_params[config.G_model]) self.netD = DModelSelector[config.D_model].NLayerDiscriminator( input_nc=6, n_layers=3) self.criterion_GAN = nn.MSELoss() self.criterion_L1 = nn.L1Loss() if config.distributed: self.netG = nn.DataParallel(self.netG) self.netD = nn.DataParallel(self.netD) patch_replication_callback(self.netG) patch_replication_callback(self.netD) if self.config.cuda: self.netG = self.netG.cuda() self.netD = self.netD.cuda() self.criterion_GAN = self.criterion_GAN.cuda() self.criterion_L1 = self.criterion_L1.cuda() # self.criterion = LossSelector[config.loss](**config.loss_params[config.loss]) self.optimizer_G = optim.Adam(self.netG.parameters(), lr=self.lr, betas=(0.9, 0.999), eps=1e-8) self.optimizer_D = optim.Adam(self.netD.parameters(), lr=self.lr, betas=(0.9, 0.999), eps=1e-8) if self.max_iters is not None: self.lr_decay_G = optim.lr_scheduler.CosineAnnealingLR( self.optimizer_G, self.max_iters) self.lr_decay_D = optim.lr_scheduler.CosineAnnealingLR( self.optimizer_D, self.max_iters) elif self.epochs is not None: self.lr_decay_G = optim.lr_scheduler.LambdaLR( self.optimizer_G, lr_lambda=self.lambda_rule) self.lr_decay_D = optim.lr_scheduler.LambdaLR( self.optimizer_D, lr_lambda=self.lambda_rule) else: raise NotImplementedError( 'max_iters or epochs cannot be {}'.format(self.epochs)) self.best_loss = float('inf') if config.resume: logger.info('***Resume from checkpoint***') state = torch.load(os.path.join(self.ckpt_dir, 'netG.pt')) self.netG.load_state_dict(state['net']) self.start_epoch = state['epoch'] self.best_loss = state['best_loss'] self.optimizer_G.load_state_dict(state['optim']) self.lr_decay_G.load_state_dict(state['lr_decay_G']) self.lr_decay_G.last_epoch = self.start_epoch state = torch.load(os.path.join(self.ckpt_dir, 'netD.pt')) self.netD.load_state_dict(state['net']) self.start_epoch = state['epoch'] self.best_loss = state['best_loss'] self.optimizer_D.load_state_dict(state['optim']) self.lr_decay_D.load_state_dict(state['lr_decay_D']) self.lr_decay_D.last_epoch = self.start_epoch
columns=[i for i in label]) #fig = plt.figure(figsize = (10,7)) fig = sn.heatmap(df_cm, annot=True, cmap="BuPu", fmt='g') fig = fig.get_figure() fig.savefig(confusion_matrix, dpi=400) return if __name__ == "__main__": main() if __name__ == "__main__ not use": ckpt_name = "affectnet7_mobilenet_small_floss_alpha2.pth.tar" print("Evaluating", ckpt_name) _, valLoader = datasets.get_loader(setname="affectnet7", batch_size=8, use_sampler=False, num_workers=4) ckpt = torch.load("ckpt/" + ckpt_name, map_location='cpu') model = MobileNetV3_Small() model.load_state_dict(ckpt["state_dict"]) #rafdb_table = {1:"Surprise", 2:"Fear", 3:"Disgust", 4:"Happiness", 5:"Sadness", 6:"Anger", 7:"Neutral"} affectnet7_table = { 0: "Neutral", 1: "Happy", 2: "Sad", 3: "Surprise", 4: "Fear", 5: "Disgust", 6: "Anger" } eval(model, valLoader, affectnet7_table)
def train_unbalanced_descent(D, dataQ0, dataP, wP, opt): n_samples, n_features = dataQ0.shape device = dataQ0.device # Lagrange multiplier for Augmented Lagrangian lambda_aug = torch.tensor([opt.lambda_aug_init], requires_grad=True, device=device) # MMD distance mmd = MMD_RFF(num_features=n_features, num_outputs=300).to(device) # Train print('Start training') if opt.plot_online: fig, ax = plt.subplots() ax.set_xlim((-1.1, 1.1)) ax.set_ylim((-1.1, 1.1)) scat = ax.scatter([], [], facecolor='r') # Save stuff collQ, coll_mmd = [], [] birth_total, death_total = 0, 0 dataQ = dataQ0.clone() for t in range(opt.T + 1): tic = time.time() # Snapshot of current state with torch.no_grad(): mmd_PQ = mmd(dataP, dataQ, weights_X=wP if wP is not None else None) coll_mmd.append(mmd_PQ) collQ.append(dataQ.detach().cpu().numpy()) # snapshot of current state # (1) Update D network optimizerD = torch.optim.Adam(D.parameters(), lr=opt.lrD, weight_decay=opt.wdecay, amsgrad=True) D.train() for i in range(opt.n_c_startup if t == 0 else opt.n_c): optimizerD.zero_grad() x_p, w_p = minibatch((dataP, wP), opt.batchSizeD) x_q = minibatch(dataQ, opt.batchSizeD).requires_grad_(True) loss, Ep_f, Eq_f, normgrad_f2_q = D_forward_weights( D, x_p, w_p, x_q, 1.0, lambda_aug, opt.alpha, opt.rho) loss.backward() optimizerD.step() manual_sgd_(lambda_aug, opt.rho) tocD = time.time() - tic # (2) Update Q distribution (with birth/death) D.eval() # compute initial m_f with torch.no_grad(): x_q = minibatch(dataQ) m_f = D(x_q).mean() # Update particles positions, and compute birth-death scores new_x_q, b_j = [], [] for x_q, in get_loader(dataQ, batch_size=opt.batchSizeQ): x_q = x_q.detach().requires_grad_(True) sum_f_q = D(x_q).sum() grad_x_q = grad(outputs=sum_f_q, inputs=x_q, create_graph=True)[0] with torch.no_grad(): new_x_q.append(x_q + opt.lrQ * grad_x_q) f_q_new = D(new_x_q[-1]) # birth-death score m_f = m_f + (1 / n_samples) * (f_q_new.sum() - sum_f_q) b_j.append(f_q_new.view(-1) - m_f) new_x_q = torch.cat(new_x_q) b_j = torch.cat(b_j) # Birth idx_alive = (b_j > 0).nonzero().view(-1) p_j = 1 - torch.exp(-opt.alpha * opt.tau * b_j[idx_alive]) idx_birth = idx_alive[p_j > torch.rand_like(p_j)] # Death idx_neg = (b_j <= 0).nonzero().view(-1) p_j = 1 - torch.exp(-opt.alpha * opt.tau * torch.abs(b_j[idx_neg])) ix_die = p_j > torch.rand_like(p_j) # Particles that die idx_dead = idx_neg[ix_die] idx_notdead = idx_neg[~ix_die] # Particles that don't die birth_total += len(idx_birth) death_total += len(idx_dead) if not opt.keep_order: new_x_q.data = new_x_q.data[torch.cat( (idx_alive, idx_notdead, idx_birth))] # Resize population if opt.balance: n_l = new_x_q.shape[0] if n_l < n_samples: # Randomly double particles r_idx = torch.randint(n_l, (n_samples - n_l, )) new_x_q = torch.cat((new_x_q, new_x_q[r_idx])) if n_l > n_samples: # Randomly kill particles r_idx = torch.randperm( n_l)[:n_samples] # Particles that should be kept new_x_q = new_x_q[r_idx] else: # Sample dead samples from cloned ones (if there are any), otherwise sample them from alive if len(idx_birth) > 0: r_idx = idx_birth[torch.randint(len(idx_birth), (len(idx_dead), ))] else: r_idx = idx_alive[torch.randint(len(idx_alive), (len(idx_dead), ))] new_x_q.data[idx_dead] = new_x_q.data[r_idx] dataQ = new_x_q.data # (3) print some stuff if t % opt.log_every == 0: x_p, w_p = minibatch((dataP, wP)) x_q = minibatch(dataQ) loss, Ep_f, Eq_f, normgrad_f2_q = D_forward_weights( D, x_p, w_p, x_q, 1.0, lambda_aug, opt.alpha, opt.rho) with torch.no_grad(): SobDist_lasti = Ep_f.item() - Eq_f.item() mmd_dist = mmd(dataP, dataQ, weights_X=wP if wP is not None else None) print('[{:5d}/{}] SobolevDist={:.4f}\t mmd={:.5f} births={} deaths={} Eq_normgrad_f2[stepQ]={:.3f} Ep_f={:.2f} Eq_f={:.2f} lambda_aug={:.4f}'.\ format(t, opt.T, SobDist_lasti, mmd_dist, birth_total, death_total, normgrad_f2_q.mean().item(), Ep_f.item(), Eq_f.item(), lambda_aug.item())) if opt.plot_online: line.set_data(dataQ[:, 0].detach().cpu().numpy(), dataQ[:, 1].detach().cpu().numpy()) plt.pause(0.01) return dataQ, collQ, coll_mmd
if __name__ == '__main__': # Set up random seed random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if torch.cuda.is_available(): torch.backends.cudnn.deterministic = True # Setting bechmark to False might slow down the training speed torch.backends.cudnn.benchmark = True # False device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') start_epoch = 0 # Set up data print("Loading data") train_dataloader, val_dataloader = get_loader(args) # Set up model and loss function print("Creating model") model = get_model(args) # device_count = torch.cuda.device_count() # if device_count > 1: # model = nn.DataParallel(model) model = model.to(device) # model.load_state_dict(torch.load('../test_v2/2020-05-09-05-41-04/epoch21.pth')['state_dict']) if args.resume_dir and not args.debug: # Load checkpoint print('==> Resuming from checkpoint') checkpoint = torch.load(os.path.join(args.resume_dir, 'ckpt.pth')) model.load_state_dict(checkpoint['state_dict'])
def visEmbed(exp_dict): # src_loader = datasets.get_loader(exp_dict["src_dataset"], "train", # batch_size=exp_dict["src_batch_size"]) # tgt_val_loader = datasets.get_loader(exp_dict["tgt_dataset"], "val", # batch_size=exp_dict["tgt_batch_size"]) src_loader = datasets.get_loader(exp_dict["src_dataset"], "train", batch_size=exp_dict["src_batch_size"], exp_dict=exp_dict) tgt_val_loader = datasets.get_loader(exp_dict["tgt_dataset"], "val", batch_size=exp_dict["tgt_batch_size"], exp_dict=exp_dict) # src_model, src_opt = models.get_model(exp_dict["src_model"], # exp_dict["n_outputs"]) src_model, src_opt, src_scheduler = models.get_model( exp_dict["src_model"], exp_dict["n_outputs"], input_channels=exp_dict['input_channels'], patch_size=exp_dict['patch_size'], n_classes=exp_dict['n_classes']) src_model.load_state_dict( torch.load(exp_dict["path"] + "/model_src_run{}.pth".format(exp_dict['run']))) # tgt_model, tgt_opt = models.get_model(exp_dict["tgt_model"], # exp_dict["n_outputs"]) tgt_model, tgt_opt, tgt_scheduler = models.get_model( exp_dict["tgt_model"], exp_dict["n_outputs"], input_channels=exp_dict['input_channels'], patch_size=exp_dict['patch_size'], n_classes=exp_dict['n_classes']) tgt_model.load_state_dict( torch.load(exp_dict["path"] + "/model_tgt_run{}.pth".format(exp_dict['run']))) X, X_tgt = losses.extract_embeddings(src_model, src_loader) Y, Y_tgt = losses.extract_embeddings(tgt_model, tgt_val_loader) X, X_tgt = X[:500], X_tgt[:500] Y, Y_tgt = Y[:500], Y_tgt[:500] # X, X_tgt = X[:1080], X_tgt[:1080] # Y,Y_tgt = Y[:1080], Y_tgt[:1080] n_classes = tgt_model.n_classes src_kmeans = KMeans(n_clusters=n_classes) src_kmeans.fit(X) Xc = src_kmeans.cluster_centers_ clf = neighbors.KNeighborsClassifier(n_neighbors=2) clf.fit(X, X_tgt) Xc_tgt = clf.predict(Xc) # acc_tgt = test.validate(src_model, tgt_model, # src_loader, # tgt_val_loader) tsne = manifold.TSNE(n_components=2, init='pca', random_state=0) #tsne.fit(Y[:500]) S_tsne = tsne.fit_transform(np.vstack([Y, X, Xc])) #X_tsne = tsne.transform(X[:500]) Y_tsne = S_tsne[:Y.shape[0]] X_tsne = S_tsne[Y.shape[0]:-n_classes] Xc_tsne = S_tsne[-n_classes:] # plt.mpl.rcParams['grid.color'] = 'k' # plt.mpl.rcParams['grid.linestyle'] = ':' # plt.mpl.rcParams['grid.linewidth'] = 0.5 # Y_labels = Y_labels # X_labels = X_labels # scatter(Y_tsne, Y_tgt+1, win="1", title="target - {}".format(exp_dict["tgt_dataset"])) # scatter(X_tsne, X_tgt+1, win="2",title="source - {}".format(exp_dict["src_dataset"])) colors = [ "b", "g", "r", "c", "m", "y", "gray", "w", "chocolate", "olive", "pink" ] if 1: fig = plt.figure(figsize=(6, 6)) plt.grid(linestyle='dotted') plt.scatter(X_tsne[:, 0], X_tsne[:, 1], alpha=0.6, edgecolors="black") for c in range(n_classes): ind = Xc_tgt == c color = colors[c + 1] plt.scatter(Xc_tsne[ind][:, 0], Xc_tsne[ind][:, 1], s=250, c=color, edgecolors="black", marker="*") # plt.axes().set_aspect('equal', 'datalim') plt.xlabel("t-SNE Feature 2") plt.ylabel("t-SNE Feature 1") title = "Source Dataset ({}) - Center: {} - Adv: {}".format( exp_dict["src_dataset"].upper().replace("BIG", ""), exp_dict["options"]["center"], exp_dict["options"]["disc"]) plt.title(title) fig.tight_layout(rect=[0, 0.03, 1, 0.95]) plt.savefig("figures/src_{}_center_{}_disc_{}.pdf".format( exp_dict["exp_name"].replace(" ", ""), exp_dict["options"]["center"], exp_dict["options"]["disc"]), bbox_inches='tight', transparent=False) plt.savefig("figures/src_{}_center_{}_disc_{}.png".format( exp_dict["exp_name"], exp_dict["options"]["center"], exp_dict["options"]["disc"]), bbox_inches='tight', transparent=False) # ms.visplot(fig) if 1: fig = plt.figure(figsize=(6, 6)) plt.grid(linestyle='dotted') for c in range(n_classes): ind = Y_tgt == c color = colors[c + 1] plt.scatter(Y_tsne[ind][:, 0], Y_tsne[ind][:, 1], alpha=0.6, c=color, edgecolors="black") for c in range(n_classes): ind = Xc_tgt == c color = colors[c + 1] plt.scatter(Xc_tsne[ind][:, 0], Xc_tsne[ind][:, 1], s=350, c=color, edgecolors="black", marker="*") # plt.axes().set_aspect('equal', 'datalim') plt.xlabel("t-SNE Feature 2") plt.ylabel("t-SNE Feature 1") title = "Target Dataset ({}) - Center: {} - Adv: {}".format( exp_dict["tgt_dataset"].upper().replace("BIG", ""), exp_dict["options"]["center"], exp_dict["options"]["disc"]) plt.title(title) fig.tight_layout(rect=[0, 0.03, 1, 0.95]) plt.savefig("figures/tgt_{}_center_{}_disc_{}.pdf".format( exp_dict["exp_name"], exp_dict["options"]["center"], exp_dict["options"]["disc"]), bbox_inches='tight', transparent=False) plt.savefig("figures/tgt_{}_center_{}_disc_{}.png".format( exp_dict["exp_name"], exp_dict["options"]["center"], exp_dict["options"]["disc"]), bbox_inches='tight', transparent=False)