def main(): # Hyper Parameters parser = argparse.ArgumentParser() parser.add_argument('--data_path', default='/media/shenkev/data/Ubuntu/vsepp/data/data', help='path to datasets') parser.add_argument('--data_name', default='coco_precomp', help='{coco,f8k,f30k,10crop}_precomp|coco|f8k|f30k') parser.add_argument('--vocab_path', default='./vocab/', help='Path to saved vocabulary pickle files.') parser.add_argument('--margin', default=0.2, type=float, help='Rank loss margin.') parser.add_argument('--num_epochs', default=30, type=int, help='Number of training epochs.') parser.add_argument('--batch_size', default=128, type=int, help='Size of a training mini-batch.') parser.add_argument('--word_dim', default=300, type=int, help='Dimensionality of the word embedding.') parser.add_argument('--embed_size', default=1024, type=int, help='Dimensionality of the joint embedding.') parser.add_argument('--grad_clip', default=2., type=float, help='Gradient clipping threshold.') parser.add_argument('--crop_size', default=224, type=int, help='Size of an image crop as the CNN input.') parser.add_argument('--num_layers', default=1, type=int, help='Number of GRU layers.') parser.add_argument('--learning_rate', default=.0002, type=float, help='Initial learning rate.') parser.add_argument('--lr_update', default=15, type=int, help='Number of epochs to update the learning rate.') parser.add_argument('--workers', default=10, type=int, help='Number of data loader workers.') parser.add_argument('--log_step', default=10, type=int, help='Number of steps to print and record the log.') parser.add_argument('--val_step', default=500, type=int, help='Number of steps to run validation.') parser.add_argument('--logger_name', default='runs/coco_vse', help='Path to save the model and Tensorboard log.') parser.add_argument('--resume', default='', type=str, metavar='PATH', help='path to latest checkpoint (default: none)') parser.add_argument('--max_violation', default=True, action='store_true', help='Use max instead of sum in the rank loss.') parser.add_argument('--img_dim', default=4096, type=int, help='Dimensionality of the image embedding.') parser.add_argument('--finetune', action='store_true', help='Fine-tune the image encoder.') parser.add_argument('--cnn_type', default='vgg19', help="""The CNN used for image encoder (e.g. vgg19, resnet152)""") parser.add_argument('--use_restval', action='store_true', help='Use the restval data for training on MSCOCO.') parser.add_argument('--measure', default='cosine', help='Similarity measure used (cosine|order)') parser.add_argument('--use_abs', action='store_true', help='Take the absolute value of embedding vectors.') parser.add_argument('--no_imgnorm', action='store_true', help='Do not normalize the image embeddings.') opt = parser.parse_args() print(opt) logging.basicConfig(format='%(asctime)s %(message)s', level=logging.INFO) tb_logger.configure(opt.logger_name, flush_secs=5) # Load Vocabulary Wrapper vocab = pickle.load( open(os.path.join(opt.vocab_path, '%s_vocab.pkl' % opt.data_name), 'rb')) opt.vocab_size = len(vocab) # Load data loaders train_loader, val_loader = data.get_loaders(opt.data_name, vocab, opt.crop_size, opt.batch_size, opt.workers, opt) # Construct the model model = VSE(opt) # optionally resume from a checkpoint if opt.resume: if os.path.isfile(opt.resume): print("=> loading checkpoint '{}'".format(opt.resume)) checkpoint = torch.load(opt.resume) start_epoch = checkpoint['epoch'] best_rsum = checkpoint['best_rsum'] model.load_state_dict(checkpoint['model']) # Eiters is used to show logs as the continuation of another # training model.Eiters = checkpoint['Eiters'] print("=> loaded checkpoint '{}' (epoch {}, best_rsum {})".format( opt.resume, start_epoch, best_rsum)) validate(opt, val_loader, model) else: print("=> no checkpoint found at '{}'".format(opt.resume)) # Train the Model best_rsum = 0 for epoch in range(opt.num_epochs): adjust_learning_rate(opt, model.optimizer, epoch) # train for one epoch train(opt, train_loader, model, epoch, val_loader) # evaluate on validation set rsum = validate(opt, val_loader, model) # remember best R@ sum and save checkpoint is_best = rsum > best_rsum best_rsum = max(rsum, best_rsum) save_checkpoint( { 'epoch': epoch + 1, 'model': model.state_dict(), 'best_rsum': best_rsum, 'opt': opt, 'Eiters': model.Eiters, }, is_best, prefix=opt.logger_name + '/')
def main(): args = parser.parse_args() if args.seed is None: args.seed = random.randint(1, 10000) print("Random Seed: ", args.seed) random.seed(args.seed) torch.manual_seed(args.seed) if args.gpus: torch.cuda.manual_seed_all(args.seed) time_stamp = datetime.now().strftime('%Y-%m-%d_%H-%M-%S') if args.evaluate: args.results_dir = '/tmp' if args.save is '': args.save = time_stamp save_path = os.path.join(args.results_dir, args.save) if not os.path.exists(save_path): os.makedirs(save_path) if args.gpus is not None: args.gpus = [int(i) for i in args.gpus.split(',')] device = 'cuda:' + str(args.gpus[0]) cudnn.benchmark = True else: device = 'cpu' if args.type == 'float64': dtype = torch.float64 elif args.type == 'float32': dtype = torch.float32 elif args.type == 'float16': dtype = torch.float16 else: raise ValueError('Wrong type!') # TODO int8 model = MobileNet2(input_size=args.input_size, scale=args.scaling) num_parameters = sum([l.nelement() for l in model.parameters()]) print(model) print('number of parameters: {}'.format(num_parameters)) print('FLOPs: {}'.format( flops_benchmark.count_flops(MobileNet2, args.batch_size // len(args.gpus) if args.gpus is not None else args.batch_size, device, dtype, args.input_size, 3, args.scaling))) train_loader, val_loader = get_loaders(args.dataroot, args.batch_size, args.batch_size, args.input_size, args.workers) # define loss function (criterion) and optimizer criterion = torch.nn.CrossEntropyLoss() if args.gpus is not None: model = torch.nn.DataParallel(model, args.gpus) model.to(device=device, dtype=dtype) criterion.to(device=device, dtype=dtype) optimizer = torch.optim.SGD(model.parameters(), args.learning_rate, momentum=args.momentum, weight_decay=args.decay, nesterov=True) if args.find_clr: find_bounds_clr(model, train_loader, optimizer, criterion, device, dtype, min_lr=args.min_lr, max_lr=args.max_lr, step_size=args.epochs_per_step * len(train_loader), mode=args.mode, save_path=save_path) return if args.clr: scheduler = CyclicLR(optimizer, base_lr=args.min_lr, max_lr=args.max_lr, step_size=args.epochs_per_step * len(train_loader), mode=args.mode) else: scheduler = MultiStepLR(optimizer, milestones=args.schedule, gamma=args.gamma) best_test = 0 # optionally resume from a checkpoint data = None if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume, map_location=device) args.start_epoch = checkpoint['epoch'] - 1 best_test = checkpoint['best_prec1'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' (epoch {})" .format(args.resume, checkpoint['epoch'])) elif os.path.isdir(args.resume): checkpoint_path = os.path.join(args.resume, 'checkpoint.pth.tar') csv_path = os.path.join(args.resume, 'results.csv') print("=> loading checkpoint '{}'".format(checkpoint_path)) checkpoint = torch.load(checkpoint_path, map_location=device) args.start_epoch = checkpoint['epoch'] - 1 best_test = checkpoint['best_prec1'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' (epoch {})".format(checkpoint_path, checkpoint['epoch'])) data = [] with open(csv_path) as csvfile: reader = csv.DictReader(csvfile) for row in reader: data.append(row) else: print("=> no checkpoint found at '{}'".format(args.resume)) if args.evaluate: loss, top1, top5 = test(model, val_loader, criterion, device, dtype) # TODO return csv_logger = CsvLogger(filepath=save_path, data=data) csv_logger.save_params(sys.argv, args) claimed_acc1 = None claimed_acc5 = None if args.input_size in claimed_acc_top1: if args.scaling in claimed_acc_top1[args.input_size]: claimed_acc1 = claimed_acc_top1[args.input_size][args.scaling] claimed_acc5 = claimed_acc_top5[args.input_size][args.scaling] csv_logger.write_text( 'Claimed accuracies are: {:.2f}% top-1, {:.2f}% top-5'.format(claimed_acc1 * 100., claimed_acc5 * 100.)) train_network(args.start_epoch, args.epochs, scheduler, model, train_loader, val_loader, optimizer, criterion, device, dtype, args.batch_size, args.log_interval, csv_logger, save_path, claimed_acc1, claimed_acc5, best_test)
def main(): opt = get_opt() tb_logger.configure(opt.logger_name, flush_secs=5, opt=opt) logfname = os.path.join(opt.logger_name, 'log.txt') logging.basicConfig(filename=logfname, format='%(asctime)s %(message)s', level=logging.INFO) logging.getLogger().addHandler(logging.StreamHandler(sys.stdout)) logging.info(str(opt.d)) torch.manual_seed(opt.seed) if opt.cuda: # TODO: remove deterministic torch.backends.cudnn.deterministic = True torch.cuda.manual_seed(opt.seed) np.random.seed(opt.seed) # helps with wide-resnet by reducing memory and time 2x cudnn.benchmark = True train_loader, test_loader, train_test_loader = get_loaders(opt) if opt.epoch_iters == 0: opt.epoch_iters = int( np.ceil(1. * len(train_loader.dataset) / opt.batch_size)) opt.maxiter = opt.epoch_iters * opt.epochs if opt.g_epoch: opt.gvar_start *= opt.epoch_iters opt.g_optim_start = (opt.g_optim_start * opt.epoch_iters) + 1 model = models.init_model(opt) optimizer = OptimizerFactory(model, train_loader, tb_logger, opt) epoch = 0 save_checkpoint = utils.SaveCheckpoint() # optionally resume from a checkpoint if not opt.noresume: model_path = os.path.join(opt.logger_name, opt.ckpt_name) if os.path.isfile(model_path): print("=> loading checkpoint '{}'".format(model_path)) checkpoint = torch.load(model_path) best_prec1 = checkpoint['best_prec1'] optimizer.gvar.load_state_dict(checkpoint['gvar']) optimizer.niters = checkpoint['niters'] epoch = checkpoint['epoch'] model.load_state_dict(checkpoint['model']) save_checkpoint.best_prec1 = best_prec1 print("=> loaded checkpoint '{}' (epoch {}, best_prec {})".format( model_path, epoch, best_prec1)) else: print("=> no checkpoint found at '{}'".format(model_path)) if opt.niters > 0: max_iters = opt.niters else: max_iters = opt.epochs * opt.epoch_iters if opt.untrain_steps > 0: untrain(model, optimizer.gvar, opt) while optimizer.niters < max_iters: optimizer.epoch = epoch utils.adjust_lr(optimizer, opt) ecode = train(tb_logger, epoch, train_loader, model, optimizer, opt, test_loader, save_checkpoint, train_test_loader) if ecode == -1: break epoch += 1 tb_logger.save_log()
required=False, help='Model Save Directory') parser.add_argument('--graph_dir', type=str, default="./graphs/cifar10_r10", required=False, help='Graph Save Directory') args = parser.parse_args() print(args) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print(device) train_loader, test_loader = get_loaders(args) asmv = AsymValley(args.graph_dir) num_classes = 10 model = models.resnet18() model.fc = nn.Linear(512, num_classes) model.load_state_dict(torch.load("./checkpoints/r18_10")) criterion = nn.CrossEntropyLoss() model = model.to(device) asmv.draw(model, evalav, train_loader, test_loader, criterion, 1.8, 100)
def main(): if torch.cuda.is_available(): args.device = torch.device('cuda') args.cuda = True else: args.device = torch.device('cpu') args.cuda = False # Make as reproducible as possible. # Please note that pytorch does not let us make things completely reproducible across machines. # See https://pytorch.org/docs/stable/notes/randomness.html if args.seed is not None: print('setting seed', args.seed) torch.manual_seed(args.seed) if args.cuda: torch.cuda.manual_seed(args.seed) np.random.seed(args.seed) random.seed(args.seed) # Get data train_loader, train_eval_loader, val_loader, _ = data.get_loaders(args) args.n_groups = train_loader.dataset.n_groups # Get model model = utils.get_model(args, image_shape=train_loader.dataset.image_shape) model = model.to(args.device) # Loss Fn if args.use_robust_loss: loss_fn = nn.CrossEntropyLoss(reduction='none') loss_computer = LossComputer(loss_fn, is_robust=True, dataset=train_loader.dataset, step_size=args.robust_step_size, device=args.device, args=args) else: loss_fn = nn.CrossEntropyLoss() # Optimizer if args.optimizer == 'adam': # This is used for MNIST. optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate) elif args.optimizer == 'sgd': # From DRNN paper optimizer = torch.optim.SGD(filter(lambda p: p.requires_grad, model.parameters()), lr=args.learning_rate, momentum=0.9, weight_decay=args.weight_decay) # Train loop best_worst_case_acc = 0 best_worst_case_acc_epoch = 0 avg_val_acc = 0 empirical_val_acc = 0 for epoch in trange(args.num_epochs): total_loss = 0 total_accuracy = 0 total_examples = 0 model.train() for batch_id, (images, labels, group_ids) in enumerate( tqdm(train_loader, desc='train loop')): # Put on GPU images = images.to(args.device) labels = labels.to(args.device) # Forward logits = model(images) if args.use_robust_loss: group_ids = group_ids.to(args.device) loss = loss_computer.loss(logits, labels, group_ids, is_training=True) else: loss = loss_fn(logits, labels) # Evaluate preds = np.argmax(logits.detach().cpu().numpy(), axis=1) accuracy = np.sum( preds == labels.detach().cpu().numpy().reshape(-1)) total_accuracy += accuracy * labels.shape[0] total_loss += loss.item() * labels.shape[0] total_examples += labels.shape[0] # Backprop optimizer.zero_grad() loss.backward() optimizer.step() if epoch % args.epochs_per_eval == 0: worst_case_acc, stats = utils.evaluate_groups( args, model, val_loader, epoch, split='val') # validation # Track early stopping values with respect to worst case. if worst_case_acc > best_worst_case_acc: best_worst_case_acc = worst_case_acc save_model(model, ckpt_dir, epoch, args.device) # Log early stopping values if args.log_wandb: wandb.log({ "Train Loss": total_loss / total_examples, "Best Worst Case Val Acc": best_worst_case_acc, "Train Accuracy": total_accuracy / total_examples, "epoch": epoch }) print(f"Epoch: ", epoch, "Worst Case Acc: ", worst_case_acc)
def main(): # Hyper Parameters parser = argparse.ArgumentParser() parser.add_argument('--data_path', default='/data', help='path to datasets') parser.add_argument('--data_name', default='precomp', help='{coco,f8k,f30k,10crop}_precomp|coco|f8k|f30k') parser.add_argument('--vocab_path', default='./vocab/', help='Path to saved vocabulary pickle files.') parser.add_argument('--margin', default=0.05, type=float, help='loss margin.') parser.add_argument('--temperature', default=14, type=int, help='loss temperature.') parser.add_argument('--num_epochs', default=9, type=int, help='Number of training epochs.') parser.add_argument('--batch_size', default=128, type=int, help='Size of a training mini-batch.') parser.add_argument('--word_dim', default=300, type=int, help='Dimensionality of the word embedding.') parser.add_argument('--embed_size', default=2048, type=int, help='Dimensionality of the joint embedding.') parser.add_argument('--grad_clip', default=2., type=float, help='Gradient clipping threshold.') parser.add_argument('--num_layers', default=1, type=int, help='Number of GRU layers.') parser.add_argument('--learning_rate', default=.0002, type=float, help='Initial learning rate.') parser.add_argument('--lr_update', default=4, type=int, help='Number of epochs to update the learning rate.') parser.add_argument('--workers', default=10, type=int, help='Number of data loader workers.') parser.add_argument('--log_step', default=10, type=int, help='Number of steps to print and record the log.') parser.add_argument('--val_step', default=500, type=int, help='Number of steps to run validation.') parser.add_argument('--logger_name', default='runs/runX', help='Path to save the model and Tensorboard log.') parser.add_argument('--resume', default='', type=str, metavar='PATH', help='path to latest checkpoint (default: none)') parser.add_argument('--img_dim', default=2048, type=int, help='Dimensionality of the image embedding.') parser.add_argument('--measure', default='cosine', help='Similarity measure used (cosine|order)') parser.add_argument('--use_abs', action='store_true', help='Take the absolute value of embedding vectors.') parser.add_argument('--no_imgnorm', action='store_true', help='Do not normalize the image embeddings.') parser.add_argument('--seed', default=1, type=int, help='random seed.') parser.add_argument('--use_atten', action='store_true', help='use_atten') parser.add_argument('--lambda_softmax', default=9., type=float, help='Attention softmax temperature.') parser.add_argument('--use_box', action='store_true', help='use_box') parser.add_argument('--use_label', action='store_true', help='use_label') parser.add_argument('--use_mmd', action='store_true', help='use_mmd') parser.add_argument('--score_path', default='../user_data/score.npy', type=str) opt = parser.parse_args() print(opt) logging.basicConfig(format='%(asctime)s %(message)s', level=logging.INFO) #tb_logger.configure(opt.logger_name, flush_secs=5) # Load Vocabulary Wrapper #vocab = pickle.load(open(os.path.join( # opt.vocab_path, '%s_vocab.pkl' % opt.data_name), 'rb')) #vocab = deserialize_vocab(os.path.join(opt.vocab_path, 'kdd2020_caps_vocab_train_val_threshold2.json')) vocab = deserialize_vocab( os.path.join(opt.vocab_path, '%s_vocab.json' % opt.data_name)) opt.vocab_size = len(vocab) stoppath = os.path.join(opt.vocab_path, 'stopwords.txt') f_stop = open(stoppath, 'r') stops = f_stop.readlines() stopwords = [] for sw in stops: sw = sw.strip() #.encode('utf-8').decode('utf-8') stopwords.append(sw) # Load data loaders train_loader, val_loader = data.get_loaders(opt.data_name, vocab, stopwords, opt.batch_size, opt.workers, opt, True) # Construct the model model = VSRN(opt) # optionally resume from a checkpoint if opt.resume: if os.path.isfile(opt.resume): print("=> loading checkpoint '{}'".format(opt.resume)) checkpoint = torch.load(opt.resume) start_epoch = checkpoint['epoch'] best_rsum = checkpoint['best_rsum'] model.load_state_dict(checkpoint['model']) # Eiters is used to show logs as the continuation of another # training model.Eiters = checkpoint['Eiters'] print("=> loaded checkpoint '{}' (epoch {}, best_rsum {})".format( opt.resume, start_epoch, best_rsum)) validate(opt, val_loader, model) else: print("=> no checkpoint found at '{}'".format(opt.resume))
def main(args,asmv,counter): device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print(device) train_loader,test_loader = get_loaders(args) num_classes = 10 # model = Mark001(num_classes).to(device) model = models.resnet18() model.fc = nn.Linear(512, num_classes) # model = nn.DataParallel(model) model = model.to(device) model.train() optimizer = torch.optim.Adam( model.parameters(), lr=args.lr_init, amsgrad=True, # momentum=args.momentum, weight_decay=args.wd ) # scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 10, gamma=0.9, last_epoch=-1) criterion = nn.CrossEntropyLoss() # summary(model, (3, 32,32)) # s = Surface() score = 0 for x in range(args.epochs): losses = AverageMeter('Loss', ':.6f') for i, (inputs, label) in enumerate(train_loader): inputs, label = inputs.to(device), label.to(device) output = model(inputs) loss = criterion(output, label) optimizer.zero_grad() loss.backward() optimizer.step() losses.update(loss.item(), inputs.size(0)) # if i % args.surface_track_freq == 0: # s.add(model,loss.item(),score) if i % args.print_freq == 0: print("\tStep: ",i,losses.__str__()) # scheduler.step() print("Epoch: ",x,losses.__str__()) score = validate(test_loader,model,criterion,device) # s.add(model,loss.item(),score) torch.save(model.state_dict(), f"./{args.model_dir}/r18_10") # s.plot() asmv.draw(model,evalav,train_loader,test_loader,criterion,0.5,counter)
def main(): # Hyper Parameters parser = argparse.ArgumentParser() parser.add_argument('--data_path', default='./Bottom_up_atten_feature/data', help='path to datasets') parser.add_argument('--data_name', default='f30k_precomp', help='{coco_precomp_original, f30k_precomp') parser.add_argument('--vocab_path', default='./vocab/', help='Path to saved vocabulary json files.') parser.add_argument('--orig_img_path', default='./data/', help='path to get the original image data') parser.add_argument('--orig_data_name', default='f30k', help='{coco,f30k}') parser.add_argument('--use_restval', action='store_false', help='Use the restval data for training on MSCOCO.') parser.add_argument('--margin', default=0.2, type=float, help='Rank loss margin.') parser.add_argument('--num_epochs', default=50, type=int, help='Number of training epochs.') parser.add_argument('--batch_size', default=128, type=int, help='Size of a training mini-batch.') parser.add_argument('--word_dim', default=300, type=int, help='Dimensionality of the word embedding.') parser.add_argument('--embed_size', default=1024, type=int, help='Dimensionality of the joint embedding.') parser.add_argument('--grad_clip', default=2., type=float, help='Gradient clipping threshold.') parser.add_argument('--num_layers', default=1, type=int, help='Number of GRU layers.') parser.add_argument('--learning_rate', default=.0002, type=float, help='Initial learning rate.') parser.add_argument('--lr_update', default=25, type=int, help='Number of epochs to update the learning rate.') parser.add_argument('--workers', default=40, type=int, help='Number of data loader workers.') parser.add_argument('--log_step', default=200, type=int, help='Number of steps to print and record the log.') parser.add_argument('--val_step', default=2000, type=int, help='Number of steps to run validation.') parser.add_argument('--logger_name', default='./runs/f30k/CVSE_f30k/log', help='Path to save Tensorboard log.') parser.add_argument('--model_name', default='./runs/f30k/CVSE_f30k/', help='Path to save the model.') parser.add_argument('--resume', default='./runs/f30k/CVSE_f30k/model_best.pth.tar', type=str, metavar='PATH', help='path to latest checkpoint (default: none)') parser.add_argument('--max_violation', action='store_false', help='Use max instead of sum in the rank loss.') parser.add_argument('--img_dim', default=2048, type=int, help='Dimensionality of the image embedding.') parser.add_argument('--no_imgnorm', action='store_true', help='Do not normalize the image embeddings.') parser.add_argument('--no_txtnorm', action='store_true', help='Do not normalize the text embeddings.') parser.add_argument('--precomp_enc_type', default="basic", help='basic|weight_norm') parser.add_argument('--bi_gru', action='store_false', help='Use bidirectional GRU.') parser.add_argument('--use_BatchNorm', action='store_false', help='Whether to use BN.') parser.add_argument('--activation_type', default='tanh', help='choose type of activation functions.') parser.add_argument('--dropout_rate', default=0.4, type=float, help='dropout rate.') parser.add_argument('--use_abs', action='store_true', help='Take the absolute value of embedding vectors.') parser.add_argument('--measure', default='cosine', help='Similarity measure used (cosine|order)') parser.add_argument('--attribute_path', default='data/f30k_annotations/Concept_annotations/', help='path to get attribute json file' ) # absolute path (get from path of SAN model) parser.add_argument('--num_attribute', default=300, type=int, help='dimension of Attribute annotation') parser.add_argument('--input_channel', default=300, type=int, help='dimension of initial word embedding') parser.add_argument( '--inp_name', default= 'data/f30k_annotations/Concept_annotations/f30k_concepts_glove_word2vec.pkl', help='load the input glove word embedding file') parser.add_argument( '--adj_file', default= 'data/f30k_annotations/Concept_annotations/f30k_adj_concepts.pkl', help='load the adj file') parser.add_argument('--learning_rate_MLGCN', default=.0002, type=float, help='learning rate of module of MLGCN.') parser.add_argument('--lr_MLGCN_update', default=10, type=int, help='Number of epochs to update the learning rate.') parser.add_argument('--Concept_label_ratio', default=0.35, type=float, help='The ratio of concept label.') parser.add_argument( '--concept_name', default= 'data/f30k_annotations/Concept_annotations/category_concepts.json', help='load the input concrete words of concepts') parser.add_argument('--norm_func_type', default='sigmoid', help='choose type of norm functions.') parser.add_argument( '--feature_fuse_type', default='weight_sum', help= 'choose the fusing type for raw feature and attribute feature (multiple|concat|adap_sum|weight_sum))' ) parser.add_argument('--wemb_type', default='glove', choices=('glove', 'fasttext', 'random_init'), type=str, help='Word embedding (glove|fasttext|random_init)') opt = parser.parse_args() print(opt) logging.basicConfig(format='%(asctime)s %(message)s', level=logging.INFO) tb_logger.configure(opt.logger_name, flush_secs=5) # Load Vocabulary Wrapper vocab = pickle.load( open(os.path.join(opt.vocab_path, '%s_vocab.pkl' % opt.data_name), 'rb')) opt.vocab_size = len(vocab) '''load the vocab word2idx''' word2idx = vocab.word2idx # Load data loaders train_loader, val_loader = data.get_loaders(opt.data_name, vocab, opt.batch_size, opt.workers, opt) # Construct the model # model = CVSE(opt) model = CVSE(word2idx, opt) # optionally resume from a checkpoint if opt.resume: if os.path.isfile(opt.resume): print("=> loading checkpoint '{}'".format(opt.resume)) checkpoint = torch.load(opt.resume) start_epoch = checkpoint['epoch'] best_rsum = checkpoint['best_rsum'] model.load_state_dict(checkpoint['model']) # Eiters is used to show logs as the continuation of another # training model.Eiters = checkpoint['Eiters'] print("=> loaded checkpoint '{}' (epoch {}, best_rsum {})".format( opt.resume, start_epoch, best_rsum)) validate(opt, val_loader, model) else: print("=> no checkpoint found at '{}'".format(opt.resume)) # Train the Model best_rsum = 0 for epoch in range(opt.num_epochs): print(opt.logger_name) print(opt.model_name) adjust_learning_rate(opt, model.optimizer, epoch) # train for one epoch train(opt, train_loader, model, epoch, val_loader) # evaluate on validation set rsum = validate(opt, val_loader, model) # remember best R@ sum and save checkpoint is_best = rsum > best_rsum best_rsum = max(rsum, best_rsum) if not os.path.exists(opt.model_name): os.mkdir(opt.model_name) save_checkpoint( { 'epoch': epoch + 1, 'model': model.state_dict(), 'best_rsum': best_rsum, 'opt': opt, 'Eiters': model.Eiters, }, is_best, filename='checkpoint_{}.pth.tar'.format(epoch), prefix=opt.model_name + '/')
def main(): # Hyper Parameters parser = argparse.ArgumentParser() parser.add_argument('--data_path', default='/data/stud/jorgebjorn/data/', help='path to datasets') parser.add_argument('--data_name', default='f8k_precomp', help='{coco,f8k,f30k,10crop}_precomp|coco|f8k|f30k') parser.add_argument('--vocab_path', default='/data/stud/jorgebjorn/data/vocab/', help='Path to saved vocabulary pickle files.') parser.add_argument('--margin', default=0.2, type=float, help='Rank loss margin.') parser.add_argument('--num_epochs', default=30, type=int, help='Number of training epochs.') parser.add_argument('--batch_size', default=128, type=int, help='Size of a training mini-batch.') parser.add_argument('--word_dim', default=300, type=int, help='Dimensionality of the word embedding.') parser.add_argument('--embed_size', default=1024, type=int, help='Dimensionality of the joint embedding.') parser.add_argument('--grad_clip', default=2., type=float, help='Gradient clipping threshold.') parser.add_argument('--crop_size', default=224, type=int, help='Size of an image crop as the CNN input.') parser.add_argument('--num_layers', default=1, type=int, help='Number of GRU layers.') parser.add_argument('--learning_rate', default=.0002, type=float, help='Initial learning rate.') parser.add_argument('--lr_update', default=15, type=int, help='Number of epochs to update the learning rate.') parser.add_argument('--workers', default=10, type=int, help='Number of data loader workers.') parser.add_argument('--log_step', default=10, type=int, help='Number of steps to print and record the log.') parser.add_argument('--val_step', default=500, type=int, help='Number of steps to run validation.') parser.add_argument( '--logger_name', default='/data/stud/jorgebjorn/runs/{}/{}'.format( getpass.getuser(), datetime.datetime.now().strftime("%d-%m-%y_%H:%M")), help='Path to save the model and Tensorboard log.') parser.add_argument('--selection', default='uncertainty', help='Active learning selection algorithm') parser.add_argument('--primary', default='images', help='Image- or caption-centric active learning') parser.add_argument('--resume', default='', type=str, metavar='PATH', help='path to latest checkpoint (default: none)') parser.add_argument('--max_violation', action='store_true', help='Use max instead of sum in the rank loss.') parser.add_argument('--img_dim', default=4096, type=int, help='Dimensionality of the image embedding.') parser.add_argument('--finetune', action='store_true', help='Fine-tune the image encoder.') parser.add_argument('--device', default=0, type=int, help='which gpu to use') parser.add_argument('--cnn_type', default='vgg19', help="""The CNN used for image encoder (e.g. vgg19, resnet152)""") parser.add_argument('--use_restval', action='store_true', help='Use the restval data for training on MSCOCO.') parser.add_argument('--measure', default='cosine', help='Similarity measure used (cosine|order)') parser.add_argument('--use_abs', action='store_true', help='Take the absolute value of embedding vectors.') parser.add_argument('--no_imgnorm', action='store_true', help='Do not normalize the image embeddings.') parser.add_argument('--reset_train', action='store_true', help='Ensure the training is always done in ' 'train mode (Not recommended).') parser.add_argument('--no_log', action='store_true', default=False, help='Disable logging') opt = parser.parse_args() opt.logger_name += "_" + opt.selection + "_" + opt.primary print(opt) if torch.cuda.is_available(): torch.cuda.set_device(opt.device) # Setup tensorboard logger if not opt.no_log: logging.basicConfig(format='%(asctime)s %(message)s', level=logging.INFO) tb_logger.configure(opt.logger_name, flush_secs=5) # Load Vocabulary Wrapper vocab = pickle.load( open(os.path.join(opt.vocab_path, '%s_vocab.pkl' % opt.data_name), 'rb')) opt.vocab_size = len(vocab) # Load data loaders active_loader, train_loader, val_loader = data.get_loaders( opt.data_name, vocab, opt.crop_size, opt.batch_size, opt.workers, opt) # Construct the model model = VSE(opt) if torch.cuda.is_available(): model.cuda() # optionally resume from a checkpoint if opt.resume: if os.path.isfile(opt.resume): print("=> loading checkpoint '{}'".format(opt.resume)) checkpoint = torch.load(opt.resume) start_epoch = checkpoint['epoch'] best_rsum = checkpoint['best_rsum'] model.load_state_dict(checkpoint['model']) # Eiters is used to show logs as the continuation of another # training model.Eiters = checkpoint['Eiters'] print("=> loaded checkpoint '{}' (epoch {}, best_rsum {})".format( opt.resume, start_epoch, best_rsum)) validate(opt, val_loader, model) else: print("=> no checkpoint found at '{}'".format(opt.resume)) n_rounds = 234 if opt.selection == "uncertainty": selection = select_uncertainty elif opt.selection == "margin": selection = select_margin elif opt.selection == "random": selection = select_random elif opt.selection == "hybrid": selection = select_hybrid elif opt.selection == "all": selection = select_all elif opt.selection == "capsim": selection = select_captionSimilarity else: selection = select_uncertainty for r in range(n_rounds): best_indices = selection(r, model, train_loader) for index in best_indices: active_loader.dataset.add_single(train_loader.dataset[index][0], train_loader.dataset[index][1]) train_loader.dataset.delete_indices(best_indices) # Train the Model print("Training on {} items ".format(len(active_loader))) # Reset the model model = VSE(opt) if torch.cuda.is_available(): model.cuda() best_rsum = 0 for epoch in range(opt.num_epochs): adjust_learning_rate(opt, model.optimizer, epoch) # train for one epoch train(opt, active_loader, model, epoch, val_loader) # evaluate on validation set rsum = validate(opt, val_loader, model, not opt.no_log, r)
def main(): # Hyper Parameters parser = argparse.ArgumentParser() parser.add_argument( '--data_path', default='/data3/zhangyf/cross_modal_retrieval/SCAN/data', help='path to datasets') parser.add_argument('--data_name', default='f30k_precomp', help='{coco,f30k}_precomp') parser.add_argument( '--vocab_path', default='/data3/zhangyf/cross_modal_retrieval/SCAN/vocab/', help='Path to saved vocabulary json files.') parser.add_argument('--margin', default=0.2, type=float, help='Rank loss margin.') parser.add_argument('--num_epochs', default=20, type=int, help='Number of training epochs.') parser.add_argument('--batch_size', default=128, type=int, help='Size of a training mini-batch.') parser.add_argument('--word_dim', default=300, type=int, help='Dimensionality of the word embedding.') parser.add_argument('--decoder_dim', default=512, type=int, help='Dimensionality of the word embedding.') parser.add_argument('--embed_size', default=1024, type=int, help='Dimensionality of the joint embedding.') parser.add_argument('--grad_clip', default=2., type=float, help='Gradient clipping threshold.') parser.add_argument('--num_layers', default=1, type=int, help='Number of GRU layers.') parser.add_argument('--learning_rate', default=.0002, type=float, help='Initial learning rate.') parser.add_argument('--lr_update', default=10, type=int, help='Number of epochs to update the learning rate.') parser.add_argument('--workers', default=4, type=int, help='Number of data loader workers.') parser.add_argument('--log_step', default=30, type=int, help='Number of steps to print and record the log.') parser.add_argument('--val_step', default=500, type=int, help='Number of steps to run validation.') parser.add_argument('--logger_name', default='./runs/runX/log', help='Path to save Tensorboard log.') parser.add_argument('--model_name', default='./runs/runX/checkpoint', help='Path to save the model.') parser.add_argument( '--resume', default= '/data3/zhangyf/cross_modal_retrieval/vsepp_next_train_12_31_f30k/run/coco_vse++_ft_128_f30k_next/model_best.pth.tar', type=str, metavar='PATH', help='path to latest checkpoint (default: none)') parser.add_argument('--max_violation', action='store_true', help='Use max instead of sum in the rank loss.') parser.add_argument('--img_dim', default=2048, type=int, help='Dimensionality of the image embedding.') parser.add_argument('--no_imgnorm', action='store_true', help='Do not normalize the image embeddings.') parser.add_argument('--no_txtnorm', action='store_true', help='Do not normalize the text embeddings.') parser.add_argument('--precomp_enc_type', default="basic", help='basic|weight_norm') parser.add_argument('--reset_train', action='store_true', help='Ensure the training is always done in ' 'train mode (Not recommended).') parser.add_argument('--finetune', action='store_true', help='Fine-tune the image encoder.') parser.add_argument('--cnn_type', default='resnet152', help="""The CNN used for image encoder (e.g. vgg19, resnet152)""") parser.add_argument('--crop_size', default=224, type=int, help='Size of an image crop as the CNN input.') opt = parser.parse_args() print(opt) logging.basicConfig(format='%(asctime)s %(message)s', level=logging.INFO) tb_logger.configure(opt.logger_name, flush_secs=5) # Load Vocabulary Wrapper vocab = pickle.load( open(os.path.join(opt.vocab_path, '%s_vocab.pkl' % opt.data_name), 'rb')) opt.vocab_size = len(vocab) # Load data loaders train_loader, val_loader = data.get_loaders(opt.data_name, vocab, opt.batch_size, opt.workers, opt) # Construct the model model = SCAN(opt) # optionally resume from a checkpoint if opt.resume: if os.path.isfile(opt.resume): print("=> loading checkpoint '{}'".format(opt.resume)) checkpoint = torch.load(opt.resume) start_epoch = checkpoint['epoch'] best_rsum = checkpoint['best_rsum'] model.load_state_dict(checkpoint['model']) # Eiters is used to show logs as the continuation of another # training model.Eiters = checkpoint['Eiters'] print("=> loaded checkpoint '{}' (epoch {}, best_rsum {})".format( opt.resume, start_epoch, best_rsum)) validate(opt, val_loader, model) else: print("=> no checkpoint found at '{}'".format(opt.resume)) # Train the Model best_rsum = 0 for epoch in range(opt.num_epochs): print(opt.logger_name) print(opt.model_name) adjust_learning_rate(opt, model.optimizer, epoch) # train for one epoch bset_rsum = train(opt, train_loader, model, epoch, val_loader, best_rsum) # evaluate on validation set rsum = validate(opt, val_loader, model) # remember best R@ sum and save checkpoint is_best = rsum > best_rsum best_rsum = max(rsum, best_rsum) if not os.path.exists(opt.model_name): os.mkdir(opt.model_name) save_checkpoint( { 'epoch': epoch + 1, 'model': model.state_dict(), 'best_rsum': best_rsum, 'opt': opt, 'Eiters': model.Eiters, }, is_best, filename='checkpoint_{}.pth.tar'.format(epoch), prefix=opt.model_name + '/')
def main(premise_hidden_size, hypo_hidden_size, linear_hidden_size, interaction_type, device, kind, num_layers=1, bidirectional=True, kernel_size=3, lr=1e-4, test=False, model_dir='models'): valid_types = ('cat', 'element_wise_mult') if interaction_type not in valid_types: raise ValueError('interaction_type can only be: ', valid_types) # data batch_size = 32 save_freq = 500 max_epochs = 40 train_loader, val_loader = data.get_loaders(batch_size, test=test) # model embed_size = 300 ind2vec = data.get_table_lookup() if kind == 'rnn': model = models.SNLI_Model(ind2vec, embed_size, premise_hidden_size, hypo_hidden_size, linear_hidden_size, interaction_type, device, kind='rnn', num_layers=num_layers, bidirectional=bidirectional) else: model = models.SNLI_Model(ind2vec, embed_size, premise_hidden_size, hypo_hidden_size, linear_hidden_size, interaction_type, device, kind='cnn', kernel_size=kernel_size) model = model.to(device) optimizer = torch.optim.Adam( [param for param in model.parameters() if param.requires_grad], lr=lr) loss_fn = torch.nn.CrossEntropyLoss() model_name = f'{kind}_model_{premise_hidden_size}_{interaction_type}' model_dir = os.path.join(model_dir, model_name) train_helper = train_helpers.TrainHelper(device, model, loss_fn, optimizer, models.batch_params_key, model_dir, test) train_loss, val_loss, train_acc, val_acc = train_helper.train_loop( train_loader, val_loader, max_epochs=max_epochs, save_freq=save_freq) if 'cpu' in device: os.makedirs('figures', exist_ok=True) path = f'figures/{model_name}' utils.plot_curves(train_loss, val_loss, train_acc, val_acc, path) utils.save_pkl_data(train_loss, 'train_loss.p', data_dir=model_dir) utils.save_pkl_data(val_loss, 'val_loss.p', data_dir=model_dir) utils.save_pkl_data(train_acc, 'train_acc.p', data_dir=model_dir) utils.save_pkl_data(val_acc, 'val_acc.p', data_dir=model_dir)
model.load_state_dict(model_dict['best']) else: model.load_state_dict(model_dict['last'] if 'last' in model_dict else model_dict) opt = torch.optim.SGD(model.parameters(), lr=0) # needed for backprop only if half_prec: model, opt = amp.initialize(model, opt, opt_level="O1") utils.model_eval(model, half_prec) eps, pgd_alpha, pgd_alpha_rr = eps / 255, pgd_alpha / 255, pgd_alpha_rr / 255 eval_batches_all = data.get_loaders( args.dataset, -1, args.batch_size_eval, train_set=True if args.set == 'train' else False, shuffle=False, data_augm=False) eval_batches = data.get_loaders( args.dataset, n_eval, args.batch_size_eval, train_set=True if args.set == 'train' else False, shuffle=False, data_augm=False) time_start = time.time() acc_clean, loss_clean, _ = rob_acc(eval_batches, model, 0, 0, opt, half_prec, 0, 1) print('acc={:.2%}, loss={:.3f}'.format(acc_clean, loss_clean))
def main(): # Hyper Parameters parser = argparse.ArgumentParser() parser.add_argument('--data_path', default='./data/', help='path to datasets') parser.add_argument('--data_name', default='precomp', help='{coco, f30k}_precomp') parser.add_argument('--vocab_path', default='./vocab/', help='Path to saved vocablulary json files') parser.add_argument('--margin', default=0.2, type=float, help='Rank loss margin.') parser.add_argument('--num_epochs', default=30, type=int, help='Number of training epochs.') parser.add_argument('--batch_size', default=64, type=int, help='Number of training epochs.') parser.add_argument('--word_embed_size', default=300, type=int, help='Dimensionality of the word embedding') parser.add_argument('--hidden_size', default=1024, type=int, help='Dimensionality of the joint embedding') parser.add_argument('--grad_clip', default=2., type=float, help='Gradient clipping threshold') parser.add_argument('--num_layers', default=1, type=int, help='Number of GRU layers.') parser.add_argument('--learning_rate', default=0.0002, type=float, help='Initial learning rate.') parser.add_argument('--lr_update', default=15, type=int, help='Number of epochs to update the learning rate.') parser.add_argument('--workers', default=10, type=int, help='Number of data loader workers.') parser.add_argument('--log_step', default=10, type=int, help='Number of steps to print and record the log') parser.add_argument('--val_step', default=1000, type=int, help='Number of steps to run validation.') parser.add_argument('--logger_name', default='./runs/runX/log', help='Path to save Tensorboard log.') parser.add_argument('--model_name', default='./runs/runX/checkpoint', help='Path to save the model') parser.add_argument('--resume', default='', type=str, metavar='PATH', help='path to latest checkpoint (default:none)') parser.add_argument('--max_violation', action='store_true', help='Use max instead of sum in the rank loss') parser.add_argument('--img_size', default=2048, type=int, help='Dimensionality of the image embedding') parser.add_argument('--norm', action='store_true', help='normalize the text and image embedding') parser.add_argument( '--norm_func', default='clipped_l2norm', help='clipped_leaky_l2norm|clipped_l2norm|l2norm|' 'clipped_leaky_l1norm|clipped_l1norm|l1norm|no_norm|softmax') parser.add_argument('--agg_func', default='Mean', help='LogSumExp|Mean|Max|Sum') parser.add_argument('--bi_gru', action='store_true', help='Use bidirectional GRU.') parser.add_argument('--lambda_lse', default=6., type=float, help='LogSumExp temp') parser.add_argument('--lambda_softmax', default=9., type=float, help='Attention softmax temperature') parser.add_argument( '--activation_func', default='relu', help='activation function: relu|gelu|no_activation_fun') parser.add_argument( '--alpha', default=0.5, type=float, help='the weight of final score between i2t score and t2i score') parser.add_argument('--use_abs', action='store_true', help='take the absolute value of embedding vectors') opt = parser.parse_args() print(opt) logging.basicConfig(format='%(asctime)s %(message)s', level=logging.INFO) tb_logger.configure(opt.logger_name, flush_secs=5) # load Vocabulary Wrapper vocab = deserialize_vocab( os.path.join(opt.vocab_path, '%s_vocab.json' % opt.data_name)) opt.vocab_size = len(vocab) # load data loaders train_loader, val_loader = data.get_loaders(opt.data_name, vocab, opt.batch_size, opt.workers, opt) # Construct the model model = CARRN(opt) best_rsum = 0 start_epoch = 0 if opt.resume: if os.path.isfile(opt.resume): print("=> loading checkpoint '{}'".format(opt.resume)) checkpoint = torch.load(opt.resume) start_epoch = checkpoint['epoch'] + 1 best_rsum = checkpoint['best_rsum'] model.load_state_dict(checkpoint['model']) # Eiters is used to show logs as the continuation of another # training model.Eiters = checkpoint['Eiters'] print("=> loaded checkpoint '{}' (epoch {}, best_rsum {})".format( opt.resume, start_epoch, best_rsum)) validate(opt, val_loader, model) else: print("=> no checkpoint found at '{}'".format(opt.resume)) # Train the Model for epoch in range(start_epoch, opt.num_epochs): print(opt.logger_name) print(opt.model_name) adjust_learning_rate(opt, model.optimizer, epoch) # train for one epoch train(opt, train_loader, model, epoch, val_loader) # evaluate on validation set rsum = validate(opt, val_loader, model) # remember best R@ sum and save checkpoint is_best = rsum > best_rsum best_rsum = max(rsum, best_rsum) if not os.path.exists(opt.model_name): os.mkdir(opt.model_name) save_checkpoint( { 'epoch': epoch, 'model': model.state_dict(), 'best_rsum': best_rsum, 'opt': opt, 'Eiters': model.Eiters, }, is_best, filename='checkpoint_{}.pth.tar'.format(epoch), prefix=opt.model_name + '/')
def main(): args = parser.parse_args() if args.seed is None: args.seed = random.randint(1, 10000) print("Random Seed: ", args.seed) random.seed(args.seed) torch.manual_seed(args.seed) if args.gpus: torch.cuda.manual_seed_all(args.seed) time_stamp = datetime.now().strftime('%Y-%m-%d_%H-%M-%S') if args.evaluate: args.results_dir = '/tmp' if args.save is '': args.save = time_stamp save_path = os.path.join(args.results_dir, args.save) if not os.path.exists(save_path): os.makedirs(save_path) if args.gpus is not None: args.gpus = [int(i) for i in args.gpus.split(',')] device = 'cuda:' + str(args.gpus[0]) cudnn.benchmark = True else: device = 'cpu' if args.type == 'float64': dtype = torch.float64 elif args.type == 'float32': dtype = torch.float32 elif args.type == 'float16': dtype = torch.float16 else: raise ValueError('Wrong type!') # TODO int8 model = ShuffleNetV2(scale=args.scaling, c_tag=args.c_tag, SE=args.SE, residual=args.residual, groups=args.groups) num_parameters = sum([l.nelement() for l in model.parameters()]) print(model) print('number of parameters: {}'.format(num_parameters)) print('FLOPs: {}'.format( flops_benchmark.count_flops( ShuffleNetV2, args.batch_size // len(args.gpus) if args.gpus is not None else args.batch_size, device, dtype, args.input_size, 3, args.scaling, 3, args.c_tag, 1000, torch.nn.ReLU, args.SE, args.residual, args.groups))) train_loader, val_loader = get_loaders(args.dataroot, args.batch_size, args.batch_size, args.input_size, args.workers) # define loss function (criterion) and optimizer criterion = torch.nn.CrossEntropyLoss() if args.gpus is not None: model = torch.nn.DataParallel(model, args.gpus) model.to(device=device, dtype=dtype) criterion.to(device=device, dtype=dtype) optimizer = torch.optim.SGD(model.parameters(), args.learning_rate, momentum=args.momentum, weight_decay=args.decay, nesterov=True) if args.find_clr: find_bounds_clr(model, train_loader, optimizer, criterion, device, dtype, min_lr=args.min_lr, max_lr=args.max_lr, step_size=args.epochs_per_step * len(train_loader), mode=args.mode, save_path=save_path) return if args.clr: scheduler = CyclicLR(optimizer, base_lr=args.min_lr, max_lr=args.max_lr, step_size=args.epochs_per_step * len(train_loader), mode=args.mode) else: scheduler = MultiStepLR(optimizer, milestones=args.schedule, gamma=args.gamma) best_test = 0 # optionally resume from a checkpoint data = None if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume, map_location=device) args.start_epoch = checkpoint['epoch'] - 1 best_test = checkpoint['best_prec1'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) elif os.path.isdir(args.resume): checkpoint_path = os.path.join(args.resume, 'checkpoint.pth.tar') csv_path = os.path.join(args.resume, 'results.csv') print("=> loading checkpoint '{}'".format(checkpoint_path)) checkpoint = torch.load(checkpoint_path, map_location=device) args.start_epoch = checkpoint['epoch'] - 1 best_test = checkpoint['best_prec1'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' (epoch {})".format( checkpoint_path, checkpoint['epoch'])) data = [] with open(csv_path) as csvfile: reader = csv.DictReader(csvfile) for row in reader: data.append(row) else: print("=> no checkpoint found at '{}'".format(args.resume)) if args.evaluate: loss, top1, top5 = test(model, val_loader, criterion, device, dtype) # TODO return csv_logger = CsvLogger(filepath=save_path, data=data) csv_logger.save_params(sys.argv, args) claimed_acc1 = None claimed_acc5 = None if args.SE in claimed_acc_top1: if args.scaling in claimed_acc_top1[args.SE]: claimed_acc1 = 1 - claimed_acc_top1[args.SE][args.scaling] csv_logger.write_text('Claimed accuracy is {:.2f}% top-1'.format( claimed_acc1 * 100.)) train_network(args.start_epoch, args.epochs, scheduler, model, train_loader, val_loader, optimizer, criterion, device, dtype, args.batch_size, args.log_interval, csv_logger, save_path, claimed_acc1, claimed_acc5, best_test)
def run_experiment(xp, xp_count, n_experiments): t0 = time.time() print(xp) hp = xp.hyperparameters num_classes = {"cifar10" : 10, "cifar100" : 100}[hp["dataset"]] model_names = [model_name for model_name, k in hp["models"].items() for _ in range(k)] optimizer, optimizer_hp = getattr(torch.optim, hp["local_optimizer"][0]), hp["local_optimizer"][1] optimizer_fn = lambda x : optimizer(x, **{k : hp[k] if k in hp else v for k, v in optimizer_hp.items()}) distill_optimizer, distill_optimizer_hp = getattr(torch.optim, hp["distill_optimizer"][0]), hp["distill_optimizer"][1] distill_optimizer_fn = lambda x : distill_optimizer(x, **{k : hp[k] if k in hp else v for k, v in distill_optimizer_hp.items()}) train_data, test_data = data.get_data(hp["dataset"], args.DATA_PATH) all_distill_data = data.get_data(hp["distill_dataset"], args.DATA_PATH) np.random.seed(hp["random_seed"]) torch.manual_seed(hp["random_seed"]) n_distill = int(hp["n_distill_frac"] * len(all_distill_data)) distill_data = data.IdxSubset(all_distill_data, np.random.permutation(len(all_distill_data))[:n_distill], return_index=True) public_data = data.IdxSubset(all_distill_data, np.random.permutation(len(all_distill_data))[n_distill:len(all_distill_data)], return_index=False) print(len(distill_data), len(public_data)) client_loaders, test_loader = data.get_loaders(train_data, test_data, n_clients=len(model_names), alpha=hp["alpha"], batch_size=hp["batch_size"], n_data=None, num_workers=0, seed=hp["random_seed"]) distill_loader = torch.utils.data.DataLoader(distill_data, batch_size=hp["distill_batch_size"], shuffle=True, num_workers=8) public_loader = torch.utils.data.DataLoader(public_data, batch_size=128, shuffle=True, num_workers=8) clients = [Client(model_name, optimizer_fn, loader, idnum=i, num_classes=num_classes) for i, (loader, model_name) in enumerate(zip(client_loaders, model_names))] server = Server(np.unique(model_names), distill_optimizer_fn, test_loader, distill_loader, num_classes=num_classes) for client in clients: client.public_loader = public_loader client.distill_loader = distill_loader # print model models.print_model(clients[0].model) if "P" in hp["aggregation_mode"] or hp["aggregation_mode"] == "FedAUX": for model_name, model in server.model_dict.items(): pretrained = hp["pretrained"] if hp["pretrained"] else "{}_{}.pth".format(model_name, hp["distill_dataset"]) loaded_state = torch.load(args.CHECKPOINT_PATH + pretrained, map_location='cpu') loaded_layers = [key for key in loaded_state if key in model.state_dict()] model.load_state_dict(loaded_state, strict=False) for client in clients: client.synchronize_with_server(server) print("Successfully loaded layers {} from".format(loaded_layers), pretrained) if hp["aggregation_mode"] == "FedAUX": print("Computing Scores...") for client in clients: client.scores = client.extract_features_and_compute_scores(client.loader, public_loader, distill_loader, lambda_reg=hp["lambda_reg_score"], eps_delt=hp["eps_delt"]) if hp["save_scores"]: xp.log({"client_{}_scores".format(client.id) : client.scores.detach().cpu().numpy()}) if "L" in hp["aggregation_mode"]: for client in clients: for name, param in client.model.named_parameters(): if "classification_layer" not in name: param.requires_grad = False for model in server.model_dict.values(): for name, param in model.named_parameters(): if "classification_layer" not in name: param.requires_grad = False # Start Distributed Training Process print("Start Distributed Training..\n") t1 = time.time() xp.log({"prep_time" : t1-t0}) xp.log({"server_val_{}".format(key) : value for key, value in server.evaluate_ensemble().items()}) for c_round in range(1, hp["communication_rounds"]+1): participating_clients = server.select_clients(clients, hp["participation_rate"]) xp.log({"participating_clients" : np.array([c.id for c in participating_clients])}) for client in participating_clients: client.synchronize_with_server(server) train_stats = client.compute_weight_update(hp["local_epochs"], lambda_fedprox=hp["lambda_fedprox"] if "PROX" in hp["aggregation_mode"] else 0.0) print(train_stats) # Averaging server.aggregate_weight_updates(participating_clients) avg_stats = server.evaluate_ensemble() xp.log({"averaging_{}".format(key) : value for key, value in avg_stats.items()}) if hp["aggregation_mode"] in ["FedDF", "FedAUX", "FedDF+P"]: distill_mode = "weighted_logits_precomputed" if hp["aggregation_mode"]=="FedAUX" else "mean_logits" distill_stats = server.distill(participating_clients, hp["distill_epochs"], mode=distill_mode, num_classes=num_classes) xp.log({"distill_{}".format(key) : value for key, value in distill_stats.items()}) # Logging if xp.is_log_round(c_round): print("Experiment: {} ({}/{})".format(args.schedule, xp_count+1, n_experiments)) xp.log({'communication_round' : c_round, 'epochs' : c_round*hp['local_epochs']}) xp.log({key : clients[0].optimizer.__dict__['param_groups'][0][key] for key in optimizer_hp}) # Evaluate xp.log({"server_val_{}".format(key) : value for key, value in server.evaluate_ensemble().items()}) xp.log({"epoch_time" : (time.time()-t1)/c_round}) # Save results to Disk try: xp.save_to_disc(path=args.RESULTS_PATH, name=hp['log_path']) except: print("Saving results Failed!") # Timing e = int((time.time()-t1)/c_round*(hp['communication_rounds']-c_round)) print("Remaining Time (approx.):", '{:02d}:{:02d}:{:02d}'.format(e // 3600, (e % 3600 // 60), e % 60), "[{:.2f}%]\n".format(c_round/hp['communication_rounds']*100)) # Save model to disk server.save_model(path=args.CHECKPOINT_PATH, name=hp["save_model"]) # Delete objects to free up GPU memory del server; clients.clear() torch.cuda.empty_cache()
def main(): # Hyper Parameters parser = argparse.ArgumentParser() parser.add_argument('--data_path', default='/A/VSE/data/', help='path to datasets') parser.add_argument( '--data_name', default='resnet152_precomp', help='{coco,f8k,f30k,10crop,irv2,resnet152}_precomp|coco|f8k|f30k') parser.add_argument('--vocab_path', default='./vocab/', help='Path to saved vocabulary pickle files.') parser.add_argument('--margin', default=0.05, type=float, help='Rank loss margin.') parser.add_argument('--num_epochs', default=30, type=int, help='Number of training epochs.') parser.add_argument('--batch_size', default=128, type=int, help='Size of a training mini-batch.') parser.add_argument('--word_dim', default=300, type=int, help='Dimensionality of the word embedding.') parser.add_argument( '--embed_size', default=1024, type=int, help= 'Dimensionality of the joint embedding. [NOTE: this is used only if <embed_size> differs from <gru_units>]' ) parser.add_argument('--gru_units', default=1024, type=int, help='Number of GRU neurons.') parser.add_argument('--grad_clip', default=1., type=float, help='Gradient clipping threshold.') parser.add_argument('--crop_size', default=224, type=int, help='Size of an image crop as the CNN input.') parser.add_argument('--num_layers', default=1, type=int, help='Number of GRU layers.') parser.add_argument('--learning_rate', default=.001, type=float, help='Initial learning rate.') parser.add_argument('--lr_update', default=15, type=int, help='Number of epochs to update the learning rate.') parser.add_argument('--workers', default=10, type=int, help='Number of data loader workers.') parser.add_argument('--log_step', default=10, type=int, help='Number of steps to print and record the log.') parser.add_argument('--val_step', default=500, type=int, help='Number of steps to run validation.') parser.add_argument('--logger_name', default='runs/runX', help='Path to save the model and Tensorboard log.') parser.add_argument('--resume', default='', type=str, metavar='PATH', help='path to latest checkpoint (default: none)') parser.add_argument('--max_violation', action='store_true', help='Use max instead of sum in the rank loss.') parser.add_argument('--img_dim', default=2048, type=int, help='Dimensionality of the image embedding.') parser.add_argument('--finetune', action='store_true', help='Fine-tune the image encoder.') parser.add_argument('--cnn_type', default='vgg19', help="""The CNN used for image encoder (e.g. vgg19, resnet152)""") parser.add_argument('--use_restval', action='store_true', help='Use the restval data for training on MSCOCO.') parser.add_argument('--measure', default='cosine', help='Similarity measure used (cosine|order)') parser.add_argument( '--test_measure', default=None, help= 'Similarity used for retrieval (None<same used for training>|cosine|order)' ) parser.add_argument('--use_abs', action='store_true', help='Take the absolute value of embedding vectors.') parser.add_argument('--no_imgnorm', action='store_true', help='Do not normalize the image embeddings.') parser.add_argument('--text_encoder', default='seam-e', choices=text_encoders.text_encoders_alias.keys()) parser.add_argument( '--att_units', default=300, type=int, help= 'Number of tanh neurons. When using --att_dim=None we apply a tanh directly to the att input. ' ) parser.add_argument('--att_hops', default=30, type=int, help='Number of attention hops (viewpoints).') parser.add_argument( '--att_coef', default=0., type=float, help='Influence of Frobenius divergence in the loss function.') opt = parser.parse_args() if opt.test_measure is None: opt.test_measure = opt.measure print(opt) logging.basicConfig(format='%(asctime)s %(message)s', level=logging.INFO) tb_logger.configure(opt.logger_name, flush_secs=5) tokenizer, vocab_size = data.get_tokenizer(opt.vocab_path, opt.data_name) opt.vocab_size = vocab_size collate_fn = 'collate_fn' # Load data loaders train_loader, val_loader = data.get_loaders(opt.data_name, tokenizer, opt.crop_size, opt.batch_size, opt.workers, opt, collate_fn) # Construct the model model = VSE(opt) print(model.txt_enc) # optionally resume from a checkpoint if opt.resume: if os.path.isfile(opt.resume): print("=> loading checkpoint '{}'".format(opt.resume)) checkpoint = torch.load(opt.resume) start_epoch = checkpoint['epoch'] best_rsum = checkpoint['best_rsum'] model.load_state_dict(checkpoint['model']) # Eiters is used to show logs as the continuation of another # training model.Eiters = checkpoint['Eiters'] print("=> loaded checkpoint '{}' (epoch {}, best_rsum {})".format( opt.resume, start_epoch, best_rsum)) validate(opt, val_loader, model) else: print("=> no checkpoint found at '{}'".format(opt.resume)) # Train the Model best_rsum = 0 for epoch in range(opt.num_epochs): adjust_learning_rate(opt, model.optimizer, epoch) # train for one epoch train(opt, train_loader, model, epoch, val_loader) # evaluate on validation set rsum = validate(opt, val_loader, model) # remember best R@ sum and save checkpoint is_best = rsum > best_rsum best_rsum = max(rsum, best_rsum) save_checkpoint( { 'epoch': epoch + 1, 'model': model.state_dict(), 'best_rsum': best_rsum, 'opt': opt, 'Eiters': model.Eiters, }, is_best, prefix=opt.logger_name + '/')
def main(): # Hyper Parameters parser = argparse.ArgumentParser() parser.add_argument('--data_path', default='/data', help='path to datasets') parser.add_argument('--data_name', default='precomp', help='{coco,f8k,f30k,10crop}_precomp|coco|f8k|f30k') parser.add_argument('--vocab_path', default='./vocab/', help='Path to saved vocabulary pickle files.') parser.add_argument('--margin', default=0.2, type=float, help='Rank loss margin.') parser.add_argument('--num_epochs', default=30, type=int, help='Number of training epochs.') parser.add_argument('--batch_size', default=128, type=int, help='Size of a training mini-batch.') parser.add_argument('--word_dim', default=300, type=int, help='Dimensionality of the word embedding.') parser.add_argument('--embed_size', default=2048, type=int, help='Dimensionality of the joint embedding.') parser.add_argument('--grad_clip', default=2., type=float, help='Gradient clipping threshold.') parser.add_argument('--crop_size', default=224, type=int, help='Size of an image crop as the CNN input.') parser.add_argument('--num_layers', default=1, type=int, help='Number of GRU layers.') parser.add_argument('--learning_rate', default=.0002, type=float, help='Initial learning rate.') parser.add_argument('--lr_update', default=15, type=int, help='Number of epochs to update the learning rate.') parser.add_argument('--workers', default=10, type=int, help='Number of data loader workers.') parser.add_argument('--log_step', default=10, type=int, help='Number of steps to print and record the log.') parser.add_argument('--val_step', default=500, type=int, help='Number of steps to run validation.') parser.add_argument('--logger_name', default='runs/runX', help='Path to save the model and Tensorboard log.') parser.add_argument('--resume', default='', type=str, metavar='PATH', help='path to latest checkpoint (default: none)') parser.add_argument('--max_violation', action='store_true', help='Use max instead of sum in the rank loss.') parser.add_argument('--img_dim', default=2048, type=int, help='Dimensionality of the image embedding.') parser.add_argument('--finetune', action='store_true', help='Fine-tune the image encoder.') parser.add_argument('--cnn_type', default='vgg19', help="""The CNN used for image encoder (e.g. vgg19, resnet152)""") parser.add_argument('--use_restval', action='store_true', help='Use the restval data for training on MSCOCO.') parser.add_argument('--measure', default='cosine', help='Similarity measure used (cosine|order)') parser.add_argument('--use_abs', action='store_true', help='Take the absolute value of embedding vectors.') parser.add_argument('--no_imgnorm', action='store_true', help='Do not normalize the image embeddings.') parser.add_argument('--reset_train', action='store_true', help='Ensure the training is always done in ' 'train mode (Not recommended).') ### AM Parameters parser.add_argument('--text_number', default=15, type=int, help='Number of ocr tokens used (max. 20).') parser.add_argument('--text_dim', default=300, type=int, help='Dimension of scene text embedding - default 300') ###caption parameters parser.add_argument( '--dim_vid', type=int, default=2048, help='dim of features of video frames') parser.add_argument( '--dim_hidden', type=int, default=512, help='size of the rnn hidden layer') parser.add_argument( "--bidirectional", type=int, default=0, help="0 for disable, 1 for enable. encoder/decoder bidirectional.") parser.add_argument( '--input_dropout_p', type=float, default=0.2, help='strength of dropout in the Language Model RNN') parser.add_argument( '--rnn_type', type=str, default='gru', help='lstm or gru') parser.add_argument( '--rnn_dropout_p', type=float, default=0.5, help='strength of dropout in the Language Model RNN') parser.add_argument( '--dim_word', type=int, default=300, # 512 help='the encoding size of each token in the vocabulary, and the video.' ) parser.add_argument( "--max_len", type=int, default=60, help='max length of captions(containing <sos>,<eos>)') opt = parser.parse_args() print(opt) logging.basicConfig(format='%(asctime)s %(message)s', level=logging.INFO) tb_logger.configure(opt.logger_name, flush_secs=5) # Load Vocabulary Wrapper vocab = pickle.load(open(os.path.join( opt.vocab_path, '%s_vocab.pkl' % opt.data_name), 'rb')) opt.vocab_size = len(vocab) # Load data loaders train_loader, val_loader = data.get_loaders( opt.data_name, vocab, opt.crop_size, opt.batch_size, opt.workers, opt) # Construct the model model = VSRN(opt) # optionally resume from a checkpoint if opt.resume: if os.path.isfile(opt.resume): print("=> loading checkpoint '{}'".format(opt.resume)) checkpoint = torch.load(opt.resume) start_epoch = checkpoint['epoch'] best_rsum = checkpoint['best_rsum'] model.load_state_dict(checkpoint['model']) # Eiters is used to show logs as the continuation of another # training model.Eiters = checkpoint['Eiters'] print("=> loaded checkpoint '{}' (epoch {}, best_rsum {})" .format(opt.resume, start_epoch, best_rsum)) validate(opt, val_loader, model) else: print("=> no checkpoint found at '{}'".format(opt.resume)) # Train the Model best_rsum = 0 for epoch in range(opt.num_epochs): adjust_learning_rate(opt, model.optimizer, epoch) # train for one epoch best_rsum = train(opt, train_loader, model, epoch, val_loader, best_rsum) # evaluate on validation set rsum = validate(opt, val_loader, model) # remember best R@ sum and save checkpoint is_best = rsum > best_rsum best_rsum = max(rsum, best_rsum) save_checkpoint({ 'epoch': epoch + 1, 'model': model.state_dict(), 'best_rsum': best_rsum, 'opt': opt, 'Eiters': model.Eiters, }, is_best, prefix=opt.logger_name + '/')
def main(opt): logging.basicConfig(format='%(message)s', level=logging.INFO) tb_logger.configure(opt.logger_name, flush_secs=5) # Load Vocabulary Wrapper vocab = pickle.load( open(os.path.join(opt.vocab_path, 'vg_c_precomp_vocab.pkl'), 'rb')) opt.vocab_size = len(vocab) print(opt.vocab_size) # Load data loaders train_loader, val_loader = data.get_loaders(opt.data_name, vocab, opt.crop_size, opt.batch_size, opt.workers, opt) # Construct the model model = XRN(opt) # optionally resume from a checkpoint if opt.resume: if os.path.isfile(opt.resume): print("=> loading checkpoint '{}'".format(opt.resume)) checkpoint = torch.load(opt.resume) start_epoch = checkpoint['epoch'] best_rsum = checkpoint['best_rsum'] model.load_state_dict(checkpoint['model']) # Eiters is used to show logs as the continuation of another training model.Eiters = checkpoint['Eiters'] print("=> loaded checkpoint '{}' (epoch {}, best_rsum {})".format( opt.resume, start_epoch, best_rsum)) validate(opt, val_loader, model) else: print("=> no checkpoint found at '{}'".format(opt.resume)) # Train the Model best_rsum = 0 for epoch in range(opt.num_epochs): adjust_learning_rate(opt, model.optimizer, epoch) # train for one epoch train(opt, train_loader, model, epoch, val_loader) # evaluate on validation set rsum, d_i2t = validate(opt, val_loader, model) # remember best R@ sum and save checkpoint is_best = rsum > best_rsum if is_best: np.save('d_i2t.npy', d_i2t) best_rsum = max(rsum, best_rsum) save_checkpoint( { 'epoch': epoch + 1, 'model': model.state_dict(), 'best_rsum': best_rsum, 'opt': opt, 'Eiters': model.Eiters, }, is_best, prefix=opt.logger_name + '_' + opt.model_name + '/')
def main(): args = parser.parse_args() if args.seed is None: args.seed = random.randint(1, 10000) print("Random Seed: ", args.seed) random.seed(args.seed) torch.manual_seed(args.seed) if args.gpus: torch.cuda.manual_seed_all(args.seed) time_stamp = datetime.now().strftime('%Y-%m-%d_%H-%M-%S') if args.evaluate: args.results_dir = '/tmp' if args.save is '': args.save = 'mar10_224_' + time_stamp save_path = os.path.join(args.results_dir, args.save) if not os.path.exists(save_path): os.makedirs(save_path) if args.gpus is not None: args.gpus = [int(i) for i in args.gpus.split(',')] device = 'cuda:' + str(args.gpus[0]) cudnn.benchmark = True else: device = 'cpu' if args.type == 'float64': dtype = torch.float64 elif args.type == 'float32': dtype = torch.float32 elif args.type == 'float16': dtype = torch.float16 else: raise ValueError('Wrong type!') # TODO int8 model = STN_MobileNet2(input_size=args.input_size, scale=args.scaling, shearing=args.shearing) # print(model.stnmod.fc_loc[0].bias.data) num_parameters = sum([l.nelement() for l in model.parameters()]) print(model) print('number of parameters: {}'.format(num_parameters)) print('FLOPs: {}'.format( flops_benchmark.count_flops( STN_MobileNet2, args.batch_size // len(args.gpus) if args.gpus is not None else args.batch_size, device, dtype, args.input_size, 3, args.scaling))) train_loader, val_loader, test_loader = get_loaders( args.dataroot, args.batch_size, args.batch_size, args.input_size, args.workers, args.b_weights) # define loss function (criterion) and optimizer criterion = torch.nn.CrossEntropyLoss() L1_criterion = torch.nn.L1Loss() PW_criterion = torch.nn.CosineSimilarity(dim=2, eps=1e-6) if args.gpus is not None: model = torch.nn.DataParallel(model, args.gpus) model.to(device=device, dtype=dtype) criterion.to(device=device, dtype=dtype) L1_criterion.to(device=device, dtype=dtype) PW_criterion.to(device=device, dtype=dtype) optimizer = torch.optim.SGD(model.parameters(), args.learning_rate, momentum=args.momentum, weight_decay=args.decay, nesterov=True) if args.find_clr: find_bounds_clr(model, train_loader, optimizer, PW_criterion, device, dtype, min_lr=args.min_lr, max_lr=args.max_lr, step_size=args.epochs_per_step * len(train_loader), mode=args.mode, save_path=save_path) return if args.clr: print('Use CLR') scheduler = CyclicLR(optimizer, base_lr=args.min_lr, max_lr=args.max_lr, step_size=args.epochs_per_step * len(train_loader), mode=args.mode) else: print('Use scheduler') scheduler = MultiStepLR(optimizer, milestones=args.schedule, gamma=args.gamma) best_val = 500 # optionally resume from a checkpoint data = None if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume, map_location=device) # args.start_epoch = checkpoint['epoch'] - 1 # best_val = checkpoint['best_prec1'] # best_test = checkpoint['best_prec1'] args.start_epoch = 0 best_val = 500 state_dict = checkpoint['state_dict'] # if weights from imagenet new_state_dict = OrderedDict() for k, v in state_dict.items(): # print(k, v.size()) name = k if k == 'module.fc.bias': new_state_dict[name] = torch.zeros(101) continue elif k == 'module.fc.weight': new_state_dict[name] = torch.ones(101, 1280) continue else: print('else:', name) new_state_dict[name] = v model.load_state_dict(new_state_dict, strict=False) # optimizer.load_state_dict(checkpoint['optimizer'], strict=False) print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) elif os.path.isdir(args.resume): checkpoint_path = os.path.join(args.resume, 'checkpoint.pth.tar') csv_path = os.path.join(args.resume, 'results.csv') print("=> loading checkpoint '{}'".format(checkpoint_path)) checkpoint = torch.load(checkpoint_path, map_location=device) args.start_epoch = checkpoint['epoch'] - 1 best_val = checkpoint['best_prec1'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' (epoch {})".format( checkpoint_path, checkpoint['epoch'])) data = [] with open(csv_path) as csvfile: reader = csv.DictReader(csvfile) for row in reader: data.append(row) else: print("=> no checkpoint found at '{}'".format(args.resume)) if args.evaluate: loss, test_mae = test(model, predefined_points, 0, test_loader, PW_criterion, device, dtype) # TODO return csv_logger = CsvLogger(filepath=save_path, data=data) csv_logger.save_params(sys.argv, args) # claimed_acc1 = None # claimed_acc5 = None # if args.input_size in claimed_acc_top1: # if args.scaling in claimed_acc_top1[args.input_size]: # claimed_acc1 = claimed_acc_top1[args.input_size][args.scaling] # claimed_acc5 = claimed_acc_top5[args.input_size][args.scaling] # csv_logger.write_text( # 'Claimed accuracies are: {:.2f}% top-1, {:.2f}% top-5'.format(claimed_acc1 * 100., claimed_acc5 * 100.)) train_network(args.start_epoch, args.epochs, scheduler, model, predefined_points, train_loader, val_loader, test_loader, optimizer, PW_criterion, device, dtype, args.batch_size, args.log_interval, csv_logger, save_path, best_val)
# TODO: data parallel + normalisation + hidden sizes ? + flatten direct # Ideas: binaryce + import models import data import torch import os from tqdm import tqdm import wandb import numpy as np import matplotlib.pyplot as plt from utils import compute_accuracy, visualize_noise from setup import (BATCH_SIZE, MODEL, DEVICE, LEARNING_RATE, NB_EPOCHS, FILE_NAME, DROPOUT, WEIGHTS_INIT, ALPHA, KEEP_WEIGHTS, SIGNAL_LENGTH, SUBSAMPLE) train_loader, test_loader = data.get_loaders(BATCH_SIZE) model = models.DeNoiser(dropout=DROPOUT) if WEIGHTS_INIT: model.apply(models.init_weights) model.to(DEVICE) wandb.watch(model) if KEEP_WEIGHTS and os.path.exists(f"weights/{FILE_NAME}.pt"): print("Weights found") model.load_state_dict(torch.load(f"weights/{FILE_NAME}.pt")) else: os.makedirs("weights/", exist_ok=True) print("Weights not found")
import utils import models import data import train_helpers model_dir = sys.argv[1] hidden_size = int(sys.argv[2]) interaction_type = sys.argv[3] kind = sys.argv[4] epoch = sys.argv[5] batch_ix = sys.argv[6] batch_size = 32 ind2vec = utils.load_pkl_data('ind2vec.p', data_dir='vocab') _, val_loader = data.get_loaders(batch_size, data_dir='hw2_data') loss_fn = torch.nn.CrossEntropyLoss() fmodel = f'epoch_{epoch}_batch_{batch_ix}.pt' print('model: ' + fmodel) model = models.SNLI_Model(ind2vec, 300, hidden_size, hidden_size, 80, interaction_type, 'cpu', kind, num_layers=1, bidirectional=True, kernel_size=3) model.load_state_dict(torch.load(f'{model_dir}/{fmodel}'))
def main(): args = get_args() os.environ["CUDA_VISIBLE_DEVICES"] = str(args.gpu) cur_timestamp = str(datetime.now())[:-3] # we also include ms to prevent the probability of name collision model_width = {'linear': '', 'cnn': args.n_filters_cnn, 'lenet': '', 'resnet18': ''}[args.model] model_str = '{}{}'.format(args.model, model_width) model_name = '{} dataset={} model={} eps={} attack={} m={} attack_init={} fgsm_alpha={} epochs={} pgd={}-{} grad_align_cos_lambda={} lr_max={} seed={}'.format( cur_timestamp, args.dataset, model_str, args.eps, args.attack, args.minibatch_replay, args.attack_init, args.fgsm_alpha, args.epochs, args.pgd_alpha_train, args.pgd_train_n_iters, args.grad_align_cos_lambda, args.lr_max, args.seed) if not os.path.exists('models'): os.makedirs('models') logger = utils.configure_logger(model_name, args.debug) logger.info(args) half_prec = args.half_prec n_cls = 2 if 'binary' in args.dataset else 10 np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) double_bp = True if args.grad_align_cos_lambda > 0 else False n_eval_every_k_iter = args.n_eval_every_k_iter args.pgd_alpha = args.eps / 4 eps, pgd_alpha, pgd_alpha_train = args.eps / 255, args.pgd_alpha / 255, args.pgd_alpha_train / 255 train_data_augm = False if args.dataset in ['mnist'] else True train_batches = data.get_loaders(args.dataset, -1, args.batch_size, train_set=True, shuffle=True, data_augm=train_data_augm) train_batches_fast = data.get_loaders(args.dataset, n_eval_every_k_iter, args.batch_size, train_set=True, shuffle=False, data_augm=False) test_batches = data.get_loaders(args.dataset, args.n_final_eval, args.batch_size_eval, train_set=False, shuffle=False, data_augm=False) test_batches_fast = data.get_loaders(args.dataset, n_eval_every_k_iter, args.batch_size_eval, train_set=False, shuffle=False, data_augm=False) model = models.get_model(args.model, n_cls, half_prec, data.shapes_dict[args.dataset], args.n_filters_cnn).cuda() model.apply(utils.initialize_weights) model.train() if args.model == 'resnet18': opt = torch.optim.SGD(model.parameters(), lr=args.lr_max, momentum=0.9, weight_decay=args.weight_decay) elif args.model == 'cnn': opt = torch.optim.Adam(model.parameters(), lr=args.lr_max, weight_decay=args.weight_decay) elif args.model == 'lenet': opt = torch.optim.Adam(model.parameters(), lr=args.lr_max, weight_decay=args.weight_decay) else: raise ValueError('decide about the right optimizer for the new model') if half_prec: if double_bp: amp.register_float_function(torch, 'batch_norm') model, opt = amp.initialize(model, opt, opt_level="O1") if args.attack == 'fgsm': # needed here only for Free-AT delta = torch.zeros(args.batch_size, *data.shapes_dict[args.dataset][1:]).cuda() delta.requires_grad = True lr_schedule = utils.get_lr_schedule(args.lr_schedule, args.epochs, args.lr_max) loss_function = nn.CrossEntropyLoss() train_acc_pgd_best, best_state_dict = 0.0, copy.deepcopy(model.state_dict()) start_time = time.time() time_train, iteration, best_iteration = 0, 0, 0 for epoch in range(args.epochs + 1): train_loss, train_reg, train_acc, train_n, grad_norm_x, avg_delta_l2 = 0, 0, 0, 0, 0, 0 for i, (X, y) in enumerate(train_batches): if i % args.minibatch_replay != 0 and i > 0: # take new inputs only each `minibatch_replay` iterations X, y = X_prev, y_prev time_start_iter = time.time() # epoch=0 runs only for one iteration (to check the training stats at init) if epoch == 0 and i > 0: break X, y = X.cuda(), y.cuda() lr = lr_schedule(epoch - 1 + (i + 1) / len(train_batches)) # epoch - 1 since the 0th epoch is skipped opt.param_groups[0].update(lr=lr) if args.attack in ['pgd', 'pgd_corner']: pgd_rs = True if args.attack_init == 'random' else False n_eps_warmup_epochs = 5 n_iterations_max_eps = n_eps_warmup_epochs * data.shapes_dict[args.dataset][0] // args.batch_size eps_pgd_train = min(iteration / n_iterations_max_eps * eps, eps) if args.dataset == 'svhn' else eps delta = utils.attack_pgd_training( model, X, y, eps_pgd_train, pgd_alpha_train, opt, half_prec, args.pgd_train_n_iters, rs=pgd_rs) if args.attack == 'pgd_corner': delta = eps * utils.sign(delta) # project to the corners delta = clamp(X + delta, 0, 1) - X elif args.attack == 'fgsm': if args.minibatch_replay == 1: if args.attack_init == 'zero': delta = torch.zeros_like(X, requires_grad=True) elif args.attack_init == 'random': delta = utils.get_uniform_delta(X.shape, eps, requires_grad=True) else: raise ValueError('wrong args.attack_init') else: # if Free-AT, we just reuse the existing delta from the previous iteration delta.requires_grad = True X_adv = clamp(X + delta, 0, 1) output = model(X_adv) loss = F.cross_entropy(output, y) if half_prec: with amp.scale_loss(loss, opt) as scaled_loss: grad = torch.autograd.grad(scaled_loss, delta, create_graph=True if double_bp else False)[0] grad /= scaled_loss / loss # reverse back the scaling else: grad = torch.autograd.grad(loss, delta, create_graph=True if double_bp else False)[0] grad = grad.detach() argmax_delta = eps * utils.sign(grad) n_alpha_warmup_epochs = 5 n_iterations_max_alpha = n_alpha_warmup_epochs * data.shapes_dict[args.dataset][0] // args.batch_size fgsm_alpha = min(iteration / n_iterations_max_alpha * args.fgsm_alpha, args.fgsm_alpha) if args.dataset == 'svhn' else args.fgsm_alpha delta.data = clamp(delta.data + fgsm_alpha * argmax_delta, -eps, eps) delta.data = clamp(X + delta.data, 0, 1) - X elif args.attack == 'random_corner': delta = utils.get_uniform_delta(X.shape, eps, requires_grad=False) delta = eps * utils.sign(delta) elif args.attack == 'none': delta = torch.zeros_like(X, requires_grad=False) else: raise ValueError('wrong args.attack') # extra FP+BP to calculate the gradient to monitor it if args.attack in ['none', 'random_corner', 'pgd', 'pgd_corner']: grad = get_input_grad(model, X, y, opt, eps, half_prec, delta_init='none', backprop=args.grad_align_cos_lambda != 0.0) delta = delta.detach() output = model(X + delta) loss = loss_function(output, y) reg = torch.zeros(1).cuda()[0] # for .item() to run correctly if args.grad_align_cos_lambda != 0.0: grad2 = get_input_grad(model, X, y, opt, eps, half_prec, delta_init='random_uniform', backprop=True) grads_nnz_idx = ((grad**2).sum([1, 2, 3])**0.5 != 0) * ((grad2**2).sum([1, 2, 3])**0.5 != 0) grad1, grad2 = grad[grads_nnz_idx], grad2[grads_nnz_idx] grad1_norms, grad2_norms = l2_norm_batch(grad1), l2_norm_batch(grad2) grad1_normalized = grad1 / grad1_norms[:, None, None, None] grad2_normalized = grad2 / grad2_norms[:, None, None, None] cos = torch.sum(grad1_normalized * grad2_normalized, (1, 2, 3)) reg += args.grad_align_cos_lambda * (1.0 - cos.mean()) loss += reg if epoch != 0: opt.zero_grad() utils.backward(loss, opt, half_prec) opt.step() time_train += time.time() - time_start_iter train_loss += loss.item() * y.size(0) train_reg += reg.item() * y.size(0) train_acc += (output.max(1)[1] == y).sum().item() train_n += y.size(0) with torch.no_grad(): # no grad for the stats grad_norm_x += l2_norm_batch(grad).sum().item() delta_final = clamp(X + delta, 0, 1) - X # we should measure delta after the projection onto [0, 1]^d avg_delta_l2 += ((delta_final ** 2).sum([1, 2, 3]) ** 0.5).sum().item() if iteration % args.eval_iter_freq == 0: train_loss, train_reg = train_loss / train_n, train_reg / train_n train_acc, avg_delta_l2 = train_acc / train_n, avg_delta_l2 / train_n # it'd be incorrect to recalculate the BN stats on the test sets and for clean / adversarial points utils.model_eval(model, half_prec) test_acc_clean, _, _ = rob_acc(test_batches_fast, model, eps, pgd_alpha, opt, half_prec, 0, 1) test_acc_fgsm, test_loss_fgsm, fgsm_deltas = rob_acc(test_batches_fast, model, eps, eps, opt, half_prec, 1, 1, rs=False) test_acc_pgd, test_loss_pgd, pgd_deltas = rob_acc(test_batches_fast, model, eps, pgd_alpha, opt, half_prec, args.attack_iters, 1) cos_fgsm_pgd = utils.avg_cos_np(fgsm_deltas, pgd_deltas) train_acc_pgd, _, _ = rob_acc(train_batches_fast, model, eps, pgd_alpha, opt, half_prec, args.attack_iters, 1) # needed for early stopping grad_x = utils.get_grad_np(model, test_batches_fast, eps, opt, half_prec, rs=False) grad_eta = utils.get_grad_np(model, test_batches_fast, eps, opt, half_prec, rs=True) cos_x_eta = utils.avg_cos_np(grad_x, grad_eta) time_elapsed = time.time() - start_time train_str = '[train] loss {:.3f}, reg {:.3f}, acc {:.2%} acc_pgd {:.2%}'.format(train_loss, train_reg, train_acc, train_acc_pgd) test_str = '[test] acc_clean {:.2%}, acc_fgsm {:.2%}, acc_pgd {:.2%}, cos_x_eta {:.3}, cos_fgsm_pgd {:.3}'.format( test_acc_clean, test_acc_fgsm, test_acc_pgd, cos_x_eta, cos_fgsm_pgd) logger.info('{}-{}: {} {} ({:.2f}m, {:.2f}m)'.format(epoch, iteration, train_str, test_str, time_train/60, time_elapsed/60)) if train_acc_pgd > train_acc_pgd_best: # catastrophic overfitting can be detected on the training set best_state_dict = copy.deepcopy(model.state_dict()) train_acc_pgd_best, best_iteration = train_acc_pgd, iteration utils.model_train(model, half_prec) train_loss, train_reg, train_acc, train_n, grad_norm_x, avg_delta_l2 = 0, 0, 0, 0, 0, 0 iteration += 1 X_prev, y_prev = X.clone(), y.clone() # needed for Free-AT if epoch == args.epochs: torch.save({'last': model.state_dict(), 'best': best_state_dict}, 'models/{} epoch={}.pth'.format(model_name, epoch)) # disable global conversion to fp16 from amp.initialize() (https://github.com/NVIDIA/apex/issues/567) context_manager = amp.disable_casts() if half_prec else utils.nullcontext() with context_manager: last_state_dict = copy.deepcopy(model.state_dict()) half_prec = False # final eval is always in fp32 model.load_state_dict(last_state_dict) utils.model_eval(model, half_prec) opt = torch.optim.SGD(model.parameters(), lr=0) attack_iters, n_restarts = (50, 10) if not args.debug else (10, 3) test_acc_clean, _, _ = rob_acc(test_batches, model, eps, pgd_alpha, opt, half_prec, 0, 1) test_acc_pgd_rr, _, deltas_pgd_rr = rob_acc(test_batches, model, eps, pgd_alpha, opt, half_prec, attack_iters, n_restarts) logger.info('[last: test on 10k points] acc_clean {:.2%}, pgd_rr {:.2%}'.format(test_acc_clean, test_acc_pgd_rr)) if args.eval_early_stopped_model: model.load_state_dict(best_state_dict) utils.model_eval(model, half_prec) test_acc_clean, _, _ = rob_acc(test_batches, model, eps, pgd_alpha, opt, half_prec, 0, 1) test_acc_pgd_rr, _, deltas_pgd_rr = rob_acc(test_batches, model, eps, pgd_alpha, opt, half_prec, attack_iters, n_restarts) logger.info('[best: test on 10k points][iter={}] acc_clean {:.2%}, pgd_rr {:.2%}'.format( best_iteration, test_acc_clean, test_acc_pgd_rr)) utils.model_train(model, half_prec) logger.info('Done in {:.2f}m'.format((time.time() - start_time) / 60))
def main(): # Hyper Parameters parser = argparse.ArgumentParser() parser.add_argument('--data_path', default='data', help='path to datasets') parser.add_argument('--data_name', default='f30k', help='{coco,f8k,f30k,10crop}_precomp|coco|f8k|f30k') parser.add_argument('--vocab_path', default='vocab', help='Path to saved vocabulary pickle files.') parser.add_argument('--margin', default=0.2, type=float, help='Rank loss margin.') parser.add_argument('--num_epochs', default=30, type=int, help='Number of training epochs.') parser.add_argument('--batch_size', default=128, type=int, help='Size of a training mini-batch.') parser.add_argument('--word_dim', default=300, type=int, help='Dimensionality of the word embedding.') parser.add_argument('--embed_size', default=1024, type=int, help='Dimensionality of the joint embedding.') parser.add_argument('--grad_clip', default=2., type=float, help='Gradient clipping threshold.') parser.add_argument('--crop_size', default=224, type=int, help='Size of an image crop as the CNN input.') parser.add_argument('--num_layers', default=1, type=int, help='Number of GRU layers.') parser.add_argument('--learning_rate', default=2e-4, type=float, help='Initial learning rate.') parser.add_argument('--lr_update', default=15, type=int, help='Number of epochs to update the learning rate.') parser.add_argument('--workers', default=10, type=int, help='Number of data loader workers.') parser.add_argument('--log_step', default=100, type=int, help='Number of steps to print and record the log.') parser.add_argument('--val_step', default=500, type=int, help='Number of steps to run validation.') parser.add_argument('--logger_name', default='runs/test', help='Path to save the model and Tensorboard log.') parser.add_argument('--resume', default='', type=str, metavar='PATH', help='path to latest checkpoint (default: none)') parser.add_argument('--img_dim', default=2048, type=int, help='Dimensionality of the image embedding.') parser.add_argument('--finetune', action='store_true', help='Fine-tune the image encoder.') parser.add_argument('--use_restval', action='store_true', help='Use the restval data for training on MSCOCO.') parser.add_argument('--reset_train', action='store_true', help='Ensure the training is always done in ' 'train mode (Not recommended).') parser.add_argument('--K', default=2, type=int,help='num of JSR.') parser.add_argument('--feature_path', default='data/joint-pretrain/flickr30k/region_feat_gvd_wo_bgd/trainval/', type=str, help='path to the pre-computed image features') parser.add_argument('--region_bbox_file', default='data/joint-pretrain/flickr30k/region_feat_gvd_wo_bgd/flickr30k_detection_vg_thresh0.2_feat_gvd_checkpoint_trainvaltest.h5', type=str, help='path to the region_bbox_file(.h5)') opt = parser.parse_args() print(opt) logging.basicConfig(format='%(asctime)s %(message)s', level=logging.INFO) tb_logger.configure(opt.logger_name, flush_secs=5) # Load Vocabulary Wrapper vocab = pickle.load(open(os.path.join( opt.vocab_path, '%s_vocab.pkl' % opt.data_name), 'rb')) opt.vocab_size = len(vocab) # Load data loaders train_loader, val_loader = data.get_loaders( opt.data_name, vocab, opt.crop_size, opt.batch_size, opt.workers, opt) # Construct the model model = VSE(opt) best_rsum = 0 # optionally resume from a checkpoint if opt.resume: if os.path.isfile(opt.resume): print("=> loading checkpoint '{}'".format(opt.resume)) checkpoint = torch.load(opt.resume) start_epoch = checkpoint['epoch'] best_rsum = checkpoint['best_rsum'] model.load_state_dict(checkpoint['model']) # Eiters is used to show logs as the continuation of another # training model.Eiters = checkpoint['Eiters'] print("=> loaded checkpoint '{}' (epoch {}, best_rsum {})" .format(opt.resume, start_epoch, best_rsum)) validate(opt, val_loader, model) else: print("=> no checkpoint found at '{}'".format(opt.resume)) del checkpoint # Train the Model for epoch in range(opt.num_epochs): adjust_learning_rate(opt, model.optimizer, epoch) # train for one epoch train(opt, train_loader, model, epoch, val_loader) # evaluate on validation set rsum = validate(opt, val_loader, model) # remember best R@ sum and save checkpoint is_best = rsum > best_rsum best_rsum = max(rsum, best_rsum) save_checkpoint({ 'epoch': epoch + 1, 'model': model.state_dict(), 'best_rsum': best_rsum, 'opt': opt, 'Eiters': model.Eiters, }, is_best, epoch, prefix=opt.logger_name + '/')
def main(): # Hyper Parameters parser = argparse.ArgumentParser() parser.add_argument('--data_path', default='/home/dcsaero01/data/datasets/vsepp/', help='path to datasets') parser.add_argument('--data_name', default='minicsdv2_precomp', help='{coco,f8k,f30k,10crop}_precomp|coco|f8k|f30k') parser.add_argument('--vocab_path', default='./vocab/', help='Path to saved vocabulary pickle files.') parser.add_argument('--margin', default=0.2, type=float, help='Rank loss margin.') parser.add_argument('--num_epochs', default=300, type=int, help='Number of training epochs.') parser.add_argument('--batch_size', default=128, type=int, help='Size of a training mini-batch.') parser.add_argument('--word_dim', default=300, type=int, help='Dimensionality of the word embedding.') parser.add_argument('--embed_size', default=1024, type=int, help='Dimensionality of the joint embedding.') parser.add_argument('--grad_clip', default=2., type=float, help='Gradient clipping threshold.') parser.add_argument('--crop_size', default=224, type=int, help='Size of an image crop as the CNN input.') parser.add_argument('--num_layers', default=1, type=int, help='Number of GRU layers.') parser.add_argument('--learning_rate', default=.0002, type=float, help='Initial learning rate.') parser.add_argument('--lr_update', default=15, type=int, help='Number of epochs to update the learning rate.') parser.add_argument( '--dropout_value', default=0, type=float, help='Probability value for dropout after linear layer') parser.add_argument('--workers', default=10, type=int, help='Number of data loader workers.') parser.add_argument('--log_step', default=10, type=int, help='Number of steps to print and record the log.') parser.add_argument('--val_step', default=500, type=int, help='Number of steps to run validation.') parser.add_argument('--logger_name', default='runs/runX', help='Path to save the model and Tensorboard log.') parser.add_argument( '--resume', default= '/home/dcsaero01/data/projects/vsepp/runs/minicsdv2/checkpoint.pth.tar', type=str, metavar='PATH', help='path to latest checkpoint (default: none)') parser.add_argument('--max_violation', action='store_true', help='Use max instead of sum in the rank loss.') parser.add_argument('--img_dim', default=4096, type=int, help='Dimensionality of the image embedding.') parser.add_argument('--text_dim', default=6000, type=int, help='Dimensionality of the image embedding.') parser.add_argument('--finetune', action='store_true', help='Fine-tune the image encoder.') parser.add_argument('--cnn_type', default='vgg19', help="""The CNN used for image encoder (e.g. vgg19, resnet152)""") parser.add_argument('--use_restval', action='store_true', help='Use the restval data for training on MSCOCO.') parser.add_argument('--measure', default='cosine', help='Similarity measure used (cosine|order)') parser.add_argument('--use_abs', action='store_true', help='Take the absolute value of embedding vectors.') parser.add_argument('--no_imgnorm', action='store_true', help='Do not normalize the image embeddings.') parser.add_argument('--reset_train', action='store_true', help='Ensure the training is always done in ' 'train mode (Not recommended).') opt = parser.parse_args() print(opt) logging.basicConfig(format='%(asctime)s %(message)s', level=logging.INFO) tb_logger.configure(opt.logger_name, flush_secs=5) # Load Vocabulary Wrapper if opt.data_name == 'coco_st_precomp' or opt.data_name == 'coco_st_ner_precomp': vocab = None opt.vocab_size = 0 else: vocab = pickle.load( open(os.path.join(opt.vocab_path, '%s_vocab.pkl' % opt.data_name), 'rb')) opt.vocab_size = len(vocab) # Load data loaders train_loader, val_loader = data.get_loaders(opt.data_name, vocab, opt.crop_size, opt.batch_size, opt.workers, opt) # Construct the model model = VSE(opt) # optionally resume from a checkpoint if opt.resume: if os.path.isfile(opt.resume): print("=> loading checkpoint '{}'".format(opt.resume)) checkpoint = torch.load(opt.resume) start_epoch = checkpoint['epoch'] best_rsum = checkpoint['best_rsum'] model.load_state_dict(checkpoint['model']) # Eiters is used to show logs as the continuation of another # training model.Eiters = checkpoint['Eiters'] print("=> loaded checkpoint '{}' (epoch {}, best_rsum {})".format( opt.resume, start_epoch, best_rsum)) validate(opt, train_loader, model) else: print("=> no checkpoint found at '{}'".format(opt.resume)) """
def run_experiment(xp, xp_count, n_experiments): print(xp) hp = xp.hyperparameters model_fn, optimizer, optimizer_hp = models.get_model(hp["net"]) optimizer_fn = lambda x: optimizer( x, **{k: hp[k] if k in hp else v for k, v in optimizer_hp.items()}) train_data, test_data = data.get_data(hp["dataset"], args.DATA_PATH) distill_data = data.get_data(hp["distill_dataset"], args.DATA_PATH) distill_data = torch.utils.data.Subset( distill_data, np.random.permutation(len(distill_data))[:hp["n_distill"]]) client_loaders, test_loader = data.get_loaders( train_data, test_data, n_clients=hp["n_clients"], classes_per_client=hp["classes_per_client"], batch_size=hp["batch_size"], n_data=None) distill_loader = torch.utils.data.DataLoader(distill_data, batch_size=128, shuffle=False) clients = [ Client(model_fn, optimizer_fn, loader) for loader in client_loaders ] server = Server(model_fn, lambda x: torch.optim.Adam(x, lr=0.001), test_loader, distill_loader) server.load_model(path=args.CHECKPOINT_PATH, name=hp["pretrained"]) # print model models.print_model(server.model) # Start Distributed Training Process print("Start Distributed Training..\n") t1 = time.time() for c_round in range(1, hp["communication_rounds"] + 1): participating_clients = server.select_clients(clients, hp["participation_rate"]) for client in tqdm(participating_clients): client.synchronize_with_server(server) train_stats = client.compute_weight_update(hp["local_epochs"]) if hp["aggregate"]: server.aggregate_weight_updates(participating_clients) if hp["use_distillation"]: server.distill(participating_clients, hp["distill_epochs"], compress=hp["compress"]) # Logging if xp.is_log_round(c_round): print("Experiment: {} ({}/{})".format(args.schedule, xp_count + 1, n_experiments)) xp.log({ 'communication_round': c_round, 'epochs': c_round * hp['local_epochs'] }) xp.log({ key: clients[0].optimizer.__dict__['param_groups'][0][key] for key in optimizer_hp }) # Evaluate xp.log({ "client_train_{}".format(key): value for key, value in train_stats.items() }) xp.log({ "server_val_{}".format(key): value for key, value in server.evaluate().items() }) # Save results to Disk try: xp.save_to_disc(path=args.RESULTS_PATH, name=hp['log_path']) except: print("Saving results Failed!") # Timing e = int((time.time() - t1) / c_round * (hp['communication_rounds'] - c_round)) print( "Remaining Time (approx.):", '{:02d}:{:02d}:{:02d}'.format(e // 3600, (e % 3600 // 60), e % 60), "[{:.2f}%]\n".format(c_round / hp['communication_rounds'] * 100)) # Save model to disk server.save_model(path=args.CHECKPOINT_PATH, name=hp["save_model"]) # Delete objects to free up GPU memory del server clients.clear() torch.cuda.empty_cache()
def main(): args = get_args() device, dtype = args.device, args.dtype train_loader, val_loader = get_loaders(args.dataroot, args.batch_size, args.batch_size, args.input_size, args.workers, args.world_size, args.local_rank) model = MnasNet(n_class=args.num_classes, width_mult=args.scaling, drop_prob=0.0, num_steps=len(train_loader) * args.epochs) num_parameters = sum([l.nelement() for l in model.parameters()]) flops = flops_benchmark.count_flops(MnasNet, 1, device, dtype, args.input_size, 3, width_mult=args.scaling) if not args.child: print(model) print('number of parameters: {}'.format(num_parameters)) print('FLOPs: {}'.format(flops)) # define loss function (criterion) and optimizer criterion = CrossEntropyLoss() mixup = Mixup(args.num_classes, args.mixup, args.smooth_eps) model, criterion = model.to(device=device, dtype=dtype), criterion.to(device=device, dtype=dtype) if args.dtype == torch.float16: for module in model.modules(): # FP batchnorm if is_bn(module): module.to(dtype=torch.float32) if args.distributed: args.device_ids = [args.local_rank] dist.init_process_group(backend=args.dist_backend, init_method=args.dist_init, world_size=args.world_size, rank=args.local_rank) model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank) print('Node #{}'.format(args.local_rank)) else: model = torch.nn.parallel.DataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank) optimizer_class = torch.optim.SGD optimizer_params = { "lr": args.learning_rate, "momentum": args.momentum, "weight_decay": args.decay, "nesterov": True } if args.find_clr: optimizer = torch.optim.SGD(model.parameters(), args.learning_rate, momentum=args.momentum, weight_decay=args.decay, nesterov=True) find_bounds_clr(model, train_loader, optimizer, criterion, device, dtype, min_lr=args.min_lr, max_lr=args.max_lr, step_size=args.epochs_per_step * len(train_loader), mode=args.mode, save_path=args.save_path) return if args.sched == 'clr': scheduler_class = CyclicLR scheduler_params = { "base_lr": args.min_lr, "max_lr": args.max_lr, "step_size": args.epochs_per_step * len(train_loader), "mode": args.mode } elif args.sched == 'multistep': scheduler_class = MultiStepLR scheduler_params = {"milestones": args.schedule, "gamma": args.gamma} elif args.sched == 'cosine': scheduler_class = CosineLR scheduler_params = { "max_epochs": args.epochs, "warmup_epochs": args.warmup, "iter_in_epoch": len(train_loader) } elif args.sched == 'gamma': scheduler_class = StepLR scheduler_params = {"step_size": 30, "gamma": args.gamma} else: raise ValueError('Wrong scheduler!') optim = OptimizerWrapper(model, optimizer_class=optimizer_class, optimizer_params=optimizer_params, scheduler_class=scheduler_class, scheduler_params=scheduler_params, use_shadow_weights=args.dtype == torch.float16) best_test = 0 # optionally resume from a checkpoint data = None if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume, map_location=device) args.start_epoch = checkpoint['epoch'] - 1 best_test = checkpoint['best_prec1'] model.load_state_dict(checkpoint['state_dict']) optim.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) elif os.path.isdir(args.resume): checkpoint_path = os.path.join( args.resume, 'checkpoint{}.pth.tar'.format(args.local_rank)) csv_path = os.path.join(args.resume, 'results{}.csv'.format(args.local_rank)) print("=> loading checkpoint '{}'".format(checkpoint_path)) checkpoint = torch.load(checkpoint_path, map_location=device) args.start_epoch = checkpoint['epoch'] - 1 best_test = checkpoint['best_prec1'] model.load_state_dict(checkpoint['state_dict']) optim.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' (epoch {})".format( checkpoint_path, checkpoint['epoch'])) data = [] with open(csv_path) as csvfile: reader = csv.DictReader(csvfile) for row in reader: data.append(row) else: print("=> no checkpoint found at '{}'".format(args.resume)) if args.evaluate: loss, top1, top5 = test(model, val_loader, criterion, device, dtype, args.child) # TODO return csv_logger = CsvLogger(filepath=args.save_path, data=data, local_rank=args.local_rank) csv_logger.save_params(sys.argv, args) claimed_acc1 = None claimed_acc5 = None if args.input_size in claimed_acc_top1: if args.scaling in claimed_acc_top1[args.input_size]: claimed_acc1 = claimed_acc_top1[args.input_size][args.scaling] if not args.child: csv_logger.write_text( 'Claimed accuracy is {:.2f}% top-1'.format(claimed_acc1 * 100.)) train_network(args.start_epoch, args.epochs, optim, model, train_loader, val_loader, criterion, mixup, device, dtype, args.batch_size, args.log_interval, csv_logger, args.save_path, claimed_acc1, claimed_acc5, best_test, args.local_rank, args.child)
log_interval = 100 mode = 'triangular2' evaluate = 'false' model = MobileNet2(input_size=input_size, scale=scaling) num_parameters = sum([l.nelement() for l in model.parameters()]) print(model) print('number of parameters: {}'.format(num_parameters)) print('FLOPs: {}'.format( flops_benchmark.count_flops(MobileNet2, batch_size // len(gpus) if gpus is not None else batch_size, device, dtype, input_size, 3, scaling))) train_loader, val_loader = get_loaders(dataroot, batch_size, batch_size, input_size, workers) # define loss function (criterion) and optimizer criterion = torch.nn.CrossEntropyLoss() if gpus is not None: model = torch.nn.DataParallel(model, gpus) model.to(device=device, dtype=dtype) criterion.to(device=device, dtype=dtype) optimizer = torch.optim.SGD(model.parameters(), learning_rate, momentum=momentum, weight_decay=decay, nesterov=True) if args.find_clr: find_bounds_clr(model, train_loader, optimizer, criterion, device, dtype, min_lr=min_lr, max_lr=max_lr, step_size=epochs_per_step * len(train_loader), mode=mode, save_path=save_path) return
def main(): # Hyper Parameters opt = opts.parse_opt() device_id = opt.gpuid device_count = len(str(device_id).split(",")) #assert device_count == 1 or device_count == 2 print("use GPU:", device_id, "GPUs_count", device_count, flush=True) os.environ['CUDA_VISIBLE_DEVICES']=str(device_id) device_id = 0 torch.cuda.set_device(0) # Load Vocabulary Wrapper vocab = deserialize_vocab(os.path.join(opt.vocab_path, '%s_vocab.json' % opt.data_name)) opt.vocab_size = len(vocab) # Load data loaders train_loader, val_loader = data.get_loaders( opt.data_name, vocab, opt.batch_size, opt.workers, opt) # Construct the model model = SCAN(opt) model.cuda() model = nn.DataParallel(model) # Loss and Optimizer criterion = ContrastiveLoss(opt=opt, margin=opt.margin, max_violation=opt.max_violation) mse_criterion = nn.MSELoss(reduction="batchmean") optimizer = torch.optim.Adam(model.parameters(), lr=opt.learning_rate) # optionally resume from a checkpoint if not os.path.exists(opt.model_name): os.makedirs(opt.model_name) start_epoch = 0 best_rsum = 0 if opt.resume: if os.path.isfile(opt.resume): print("=> loading checkpoint '{}'".format(opt.resume)) checkpoint = torch.load(opt.resume) start_epoch = checkpoint['epoch'] best_rsum = checkpoint['best_rsum'] model.load_state_dict(checkpoint['model']) print("=> loaded checkpoint '{}' (epoch {}, best_rsum {})" .format(opt.resume, start_epoch, best_rsum)) else: print("=> no checkpoint found at '{}'".format(opt.resume)) evalrank(model.module, val_loader, opt) print(opt, flush=True) # Train the Model for epoch in range(start_epoch, opt.num_epochs): message = "epoch: %d, model name: %s\n" % (epoch, opt.model_name) log_file = os.path.join(opt.logger_name, "performance.log") logging_func(log_file, message) print("model name: ", opt.model_name, flush=True) adjust_learning_rate(opt, optimizer, epoch) run_time = 0 for i, (images, captions, lengths, masks, ids, _) in enumerate(train_loader): start_time = time.time() model.train() optimizer.zero_grad() if device_count != 1: images = images.repeat(device_count,1,1) score = model(images, captions, lengths, masks, ids) loss = criterion(score) loss.backward() if opt.grad_clip > 0: clip_grad_norm_(model.parameters(), opt.grad_clip) optimizer.step() run_time += time.time() - start_time # validate at every val_step if i % 100 == 0: log = "epoch: %d; batch: %d/%d; loss: %.4f; time: %.4f" % (epoch, i, len(train_loader), loss.data.item(), run_time / 100) print(log, flush=True) run_time = 0 if (i + 1) % opt.val_step == 0: evalrank(model.module, val_loader, opt) print("-------- performance at epoch: %d --------" % (epoch)) # evaluate on validation set rsum = evalrank(model.module, val_loader, opt) #rsum = -100 filename = 'model_' + str(epoch) + '.pth.tar' # remember best R@ sum and save checkpoint is_best = rsum > best_rsum best_rsum = max(rsum, best_rsum) save_checkpoint({ 'epoch': epoch + 1, 'model': model.state_dict(), 'best_rsum': best_rsum, 'opt': opt, }, is_best, filename=filename, prefix=opt.model_name + '/')
def main(opt): logging.basicConfig(format='%(message)s', level=logging.INFO) tb_logger.configure(opt.logger_name, flush_secs=5) # Construct the model model = UDAG(opt) # optionally resume from a checkpoint if opt.evaluation: val_loader = data.get_test_loader(opt.data_name, opt.batch_size, opt.workers, opt) if os.path.isfile(opt.resume): print("=> loading checkpoint '{}'".format(opt.resume)) checkpoint = torch.load(opt.resume) start_epoch = checkpoint['epoch'] best_rsum = checkpoint['best_rsum'] model.load_state_dict(checkpoint['model']) # Eiters is used to show logs as the continuation of another training model.Eiters = checkpoint['Eiters'] print("=> loaded checkpoint '{}' (epoch {}, best_rsum {})".format( opt.resume, start_epoch, best_rsum)) _, sims = validate(opt, val_loader, model) np.save(opt.data_name + '_sims', sims) else: print("=> no checkpoint found at '{}'".format(opt.resume)) # if opt.resume: # if os.path.isfile(opt.resume): # print("=> loading checkpoint '{}'".format(opt.resume)) # checkpoint = torch.load(opt.resume) # start_epoch = checkpoint['epoch'] # best_rsum = checkpoint['best_rsum'] # model.load_state_dict(checkpoint['model']) # # Eiters is used to show logs as the continuation of another training # model.Eiters = checkpoint['Eiters'] # print("=> loaded checkpoint '{}' (epoch {}, best_rsum {})".format(opt.resume, start_epoch, best_rsum)) # validate(opt, val_loader, model) # else: # print("=> no checkpoint found at '{}'".format(opt.resume)) else: # Train the Model # Load data loaders train_loader, val_loader = data.get_loaders(opt.data_name, opt.batch_size, opt.workers, opt) best_rsum = 0 for epoch in range(opt.num_epochs): adjust_learning_rate(opt, model.optimizer, epoch) # rsum = validate(opt, val_loader, model) # train for one epoch train(opt, train_loader, model, epoch, val_loader) # evaluate on validation set rsum = validate(opt, val_loader, model) # remember best R@ sum and save checkpoint is_best = rsum > best_rsum best_rsum = max(rsum, best_rsum) save_checkpoint( { 'epoch': epoch + 1, 'model': model.state_dict(), 'best_rsum': best_rsum, 'opt': opt, 'Eiters': model.Eiters, }, is_best, prefix=opt.logger_name + '_' + opt.model_name + '/')
def main(): # Hyper Parameters parser = argparse.ArgumentParser() parser.add_argument('--data_path', default='./data/', help='path to datasets') parser.add_argument('--data_name', default='precomp', help='{coco,f30k}_precomp') parser.add_argument('--vocab_path', default='./vocab/', help='Path to saved vocabulary json files.') parser.add_argument('--margin', default=0.2, type=float, help='Rank loss margin.') parser.add_argument('--num_epochs', default=30, type=int, help='Number of training epochs.') parser.add_argument('--batch_size', default=128, type=int, help='Size of a training mini-batch.') parser.add_argument('--word_dim', default=300, type=int, help='Dimensionality of the word embedding.') parser.add_argument('--embed_size', default=1024, type=int, help='Dimensionality of the joint embedding.') parser.add_argument('--grad_clip', default=2., type=float, help='Gradient clipping threshold.') parser.add_argument('--num_layers', default=1, type=int, help='Number of GRU layers.') parser.add_argument('--learning_rate', default=.0002, type=float, help='Initial learning rate.') parser.add_argument('--lr_update', default=15, type=int, help='Number of epochs to update the learning rate.') parser.add_argument('--workers', default=10, type=int, help='Number of data loader workers.') parser.add_argument('--log_step', default=100, type=int, help='Number of steps to print and record the log.') parser.add_argument('--val_step', default=15000, type=int, help='Number of steps to run validation.') parser.add_argument('--logger_name', default='./runs/runX/log', help='Path to save Tensorboard log.') parser.add_argument('--model_name', default='./runs/runX/checkpoint', help='Path to save the models.') parser.add_argument('--resume', default='', type=str, metavar='PATH', help='path to latest checkpoint (default: none)') parser.add_argument('--max_violation', action='store_true', help='Use max instead of sum in the rank loss.') parser.add_argument('--img_dim', default=2048, type=int, help='Dimensionality of the image embedding.') parser.add_argument('--no_imgnorm', action='store_true', help='Do not normalize the image embeddings.') parser.add_argument('--no_txtnorm', action='store_true', help='Do not normalize the text embeddings.') parser.add_argument('--bi_gru', action='store_true', help='Use bidirectional GRU.') parser.add_argument('--lambda_softmax', default=9., type=float, help='Attention softmax temperature.') parser.add_argument('--feat_dim', default=16, type=int, help='Dimensionality of the similarity embedding.') parser.add_argument('--num_block', default=16, type=int, help='Dimensionality of the similarity embedding.') parser.add_argument('--hid_dim', default=32, type=int, help='Dimensionality of the hidden state during graph convolution.') parser.add_argument('--out_dim', default=1, type=int, help='Dimensionality of the hidden state during graph convolution.') parser.add_argument('--is_sparse', action='store_true', help='Whether models the text as a fully connected graph.') opt = parser.parse_args() print(opt) logging.basicConfig(format='%(asctime)s %(message)s', level=logging.INFO) tb_logger.configure(opt.logger_name, flush_secs=5) # Load Vocabulary Wrapper vocab = deserialize_vocab(os.path.join( opt.vocab_path, '%s_vocab.json' % opt.data_name)) opt.vocab_size = len(vocab) # Load data loaders train_loader, val_loader = data.get_loaders( opt.data_name, vocab, opt.batch_size, opt.workers, opt) # Construct the models model = GSMN(opt) # optionally resume from a checkpoint if opt.resume: if os.path.isfile(opt.resume): print("=> loading checkpoint '{}'".format(opt.resume)) checkpoint = torch.load(opt.resume) start_epoch = checkpoint['epoch'] best_rsum = checkpoint['best_rsum'] model.load_state_dict(checkpoint['models']) # Eiters is used to show logs as the continuation of another # training model.Eiters = checkpoint['Eiters'] print("=> loaded checkpoint '{}' (epoch {}, best_rsum {})" .format(opt.resume, start_epoch, best_rsum)) validate(opt, val_loader, model) else: print("=> no checkpoint found at '{}'".format(opt.resume)) # Train the Model best_rsum = 0 for epoch in range(opt.num_epochs): print(opt.logger_name) print(opt.model_name) adjust_learning_rate(opt, model.optimizer, epoch) # train for one epoch train(opt, train_loader, model, epoch, val_loader) # evaluate on validation set rsum = validate(opt, val_loader, model) # remember best R@ sum and save checkpoint is_best = rsum > best_rsum best_rsum = max(rsum, best_rsum) if not os.path.exists(opt.model_name): os.mkdir(opt.model_name) save_checkpoint({ 'epoch': epoch + 1, 'models': model.state_dict(), 'best_rsum': best_rsum, 'opt': opt, 'Eiters': model.Eiters, }, is_best, filename='checkpoint_{}.pth.tar'.format(epoch), prefix=opt.model_name + '/')
def test(args, eval_on): # Cuda args.device = torch.device( 'cuda') if torch.cuda.is_available() else torch.device('cpu') tags = ['supervised', f'{args.dataset}', f'use_context_{args.use_context}'] if args.debug: tags.append('debug') if args.log_wandb: wandb.init(name=args.experiment_name, project=f"arm_test_{args.dataset}", tags=tags, reinit=True) wandb.config.update(args, allow_val_change=True) # Get data train_loader, train_eval_loader, val_loader, test_loader = data.get_loaders( args) args.n_groups = train_loader.dataset.n_groups if args.seed is not None: print('setting seed', args.seed) torch.manual_seed(args.seed) if args.cuda: torch.cuda.manual_seed(args.seed) np.random.seed(args.seed) random.seed(args.seed) if args.binning: corner_dists = np.array( get_one_hot(range(args.n_groups), args.n_groups)) binned_groups = get_binned_dists(args.n_test_dists) if eval_on == 'train': empirical_dist = [train_loader.dataset.group_dist] elif eval_on == 'val': empirical_dist = [val_loader.dataset.group_dist] elif eval_on == 'test': empirical_dist = [test_loader.dataset.group_dist] eval_dists = [corner_dists, binned_groups, empirical_dist] else: corner_dists = np.array( get_one_hot(range(args.n_groups), args.n_groups)) if eval_on == 'train': empirical_dist = [train_loader.dataset.group_dist] elif eval_on == 'val': empirical_dist = [val_loader.dataset.group_dist] elif eval_on == 'test': empirical_dist = [test_loader.dataset.group_dist] if args.n_test_dists > 0: random_dists = np.random.dirichlet(np.ones(args.n_groups), size=args.n_test_dists) eval_dists = np.concatenate( [corner_dists, random_dists, empirical_dist]) else: eval_dists = np.concatenate([corner_dists, empirical_dist]) # Get model model = utils.get_model(args, image_shape=train_loader.dataset.image_shape) state_dict = torch.load(args.ckpt_path) # Remove unnecessary weights from the model. new_state_dict = state_dict[0] if "context_net.classifier.weight" in new_state_dict: del new_state_dict["context_net.classifier.weight"] if "context_net.classifier.bias" in new_state_dict: del new_state_dict["context_net.classifier.bias"] new_state_dict_2 = {} for key in new_state_dict.keys(): new_key = key.replace('module.', '') new_state_dict_2[new_key] = new_state_dict[key] new_state_dict = new_state_dict_2 model.load_state_dict(new_state_dict) model = model.to(args.device) model.eval() # Train if args.eval_deterministic: if eval_on == 'train': stats = utils.evaluate_each_corner(args, model, train_eval_loader, split='train') elif eval_on == 'val': stats = utils.evaluate_each_corner(args, model, val_loader, split='val') elif eval_on == 'test': stats = utils.evaluate_each_corner(args, model, test_loader, split='test') else: if eval_on == 'train': worst_case_acc, avg_acc, empirical_case_acc, stats = utils.evaluate_mixtures( args, model, train_eval_loader, eval_dists, split='train') elif eval_on == 'val': worst_case_acc, avg_acc, empirical_case_acc, stats = utils.evaluate_mixtures( args, model, val_loader, eval_dists, split='val') elif eval_on == 'test': worst_case_acc, avg_acc, empirical_case_acc, stats = utils.evaluate_mixtures( args, model, test_loader, eval_dists, split='test') return stats