def __init__(self, learning_rate=1e-3, batch_size=32, scheduler_stepsize=20): self.learning_rate = learning_rate self.batch_size = batch_size self.scheduler_stepsize = scheduler_stepsize self.dataloaders = get_dataloaders(self.batch_size) self.model = Net().model.to(device)
def main(config): # load data train_dataloader, valid_dataloader, test_dataloader = get_dataloaders(config) # define model, loss, optimizer, scheduler, logger model = AtecModel(config) criterion = nn.CrossEntropyLoss() trainable_params = list(model.encoder.parameters())+list(model.comparator.parameters()) optimizer = Adam(trainable_params, lr=config.lr) scheduler = MultiStepLR(optimizer, milestones=[10, 20, 30], gamma=0.1) # training iterations total_steps = len(train_dataloader) for epoch in range(config.num_epoch): scheduler.step() for i, (data, labels, indices, lengths) in enumerate(train_dataloader): logits = model(data, indices) loss = criterion(logits, labels) optimizer.zero_grad() loss.backward() optimizer.step() # log loss, could visualize in tensorboard if needed if (i+1) % config.log_step == 0: print( 'Epoch [%d/%d], Step[%d/%d], loss: %.4f, ' % (epoch+1, config.num_epochs, i+1, total_steps, loss.data[0])) # save the model per epoch, only save parameters if (epoch+1) % config.save_step == 0: model_path = os.path.join(config.model_dir, 'model-%d.pkl' %(epoch+1)) torch.save(model.state_dict(), model_path)
def main(): # Get DataLoaders train_fonts = [] with open('train52_fonts.txt', 'r') as file: for font in file: train_fonts.append(font.strip()) val_fonts = [] with open('val52_fonts.txt', 'r') as file: for font in file: val_fonts.append(font.strip()) train_x_loader, train_y_loader, val_loader = get_dataloaders('data/jpg', 'data/jpg', train_fonts, val_fonts, BATCH_SIZE, logger=log) # Initialize models gen = Generator().to(device) dis = Discriminator().to(device) epoch = 1 min_eval_loss = np.inf while epoch <= MAX_EPOCHS: train(gen, dis, train_x_loader, train_y_loader, epoch, lr=LR) eval_loss = eval(gen, val_loader, epoch) log.info(f'Eval Pixelwise BCE Loss: {eval_loss}') if eval_loss < min_eval_loss: eval_loss = min_eval_loss save(gen, dis) epoch += 1
def main(): opt = parse_option() opt.n_test_runs = 600 train_loader, val_loader, meta_testloader, meta_valloader, n_cls = get_dataloaders( opt) # load model model = create_model(opt.model, n_cls, opt.dataset) ckpt = torch.load(opt.model_path) model.load_state_dict(ckpt["model"]) if torch.cuda.is_available(): model = model.cuda() cudnn.benchmark = True start = time.time() test_acc, test_std = meta_test(model, meta_testloader) test_time = time.time() - start print('test_acc: {:.4f}, test_std: {:.4f}, time: {:.1f}'.format( test_acc, test_std, test_time)) start = time.time() test_acc_feat, test_std_feat = meta_test(model, meta_testloader, use_logit=False) test_time = time.time() - start print('test_acc_feat: {:.4f}, test_std: {:.4f}, time: {:.1f}'.format( test_acc_feat, test_std_feat, test_time))
def run(args, use_cuda, output_dir): trial_list = list(range(args.n_trials)) np.random.shuffle(trial_list) for trial_i in trial_list: trial_dir = os.path.join(output_dir, 'trial_{}'.format(trial_i)) os.makedirs(trial_dir, exist_ok=True) loaders, params = get_dataloaders(args.batch_size, trial_i, args.dataset, args.augment_data, early_stop=args.early_stop) if args.network_type == 'fc': model = DenseModel(input_dim=np.prod(params['input_shape']), output_dim=params['output_dim'], hidden_nodes=args.hidden_nodes, num_modules=args.n_modules, activation=args.activation) elif args.network_type == 'conv': model = ConvModel(input_shape=params['input_shape'], output_dim=params['output_dim'], num_filters=args.filters, kernel_sizes=args.kernels, strides=args.strides, dilations=args.dilations, num_modules=args.n_modules, activation=args.activation, final_layer=args.conv_final_layer) elif args.network_type == 'densenet': model = DenseNet(input_shape=params['input_shape'], output_dim=params['output_dim'], growth_rate=args.densenet_k, depth=args.densenet_depth, reduction=args.densenet_reduction, bottleneck=args.densenet_bottleneck, num_modules=args.n_modules) logging.debug(args) logging.debug('Parameters: {}'.format(model.n_parameters())) device = torch.device("cuda" if use_cuda else "cpu") model = model.to(device) model.reset_parameters() weight_path = os.path.join(trial_dir, 'initial_weights.pt') torch.save(model.state_dict(), weight_path) for lambda_i, (lambda_, learning_rate) in enumerate( zip(args.lambda_values, args.learning_rates)): model.load_state_dict(torch.load(weight_path)) lambda_dir = os.path.join(trial_dir, str(lambda_)) os.makedirs(lambda_dir, exist_ok=True) do_lambda_value(model, lambda_, learning_rate, args, loaders, params['distribution'], device, lambda_dir)
def set_data_loader(self): df= pd.read_csv(self.args.csv_path) mapping = get_train_mapping(df) train_dataset = BaseDataset(self.args.data_dir, mapping, enable_mixup=True, enable_aug=False) test_dataset = BaseDataset(self.args.data_dir, mapping, enable_mixup=False, enable_aug=False) train_dataloader, eval_dataloader = get_dataloaders(train_dataset, test_dataset, self.args.batch_size,self.device) self.train_dataloader= train_dataloader self.eval_dataloader= eval_dataloader
def main(): parser = argparse.ArgumentParser() parser.add_argument("--num_epochs", type=int, default=5) parser.add_argument("--lr", type=float, default=0.01) parser.add_argument("--vocab_size", type=float, default=10000) parser.add_argument("--embed_size", type=int, default=300) parser.add_argument("--hidden_size", type=int, default=100) parser.add_argument("--num_layers", type=int, default=1) parser.add_argument("--batch_size", type=int, default=32) parser.add_argument("--win_size", type=int, default=35) parser.add_argument("--num_samples", type=int, default=100) parser.add_argument("--early_stop", type=int, default=3) parser.add_argument("--use_glove", type=bool, default=False) args = parser.parse_args() # vocab_size = args.vocab_size num_epochs = args.num_epochs embed_size = args.embed_size hidden_size = args.hidden_size num_layers = args.num_layers batch_size = args.batch_size win_size = args.win_size num_samples = args.num_samples early_stop = args.early_stop use_glove = args.use_glove train_loader, dev_loader, test_loader, vocab_size, vocab = get_dataloaders( batch_size, win_size) weight = None if use_glove: weight = loadGloveModel(vocab, "glove.6B.300d.txt") model = RNN_LM(vocab_size, embed_size, hidden_size, num_layers, weight, use_glove) criterion = nn.CrossEntropyLoss() optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) model, lowest_perplexity = trainer(train_loader, dev_loader, model, optimizer, criterion, num_epochs, early_stop, num_layers, batch_size, hidden_size) print("lowest_perplexity: ", lowest_perplexity) test(test_loader, model, num_layers, batch_size, hidden_size, criterion) generate_words("I.txt", "I", num_layers, hidden_size, vocab, num_samples, model) # starting word to generate from generate_words("What.txt", "What", num_layers, hidden_size, vocab, num_samples, model) generate_words("Anyway.txt", "anyway", num_layers, hidden_size, vocab, num_samples, model)
def main(): parser = argparse.ArgumentParser() parser.add_argument("--lr", type=float, default=0.0001) parser.add_argument("--dropout", type=float, default=0.3) parser.add_argument("--batch_size", type=int, default=20) parser.add_argument("--early_stop", type=int, default=10) parser.add_argument("--embed_dim", type=int, default=128) parser.add_argument("--dim_size", type=int, default=512) parser.add_argument("--num_layers", type=int, default=2) parser.add_argument("--window_size", type=int, default=30) parser.add_argument("--lr_decay", type=float, default=0.5) parser.add_argument("--amount_of_vocab", type=int, default=15000) args = parser.parse_args() # load data train_loader, dev_loader, test_loader, vocab_size, vocab = get_dataloaders( args.batch_size, args.window_size, args.amount_of_vocab) # build model # try to use pretrained embedding here model = RNNLM(args, vocab_size, embedding_matrix=None) # loss function criterion = nn.CrossEntropyLoss() # choose optimizer optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=args.lr) # scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=args.lr_decay) model, best_perp = trainer(train_loader, dev_loader, model, optimizer, criterion, early_stop=args.early_stop) print('best_dev_perp:{}'.format(best_perp)) predict(model, vocab, clean_str("I")) predict(model, vocab, clean_str("What")) predict(model, vocab, clean_str("Anyway"))
def main(): opt = parse_option() opt.n_test_runs = 600 train_loader, val_loader, meta_testloader, meta_valloader, n_cls, _ = get_dataloaders( opt) # load model model = create_model(opt.model, n_cls, opt.dataset) ckpt = torch.load(opt.model_path)["model"] from collections import OrderedDict new_state_dict = OrderedDict() for k, v in ckpt.items(): name = k.replace("module.", "") new_state_dict[name] = v model.load_state_dict(new_state_dict) # model.load_state_dict(ckpt["model"]) if torch.cuda.is_available(): model = model.cuda() cudnn.benchmark = True start = time.time() test_acc, test_std = meta_test(model, meta_testloader) test_time = time.time() - start print('test_acc: {:.4f}, test_std: {:.4f}, time: {:.1f}'.format( test_acc, test_std, test_time)) start = time.time() test_acc_feat, test_std_feat = meta_test(model, meta_testloader, use_logit=False) test_time = time.time() - start print('test_acc_feat: {:.4f}, test_std: {:.4f}, time: {:.1f}'.format( test_acc_feat, test_std_feat, test_time))
def main(FLAGS): "train and validate the Unet model" device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") #data directory data_dir = FLAGS.dataset_dir #log_directory log_dir = FLAGS.log_dir # Hyper and other parameters train_batch_size = FLAGS.train_batch_size val_batch_size = FLAGS.val_batch_size aug_flag = FLAGS.aug num_epochs = FLAGS.epochs num_classes = 2 # get the train and validation dataloaders dataloaders = get_dataloaders(data_dir, train_batch_size, val_batch_size, aug_flag) model = Unet(3, num_classes) # Uncomment to run traiing on Multiple GPUs if torch.cuda.device_count() > 1: print("Let's use", torch.cuda.device_count(), "GPUs!") model = nn.DataParallel(model, device_ids=[0, 1]) else: print("no multiple gpu found") model.to(device) criterion = nn.CrossEntropyLoss() optimizer = optim.SGD(model.parameters(), lr=0.02, momentum=0.9, weight_decay=0.0005) #optimizer = optim.Adam(model.parameters(),lr = learning_rate) exp_lr_scheduler = lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.1) plotter = VisdomLinePlotter(env_name='Unet Train') # uncomment for leraning rate schgeduler.. train_val(dataloaders, model, criterion, optimizer, num_epochs, log_dir, device)
import torch import torch.optim import torch.nn as nn import torch.backends.cudnn as cudnn #torch.manual_seed(args.seed) if args.gpu: os.environ["CUDA_VISIBLE_DEVICES"] = '0' model = getattr(models, 'RANet')(args) model = torch.nn.DataParallel(model.cuda()) criterion = nn.CrossEntropyLoss().cuda() train_loader, val_loader, test_loader = get_dataloaders(args) #state_dict = torch.load('model_best.pth.tar')['state_dict'] state_dict = torch.load('model_best2.pth.tar') model.load_state_dict(state_dict) def validate(val_loader, model, criterion): #batch_time = AverageMeter() #losses = AverageMeter() #data_time = AverageMeter() top1, top5 = [], []
def generate_final_report(model, opt, wandb): from eval.meta_eval import meta_test opt.n_shots = 1 train_loader, val_loader, meta_testloader, meta_valloader, _ = get_dataloaders( opt) #validate meta_val_acc, meta_val_std = meta_test(model, meta_valloader) meta_val_acc_feat, meta_val_std_feat = meta_test(model, meta_valloader, use_logit=False) #evaluate meta_test_acc, meta_test_std = meta_test(model, meta_testloader) meta_test_acc_feat, meta_test_std_feat = meta_test(model, meta_testloader, use_logit=False) print('Meta Val Acc : {:.4f}, Meta Val std: {:.4f}'.format( meta_val_acc, meta_val_std)) print('Meta Val Acc (feat): {:.4f}, Meta Val std (feat): {:.4f}'.format( meta_val_acc_feat, meta_val_std_feat)) print('Meta Test Acc: {:.4f}, Meta Test std: {:.4f}'.format( meta_test_acc, meta_test_std)) print('Meta Test Acc (feat): {:.4f}, Meta Test std (feat): {:.4f}'.format( meta_test_acc_feat, meta_test_std_feat)) wandb.log({ 'Final Meta Test Acc @1': meta_test_acc, 'Final Meta Test std @1': meta_test_std, 'Final Meta Test Acc (feat) @1': meta_test_acc_feat, 'Final Meta Test std (feat) @1': meta_test_std_feat, 'Final Meta Val Acc @1': meta_val_acc, 'Final Meta Val std @1': meta_val_std, 'Final Meta Val Acc (feat) @1': meta_val_acc_feat, 'Final Meta Val std (feat) @1': meta_val_std_feat }) opt.n_shots = 5 train_loader, val_loader, meta_testloader, meta_valloader, _ = get_dataloaders( opt) #validate meta_val_acc, meta_val_std = meta_test(model, meta_valloader) meta_val_acc_feat, meta_val_std_feat = meta_test(model, meta_valloader, use_logit=False) #evaluate meta_test_acc, meta_test_std = meta_test(model, meta_testloader) meta_test_acc_feat, meta_test_std_feat = meta_test(model, meta_testloader, use_logit=False) print('Meta Val Acc : {:.4f}, Meta Val std: {:.4f}'.format( meta_val_acc, meta_val_std)) print('Meta Val Acc (feat): {:.4f}, Meta Val std (feat): {:.4f}'.format( meta_val_acc_feat, meta_val_std_feat)) print('Meta Test Acc: {:.4f}, Meta Test std: {:.4f}'.format( meta_test_acc, meta_test_std)) print('Meta Test Acc (feat): {:.4f}, Meta Test std (feat): {:.4f}'.format( meta_test_acc_feat, meta_test_std_feat)) wandb.log({ 'Final Meta Test Acc @5': meta_test_acc, 'Final Meta Test std @5': meta_test_std, 'Final Meta Test Acc (feat) @5': meta_test_acc_feat, 'Final Meta Test std (feat) @5': meta_test_std_feat, 'Final Meta Val Acc @5': meta_val_acc, 'Final Meta Val std @5': meta_val_std, 'Final Meta Val Acc (feat) @5': meta_val_acc_feat, 'Final Meta Val std (feat) @5': meta_val_std_feat })
def test(config): # Initialize the device which to run the model on device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") # Load pre-trained model file_path = config.model_dir + config.model_file if os.path.isfile(file_path): print('Loading checkpoint \'{}\''.format(file_path)) checkpoint = torch.load(file_path) config = checkpoint['config'] # Use saved config print('Loaded checkpoint \'{}\' (epoch {})'.format( file_path, checkpoint['epoch'])) else: print('No checkpoint found at \'{}\''.format(file_path)) sys.exit('Please check the filename.') # Get torch loaders for training and test data train_loader, test_loader = get_dataloaders(config.dataset, markov_order=2, batch_size=config.batch_size) vocab_size = train_loader.dataset.vocab_size # The following steps are to initialize the model, which will be overloaded with the trained model encoder = S2SEncoder(vocab_size, config.embedding_dim, config.hidden_size, config.num_layers, dropout=config.dropout) decoder = S2SAttnDecoder(vocab_size, config.embedding_dim, config.hidden_size, config.num_layers, dropout=config.dropout) model = S2S(encoder, decoder).to(device) # Load model from checkpoint and put in evalulation mode model.load_state_dict(checkpoint['model']) model.eval() print('Model loaded from checkpoint, start evaluation.') # f, p, r = (f1-score, precision, recall) rouge_scores = [ [0, 0, 0], # rouge-1 [0, 0, 0], # rouge-2 [0, 0, 0] ] # rouge-l num_examples = 0 rouge_eval = Rouge() for batch_idx, (X, Y, xlen, ylen) in enumerate(test_loader): X = X.to(device) Y = Y.to(device) Y_in = Y[:, :-1] Y_t = Y[:, 1:] xlen = xlen.to(device) # ylen -= 1, outputs do not predict start token ylen = (ylen - 1).to(device) # No teacher forcing Y_in = Y_in[:, 0:1] ylen = torch.ones_like(ylen).to(device) out_length = Y_t.size(1) out = model(X, Y_in, xlen, ylen, output_length=out_length, teacher_forcing=False) # Calculate avg rouge scores over batch batch_correct = [] batch_test_sentence = [] for i in range(len(out)): test_sentence = torch.argmax(out[i], -1).cpu().numpy() test_sentence = [ test_loader.dataset.i2w[i] if i > 0 else 'PAD' for i in test_sentence ] correct = Y_t.cpu()[i].numpy() correct = [test_loader.dataset.i2w[i] for i in correct if i > 0] if config_old.rouge_subwords: correct = ''.join(word for word in correct).replace('▁', ' ') test_sentence = ''.join( word for word in test_sentence).replace('▁', ' ') else: test_sentence = ' '.join(word for word in test_sentence) correct = ' '.join(word for word in correct) batch_test_sentence.append(test_sentence) batch_correct.append(correct) rouge = rouge_eval.get_scores(batch_test_sentence, batch_correct, True) # output format is dict # Turn dict into lists and sum all corresponding elements with total rouge_scores[0][0] += rouge['rouge-1']['f'] rouge_scores[0][1] += rouge['rouge-1']['p'] rouge_scores[0][2] += rouge['rouge-1']['r'] rouge_scores[1][0] += rouge['rouge-2']['f'] rouge_scores[1][1] += rouge['rouge-2']['p'] rouge_scores[1][2] += rouge['rouge-2']['r'] rouge_scores[2][0] += rouge['rouge-l']['f'] rouge_scores[2][1] += rouge['rouge-l']['p'] rouge_scores[2][2] += rouge['rouge-l']['r'] num_examples += 1 # Show every 10 batches if batch_idx % 10 == 0: print("batch_idx:", batch_idx) # Current average rouge scores temp_rouge_scores = current_rouge_scores(rouge_scores, num_examples) # Final average rouge scores final_rouge_scores = current_rouge_scores(rouge_scores, num_examples)
from model_bert import * BATCH_SIZE = 10 NUM_EPOCH = 2 MARGIN = 768 / 2 DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu') if __name__ == "__main__": model_path = sys.argv[1] print(f'loading from [{model_path}]') model, existing_results = train_utils.load_model_save(model_path) optimizer = AdamW(model.parameters()) train_loader, dev_loader, test_loader = dataloader.get_dataloaders( BATCH_SIZE) def train_epoch_fn(e): total_train_loss = 0 total_num_correct_eucl = 0 total_item = 0 for i, data in enumerate(tqdm(train_loader, desc='train', leave=False)): if i > 10: break optimizer.zero_grad() ancs, poss, negs = data ancs = ancs.to(DEVICE) poss = poss.to(DEVICE)
def main(): global args best_prec1, best_epoch = 0.0, 0 if not os.path.exists(args.save): os.makedirs(args.save) if args.data.startswith('cifar'): IM_SIZE = 32 else: IM_SIZE = 224 print(args.arch) model = getattr(models, args.arch)(args) args.num_exits = len(model.classifier) global n_flops n_flops, n_params = measure_model(model, IM_SIZE, IM_SIZE) torch.save(n_flops, os.path.join(args.save, 'flops.pth')) del(model) print(args) with open('{}/args.txt'.format(args.save), 'w') as f: print(args, file=f) model = getattr(models, args.arch)(args) model = torch.nn.DataParallel(model.cuda()) criterion = nn.CrossEntropyLoss().cuda() optimizer = torch.optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) if args.resume: checkpoint = load_checkpoint(args) if checkpoint is not None: args.start_epoch = checkpoint['epoch'] + 1 best_prec1 = checkpoint['best_prec1'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) cudnn.benchmark = True train_loader, val_loader, test_loader = get_dataloaders(args) if args.evalmode is not None: state_dict = torch.load(args.evaluate_from)['state_dict'] model.load_state_dict(state_dict) if args.evalmode == 'anytime': validate(test_loader, model, criterion) elif args.evalmode == 'dynamic': dynamic_evaluate(model, test_loader, val_loader, args) else: validate(test_loader, model, criterion) dynamic_evaluate(model, test_loader, val_loader, args) return scores = ['epoch\tlr\ttrain_loss\tval_loss\ttrain_prec1' '\tval_prec1\ttrain_prec5\tval_prec5'] for epoch in range(args.start_epoch, args.epochs): train_loss, train_prec1, train_prec5, lr = train(train_loader, model, criterion, optimizer, epoch) val_loss, val_prec1, val_prec5 = validate(val_loader, model, criterion) scores.append(('{}\t{:.3f}' + '\t{:.4f}' * 6) .format(epoch, lr, train_loss, val_loss, train_prec1, val_prec1, train_prec5, val_prec5)) is_best = val_prec1 > best_prec1 if is_best or (epoch == 299): best_prec1 = val_prec1 best_epoch = epoch print('Best var_prec1 {}'.format(best_prec1)) model_filename = 'checkpoint_%03d.pth.tar' % epoch save_checkpoint({ 'epoch': epoch, 'arch': args.arch, 'state_dict': model.state_dict(), 'best_prec1': best_prec1, 'optimizer': optimizer.state_dict(), }, args, is_best, model_filename, scores) model_path = '%s/save_models/checkpoint_%03d.pth.tar' % (args.save, epoch-1) if os.path.exists(model_path): os.remove(model_path) print('Best val_prec1: {:.4f} at epoch {}'.format(best_prec1, best_epoch)) ### Test the final model print('********** Final prediction results **********') validate(test_loader, model, criterion) return
def main(): best_acc = 0 opt = parse_option() wandb.init(project=opt.model_path.split("/")[-1], tags=opt.tags) wandb.config.update(opt) wandb.save('*.py') wandb.run.save() # dataloader train_loader, val_loader, meta_testloader, meta_valloader, n_cls, no_sample = get_dataloaders(opt) # model model_t = [] if("," in opt.path_t): for path in opt.path_t.split(","): model_t.append(load_teacher(path, opt.model_t, n_cls, opt.dataset, opt.trans, opt.memfeature_size)) else: model_t.append(load_teacher(opt.path_t, opt.model_t, n_cls, opt.dataset, opt.trans, opt.memfeature_size)) model_s = create_model(opt.model_s, n_cls, opt.dataset, n_trans=opt.trans, embd_sz=opt.memfeature_size) if torch.cuda.device_count() > 1: print("second gpu count:", torch.cuda.device_count()) model_s = nn.DataParallel(model_s) if opt.pretrained_path != "": model_s.load_state_dict(torch.load(opt.pretrained_path)['model']) wandb.watch(model_s) criterion_cls = nn.CrossEntropyLoss() criterion_div = DistillKL(opt.kd_T) criterion_kd = DistillKL(opt.kd_T) optimizer = optim.SGD(model_s.parameters(), lr=opt.learning_rate, momentum=opt.momentum, weight_decay=opt.weight_decay) if torch.cuda.is_available(): for m in model_t: m.cuda() model_s.cuda() criterion_cls = criterion_cls.cuda() criterion_div = criterion_div.cuda() criterion_kd = criterion_kd.cuda() cudnn.benchmark = True MemBank = np.random.randn(no_sample, opt.memfeature_size) MemBank = torch.tensor(MemBank, dtype=torch.float).cuda() MemBankNorm = torch.norm(MemBank, dim=1, keepdim=True) MemBank = MemBank / (MemBankNorm + 1e-6) meta_test_acc = 0 meta_test_std = 0 # routine: supervised model distillation for epoch in range(1, opt.epochs + 1): if opt.cosine: scheduler.step() else: adjust_learning_rate(epoch, opt, optimizer) print("==> training...") time1 = time.time() train_acc, train_loss, MemBank = train(epoch, train_loader, model_s, model_t , criterion_cls, criterion_div, criterion_kd, optimizer, opt, MemBank) time2 = time.time() print('epoch {}, total time {:.2f}'.format(epoch, time2 - time1)) val_acc = 0 val_loss = 0 meta_val_acc = 0 meta_val_std = 0 # val_acc, val_acc_top5, val_loss = validate(val_loader, model_s, criterion_cls, opt) # #evaluate # start = time.time() # meta_val_acc, meta_val_std = meta_test(model_s, meta_valloader) # test_time = time.time() - start # print('Meta Val Acc: {:.4f}, Meta Val std: {:.4f}, Time: {:.1f}'.format(meta_val_acc, meta_val_std, test_time)) #evaluate start = time.time() meta_test_acc, meta_test_std = 0,0 #meta_test(model_s, meta_testloader, use_logit=False) test_time = time.time() - start print('Meta Test Acc: {:.4f}, Meta Test std: {:.4f}, Time: {:.1f}'.format(meta_test_acc, meta_test_std, test_time)) # regular saving if epoch % opt.save_freq == 0 or epoch==opt.epochs: print('==> Saving...') state = { 'epoch': epoch, 'model': model_s.state_dict(), } save_file = os.path.join(opt.save_folder, 'model_'+str(wandb.run.name)+'.pth') torch.save(state, save_file) #wandb saving torch.save(state, os.path.join(wandb.run.dir, "model.pth")) wandb.log({'epoch': epoch, 'Train Acc': train_acc, 'Train Loss':train_loss, 'Val Acc': val_acc, 'Val Loss':val_loss, 'Meta Test Acc': meta_test_acc, 'Meta Test std': meta_test_std, 'Meta Val Acc': meta_val_acc, 'Meta Val std': meta_val_std }) #final report print("GENERATING FINAL REPORT") generate_final_report(model_s, opt, wandb) #remove output.txt log file output_log_file = os.path.join(wandb.run.dir, "output.log") if os.path.isfile(output_log_file): os.remove(output_log_file) else: ## Show an error ## print("Error: %s file not found" % output_log_file)
warnings.filterwarnings("ignore") if __name__ == "__main__": n_epochs = 25 log_interval = 20 # set random seed random_seed = 42 torch.manual_seed(random_seed) # set torch device dev = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") batch_size_train = 100 # specified in the paper train_loader, test_loader = get_dataloaders(500, 100, batch_size_train) model_dict = { # 4-layer convnets "Four_Layer_SG": Four_Layer_SG, "Four_layer": Four_Layer, # #8-layer convnets "Eight_Layer_SG": Eight_Layer_SG, "Eight_Layer": Eight_Layer, # VGG Models "VGG16_SG": VGG16_SG2, "VGG16": VGG16_custom, } # with open("data.json", "r") as fp: # data = json.load(fp)
def start_tuning(): lr = 0.0001 dim_size = [128, 256, 512] num_layers = [1, 2] args = { 'lr': lr, 'dim_size': dim_size[0], 'num_layers': 1, 'window_size': 30, 'embed_dim': 128, 'batch_size': 20, 'dropout': 0.3, 'early_stop': 3, 'amount_of_vocab': 15000, } args = Struct(**args) # print(args) best_args = args # load data train_loader, dev_loader, test_loader, vocab_size, vocab = get_dataloaders( args.batch_size, args.window_size, args.amount_of_vocab) best_perp = 0 for size in dim_size: temp_args = args temp_args.dim_size = size print( "Current setting: \nHidden Dimension Size: {}\nNum of Hidden Layers: {}" .format(temp_args.dim_size, temp_args.num_layers)) perp = setup(temp_args, vocab_size, embedding_matrix=None, _train_loader=train_loader, _dev_loader=dev_loader) if (best_perp is 0): best_perp = perp best_args = temp_args elif perp < best_perp: best_perp = perp best_args = temp_args print("Best perplexity: {}, Current Perplexity: {}".format( best_perp, perp)) print("-" * 20) for layer in num_layers: temp_args = args temp_args.num_layers = layer print( "Current setting: \nHidden Dimension Size: {}\nNum of Hidden Layers: {}" .format(temp_args.dim_size, temp_args.num_layers)) perp = setup(args, vocab_size, embedding_matrix=None, _train_loader=train_loader, _dev_loader=dev_loader) if (best_perp is 0): best_perp = perp best_args = temp_args elif perp < best_perp: best_perp = perp best_args = temp_args print("Best perplexity: {}, Current Perplexity: {}".format( best_perp, perp)) print("-" * 20) #----------------------------------------------- # train lr = 0.001 temp_args = args temp_args.lr = 0.001 print( "Current setting: \nHidden Dimension Size: {}\nNum of Hidden Layers: {}" .format(temp_args.dim_size, temp_args.num_layers)) perp = setup(args, vocab_size, embedding_matrix=None, _train_loader=train_loader, _dev_loader=dev_loader) if (best_perp is 0): best_perp = perp best_args = temp_args elif perp < best_perp: best_perp = perp best_args = temp_args print("Best perplexity: {}, Current Perplexity: {}".format( best_perp, perp)) print("-" * 20) #------------------------------------------------ print("Best Perplexity: {}".format(best_perp)) print("Best args: \nlr = {}\ndim size = {}\nnum layers = {}".format( best_args.lr, best_args.dim_size, best_args.num_layers)) print( "Use the model with the best Hyper-parameters and report the test set perplexity" ) _best_perp = setup(best_args, vocab_size, _train_loader=train_loader, _dev_loader=test_loader) print(_best_perp)
def main(): parser = argparse.ArgumentParser() parser.add_argument('--model', default="alex-net", type=str, help='alex-net/resnet/vgg/custom') parser.add_argument('--load_from_file', default="", type=str, help='relative path of model file to load') parser.add_argument('--epochs', default=3, type=int, help='number of epochs') parser.add_argument('--cpu', default=False, type=bool, help='use CPU instead of GPU') args = parser.parse_args() batch_size = 1 model = args.model epochs = args.epochs model_path = args.load_from_file use_cpu = args.cpu if args.cpu: print("Using the CPU") device = torch.device("cpu") else: if torch.cuda.is_available(): device = torch.device("cuda") print("There are %d GPU(s) available." % torch.cuda.device_count()) print("We will use the GPU: ", torch.cuda.get_device_name(0)) else: print("No GPU available, using the CPU instead") device = torch.device("cpu") tvmodel = None if model == "alex-net": tvmodel = models.alexnet(pretrained=True) elif model == "vgg": tvmodel = models.vgg11_bn(pretrained=True) elif model == 'resnet': tvmodel = models.resnet18(pretrained=True) elif model == "custom": tvmodel = custom.NovelNet() elif model == "ensemble": tvmodel = ensemble.EnsembleModel() else: print("Incorrect model was passed, exiting!") exit() print("Loading data...") train_dataloader, test_dataloader = get_dataloaders(device) # torchtensors print("Done Loading Data.") trainer = Trainer(epochs=epochs, batch_size=batch_size, learning_rate=1e-5, model=tvmodel, model_name=model, device=device) if model_path != "": print("Loading Model") trainer.model = torch.load(model_path) trainer.model.eval() print("Finished Loading Model") else: print("Fitting model...") trainer.fit(train_dataloader) print("Done Fitting Model") prediction, probs = trainer.predict(test_dataloader) prediction = np.array(prediction) probs = torch.cat(probs, dim=0) probs = np.array(probs) accuracy = accuracy_score(test_dataloader[:][1], prediction) print("Test accuracy: %.4f" % accuracy) f1 = f1_score(test_dataloader[:][1], prediction) print("Test F1: %.4f" % f1) auroc = roc_auc_score(test_dataloader[:][1], prediction) print("Test AUC_ROC: %.4f" % auroc) precision = precision_score(test_dataloader[:][1], prediction) print("Test Precision: %.4f" % precision) recall = recall_score(test_dataloader[:][1], prediction) print("Test Recall: %.4f" % recall) fpr = dict() tpr = dict() roc_auc = dict() for i in range(probs.shape[1]): fpr[i], tpr[i], _ = roc_curve(test_dataloader[:][1], probs[:, i]) roc_auc[i] = roc_auc_score(test_dataloader[:][1], probs[:, i]) plt.figure() #plt.plot(fpr[0], tpr[0], color='red', label='ROC curve (area = %0.4f)' % roc_auc[0]) plt.plot(fpr[1], tpr[1], color='darkorange', label='ROC curve (area = %0.4f)' % roc_auc[1]) plt.plot([0, 1], [0, 1], color='navy', linestyle='--') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('Receiver operating characteristic for Alex Net') plt.legend(loc="lower right") plt.savefig("alex-net-roc.png")
def main(): global args best_err1, best_epoch = 100., 0 if args.data.startswith('cifar'): IMAGE_SIZE = 32 else: IMAGE_SIZE = 224 if not os.path.exists(args.save): os.makedirs(args.save) model = getattr(models, args.arch)(args) print(model) n_flops, n_params = measure_model(model, IMAGE_SIZE, IMAGE_SIZE) # print("------------------------------") print(n_flops, n_params) # print("------------------------------") torch.save(n_flops, os.path.join(args.save, 'flop.pth')) del(model) torch.save(args, os.path.join(args.save, 'args.pth')) # return model = getattr(models, args.arch)(args) # fout = open('model.txt', 'w') # print(model, file=fout) if args.arch.startswith('alexnet') or args.arch.startswith('vgg'): model.features = torch.nn.DataParallel(model.features) model.cuda() else: model = torch.nn.DataParallel(model).cuda() # define loss function (criterion) and pptimizer criterion = nn.CrossEntropyLoss().cuda() # define optimizer optimizer = torch.optim.SGD(model.parameters(), args.lr, momentum=args.momentum, weight_decay=args.weight_decay) # optionally resume from a checkpoint if args.resume: checkpoint = load_checkpoint(args) if checkpoint is not None: args.start_epoch = checkpoint['epoch'] + 1 best_err1 = checkpoint['best_err1'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) cudnn.benchmark = True train_loader, val_loader, test_loader = get_dataloaders(args) print("*************************************") print(args.use_valid, len(train_loader), len(val_loader), len(test_loader)) print("*************************************") if args.evalmode is not None: m = torch.load(args.evaluate_from) model.load_state_dict(m['state_dict']) if args.evalmode == 'anytime': validate(test_loader, model, criterion) else: dynamic_evaluate(model, test_loader, val_loader, args) return # set up logging global log_print, f_log f_log = open(os.path.join(args.save, 'log.txt'), 'w') def log_print(*args): print(*args) print(*args, file=f_log) log_print('args:') log_print(args) print('model:', file=f_log) print(model, file=f_log) log_print('# of params:', str(sum([p.numel() for p in model.parameters()]))) f_log.flush() scores = ['epoch\tlr\ttrain_loss\tval_loss\ttrain_err1' '\tval_err1\ttrain_err5\tval_err5'] for epoch in range(args.start_epoch, args.epochs): # train for one epoch train_loss, train_err1, train_err5, lr = train(train_loader, model, criterion, optimizer, epoch) # evaluate on validation set # val_loss, val_err1, val_err5 = validate(val_loader, model, criterion) val_loss, val_err1, val_err5 = validate(test_loader, model, criterion) # save scores to a tsv file, rewrite the whole file to prevent # accidental deletion scores.append(('{}\t{:.3f}' + '\t{:.4f}' * 6) .format(epoch, lr, train_loss, val_loss, train_err1, val_err1, train_err5, val_err5)) is_best = val_err1 < best_err1 if is_best: best_err1 = val_err1 best_epoch = epoch print('Best var_err1 {}'.format(best_err1)) model_filename = 'checkpoint_%03d.pth.tar' % epoch save_checkpoint({ 'epoch': epoch, 'arch': args.arch, 'state_dict': model.state_dict(), 'best_err1': best_err1, 'optimizer': optimizer.state_dict(), }, args, is_best, model_filename, scores) print('Best val_err1: {:.4f} at epoch {}'.format(best_err1, best_epoch))
def main(_run): args = tupperware(_run.config) args.finetune = False args.batch_size = 1 device = args.device # Get data data = get_dataloaders(args) # Model G = get_model.model(args).to(device) # LPIPS Criterion lpips_criterion = PerceptualLoss( model="net-lin", net="alex", use_gpu=True, gpu_ids=[device] ).to(device) # Load Models G, _, global_step, start_epoch, loss = load_models( G, g_optimizer=None, args=args, tag=args.inference_mode ) # Metric loggers val_metrics_dict = {"PSNR": 0.0, "SSIM": 0.0, "LPIPS_01": 0.0, "LPIPS_11": 0.0} avg_val_metrics = AvgLoss_with_dict(loss_dict=val_metrics_dict, args=args) logging.info(f"Loaded experiment {args.exp_name} trained for {start_epoch} epochs.") # Train, val and test paths val_path = args.output_dir / f"val_{args.inference_mode}_epoch_{start_epoch}" test_path = args.output_dir / f"test_{args.inference_mode}_epoch_{start_epoch}" if args.self_ensemble: val_path = val_path.parent / f"{val_path.name}_self_ensemble" test_path = test_path.parent / f"{test_path.name}_self_ensemble" val_path.mkdir(exist_ok=True, parents=True) test_path.mkdir(exist_ok=True, parents=True) with torch.no_grad(): G.eval() # Run val for an epoch avg_val_metrics.reset() pbar = tqdm(range(len(data.val_loader) * args.batch_size), dynamic_ncols=True) for i, batch in enumerate(data.val_loader): metrics_dict = defaultdict(float) source, target, filename = batch source, target = (source.to(device), target.to(device)) output = G(source) if args.self_ensemble: output_ensembled = [output] for k in ensemble_ops.keys(): # Forward transform source_t = ensemble_ops[k][0](source) output_t = G(source_t) # Inverse transform output_t = ensemble_ops[k][1](output_t) output_ensembled.append(output_t) output_ensembled = torch.cat(output_ensembled, dim=0) output = torch.mean(output_ensembled, dim=0, keepdim=True) # PSNR output_255 = (output.mul(0.5).add(0.5) * 255.0).int() output_quant = (output_255.float() / 255.0).sub(0.5).mul(2) target_255 = (target.mul(0.5).add(0.5) * 255.0).int() target_quant = (target_255.float() / 255.0).sub(0.5).mul(2) # LPIPS metrics_dict["LPIPS_01"] += lpips_criterion( output_quant.mul(0.5).add(0.5), target_quant.mul(0.5).add(0.5) ).item() metrics_dict["LPIPS_11"] += lpips_criterion( output_quant, target_quant ).item() for e in range(args.batch_size): # Compute SSIM target_numpy = ( target[e].mul(0.5).add(0.5).permute(1, 2, 0).cpu().detach().numpy() ) output_numpy = ( output[e].mul(0.5).add(0.5).permute(1, 2, 0).cpu().detach().numpy() ) metrics_dict["PSNR"] += PSNR_numpy(target_numpy, output_numpy) metrics_dict["SSIM"] += ssim( target_numpy, output_numpy, gaussian_weights=True, use_sample_covariance=False, multichannel=True, ) # Dump to output folder path_output = val_path / filename[e] cv2.imwrite( str(path_output), (output_numpy[:, :, ::-1] * 255.0).astype(np.int) ) metrics_dict["SSIM"] = metrics_dict["SSIM"] / args.batch_size metrics_dict["PSNR"] = metrics_dict["PSNR"] / args.batch_size avg_val_metrics += metrics_dict pbar.update(args.batch_size) pbar.set_description( f"Val Epoch : {start_epoch} Step: {global_step}| PSNR: {avg_val_metrics.loss_dict['PSNR']:.3f} | SSIM: {avg_val_metrics.loss_dict['SSIM']:.3f} | LPIPS 01: {avg_val_metrics.loss_dict['LPIPS_01']:.3f} | LPIPS 11: {avg_val_metrics.loss_dict['LPIPS_11']:.3f}" ) with open(val_path / "metrics.txt", "w") as f: L = [ f"exp_name:{args.exp_name} trained for {start_epoch} epochs\n", "Val Metrics \n\n", ] L = L + [f"{k}:{v}\n" for k, v in avg_val_metrics.loss_dict.items()] f.writelines(L) if data.test_loader: pbar = tqdm( range(len(data.test_loader) * args.batch_size), dynamic_ncols=True ) for i, batch in enumerate(data.test_loader): source, filename = batch source = source.to(device) output = G(source) if args.self_ensemble: output_ensembled = [output] for k in ensemble_ops.keys(): # Forward transform source_t = ensemble_ops[k][0](source) output_t = G(source_t) # Inverse transform output_t = ensemble_ops[k][1](output_t) output_ensembled.append(output_t) output_ensembled = torch.cat(output_ensembled, dim=0) output = torch.mean(output_ensembled, dim=0, keepdim=True) for e in range(args.batch_size): output_numpy = ( output[e] .mul(0.5) .add(0.5) .permute(1, 2, 0) .cpu() .detach() .numpy() ) # Dump to output folder path_output = test_path / filename[e] cv2.imwrite( str(path_output), (output_numpy[:, :, ::-1] * 255.0).astype(np.int), ) pbar.update(args.batch_size) pbar.set_description(f"Test Epoch : {start_epoch} Step: {global_step}")
def main(): global args best_prec1, best_epoch = 0.0, 0 if not os.path.exists(args.save): os.makedirs(args.save) torch.save(args, os.path.join(args.save, 'args.pth')) model = getattr(models, args.arch)(args) print(model) if args.arch.startswith('alexnet') or args.arch.startswith('vgg'): model.features = torch.nn.DataParallel(model.features) model.cuda() else: model = torch.nn.DataParallel(model).cuda() criterion = nn.CrossEntropyLoss().cuda() optimizer = torch.optim.SGD(model.parameters(), args.lr, momentum=args.momentum, weight_decay=args.weight_decay) if args.resume: checkpoint = load_checkpoint(args) if checkpoint is not None: args.start_epoch = checkpoint['epoch'] + 1 best_prec1 = checkpoint['best_prec1'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) cudnn.benchmark = True train_loader, val_loader, test_loader = get_dataloaders(args) if args.evalmode is not None: state_dict = torch.load(args.evaluate_from)['state_dict'] model.load_state_dict(state_dict) if args.evalmode == 'anytime': validate(val_loader, model, criterion) else: dynamic_evaluate(model, test_loader, val_loader, args) return scores = [ 'epoch\tlr\ttrain_loss\tval_loss\ttrain_prec1' '\tval_prec1\ttrain_prec5\tval_prec5' ] for epoch in range(args.start_epoch, args.epochs): train_loss, train_prec1, train_prec5, lr = train( train_loader, model, criterion, optimizer, epoch) val_loss, val_prec1, val_prec5 = validate(val_loader, model, criterion) scores.append( ('{}\t{:.3f}' + '\t{:.4f}' * 6).format(epoch, lr, train_loss, val_loss, train_prec1, val_prec1, train_prec5, val_prec5)) is_best = val_prec1 > best_prec1 if is_best: best_prec1 = val_prec1 best_epoch = epoch print('Best var_prec1 {}'.format(best_prec1)) model_filename = 'checkpoint_%03d.pth.tar' % epoch save_checkpoint( { 'epoch': epoch, 'arch': args.arch, 'state_dict': model.state_dict(), 'best_prec1': best_prec1, 'optimizer': optimizer.state_dict(), }, args, is_best, model_filename, scores) print('Best val_prec1: {:.4f} at epoch {}'.format(best_prec1, best_epoch))
def train(config): # Initialize the device which to run the model on device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') print("device:", device) # Get torch loaders for training and test data train_loader, test_loader = get_dataloaders(config.dataset, markov_order=config.order+1, batch_size=config.batch_size) vocab_size = train_loader.dataset.vocab_size # Load single test batch for evaluation test_X, test_Y, test_xl, test_yl = next(iter(test_loader)) # If we want the continue training and the given filename exists, load all params # Otherwise just start training from scratch if config.continue_training: file_path = config.model_dir+config.continue_training if os.path.isfile(file_path): print('Loading checkpoint \'{}\''.format(file_path)) checkpoint = torch.load(file_path) config = checkpoint['config'] # Use saved config config.start_epoch = checkpoint['epoch'] print('Loaded checkpoint \'{}\' (epoch {})'.format(file_path, checkpoint['epoch'])) config.continue_training = file_path # To make sure it is no empty string else: print('No checkpoint found at \'{}\''.format(file_path)) sys.exit('Please check the filename.') teacher_force_ratio = config.teacher_force_ratio # Define model embedding = nn.Embedding(vocab_size, config.embedding_dim, padding_idx=config.pad_token) if config.adasoft: output_size = 1024 else: output_size = vocab_size # Define encoder if config.encoder_type == 'BOW': # Bag of Words encoder = BOWEncoder(vocab_size, config.embedding_dim, output_size) elif config.encoder_type == 'Conv': # Convolutions # 4 layers -> minimal X length = 2^4 encoder = ConvEncoder(vocab_size, config.embedding_dim, 4, config.hidden_size, output_size) elif config.encoder_type == 'Attn': # Attention encoder = AttnEncoder(vocab_size, config.embedding_dim, config.order) # Define models and optimizer nnlm = NNLM(config.order, vocab_size, config.embedding_dim, [config.hidden_size]*3, output_size) model = FBModel(embedding, encoder, nnlm).to(device) optimizer = torch.optim.Adam(model.parameters(), lr=config.learning_rate) # If we want to continue training, load the existing model and optimizer if config.continue_training and checkpoint != None: print('Model and optimizer are copied from checkpoint.') model.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) # Define loss if config.adasoft: criterion = nn.AdaptiveLogSoftmaxWithLoss(1024, vocab_size, [100, 1000, 5000, 10000]).to(device) else: # EXPERIMENTAL: set UNK weight lower (maybe not needed with better vocab) loss_weights = torch.ones(vocab_size).to(device) if 'UNK' in train_loader.dataset.w2i: loss_weights[train_loader.dataset.w2i['UNK']] = 0.3 criterion = nn.CrossEntropyLoss(weight=loss_weights, ignore_index=0) if config.start_epoch >= config.num_epochs: sys.exit('Already trained for specified amount of epochs. Consider increasing num_epochs.') else: print('Start training.') losses = [] for epoch in range(config.start_epoch, config.num_epochs): # TRAIN num_teacherforce = [0, 0] num_batches = len(train_loader) starttime = time.time() for batch_idx, (X, Y, xlen, ylen) in enumerate(train_loader): X = X.to(device) Y = Y.to(device) xlen = xlen.to(device) # Because we have history of size config.order, actual y_length is total y_length - order ylen = (ylen-config.order).to(device) # Make ngrams and targets y_c = torch.stack([Y[:, i:i+config.order] for i in range(0, Y.size(1)-config.order)], 1) y_t = Y[:, config.order:] # Train step model.train() optimizer.zero_grad() # No teacher forcing if np.random.random() > teacher_force_ratio: num_teacherforce[0] += 1 y_c = y_c[:,0:1] out_length = y_t.size(1) out = model(X, y_c, xlen, ylen, output_length=out_length, teacher_forcing=False) else: num_teacherforce[1] += 1 out = model(X, y_c, xlen, ylen, teacher_forcing=True) # Loss, optimization step out = out.reshape(-1, output_size) y_t = y_t.reshape(-1) loss = criterion(out.reshape(-1, output_size), y_t.reshape(-1)) if config.adasoft: loss = loss.loss losses.append(loss.item()) loss.backward() optimizer.step() if not batch_idx%20: if config.adasoft: pred = criterion.predict(out) else: pred = torch.argmax(out, -1) acc = accuracy(pred, y_t) print('[Epoch {}/{}], step {:04d}/{:04d} loss {:.4f} acc {:.4f} time {:.4f}'.format(epoch +1, config.num_epochs, batch_idx, num_batches, loss.item(), acc.item(), time.time() - starttime )) starttime = time.time() # Save model every final step of each 10 epochs or last epoch #if (epoch + 1 % 10 == 0 or epoch + 1 == config.num_epochs) and batch_idx == num_batches - 1: # torch.save(model, config.output_dir + '/test_model_epoch_'+str(epoch+1)+'.pt') if batch_idx % 500 == 0: state = create_state(config, model, optimizer, criterion, epoch, loss, accuracy) is_best_model = check_is_best(config.model_dir, config.encoder_type, config.embedding_dim, config.hidden_size, loss.item()) save_model(state, is_best_model, config.model_dir, config.encoder_type, config.embedding_dim, config.hidden_size, loss.item()) if has_converged(losses): print('Model has converged.') return break # Decay teacherforcing teacher_force_ratio *= config.teacher_force_decay # EVAL #TODO: Seperate script or move to test.py model.eval() # Choose random sample test_idx = np.random.randint(config.batch_size) # Load random sample from test batch xlen = test_xl[[test_idx]].to(device) Y = test_Y[[test_idx],:].to(device) X = test_X[[test_idx],:xlen].to(device) ylen = torch.Tensor([1]).to(device) # Greedy Search greedy_sequence = greedy_search(model, X, Y, xlen, ylen, test_loader) # Beam Search all_sequences = beam_search(config, model, X, Y, xlen, ylen) # Target sequence y_t = Y[:, config.order:] correct = y_t.cpu()[-1].numpy() correct = [test_loader.dataset.i2w[i] for i in correct if i > 0] # print results print("greedy :", greedy_sequence) for counter, sequence in enumerate(all_sequences): print("number", counter+1, ":", [test_loader.dataset.i2w[i] for i in sequence[0].squeeze().cpu().numpy() if i > 1] ) print('correct :', correct) print()
def main(): global args best_prec1, best_epoch = 0.0, 0 if not os.path.exists(args.save): os.makedirs(args.save) if args.data.startswith('cifar'): IM_SIZE = 32 else: IM_SIZE = 224 model = getattr(models, args.arch)(args) # 根据模型结构计算flops params n_flops, n_params = measure_model(model, IM_SIZE, IM_SIZE) # 存储下模型的flops torch.save(n_flops, os.path.join(args.save, 'flops.pth')) del (model) model = getattr(models, args.arch)(args) if args.arch.startswith('alexnet') or args.arch.startswith('vgg'): model.features = torch.nn.DataParallel(model.features) model.cuda() else: model = torch.nn.DataParallel(model).cuda() criterion = nn.CrossEntropyLoss().cuda() optimizer = torch.optim.SGD(model.parameters(), args.lr, momentum=args.momentum, weight_decay=args.weight_decay) # 断点继续 if args.resume: checkpoint = load_checkpoint(args) if checkpoint is not None: args.start_epoch = checkpoint['epoch'] + 1 best_prec1 = checkpoint['best_prec1'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) cudnn.benchmark = True train_loader, val_loader, test_loader = get_dataloaders(args) # evalmode两种 分别为anytime和dynamic if args.evalmode is not None: # args.evaluate_from为模型存储路径 state_dict = torch.load(args.evaluate_from)['state_dict'] model.load_state_dict(state_dict) # 不同的处理方式 if args.evalmode == 'anytime': validate(test_loader, model, criterion) else: dynamic_evaluate(model, test_loader, val_loader, args) return # 训练看这里 scores = [ 'epoch\tlr\ttrain_loss\tval_loss\ttrain_prec1' '\tval_prec1\ttrain_prec5\tval_prec5' ] for epoch in range(args.start_epoch, args.epochs): train_loss, train_prec1, train_prec5, lr = train( train_loader, model, criterion, optimizer, epoch) val_loss, val_prec1, val_prec5 = validate(val_loader, model, criterion) scores.append( ('{}\t{:.3f}' + '\t{:.4f}' * 6).format(epoch, lr, train_loss, val_loss, train_prec1, val_prec1, train_prec5, val_prec5)) is_best = val_prec1 > best_prec1 if is_best: best_prec1 = val_prec1 best_epoch = epoch print('Best var_prec1 {}'.format(best_prec1)) model_filename = 'checkpoint_%03d.pth.tar' % epoch save_checkpoint( { 'epoch': epoch, 'arch': args.arch, 'state_dict': model.state_dict(), 'best_prec1': best_prec1, 'optimizer': optimizer.state_dict(), }, args, is_best, model_filename, scores) print('Best val_prec1: {:.4f} at epoch {}'.format(best_prec1, best_epoch)) ### Test the final model print('********** Final prediction results **********') validate(test_loader, model, criterion) return
def main(): opt = parse_option() wandb.init(project=opt.model_path.split("/")[-1], tags=opt.tags) wandb.config.update(opt) wandb.save('*.py') wandb.run.save() train_loader, val_loader, meta_testloader, meta_valloader, n_cls, no_sample = get_dataloaders( opt) # model model = create_model(opt.model, n_cls, opt.dataset, n_trans=opt.trans, embd_sz=opt.memfeature_size) wandb.watch(model) # optimizer if opt.adam: print("Adam") optimizer = torch.optim.Adam(model.parameters(), lr=opt.learning_rate, weight_decay=0.0005) else: print("SGD") optimizer = optim.SGD(model.parameters(), lr=opt.learning_rate, momentum=opt.momentum, weight_decay=opt.weight_decay) criterion = nn.CrossEntropyLoss() if torch.cuda.is_available(): if opt.n_gpu > 1: model = nn.DataParallel(model) model = model.cuda() criterion = criterion.cuda() cudnn.benchmark = True # set cosine annealing scheduler if opt.cosine: eta_min = opt.learning_rate * (opt.lr_decay_rate**3) scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( optimizer, opt.epochs, eta_min, -1) MemBank = np.random.randn(no_sample, opt.memfeature_size) MemBank = torch.tensor(MemBank, dtype=torch.float).cuda() MemBankNorm = torch.norm(MemBank, dim=1, keepdim=True) MemBank = MemBank / (MemBankNorm + 1e-6) # routine: supervised pre-training for epoch in range(1, opt.epochs + 1): if opt.cosine: scheduler.step() else: adjust_learning_rate(epoch, opt, optimizer) print("==> training...") time1 = time.time() train_acc, train_loss, MemBank = train(epoch, train_loader, model, criterion, optimizer, opt, MemBank) time2 = time.time() print('epoch {}, total time {:.2f}'.format(epoch, time2 - time1)) val_acc, val_acc_top5, val_loss = 0, 0, 0 #validate(val_loader, model, criterion, opt) #validate start = time.time() meta_val_acc, meta_val_std = 0, 0 #meta_test(model, meta_valloader) test_time = time.time() - start print( 'Meta Val Acc : {:.4f}, Meta Val std: {:.4f}, Time: {:.1f}'.format( meta_val_acc, meta_val_std, test_time)) #evaluate start = time.time() meta_test_acc, meta_test_std = 0, 0 #meta_test(model, meta_testloader) test_time = time.time() - start print('Meta Test Acc: {:.4f}, Meta Test std: {:.4f}, Time: {:.1f}'. format(meta_test_acc, meta_test_std, test_time)) # regular saving if epoch % opt.save_freq == 0 or epoch == opt.epochs: print('==> Saving...') state = { 'epoch': epoch, 'optimizer': optimizer.state_dict(), 'model': model.state_dict(), } save_file = os.path.join(opt.save_folder, 'model_' + str(wandb.run.name) + '.pth') torch.save(state, save_file) #wandb saving torch.save(state, os.path.join(wandb.run.dir, "model.pth")) wandb.log({ 'epoch': epoch, 'Train Acc': train_acc, 'Train Loss': train_loss, 'Val Acc': val_acc, 'Val Loss': val_loss, 'Meta Test Acc': meta_test_acc, 'Meta Test std': meta_test_std, 'Meta Val Acc': meta_val_acc, 'Meta Val std': meta_val_std }) #final report print("GENERATING FINAL REPORT") generate_final_report(model, opt, wandb) #remove output.txt log file output_log_file = os.path.join(wandb.run.dir, "output.log") if os.path.isfile(output_log_file): os.remove(output_log_file) else: ## Show an error ## print("Error: %s file not found" % output_log_file)
def main(args): ####################################################################################### ## 注释: ## 载入模型 ####################################################################################### best_prec1, best_epoch = 0.0, 0 model = getattr(models, args.arch)(args) n_flops, n_params = measure_model(model, IM_SIZE, IM_SIZE) torch.save(n_flops, os.path.join(args.save, 'flops.pth')) del (model) model = getattr(models, args.arch)(args) if args.arch.startswith('alexnet') or args.arch.startswith('vgg'): model.features = torch.nn.DataParallel(model.features) model.cuda() else: model = torch.nn.DataParallel(model).cuda() ####################################################################################### ## 注释: ## 载入criterion ####################################################################################### criterion = nn.CrossEntropyLoss().cuda() ####################################################################################### ## 注释: ## 载入optimizer ####################################################################################### optimizer = torch.optim.SGD(model.parameters(), args.lr, momentum=args.momentum, weight_decay=args.weight_decay) ####################################################################################### ## 注释: ## 接 中断的训练 ####################################################################################### if args.resume: checkpoint = load_checkpoint(args) if checkpoint is not None: args.start_epoch = checkpoint['epoch'] + 1 best_prec1 = checkpoint['best_prec1'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) cudnn.benchmark = True ####################################################################################### ## 注释: ## 导入数据集 ####################################################################################### train_loader, val_loader, test_loader = get_dataloaders(args) ####################################################################################### ## 注释: ## 选择推理模式 imagenet--dynamic ####################################################################################### if args.evalmode is not None: state_dict = torch.load(args.evaluate_from)['state_dict'] model.load_state_dict(state_dict) if args.evalmode == 'anytime': validate(test_loader, model, criterion) else: dynamic_evaluate(model, test_loader, val_loader, args) return scores = [ 'epoch\tlr\ttrain_loss\tval_loss\ttrain_prec1' '\tval_prec1\ttrain_prec5\tval_prec5' ] for epoch in range(args.start_epoch, args.epochs): train_loss, train_prec1, train_prec5, lr = train( train_loader, model, criterion, optimizer, epoch) val_loss, val_prec1, val_prec5 = validate(val_loader, model, criterion) scores.append( ('{}\t{:.3f}' + '\t{:.4f}' * 6).format(epoch, lr, train_loss, val_loss, train_prec1, val_prec1, train_prec5, val_prec5)) is_best = val_prec1 > best_prec1 if is_best: best_prec1 = val_prec1 best_epoch = epoch print('Best var_prec1 {}'.format(best_prec1)) model_filename = 'checkpoint_%03d.pth.tar' % epoch save_checkpoint( { 'epoch': epoch, 'arch': args.arch, 'state_dict': model.state_dict(), 'best_prec1': best_prec1, 'optimizer': optimizer.state_dict(), }, args, is_best, model_filename, scores) print('Best val_prec1: {:.4f} at epoch {}'.format(best_prec1, best_epoch)) ### Test the final model print('********** Final prediction results **********') validate(test_loader, model, criterion) return
train_util.set_logger(os.path.join(args.model_dir, 'train.log')) torch.manual_seed(0) if params.use_gpu: logging.info("GPU found") torch.cuda.manual_seed(0) else: logging.info("GPU not found") logging.info("Loading data") if params.data_type == "glyph_raster": data_types = ["image", "semantic"] print("Note : using images") dataloaders = get_dataloaders(params, ["train", "val"], data_types, character=params.character) elif params.data_type == "glyph_vector": data_types = ["svg", "semantic"] dataloaders = get_dataloaders(params, ["train", "val"], data_types, character=params.character) else: raise Exception("Invalid data type requested") train_dataloader = dataloaders["train"] val_dataloader = dataloaders["val"] logging.info("- done") Model = None
def main(_run): args = tupperware(_run.config) # Dir init dir_init(args, is_local_rank_0=is_local_rank_0) # Ignore warnings if not is_local_rank_0: warnings.filterwarnings("ignore") # Mutli GPUS Setup if args.distdataparallel: rank = int(os.environ["LOCAL_RANK"]) torch.cuda.set_device(rank) torch.distributed.init_process_group(backend="nccl", init_method="env://") world_size = dist.get_world_size() else: rank = args.device world_size = 1 # Get data data = get_dataloaders(args, is_local_rank_0=is_local_rank_0) # Model G = get_model.model(args).to(rank) # Optimisers g_optimizer, g_lr_scheduler = get_optimisers(G, args) # Load Models G, g_optimizer, global_step, start_epoch, loss = load_models( G, g_optimizer, args, is_local_rank_0=is_local_rank_0) if args.distdataparallel: # Wrap with Distributed Data Parallel G = torch.nn.parallel.DistributedDataParallel(G, device_ids=[rank], output_device=rank) # Log no of GPUs if is_local_rank_0: world_size = int( os.environ["WORLD_SIZE"]) if "WORLD_SIZE" in os.environ else 1 logging.info("Using {} GPUs".format(world_size)) writer = SummaryWriter(log_dir=str(args.run_dir)) writer.add_text("Args", pprint_args(args)) # Pbars train_pbar = tqdm(range(len(data.train_loader) * args.batch_size), dynamic_ncols=True) val_pbar = (tqdm(range(len(data.val_loader) * args.batch_size), dynamic_ncols=True) if data.val_loader else None) test_pbar = (tqdm(range(len(data.test_loader) * args.batch_size), dynamic_ncols=True) if data.test_loader else None) # Initialise losses g_loss = GLoss(args).to(rank) # Compatibility with checkpoints without global_step if not global_step: global_step = start_epoch * len(data.train_loader) * args.batch_size start_epoch = global_step // len(data.train_loader.dataset) # Exponential averaging of loss loss_dict = { "total_loss": 0.0, "image_loss": 0.0, "cobi_rgb_loss": 0.0, "train_PSNR": 0.0, } metric_dict = {"PSNR": 0.0, "total_loss": 0.0} avg_metrics = AvgLoss_with_dict(loss_dict=metric_dict, args=args) exp_loss = ExpLoss_with_dict(loss_dict=loss_dict, args=args) try: for epoch in range(start_epoch, args.num_epochs): # Train mode G.train() if is_local_rank_0: train_pbar.reset() if args.distdataparallel: data.train_loader.sampler.set_epoch(epoch) for i, batch in enumerate(data.train_loader): # allows for interrupted training if ((global_step + 1) % (len(data.train_loader) * args.batch_size) == 0) and (epoch == start_epoch): break loss_dict = defaultdict(float) source, target, filename = batch source, target = (source.to(rank), target.to(rank)) # ------------------------------- # # Update Gen # ------------------------------- # G.zero_grad() output = G(source) g_loss(output=output, target=target) g_loss.total_loss.backward() g_optimizer.step() # Update lr schedulers g_lr_scheduler.step(epoch + i / len(data.train_loader)) # if is_local_rank_0: # Train PSNR loss_dict["train_PSNR"] += PSNR(output, target) # Accumulate all losses loss_dict["total_loss"] += g_loss.total_loss loss_dict["image_loss"] += g_loss.image_loss loss_dict["cobi_rgb_loss"] += g_loss.cobi_rgb_loss exp_loss += reduce_loss_dict(loss_dict, world_size=world_size) global_step += args.batch_size * world_size if is_local_rank_0: train_pbar.update(args.batch_size) train_pbar.set_description( f"Epoch: {epoch + 1} | Gen loss: {exp_loss.loss_dict['total_loss']:.3f} " ) # Write lr rates and metrics if is_local_rank_0 and i % (args.log_interval) == 0: gen_lr = g_optimizer.param_groups[0]["lr"] writer.add_scalar("lr/gen", gen_lr, global_step) for metric in exp_loss.loss_dict: writer.add_scalar( f"Train_Metrics/{metric}", exp_loss.loss_dict[metric], global_step, ) # Display images at end of epoch n = np.min([3, args.batch_size]) for e in range(n): source_vis = source[e].mul(0.5).add(0.5) target_vis = target[e].mul(0.5).add(0.5) output_vis = output[e].mul(0.5).add(0.5) writer.add_image( f"Source/Train_{e + 1}", source_vis.cpu().detach(), global_step, ) writer.add_image( f"Target/Train_{e + 1}", target_vis.cpu().detach(), global_step, ) writer.add_image( f"Output/Train_{e + 1}", output_vis.cpu().detach(), global_step, ) writer.add_text(f"Filename/Train_{e + 1}", filename[e], global_step) if is_local_rank_0: # Save ckpt at end of epoch logging.info( f"Saving weights at epoch {epoch + 1} global step {global_step}" ) # Save weights save_weights( epoch=epoch, global_step=global_step, G=G, g_optimizer=g_optimizer, loss=loss, tag="latest", args=args, ) train_pbar.refresh() # Run val and test only occasionally if epoch % args.val_test_epoch_interval != 0: continue # Val and test with torch.no_grad(): G.eval() if data.val_loader: avg_metrics.reset() if is_local_rank_0: val_pbar.reset() filename_static = [] for i, batch in enumerate(data.val_loader): metrics_dict = defaultdict(float) source, target, filename = batch source, target = (source.to(rank), target.to(rank)) output = G(source) g_loss(output=output, target=target) # Total loss metrics_dict["total_loss"] += g_loss.total_loss # PSNR metrics_dict["PSNR"] += PSNR(output, target) avg_metrics += reduce_loss_dict(metrics_dict, world_size=world_size) # Save image if args.static_val_image in filename: filename_static = filename source_static = source target_static = target output_static = output if is_local_rank_0: val_pbar.update(args.batch_size) val_pbar.set_description( f"Val Epoch : {epoch + 1} Step: {global_step}| PSNR: {avg_metrics.loss_dict['PSNR']:.3f}" ) if is_local_rank_0: for metric in avg_metrics.loss_dict: writer.add_scalar( f"Val_Metrics/{metric}", avg_metrics.loss_dict[metric], global_step, ) n = np.min([3, args.batch_size]) for e in range(n): source_vis = source[e].mul(0.5).add(0.5) target_vis = target[e].mul(0.5).add(0.5) output_vis = output[e].mul(0.5).add(0.5) writer.add_image( f"Source/Val_{e+1}", source_vis.cpu().detach(), global_step, ) writer.add_image( f"Target/Val_{e+1}", target_vis.cpu().detach(), global_step, ) writer.add_image( f"Output/Val_{e+1}", output_vis.cpu().detach(), global_step, ) writer.add_text(f"Filename/Val_{e + 1}", filename[e], global_step) for e, name in enumerate(filename_static): if name == args.static_val_image: source_vis = source_static[e].mul(0.5).add(0.5) target_vis = target_static[e].mul(0.5).add(0.5) output_vis = output_static[e].mul(0.5).add(0.5) writer.add_image( f"Source/Val_Static", source_vis.cpu().detach(), global_step, ) writer.add_image( f"Target/Val_Static", target_vis.cpu().detach(), global_step, ) writer.add_image( f"Output/Val_Static", output_vis.cpu().detach(), global_step, ) writer.add_text( f"Filename/Val_Static", filename_static[e], global_step, ) break logging.info( f"Saving weights at END OF epoch {epoch + 1} global step {global_step}" ) # Save weights if avg_metrics.loss_dict["total_loss"] < loss: is_min = True loss = avg_metrics.loss_dict["total_loss"] else: is_min = False # Save weights save_weights( epoch=epoch, global_step=global_step, G=G, g_optimizer=g_optimizer, loss=loss, is_min=is_min, args=args, tag="best", ) val_pbar.refresh() # Test if data.test_loader: filename_static = [] if is_local_rank_0: test_pbar.reset() for i, batch in enumerate(data.test_loader): source, filename = batch source = source.to(rank) output = G(source) # Save image if args.static_test_image in filename: filename_static = filename source_static = source output_static = output if is_local_rank_0: test_pbar.update(args.batch_size) test_pbar.set_description( f"Test Epoch : {epoch + 1} Step: {global_step}" ) if is_local_rank_0: n = np.min([3, args.batch_size]) for e in range(n): source_vis = source[e].mul(0.5).add(0.5) output_vis = output[e].mul(0.5).add(0.5) writer.add_image( f"Source/Test_{e+1}", source_vis.cpu().detach(), global_step, ) writer.add_image( f"Output/Test_{e+1}", output_vis.cpu().detach(), global_step, ) writer.add_text(f"Filename/Test_{e + 1}", filename[e], global_step) for e, name in enumerate(filename_static): if name == args.static_test_image: source_vis = source_static[e] output_vis = output_static[e] writer.add_image( f"Source/Test_Static", source_vis.cpu().detach(), global_step, ) writer.add_image( f"Output/Test_Static", output_vis.cpu().detach(), global_step, ) writer.add_text( f"Filename/Test_Static", filename_static[e], global_step, ) break test_pbar.refresh() except KeyboardInterrupt: if is_local_rank_0: logging.info("-" * 89) logging.info("Exiting from training early. Saving models") for pbar in [train_pbar, val_pbar, test_pbar]: if pbar: pbar.refresh() save_weights( epoch=epoch, global_step=global_step, G=G, g_optimizer=g_optimizer, loss=loss, is_min=True, args=args, )
def train(config): log_fn = os.path.join( '../logs', 'S2S_{}_{}.log'.format(config.num_layers, config.hidden_size)) logfile = open(log_fn, 'w', 1) # Initialize the device which to run the model on device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") # get torch loaders for training and test data train_loader, test_loader = get_dataloaders(config.dataset, markov_order=2, batch_size=config.batch_size) vocab_size = train_loader.dataset.vocab_size # Load single test batch for evaluation test_X, test_Y, test_xl, test_yl = next(iter(test_loader)) # If we want the continue training and the given filename exists, load all params # Otherwise just start training from scratch if config.continue_training: file_path = config.model_dir + config.continue_training if os.path.isfile(file_path): print('Loading checkpoint \'{}\''.format(file_path)) checkpoint = torch.load(file_path) config = checkpoint['config'] # Use saved config config.start_epoch = checkpoint['epoch'] print('Loaded checkpoint \'{}\' (epoch {})'.format( file_path, checkpoint['epoch'])) config.continue_training = file_path # To make sure it is no empty string else: print('No checkpoint found at \'{}\''.format(file_path)) sys.exit('Please check the filename.') teacher_force_ratio = config.teacher_force_ratio encoder = S2SEncoder(vocab_size, config.embedding_dim, config.hidden_size, config.num_layers, dropout=config.dropout) decoder = S2SAttnDecoder(vocab_size, config.embedding_dim, config.hidden_size, config.num_layers, dropout=config.dropout) model = S2S(encoder, decoder).to(device) optimizer = torch.optim.Adam(model.parameters(), lr=config.learning_rate) criterion = nn.CrossEntropyLoss(ignore_index=0) # If we want to continue training, load the existing model and optimizer if config.continue_training and checkpoint != None: print('Model and optimizer are copied from checkpoint.') model.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) if config.start_epoch >= config.num_epochs: sys.exit( 'Already trained for specified amount of epochs. Consider increasing num_epochs.' ) else: print('Start training.') losses = [] for epoch in range(config.num_epochs): # TRAIN num_teacherforce = [0, 0] num_batches = len(train_loader) for batch_idx, (X, Y, xlen, ylen) in enumerate(train_loader): X = X.to(device) Y = Y.to(device) Y_in = Y[:, :-1] Y_t = Y[:, 1:] xlen = xlen.to(device) # ylen -= 1, outputs do not predict start token ylen = (ylen - 1).to(device) # Train step model.train() optimizer.zero_grad() # No teacher forcing if np.random.random() > teacher_force_ratio: num_teacherforce[0] += 1 Y_in = Y_in[:, 0:1] ylen = torch.ones_like(ylen).to(device) out_length = Y_t.size(1) out = model(X, Y_in, xlen, ylen, output_length=out_length, teacher_forcing=False) else: num_teacherforce[1] += 1 out = model(X, Y_in, xlen, ylen, teacher_forcing=True) # Loss, optimization step loss = criterion(out.reshape(-1, vocab_size), Y_t.reshape(-1)) loss.backward() losses.append(loss.item()) optimizer.step() if not batch_idx % 20: pred = torch.argmax(out, -1) acc = accuracy(pred, Y_t) print( '[Epoch {}/{}], step {:04d}/{:04d} loss {:.4f} acc {:.4f}'. format(epoch + 1, config.num_epochs, batch_idx, num_batches, loss.item(), acc.item())) print('{} {} {:.4f} {:.4f}'.format(epoch + 1, batch_idx, loss.item(), acc.item()), file=logfile) # Save model every final step of each 10 epochs or last epoch #if (epoch + 1 % 10 == 0 or epoch + 1 == config.num_epochs) and batch_idx == num_batches - 1: # torch.save(model, config.output_dir + '/test_model_epoch_'+str(epoch+1)+'.pt') if batch_idx % 500 == 0: state = create_state(config, model, optimizer, criterion, epoch, loss, accuracy) is_best_model = check_is_best(config.model_dir, 'S2SEncoder', config.embedding_dim, config.hidden_size, loss.item()) save_model(state, is_best_model, config.model_dir, 'S2SEncoder', config.embedding_dim, config.hidden_size, loss.item()) if has_converged(losses): print('Model converged') return # EVAL # model.eval() # print(num_teacherforce) # # Load test batch # Y = test_Y.to(device) # X = test_X.to(device) # xlen = test_xl.to(device) # ylen = test_yl.to(device) # # Make ngrams and targets # y_c = torch.stack([Y[:, i:i+config.order] for i in range(0, Y.size(1)-config.order)], 1) # y_t = Y[:, config.order:] # out = model(X, y_c, xlen, ylen) # print(out.size()) # if config.adasoft: # test_sentence = criterion.predict(out.reshape(-1, output_size)).reshape(out.size(0), out.size(1)) # test_sentence = test_sentence.cpu().numpy() # else: # test_sentence = torch.argmax(out[-1], -1).cpu().numpy() # test_sentence = [test_loader.dataset.i2w[i] if i > 0 else 'PAD' for i in test_sentence] # correct = y_t.cpu()[-1].numpy() # correct = [test_loader.dataset.i2w[i] for i in correct if i > 0] # print(test_sentence) # print(correct) # print() # Decay teacherforcing teacher_force_ratio *= config.teacher_force_decay
def main(): global args best_acc1, best_epoch = 0., 0 if args.data.startswith('cifar'): IMAGE_SIZE = 32 else: IMAGE_SIZE = 224 if not os.path.exists(args.save): os.makedirs(args.save) model = getattr(models, args.arch)(args) if not os.path.exists(os.path.join(args.save, 'args.pth')): torch.save(args, os.path.join(args.save, 'args.pth')) if args.arch.startswith('alexnet') or args.arch.startswith('vgg'): model.features = torch.nn.DataParallel(model.features) model.cuda() else: model = torch.nn.DataParallel(model).cuda() # define loss function (criterion) and pptimizer for param in model.module.net.parameters(): param.requires_grad = False optimizer = torch.optim.SGD( [{ 'params': model.module.classifier.parameters() }, { 'params': model.module.isc_modules.parameters() }], args.lr, momentum=args.momentum, weight_decay=args.weight_decay) kd_loss = KDLoss(args) # optionally resume from a checkpoint if args.resume: checkpoint = load_checkpoint(args) if checkpoint is not None: args.start_epoch = checkpoint['epoch'] best_acc1 = checkpoint['best_acc1'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) cudnn.benchmark = True train_loader, val_loader, test_loader = get_dataloaders(args) print("*************************************") print(args.use_valid, len(train_loader), len(val_loader), len(test_loader)) print("*************************************") if args.evalmode is not None: m = torch.load(args.evaluate_from) model.load_state_dict(m['state_dict']) if args.evalmode == 'anytime': validate(test_loader, model, kd_loss) else: dynamic_evaluate(model, test_loader, val_loader, args) return # set up logging global log_print, f_log f_log = open(os.path.join(args.save, 'log.txt'), 'w') def log_print(*args): print(*args) print(*args, file=f_log) log_print('args:') log_print(args) print('model:', file=f_log) print(model, file=f_log) log_print('# of params:', str(sum([p.numel() for p in model.parameters()]))) f_log.flush() scores = [ 'epoch\tlr\ttrain_loss\tval_loss\ttrain_acc1' '\tval_acc1\ttrain_acc5\tval_acc5' ] for epoch in range(args.start_epoch, args.epochs): # train for one epoch train_loss, train_acc1, train_acc5, lr = train(train_loader, model, kd_loss, optimizer, epoch) # evaluate on validation set val_loss, val_acc1, val_acc5 = validate(test_loader, model, kd_loss) # save scores to a tsv file, rewrite the whole file to prevent # accidental deletion scores.append( ('{}\t{:.3f}' + '\t{:.4f}' * 6).format(epoch, lr, train_loss, val_loss, train_acc1, val_acc1, train_acc5, val_acc5)) is_best = val_acc1 > best_acc1 if is_best: best_acc1 = val_acc1 best_epoch = epoch print('Best var_acc1 {}'.format(best_acc1)) model_filename = 'checkpoint_%03d.pth.tar' % epoch save_checkpoint( { 'epoch': epoch, 'arch': args.arch, 'state_dict': model.state_dict(), 'best_acc1': best_acc1, 'optimizer': optimizer.state_dict(), }, args, is_best, model_filename, scores) print('Best val_acc1: {:.4f} at epoch {}'.format(best_acc1, best_epoch))