def train(opts): device = torch.device("cuda" if use_cuda else "cpu") if opts.arch == 'small': channels = [32, 32, 32, 10] elif opts.arch == 'large': channels = [256, 128, 64, 32] else: raise NotImplementedError('Unknown model architecture') if opts.mode == 'train_mnist': train_loader, valid_loader = get_mnist_loaders(opts.data_dir, opts.bsize, opts.nworkers, opts.sigma, opts.alpha) model = CAE(1, 10, 28, opts.n_prototypes, opts.decoder_arch, channels) elif opts.mode == 'train_cifar': train_loader, valid_loader = get_cifar_loaders(opts.data_dir, opts.bsize, opts.nworkers, opts.sigma, opts.alpha) model = CAE(3, 10, 32, opts.n_prototypes, opts.decoder_arch, channels) elif opts.mode == 'train_fmnist': train_loader, valid_loader = get_fmnist_loaders( opts.data_dir, opts.bsize, opts.nworkers, opts.sigma, opts.alpha) model = CAE(1, 10, 28, opts.n_prototypes, opts.decoder_arch, channels) else: raise NotImplementedError('Unknown train mode') if opts.optim == 'adam': optimizer = torch.optim.Adam(model.parameters(), lr=opts.lr, weight_decay=opts.wd) else: raise NotImplementedError("Unknown optim type") criterion = nn.CrossEntropyLoss() start_n_iter = 0 # for choosing the best model best_val_acc = 0.0 model_path = os.path.join(opts.save_path, 'model_latest.net') if opts.resume and os.path.exists(model_path): # restoring training from save_state print('====> Resuming training from previous checkpoint') save_state = torch.load(model_path, map_location='cpu') model.load_state_dict(save_state['state_dict']) start_n_iter = save_state['n_iter'] best_val_acc = save_state['best_val_acc'] opts = save_state['opts'] opts.start_epoch = save_state['epoch'] + 1 model = model.to(device) # for logging logger = TensorboardXLogger(opts.start_epoch, opts.log_iter, opts.log_dir) logger.set(['acc', 'loss', 'loss_class', 'loss_ae', 'loss_r1', 'loss_r2']) logger.n_iter = start_n_iter for epoch in range(opts.start_epoch, opts.epochs): model.train() logger.step() valid_sample = torch.stack([ valid_loader.dataset[i][0] for i in random.sample(range(len(valid_loader.dataset)), 10) ]).to(device) for batch_idx, (data, target) in enumerate(train_loader): acc, loss, class_error, ae_error, error_1, error_2 = run_iter( opts, data, target, model, criterion, device) # optimizer step optimizer.zero_grad() loss.backward() nn.utils.clip_grad_norm_(model.parameters(), opts.max_norm) optimizer.step() logger.update(acc, loss, class_error, ae_error, error_1, error_2) val_loss, val_acc, val_class_error, val_ae_error, val_error_1, val_error_2, time_taken = evaluate( opts, model, valid_loader, criterion, device) # log the validation losses logger.log_valid(time_taken, val_acc, val_loss, val_class_error, val_ae_error, val_error_1, val_error_2) print('') # Save the model to disk if val_acc >= best_val_acc: best_val_acc = val_acc save_state = { 'epoch': epoch, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(), 'n_iter': logger.n_iter, 'opts': opts, 'val_acc': val_acc, 'best_val_acc': best_val_acc } model_path = os.path.join(opts.save_path, 'model_best.net') torch.save(save_state, model_path) prototypes = model.save_prototypes(opts.save_path, 'prototypes_best.png') x = torchvision.utils.make_grid(prototypes, nrow=10, pad_value=1.0) logger.writer.add_image('Prototypes (best)', x, epoch) save_state = { 'epoch': epoch, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(), 'n_iter': logger.n_iter, 'opts': opts, 'val_acc': val_acc, 'best_val_acc': best_val_acc } model_path = os.path.join(opts.save_path, 'model_latest.net') torch.save(save_state, model_path) prototypes = model.save_prototypes(opts.save_path, 'prototypes_latest.png') x = torchvision.utils.make_grid(prototypes, nrow=10, pad_value=1.0) logger.writer.add_image('Prototypes (latest)', x, epoch) ae_samples = model.get_decoded_pairs_grid(valid_sample) logger.writer.add_image('AE_samples_latest', ae_samples, epoch)
factor=0.1, patience=6) #todo: inserire salvataggi periodici durante allenamento, salvataggio log di accuracy e loss train_loader, val_loader = loader_helper.get_loaders( batch_size=batch_size, merge_idda_classes=merge_idda_classes) since = time.time() val_acc_history = [] best_model_wts = copy.deepcopy(model.state_dict()) best_acc = 0.0 logger = TensorboardXLogger('tensorboard') for epoch in range(starting_epoch, num_epochs): print('Epoch {}/{}'.format(epoch, num_epochs - 1)) print('-' * 10) # Each epoch has a training and validation phase for phase in ['train', 'val']: #['train', 'val']: if phase == 'train': print('\n-- Training epoch %d' % int(epoch + 1)) model.train() # Set model to training mode else: print('-- Validating epoch %d' % int(epoch + 1)) model.eval() # Set model to evaluate mode
def train_sentiment(opts): device = torch.device("cuda" if use_cuda else "cpu") glove_loader = GloveLoader(os.path.join(opts.data_dir, 'glove', opts.glove_emb_file)) train_loader = DataLoader(RottenTomatoesReviewDataset(opts.data_dir, 'train', glove_loader, opts.maxlen), \ batch_size=opts.bsize, shuffle=True, num_workers=opts.nworkers) valid_loader = DataLoader(RottenTomatoesReviewDataset(opts.data_dir, 'val', glove_loader, opts.maxlen), \ batch_size=opts.bsize, shuffle=False, num_workers=opts.nworkers) model = Classifier(opts.hidden_size, opts.dropout_p, glove_loader, opts.enc_arch) if opts.optim == 'adam': optimizer = torch.optim.Adam(model.parameters(), lr=opts.lr, weight_decay=opts.wd) else: raise NotImplementedError("Unknown optim type") criterion = nn.CrossEntropyLoss() start_n_iter = 0 # for choosing the best model best_val_acc = 0.0 model_path = os.path.join(opts.save_path, 'model_latest.net') if opts.resume and os.path.exists(model_path): # restoring training from save_state print ('====> Resuming training from previous checkpoint') save_state = torch.load(model_path, map_location='cpu') model.load_state_dict(save_state['state_dict']) start_n_iter = save_state['n_iter'] best_val_acc = save_state['best_val_acc'] opts = save_state['opts'] opts.start_epoch = save_state['epoch'] + 1 model = model.to(device) # for logging logger = TensorboardXLogger(opts.start_epoch, opts.log_iter, opts.log_dir) logger.set(['acc', 'loss']) logger.n_iter = start_n_iter for epoch in range(opts.start_epoch, opts.epochs): model.train() logger.step() for batch_idx, data in enumerate(train_loader): acc, loss = run_iter(opts, data, model, criterion, device) # optimizer step optimizer.zero_grad() loss.backward() nn.utils.clip_grad_norm_(model.parameters(), opts.max_norm) optimizer.step() logger.update(acc, loss) val_loss, val_acc, time_taken = evaluate(opts, model, valid_loader, criterion, device) # log the validation losses logger.log_valid(time_taken, val_acc, val_loss) print ('') # Save the model to disk if val_acc >= best_val_acc: best_val_acc = val_acc save_state = { 'epoch': epoch, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(), 'n_iter': logger.n_iter, 'opts': opts, 'val_acc': val_acc, 'best_val_acc': best_val_acc } model_path = os.path.join(opts.save_path, 'model_best.net') torch.save(save_state, model_path) save_state = { 'epoch': epoch, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(), 'n_iter': logger.n_iter, 'opts': opts, 'val_acc': val_acc, 'best_val_acc': best_val_acc } model_path = os.path.join(opts.save_path, 'model_latest.net') torch.save(save_state, model_path)
num_workers=8) target_loader = DataLoader(target, batch_size=batch_size, shuffle=True, num_workers=8) source_loader = DataLoader(source, batch_size=batch_size, shuffle=True, num_workers=8) return target_loader, source_loader, test_loader, net, EPOCHS, init_lr if __name__ == '__main__': # create the Logger log = Log(f'logs/{setting}', method_name) # Make the dataset target_loader, source_loader, test_loader, net, EPOCHS, init_lr = get_setting( ) if args.epochs is not None: EPOCHS = args.epochs if args.so: loader_lenght = 'source' dl_len = len(source_loader) total_steps = EPOCHS * dl_len print(f"Num of Batches ({loader_lenght}) is {dl_len}") method = SourceOnly(net, init_lr,
def train_rank(opts): if opts.constraint == 'DemoParity' or opts.constraint == 'DispTreat' or opts.constraint == 'DispImpact': func = lambda x: rank_lp_func(opts.constraint, x) else: func = rank_collate_func glove_loader = GloveLoader(os.path.join(opts.data_dir, 'glove', opts.glove_emb_file)) train_dataset = RottenTomatoesRankingDataset(opts.data_dir, 'train', glove_loader, opts.maxlen, opts.div_by) train_loader = DataLoader(train_dataset, batch_size=opts.bsize, sampler=RankSampler(train_dataset), \ collate_fn=func, num_workers=opts.nworkers) valid_dataset = RottenTomatoesRankingDataset(opts.data_dir, 'val', glove_loader, opts.maxlen, opts.div_by) valid_loader = DataLoader(valid_dataset, batch_size=opts.bsize, sampler=RankSampler(valid_dataset), \ collate_fn=func, num_workers=opts.nworkers) model = RankNet(opts.hidden_size, opts.dropout_p, glove_loader, opts.enc_arch, \ num_genres=len(train_dataset.genres), pretrained_base=opts.pretrained_base, loss_type=opts.loss_type) if opts.optim == 'adam': optimizer = torch.optim.Adam([ {'params': model.encoder.parameters(), 'lr': opts.lr / 10.0}, {'params': model.rank_layer.parameters()}], lr=opts.lr, weight_decay=opts.wd) else: raise NotImplementedError("Unknown optim type") start_n_iter = 0 # for choosing the best model best_val_ndcg = 0.0 model_path = os.path.join(opts.save_path, 'model_latest.net') if opts.resume and os.path.exists(model_path): # restoring training from save_state print ('====> Resuming training from previous checkpoint') save_state = torch.load(model_path, map_location='cpu') model.load_state_dict(save_state['state_dict']) start_n_iter = save_state['n_iter'] best_val_ndcg = save_state['best_val_ndcg'] opts = save_state['opts'] opts.start_epoch = save_state['epoch'] + 1 model = model.to(device) # for logging logger = TensorboardXLogger(opts.start_epoch, opts.log_iter, opts.log_dir) logger.set(['NDCG', opts.metric, 'loss']) logger.n_iter = start_n_iter for epoch in range(opts.start_epoch, opts.epochs): model.train() logger.step() for batch_idx, data in enumerate(train_loader): ndcg, fscore, loss = run_iter(opts, data, model) # optimizer step optimizer.zero_grad() loss.backward() nn.utils.clip_grad_norm_(model.parameters(), opts.max_norm) optimizer.step() logger.update(ndcg, fscore, loss) val_loss, val_ndcg, val_fscore, time_taken = evaluate(opts, model, valid_loader) # log the validation losses logger.log_valid(time_taken, val_ndcg, val_fscore, val_loss) print ('') # Save the model to disk if val_ndcg >= best_val_ndcg: best_val_ndcg = val_ndcg save_state = { 'epoch': epoch, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(), 'n_iter': logger.n_iter, 'opts': opts, 'val_ndcg': val_ndcg, 'best_val_ndcg': best_val_ndcg } model_path = os.path.join(opts.save_path, 'model_best.net') torch.save(save_state, model_path) save_state = { 'epoch': epoch, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(), 'n_iter': logger.n_iter, 'opts': opts, 'val_ndcg': val_ndcg, 'best_val_ndcg': best_val_ndcg } model_path = os.path.join(opts.save_path, 'model_latest.net') torch.save(save_state, model_path)
def train(opts): glove_loader = GloveLoader( os.path.join(opts.data_dir, opts.corpus, 'glove/', opts.glove_emb_file)) if opts.corpus in ['msvd', 'msvd_vgg']: VDDataset = MSVideoDescriptionDataset elif opts.corpus == 'msrvtt': VDDataset = MSRVideoToTextDataset else: raise NotImplementedError('Unknown dataset') train_loader = DataLoader(VDDataset(opts.data_dir, opts.corpus, 'train', glove_loader, opts.num_frames, opts.max_len), \ batch_size=opts.bsize, shuffle=True, num_workers=opts.nworkers, collate_fn=collate_fn) valid_loader = DataLoader(VDDataset(opts.data_dir, opts.corpus, 'val', glove_loader, opts.num_frames, opts.max_len), \ batch_size=opts.bsize, shuffle=False, num_workers=opts.nworkers, collate_fn=collate_fn) if opts.arch == 's2vt': model = S2VTModel(glove_loader, opts.dropout_p, opts.hidden_size, opts.vid_feat_size, opts.max_len) elif opts.arch == 's2vt-att': model = S2VTAttModel(glove_loader, opts.dropout_p, opts.hidden_size, opts.vid_feat_size, opts.max_len) elif opts.arch == 'transformer': #inputs are number of layers and number of heads (last two) model = Transformer(glove_loader, opts.dropout_p, opts.hidden_size, opts.vid_feat_size, opts.max_len, 6, 8) else: raise NotImplementedError('Unknown model architecture') if opts.optim == 'adam': optimizer = torch.optim.Adam(model.parameters(), lr=opts.lr, weight_decay=opts.wd) else: raise NotImplementedError("Unknown optim type") if opts.schedule_sample: sample_probs = inverse_sigmoid(opts.epochs) else: sample_probs = np.ones(opts.epochs) criterion = nn.CrossEntropyLoss(reduction='none') metrics_to_omit = ['Bleu_1', 'Bleu_2', 'Bleu_3', 'Bleu_4', \ 'ROUGE_L', 'CIDEr', 'SkipThoughtCS', \ 'EmbeddingAverageCosineSimilairty', 'VectorExtremaCosineSimilarity', \ 'GreedyMatchingScore'] nlg_eval = NLGEval(metrics_to_omit=metrics_to_omit) start_n_iter = 0 # for choosing the best model best_val_meteor_score = 0.0 model_path = os.path.join(opts.save_path, 'model_latest.net') if opts.resume and os.path.exists(model_path): # restoring training from save_state print('====> Resuming training from previous checkpoint') save_state = torch.load(model_path, map_location='cpu') model.load_state_dict(save_state['state_dict']) start_n_iter = save_state['n_iter'] best_val_meteor_score = save_state['best_val_meteor_score'] opts = save_state['opts'] opts.start_epoch = save_state['epoch'] + 1 model = model.to(device) # for logging logger = TensorboardXLogger(opts.start_epoch, opts.log_iter, opts.log_dir) logger.set(['acc', 'loss']) logger.n_iter = start_n_iter for epoch in range(opts.start_epoch, opts.epochs): model.train() model.teacher_force_prob = sample_probs[epoch] logger.step() sampler = StreamSampler(opts.n_sample_sent) for batch_idx, data in enumerate(train_loader): acc, loss, pred = run_iter(opts, data, model, criterion, return_pred=True) hyps = glove_loader.get_sents_from_indexes(pred.data.cpu().numpy()) for hyp, ref, vk in zip(hyps, data['refs'], data['vid_key']): ref = random.choice(ref) sampler.add((hyp, ref, vk)) # optimizer step optimizer.zero_grad() loss.backward() nn.utils.clip_grad_norm_(model.parameters(), opts.max_norm) optimizer.step() logger.update(acc, loss) meteor_eval_func = lambda pred, refs: calc_meteor_score( pred, refs, nlg_eval) val_loss, val_acc, val_meteor_score, sample_sent, time_taken = evaluate( opts, model, valid_loader, criterion, glove_loader, meteor_eval_func) print('') print( '********************************** TRAIN **********************************' ) train_sample_sent = sampler.get() print_sample_sents(train_sample_sent) print( '***************************************************************************' ) print('') print( '*********************************** VAL ***********************************' ) # log the validation losses logger.log_valid(time_taken, val_acc, val_loss) logger.writer.add_scalar('val/METEOR', val_meteor_score, logger.n_iter) print('Validation METEOR score: {:.5f}'.format(val_meteor_score)) print_sample_sents(sample_sent) print('') # Save the model to disk if val_meteor_score >= best_val_meteor_score: best_val_meteor_score = val_meteor_score save_state = { 'epoch': epoch, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(), 'n_iter': logger.n_iter, 'opts': opts, 'val_meteor_score': val_meteor_score, 'best_val_meteor_score': best_val_meteor_score } model_path = os.path.join(opts.save_path, 'model_best.net') torch.save(save_state, model_path) save_state = { 'epoch': epoch, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(), 'n_iter': logger.n_iter, 'opts': opts, 'val_meteor_score': val_meteor_score, 'best_val_meteor_score': best_val_meteor_score } model_path = os.path.join(opts.save_path, 'model_latest.net') torch.save(save_state, model_path)
num_workers=8) target_loader = DataLoader(target, batch_size=batch_size, shuffle=True, num_workers=8) source_loader = DataLoader(source, batch_size=batch_size, shuffle=True, num_workers=8) return target_loader, source_loader, test_loader if __name__ == '__main__': # create the Logger log = Log(f'logs/{setting}', method_name) # Make the dataset target_loader, source_loader, test_loader = get_setting() if args.epochs is not None: EPOCHS = args.epochs loader_lenght = 'min' dl_len = min(len(source_loader), len(target_loader)) print(f"Num of Batches ({loader_lenght}) is {dl_len}") total_steps = EPOCHS * dl_len method = NODA(net, init_lr, total_steps, device, num_classes=n_classes) print("Do a validation before starting to check it is ok...") val_loss, val_acc = valid(method, valid_loader=test_loader)