hc_idxs, uc_idxs = [], [] total_uc_num = 0 # asm 总共使用的 uc 样本数 use_asm = True for epoch in range(1, args.epochs + 1): # 调整 classifier optimizer 的 lr = meta_lr adjust_lr(args.lr, optimizer_a, epoch, writer) # train on (imb_train_data) writer.add_scalar('ASM/batches', len(label_loader), global_step=epoch) train_base(label_loader, model, criterion, optimizer_a, epoch, args.print_freq, writer) # evaluate on validation set prec1 = validate(test_loader, model, criterion, epoch, args.print_freq, writer) # remember best prec@1 and save checkpoint if prec1 > best_prec1: best_prec1, best_epoch = prec1, epoch save_model( os.path.join(model_save_dir, 'rs32_epoch_{}.pth'.format(epoch)), model, epoch, best_prec1) # 每次 prec 提升后再用 asm 是个好方法,但是太理想 # 我们恰恰希望能引入 active samples 来带来 prec 提升 # begin asm stage if use_asm and epoch >= args.init_epochs and len( unlabel_dataset) > args.uncertain_samples_size: # 1.detect on unlabel images
train_acc_scores = [] val_acc_scores = [] for epoch in range(epochs): print(f"Epoch {epoch+1} of {epochs}") train_epoch_loss, f1_train, training_accurracy = train_with_scheduler( model, train_loader, optimizer, criterion, sgdr_partial, train_data, device) # train_epoch_loss,training_accurracy = train( # model, train_loader, optimizer, criterion, train_data, device # ) valid_epoch_loss, f1_val, val_accuracy = validate(model, valid_loader, criterion, valid_data, device) train_loss.append(train_epoch_loss) valid_loss.append(valid_epoch_loss) train_f1_scores.append(f1_train) val_f1_scores.append(f1_val) train_acc_scores.append(training_accurracy) val_acc_scores.append(val_accuracy) print(f"Train Loss: {train_epoch_loss:.4f}") print(f"Training Acc : {training_accurracy:.4f}") print(f"Training f1 : {f1_train:.4f}") print(f'Val Loss: {valid_epoch_loss:.4f}')
) test_meters = [ TensorboardLogger( log_dir=test_tb_log_dir, delimiter=" ", ) for test_tb_log_dir in test_tb_log_dirs ] if cfg.EVALUATE: for task_name, testloader, test_meter in zip(task_names, testloaders, test_meters): logging.info("Evaluating dataset: {}".format(task_name)) validate(testloader, net, criterion_eval, cfg, test_meter, global_step=0, device=device, local_rank=get_rank()) ############## training code ############################# if not cfg.EVALUATE: scaler = torch.cuda.amp.GradScaler(enabled=cfg.AMP.ENABLED) # start from epoch 0 or last checkpoint epoch start_epoch = checkpointer.epoch for epoch in range(start_epoch, cfg.OPTIM.EPOCHS): # wait for all processes before every epoch synchronize() logging.info("PROGRESS: {}%".format( round(100 * epoch / cfg.OPTIM.EPOCHS, 4))) global_step = epoch * len(trainloader)
model = config_file.model model = model.to(device) # Optimizer optimizer = torch.optim.Adam(params=model.parameters(), lr=config.LEARNING_RATE) # Log metrics with wandb wandb.watch(model, log="all") # Training loop print('Initiating Fine-Tuning for the model on our dataset') for epoch in range(config.TRAIN_EPOCHS): engine.train(epoch, tokenizer, model, device, training_loader, optimizer) # Validation loop and saving the resulting file with predictions and acutals in a dataframe. # Saving the dataframe as predictions.csv print( 'Now generating summaries on our fine tuned model for the validation dataset and saving it in a dataframe' ) for epoch in range(config.VAL_EPOCHS): predictions, actuals = engine.validate(epoch, tokenizer, model, device, val_loader) final_df = pd.DataFrame({ 'Generated Text': predictions, 'Actual Text': actuals }) final_df.to_csv( '/home/hasan/Desktop/Code to keep on Github/Text Summarization/predictions.csv' ) print('Output Files generated for review')
# initialize the transform transform = transform() # prepare the training and validation data loaders train_data, valid_data = prepare_dataset(root_path='../input/catsNdogs/') trainset = LFWDataset(train_data, transform=transform) trainloader = DataLoader(trainset, batch_size=batch_size, shuffle=True) validset = LFWDataset(valid_data, transform=transform) validloader = DataLoader(validset, batch_size=batch_size) train_loss = [] valid_loss = [] for epoch in range(epochs): print(f"Epoch {epoch+1} of {epochs}") train_epoch_loss = train(model, trainloader, trainset, device, optimizer, criterion) valid_epoch_loss, recon_images = validate(model, validloader, validset, device, criterion) train_loss.append(train_epoch_loss) valid_loss.append(valid_epoch_loss) # save the reconstructed images from the validation loop save_reconstructed_images(recon_images, epoch + 1) # convert the reconstructed images to PyTorch image grid format image_grid = make_grid(recon_images.detach().cpu()) grid_images.append(image_grid) print(f"Train Loss: {train_epoch_loss:.4f}") print(f"Val Loss: {valid_epoch_loss:.4f}") # save the reconstructions as a .gif file image_to_vid(grid_images) # save the loss plots to disk save_loss_plot(train_loss, valid_loss) print('TRAINING COMPLETE')
def main(): # Hyper Parameters setting parser = argparse.ArgumentParser() parser.add_argument('--data_path', default='/mnt/data/linkaiyi/scan/data/f30k_precomp', help='path to datasets') parser.add_argument('--path_opt', default='option/FusionNoattn_baseline.yaml', type=str, help='path to a yaml options file') parser.add_argument('--data_name', default='flickr30k_splits', help='{coco,f30k}_splits') parser.add_argument('--logger_name', default='./log_2', help='Path to save Tensorboard log.') parser.add_argument( '--vocab_path', default= '/home/linkaiyi/fusion_wangtan/Fusion_flickr/Fusion_10.28/vocab', help='Path to saved vocabulary json files.') parser.add_argument( '--model_name', default='/mnt/data/linkaiyi/mscoco/fusion/Fusion_flic/runs/checkpoint', help='Path to save the model.') parser.add_argument('--num_epochs', default=120, type=int, help='Number of training epochs.') parser.add_argument('--batch_size', default=128, type=int, help='Size of a training mini-batch.') parser.add_argument('--workers', default=2, type=int, help='Number of data loader workers.') parser.add_argument('--resume', default='', type=str, metavar='PATH', help='path to latest checkpoint (default: none)') parser.add_argument('--lr_update', default=20, type=int, help='Number of epochs to update the learning rate.') opt = parser.parse_args() if os.path.isdir(opt.logger_name): if click.confirm('Logs directory already exists in {}. Erase?'.format( opt.logger_name, default=False)): os.system('rm -r ' + opt.logger_name) tb_logger.configure(opt.logger_name, flush_secs=5) logging.basicConfig(format='%(asctime)s %(message)s', level=logging.INFO) ######################################################################################### # Create options ######################################################################################### options = {'logs': {}, 'coco': {}, 'model': {'seq2vec': {}}, 'optim': {}} if opt.path_opt is not None: with open(opt.path_opt, 'r') as handle: options_yaml = yaml.load(handle) options = utils.update_values(options, options_yaml) vocab = deserialize_vocab( os.path.join(opt.vocab_path, '%s_vocab.json' % opt.data_name)) vocab_word = sorted(vocab.word2idx.items(), key=lambda x: x[1], reverse=False) vocab_word = [tup[0] for tup in vocab_word] opt.vocab_size = len(vocab) # Create dataset, model, criterion and optimizer train_loader, val_loader = data.get_loaders(opt.data_path, vocab, opt.batch_size, opt.workers, opt) model = models.factory(options['model'], vocab_word, cuda=True, data_parallel=False) criterion = nn.CrossEntropyLoss(weight=torch.Tensor([1, 128])).cuda() optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=options['optim']['lr']) print('Model has {} parameters'.format(utils.params_count(model))) # optionally resume from a checkpoint if opt.resume: if os.path.isfile(opt.resume): print("=> loading checkpoint '{}'".format(opt.resume)) checkpoint = torch.load(opt.resume) start_epoch = checkpoint['epoch'] best_rsum = checkpoint['best_rsum'] model.load_state_dict(checkpoint['model']) # Eiters is used to show logs as the continuation of another # training model.Eiters = checkpoint['Eiters'] print("=> loaded checkpoint '{}' (epoch {}, best_rsum {})".format( opt.resume, start_epoch, best_rsum)) engine.validate(val_loader, model, criterion, optimizer, opt.batch_size) else: print("=> no checkpoint found at '{}'".format(opt.resume)) else: start_epoch = 0 # Train the Model best_rsum = 0 for epoch in range(start_epoch, opt.num_epochs): adjust_learning_rate(opt, options, optimizer, epoch) # train for one epoch engine.train(train_loader, model, criterion, optimizer, epoch, print_freq=10) # evaluate on validation set rsum = engine.validate(val_loader, model, criterion, optimizer, opt.batch_size) is_best = rsum > best_rsum best_rsum = max(rsum, best_rsum) if not os.path.exists(opt.model_name): os.mkdir(opt.model_name) save_checkpoint( { 'epoch': epoch + 1, 'arch': 'baseline', 'model': model.state_dict(), 'best_rsum': best_rsum, 'opt': opt, 'options': options, 'Eiters': model.Eiters, }, is_best, filename='checkpoint_{}{}.pth.tar'.format(epoch, best_rsum), prefix=opt.model_name + '/')
def valid_write(path, image): cv2.imwrite(path, engine.validate(image))
# train data loader train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True) # validation data loader valid_loader = DataLoader(valid_data, batch_size=batch_size, shuffle=True) # starting the training loop and validation train_loss = [] valid_loss = [] for epoch in range(epochs): print(f"Epoch ", epoch + 1, " of {epochs}") train_epoch_loss = train(model, train_loader, optimizer, criterion, train_data, device) valid_epoch_loss = validate(model, valid_loader, optimizer, criterion, valid_data, device) train_loss.append(train_epoch_loss) valid_loss.append(valid_epoch_loss) print(f"Train Loss: {train_epoch_loss:.4f}") print(f"Val Loss: {valid_epoch_loss:.4f}") torch.sabe( { 'epoch': epochs, 'model_state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), 'loss': criterion, }, '../outputs/model.pth')
def main_worker(save_dir, args): # basic setup cudnn.benchmark = True if args.log_name is not None: log_dir = "runs/%s" % args.log_name else: log_dir = f"runs/{datetime.datetime.now().strftime('%m-%d-%H-%M-%S')}" if args.local_rank == 0: logger = SummaryWriter(log_dir) else: logger = None deepspeed.init_distributed(dist_backend='nccl') torch.cuda.set_device(args.local_rank) model = SetVAE(args) parameters = model.parameters() n_parameters = sum(p.numel() for p in parameters if p.requires_grad) print(f'number of params: {n_parameters}') try: n_gen_parameters = sum(p.numel() for p in model.init_set.parameters() if p.requires_grad) + \ sum(p.numel() for p in model.pre_decoder.parameters() if p.requires_grad) + \ sum(p.numel() for p in model.decoder.parameters() if p.requires_grad) + \ sum(p.numel() for p in model.post_decoder.parameters() if p.requires_grad) + \ sum(p.numel() for p in model.output.parameters() if p.requires_grad) print(f'number of generator params: {n_gen_parameters}') except AttributeError: pass optimizer, criterion = model.make_optimizer(args) # initialize datasets and loaders train_dataset, val_dataset, train_loader, val_loader = get_datasets(args) # initialize the learning rate scheduler if args.scheduler == 'exponential': assert not (args.warmup_epochs > 0) scheduler = torch.optim.lr_scheduler.ExponentialLR( optimizer, args.exp_decay) elif args.scheduler == 'step': assert not (args.warmup_epochs > 0) scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=args.epochs // 2, gamma=0.1) elif args.scheduler == 'linear': def lambda_rule(ep): lr_w = min(1., ep / args.warmup_epochs) if (args.warmup_epochs > 0) else 1. lr_l = 1.0 - max(0, ep - 0.5 * args.epochs) / float( 0.5 * args.epochs) return lr_l * lr_w scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lambda_rule) elif args.scheduler == 'cosine': assert not (args.warmup_epochs > 0) scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( optimizer, T_max=args.epochs) else: # Fake SCHEDULER def lambda_rule(ep): return 1.0 scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lambda_rule) # extract collate_fn if args.distributed: collate_fn = deepcopy(train_loader.collate_fn) model, optimizer, train_loader, scheduler = deepspeed.initialize( args=args, model=model, optimizer=optimizer, model_parameters=parameters, training_data=train_dataset, collate_fn=collate_fn, lr_scheduler=scheduler) # resume checkpoints start_epoch = 0 if args.resume_checkpoint is None and Path( Path(save_dir) / 'checkpoint-latest.pt').exists(): args.resume_checkpoint = os.path.join( save_dir, 'checkpoint-latest.pt') # use the latest checkpoint print('Resumed from: ' + args.resume_checkpoint) if args.resume_checkpoint is not None: if args.distributed: if args.resume_optimizer: model.module, model.optimizer, model.lr_scheduler, start_epoch = resume( args.resume_checkpoint, model.module, model.optimizer, scheduler=model.lr_scheduler, strict=(not args.resume_non_strict)) else: model.module, _, _, start_epoch = resume( args.resume_checkpoint, model.module, optimizer=None, strict=(not args.resume_non_strict)) else: if args.resume_optimizer: model, optimizer, scheduler, start_epoch = resume( args.resume_checkpoint, model, optimizer, scheduler=scheduler, strict=(not args.resume_non_strict)) else: model, _, _, start_epoch = resume( args.resume_checkpoint, model, optimizer=None, strict=(not args.resume_non_strict)) # save dataset statistics if args.local_rank == 0: train_dataset.save_statistics(save_dir) val_dataset.save_statistics(save_dir) # main training loop avg_meters = { 'kl_avg_meter': AverageValueMeter(), 'l2_avg_meter': AverageValueMeter() } assert args.distributed epoch = start_epoch print("Start epoch: %d End epoch: %d" % (start_epoch, args.epochs)) for epoch in range(start_epoch, args.epochs): if args.local_rank == 0: # evaluate on the validation set if epoch % args.val_freq == 0 and epoch != 0: model.eval() with torch.no_grad(): val_res = validate(model.module, args, val_loader, epoch, logger, save_dir) for k, v in val_res.items(): v = v.cpu().detach().item() send_slack(f'{k}:{v}, Epoch {epoch - 1}') if logger is not None and v is not None: logger.add_scalar(f'val_sample/{k}', v, epoch - 1) # train for one epoch train_one_epoch(epoch, model, criterion, optimizer, args, train_loader, avg_meters, logger) # Only on HEAD process if args.local_rank == 0: # save checkpoints if (epoch + 1) % args.save_freq == 0: if args.eval: validate_reconstruct_l2(epoch, val_loader, model, criterion, args, logger) save(model.module, model.optimizer, model.lr_scheduler, epoch + 1, Path(save_dir) / f'checkpoint-{epoch}.pt') save(model.module, model.optimizer, model.lr_scheduler, epoch + 1, Path(save_dir) / 'checkpoint-latest.pt') # save visualizations if (epoch + 1) % args.viz_freq == 0: with torch.no_grad(): visualize(model.module, args, val_loader, epoch, logger) # adjust the learning rate model.lr_scheduler.step() if logger is not None and args.local_rank == 0: logger.add_scalar('train lr', model.lr_scheduler.get_last_lr()[0], epoch) model.eval() if args.local_rank == 0: with torch.no_grad(): val_res = validate(model.module, args, val_loader, epoch, logger, save_dir) for k, v in val_res.items(): v = v.cpu().detach().item() send_slack(f'{k}:{v}, Epoch {epoch}') if logger is not None and v is not None: logger.add_scalar(f'val_sample/{k}', v, epoch) if logger is not None: logger.flush() logger.close()
num_workers=8) # Initialize the network, loss and optimizer if args.modality == 'PAN': model = modelDef.panNet() elif args.modality == 'MS': model = modelDef.msNet() elif args.modality == 'HS1': model = modelDef.HS1(dataset.data.shape[1]) elif args.modality == 'HS2': model = modelDef.HS2(dataset.data.shape[1]) model.to(device) criterion = torch.nn.CrossEntropyLoss() optimizer = torch.optim.Adam(model.parameters(), lr=args.learningRate) # Training and Inference print('Training (' + args.modality + ') Network...') bestAcc = 0 for epoch in tqdm(range(args.nEpochs)): engine.train(model, trainLoader, optimizer, criterion, device, plotter, epoch) accuracy, classWiseAcc = engine.validate(model, valLoader, criterion, device, plotter, epoch) if accuracy > bestAcc: utils.save_model({'stateDict': model.state_dict()}, args.name, args.modality + 'net_instance_' + str(args.instance) + '.pth.tar') bestAcc = accuracy print('Best Accuracy: ' + str(bestAcc)) # print('Classwise Accuracy: '+str(classWiseAcc))