def train(): torch.cuda.set_device(0) iteration = 0 model = WaveRNN(HPARAMS) model = model.cuda() optimizer = optim.Adam(model.parameters(), lr=HPARAMS.lr) if ARGS.checkpoint: if os.path.basename(ARGS.checkpoint).startswith('ema_model'): ema_checkpoint = ARGS.checkpoint else: ema_checkpoint = 'ema_model_' + os.path.basename(ARGS.checkpoint) ema_checkpoint = os.path.join(os.path.dirname(ARGS.checkpoint), ema_checkpoint) # Initialise EMA from the ema checkpoint. logging.info('Initialising ema model {}'.format(ema_checkpoint)) ema_model = WaveRNN(HPARAMS).cuda() ema_base_model, _ = load_checkpoint(ema_checkpoint, ema_model) ema = init_ema(ema_base_model, HPARAMS.ema_rate) # Initialise vanilla model logging.info('Loading checkpoint {}'.format(ARGS.checkpoint)) model, iteration, optimizer = load_checkpoint(ARGS.checkpoint, model, optimizer) else: # Initialise EMA from scratch. ema = init_ema(model, HPARAMS.ema_rate) criterion = nn.NLLLoss(reduction='sum').cuda() train_loader, test_loader = get_loader(ARGS.data, 'train', HPARAMS), get_loader(ARGS.data, 'valid', HPARAMS) whole_loader = get_loader(ARGS.data, 'valid', HPARAMS, whole=True) model = nn.DataParallel(model) epoch_offset = max(0, int(iteration / len(train_loader))) for _ in range(epoch_offset, ARGS.epochs): iteration = train_step( train_loader, test_loader, whole_loader, model, optimizer, criterion, iteration, ema=ema ) averaged_model = clone_as_averaged_model(model, ema) save_checkpoint( { 'state_dict': model.module.state_dict(), 'iteration': iteration, 'dataset': ARGS.data, 'optimizer': optimizer.state_dict(), }, iteration, 'checkpoints/{}/lastmodel.pth'.format(ARGS.expName), ARGS.expName, ) save_checkpoint( { 'state_dict': averaged_model.state_dict(), 'iteration': iteration, 'dataset': ARGS.data, 'optimizer': optimizer.state_dict(), }, iteration, 'checkpoints/{}/ema_model_lastmodel.pth'.format(ARGS.expName), ARGS.expName, )
def test_inference_forward_parity(): hparams = create_hparams() model = WaveRNN(hparams, debug=True).cuda() model.train() data_path = '../data/short_sens/' whole_segments = get_loader(data_path, 'valid', hparams, whole=True) for i, (x, m, _) in enumerate(whole_segments): x, m = x.cuda(), m.cuda() forward_output, f_context, f_x = model.train_mode_generate(x, m) inference_output, i_cont_dict, i_x = model.inference(m, gt=x) assert (abs(i_x - f_x).mean() < 1e-6) '''
def visualize_attn(args): """ Visualization of learned attention map. """ config = CONFIGS[args.model_type] num_classes = 10 if args.dataset == "cifar10" else 100 model = VisionTransformer(config, args.img_size, norm_type=args.norm_type, zero_head=True, num_classes=num_classes, vis=True) ckpt_file = os.path.join(args.output_dir, args.name + "_checkpoint.bin") ckpt = torch.load(ckpt_file) # use single card for visualize attn map model.load_state_dict(ckpt) model.to(args.device) model.eval() _, test_loader = get_loader(args) sample_idx = 0 layer_ids = [0, 3, 6, 9] head_id = 0 with torch.no_grad(): for step, batch in enumerate(test_loader): batch = tuple(t.to(args.device) for t in batch) x, y = batch select_x = x[sample_idx].unsqueeze(0) output, attn_weights = model(select_x) # attn_weights is List[(1, number_of_head, len_h, len_h)] for layer_id in layer_ids: vis_attn(args, attn_weights[layer_id].squeeze(0)[head_id], layer_id=layer_id) break # visualize the first sample in the first batch print("done.") exit(0)
def train(args, model, device, acc_calculator): """ Train the model """ if args.local_rank in [-1, 0]: os.makedirs(args.output_dir, exist_ok=True) writer = SummaryWriter(log_dir=os.path.join("logs", args.name)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps # Prepare dataset train_loader, test_loader = get_loader(args) # Prepare optimizer and scheduler optimizer = torch.optim.SGD(model.parameters(), lr=args.learning_rate, momentum=0.9, weight_decay=args.weight_decay) t_total = args.num_steps if args.decay_type == "cosine": scheduler = WarmupCosineSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total) else: scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total) # Train! logger.info("***** Running training *****") logger.info(" Total optimization steps = %d", args.num_steps) logger.info(" Instantaneous batch size per GPU = %d", args.train_batch_size) logger.info(" Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) model.zero_grad() set_seed(args) # Added here for reproducibility (even between python 2 and 3) losses = AverageMeter() global_step, best_acc = 0, 0 loss_fct = BatchAllLoss(margin=0.1) while True: model.train() epoch_iterator = tqdm(train_loader, desc="Training (X / X Steps) (loss=X.X)", bar_format="{l_bar}{r_bar}", dynamic_ncols=True, disable=args.local_rank not in [-1, 0]) for step, batch in enumerate(epoch_iterator): batch = tuple(t.to(device) for t in batch[:2]) x, y = batch embeds = model(x) loss = loss_fct(embeds, y) losses.update(loss.item()) if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() optimizer.step() scheduler.step() optimizer.zero_grad() global_step += 1 epoch_iterator.set_description( "Training (%d / %d Steps) (loss=%2.5f)" % (global_step, t_total, losses.val) ) if args.local_rank in [-1, 0]: writer.add_scalar("train/loss", scalar_value=losses.val, global_step=global_step) writer.add_scalar("train/lr", scalar_value=scheduler.get_lr()[0], global_step=global_step) if global_step % args.eval_every == 0 and args.local_rank in [-1, 0]: accuracy = valid(args, model, writer, test_loader, global_step, device, acc_calculator) if best_acc < accuracy: save_model(args, model) best_acc = accuracy model.train() if global_step % t_total == 0: break losses.reset() if global_step % t_total == 0: break if args.local_rank in [-1, 0]: writer.close() logger.info("Best Accuracy: \t%f" % best_acc) logger.info("End Training!")
def train(args, model): """ Train the model """ if args.local_rank in [-1, 0]: os.makedirs(args.output_dir, exist_ok=True) writer = SummaryWriter(log_dir=os.path.join("logs", args.name)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps # Prepare dataset train_loader, test_loader = get_loader(args) # Prepare optimizer and scheduler optimizer = torch.optim.SGD(model.parameters(), lr=args.learning_rate, momentum=0.9, weight_decay=args.weight_decay) t_total = args.num_steps if args.decay_type == "cosine": scheduler = WarmupCosineSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total) else: scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total) if args.fp16: model, optimizer = amp.initialize(models=model, optimizers=optimizer, opt_level=args.fp16_opt_level) amp._amp_state.loss_scalers[0]._loss_scale = 2**20 # Distributed training if args.local_rank != -1: model = DDP(model, message_size=250000000, gradient_predivide_factor=get_world_size()) # Train! logger.info("***** Running training *****") logger.info(" Total optimization steps = %d", args.num_steps) logger.info(" Instantaneous batch size per GPU = %d", args.train_batch_size) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1)) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) model.zero_grad() set_seed( args) # Added here for reproducibility (even between python 2 and 3) losses = AverageMeter() global_step, best_acc = 0, 0 while True: model.train() epoch_iterator = tqdm(train_loader, desc="Training (X / X Steps) (loss=X.X)", bar_format="{l_bar}{r_bar}", dynamic_ncols=True, disable=args.local_rank not in [-1, 0]) for step, batch in enumerate(epoch_iterator): batch = tuple(t.to(args.device) for t in batch) x, y = batch loss = model(x, y) if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() if (step + 1) % args.gradient_accumulation_steps == 0: losses.update(loss.item() * args.gradient_accumulation_steps) if args.fp16: torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), args.max_grad_norm) else: torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) scheduler.step() optimizer.step() optimizer.zero_grad() global_step += 1 epoch_iterator.set_description( "Training (%d / %d Steps) (loss=%2.5f)" % (global_step, t_total, losses.val)) if args.local_rank in [-1, 0]: writer.add_scalar("train/loss", scalar_value=losses.val, global_step=global_step) writer.add_scalar("train/lr", scalar_value=scheduler.get_lr()[0], global_step=global_step) if global_step % args.eval_every == 0 and args.local_rank in [ -1, 0 ]: accuracy = valid(args, model, writer, test_loader, global_step) if best_acc < accuracy: save_model(args, model) best_acc = accuracy model.train() if global_step % t_total == 0: break losses.reset() if global_step % t_total == 0: break if args.local_rank in [-1, 0]: writer.close() logger.info("Best Accuracy: \t%f" % best_acc) logger.info("End Training!")
def train(args, model): if args.local_rank in [-1, 0]: os.makedirs(args.output_dir, exist_ok=True) writer = SummaryWriter(log_dir=os.path.join("logs", args.name)) # writer = SummaryWriter(log_dir=os.path.join("logs", 'transformer')) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps train_loader, test_loader = get_loader(args) optimizer = torch.optim.SGD(model.parameters(), lr=args.learning_rate, momentum=0.9, weight_decay=args.weight_decay) t_total = args.num_steps if args.decay_type == "cosine": scheduler = WarmupCosineSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total) else: scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total) # Train! logger.info("***** Running training *****") logger.info(" Total optimization steps = %d", args.num_steps) logger.info(" Instantaneous batch size per GPU = %d", args.train_batch_size) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1)) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) model.zero_grad() set_seed(args) losses = AverageMeter() global_step, best_acc, best_losses = 0, 0, np.inf while True: model.train() # criterion = nn.CrossEntropyLoss() criterion = FocalLoss() # criterion = MyCrossEntropyLoss() # criterion = MyMseLoss(args) epoch_iterator = tqdm(train_loader, desc="Training (X / X Steps) (loss=X.X)", bar_format="{l_bar}{r_bar}", dynamic_ncols=True, disable=args.local_rank not in [-1, 0]) for step, batch in enumerate(epoch_iterator): batch = tuple(t.to(args.device) for t in batch) imgs, intensity, labels = batch outputs = model(imgs, intensity, labels) # output[0].size = (B, N, C) outputs = outputs.view(-1, args.class_number) # outputs = F.softmax(outputs, dim=-1) labels = labels.view(-1) loss = criterion(outputs, labels) loss.backward() if (step + 1) % args.gradient_accumulation_steps == 0: losses.update(loss.item() * args.gradient_accumulation_steps) torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) scheduler.step() optimizer.step() optimizer.zero_grad() global_step += 1 epoch_iterator.set_description( "Training (%d / %d Steps) (loss=%2.5f)" % (global_step, t_total, loss.cpu())) if args.local_rank in [-1, 0]: writer.add_scalar("train/loss", scalar_value=losses.val, global_step=global_step) writer.add_scalar("train/lr", scalar_value=scheduler.get_lr()[0], global_step=global_step) if global_step % args.eval_every == 0: accuracy, eval_losses = valid(args, model, writer, test_loader, global_step) if eval_losses <= best_losses: save_model(args, model) best_losses = eval_losses model.train() if global_step % t_total == 0: break losses.reset() if global_step % t_total == 0: break if args.local_rank in [-1, 0]: writer.close() logger.info("Best Accuracy: \t%f" % best_acc) logger.info("End Training!")
def launch_routine(model): data_path = ARGS.data if 'valid/mel' in ARGS.data: data_path = data_path.replace('valid/mel', '') whole_segments = get_loader(data_path, 'valid', HPARAMS, whole=True) generate_routine(model, whole_segments, ARGS.out_dir, HPARAMS)
level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN) # logger.warning("Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s" % # (args.local_rank, args.device, args.n_gpu, bool(args.local_rank != -1), args.fp16)) # Set seed set_seed(args) mse0, mse1, mse2, mse3 = 0, 0, 0, 0 mae0, mae1, mae2, mae3 = 0, 0, 0, 0 pre = [] lab = [] acc = 0.0 l2 = [10, 12, 13, 14, 15, 16, 17, 18, 20, 21, 22, 23, 25, 27, 28, 30, 32, 33, 35, 38, 40, 42, 43, 45, 48, 50, 51, 52, 53, 55, 57, 58, 60, 62, 65, 68, 70, 72] l3 = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37] all_preds, all_label = [], [] train_loader, test_loader = get_loader(args) epoch_iterator = tqdm(test_loader, desc="Validating...(loss=X.X", bar_format="{l_bar}{r_bar}", dynamic_ncols=True, disable=args.local_rank not in [-1, 0]) # criterion = nn.CrossEntropyLoss(ignore_index=0) # f = open('./pred_result.txt', 'a') test_num = 0 model.eval() eval_losses = AverageMeter() for step, batch in enumerate(epoch_iterator): batch = tuple(t.to(args.device) for t in batch) imgs, intensity, labels = batch bs, src_len = labels.size() test_target = torch.ones((bs, src_len - 1), dtype=labels.dtype).to(labels.device)
def main(): parser = argparse.ArgumentParser() # Required parameters parser.add_argument("--name", required=True, help="Name of this run. Used for monitoring.") parser.add_argument("--dataset", choices=["cifar10", "cifar100"], default="cifar10", help="Which downstream task.") parser.add_argument("--model_type", choices=["ViT-B_16", "ViT-B_32", "ViT-L_16", "ViT-L_32", "ViT-H_14", "R50-ViT-B_16"], default="ViT-B_16", help="Which variant to use.") parser.add_argument("--test_mode", action="store_true") parser.add_argument("--mixup", action="store_true") parser.add_argument("--mixup_layer", type=int, default=0) parser.add_argument("--mixup_alpha", type=float, default=1.0) parser.add_argument("--pretrained_dir", type=str, default="checkpoint/ViT-B_16.npz", help="Where to search for pretrained ViT models.") parser.add_argument("--output_dir", default="output", type=str, help="The output directory where checkpoints will be written.") parser.add_argument("--img_size", default=224, type=int, help="Resolution size") parser.add_argument("--train_batch_size", default=512, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=64, type=int, help="Total batch size for eval.") parser.add_argument("--eval_every", default=100, type=int, help="Run prediction on validation set every so many steps." "Will always run one evaluation at the end of training.") parser.add_argument("--learning_rate", default=3e-2, type=float, help="The initial learning rate for SGD.") parser.add_argument("--weight_decay", default=0, type=float, help="Weight deay if we apply some.") parser.add_argument("--num_steps", default=10000, type=int, help="Total number of training epochs to perform.") parser.add_argument("--decay_type", choices=["cosine", "linear"], default="cosine", help="How to decay the learning rate.") parser.add_argument("--warmup_steps", default=500, type=int, help="Step of training to perform learning rate warmup for.") parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument('--gradient_accumulation_steps', type=int, default=1, help="Number of updates steps to accumulate before performing a backward/update pass.") parser.add_argument('--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument('--fp16_opt_level', type=str, default='O2', help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." "See details at https://nvidia.github.io/apex/amp.html") parser.add_argument('--loss_scale', type=float, default=0, help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") args = parser.parse_args() # Setup CUDA, GPU & distributed training if args.local_rank == -1: device = torch.device("cuda" if torch.cuda.is_available() else "cpu") args.n_gpu = torch.cuda.device_count() else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) torch.distributed.init_process_group(backend='nccl', timeout=timedelta(minutes=60)) args.n_gpu = 1 args.device = device # Setup logging logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN) logger.warning("Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s" % (args.local_rank, args.device, args.n_gpu, bool(args.local_rank != -1), args.fp16)) # Set seed set_seed(args) # Model & Tokenizer Setup args, model = setup(args) if not args.test_mode: # Training train(args, model) else: _, test_loader = get_loader(args) global_step = -1 accuracy = valid(args, model, None, test_loader, global_step) print("Test accuracy: ", accuracy)