def main(): parser = argparse.ArgumentParser(description="Compute BLEU.") parser.add_argument('ckpt', type=str, help="Checkpoint to restore.") parser.add_argument('--dir', type=str, default="./wmt14", help="Directory of dataset.") parser.add_argument('--split', default='test', type=str, help="Specify which split of data to evaluate.") parser.add_argument( '--gpu_id', default=0, type=int, help="CUDA visible GPU ID. Currently only support single GPU.") parser.add_argument('--beams', default=1, type=int, help="Beam Search width.") args = parser.parse_args() os.environ["CUDA_VISIBLE_DEVICES"] = str(args.gpu_id) assert torch.cuda.is_available() import data import build_model # Restore checkpoint info = torch.load(args.ckpt) cfg = info['cfg'] # Build model bpe_model = yttm.BPE(model=cfg['bpe']) model = build_model.Seq2Seq(bpe_model.vocab_size(), bpe_model.vocab_size(), hidden_size=cfg['model']['hidden_size'], encoder_layers=cfg['model']['encoder_layers'], decoder_layers=cfg['model']['decoder_layers'], use_bn=cfg['model']['use_bn']) model.load_state_dict(info['weights']) model.eval() model = model.cuda() # Create dataset if args.beams == 1: batch_size = cfg['train']['batch_size'] else: batch_size = 1 loader = data.load(args.dir, split=args.split, batch_size=batch_size, bpe_model=bpe_model) # Evaluate _, bleu = utils.eval_dataset(loader, model, bpe_model, args.beams) print("BLEU on %s set = %.4f" % (args.split, error))
def main(): parser = argparse.ArgumentParser(description="Compute error rate.") parser.add_argument('ckpt', type=str, help="Checkpoint to restore.") parser.add_argument('--split', default='test', type=str, help="Specify which split of data to evaluate.") parser.add_argument( '--gpu_id', default=0, type=int, help="CUDA visible GPU ID. Currently only support single GPU.") parser.add_argument('--beams', default=1, type=int, help="Beam Search width.") parser.add_argument('--workers', default=0, type=int, help="How many subprocesses to use for data loading.") args = parser.parse_args() os.environ["CUDA_VISIBLE_DEVICES"] = str(args.gpu_id) assert torch.cuda.is_available() import data import build_model # Restore checkpoint info = torch.load(args.ckpt) cfg = info['cfg'] # Create dataset if args.beams == 1: batch_size = cfg['train']['batch_size'] else: batch_size = 1 loader = data.load(split=args.split, batch_size=batch_size, workers=args.workers) # Build model tokenizer = torch.load('tokenizer.pth') model = build_model.Seq2Seq(len(tokenizer.vocab), hidden_size=cfg['model']['hidden_size'], encoder_layers=cfg['model']['encoder_layers'], decoder_layers=cfg['model']['decoder_layers'], use_bn=cfg['model']['use_bn']) model.load_state_dict(info['weights']) model.eval() model = model.cuda() # Evaluate _, error = eval_utils.eval_dataset(loader, model, args.beams) print("Error rate on %s set = %.4f" % (args.split, error))
def main(): parser = argparse.ArgumentParser(description="Compute error rate.") parser.add_argument('ckpt', type=str, help="Checkpoint to restore.") parser.add_argument('--split', default='test', type=str, help="Specify which split of data to evaluate.") parser.add_argument( '--gpu_id', default=0, type=int, help="CUDA visible GPU ID. Currently only support single GPU.") args = parser.parse_args() os.environ["CUDA_VISIBLE_DEVICES"] = str(args.gpu_id) assert torch.cuda.is_available() import data import build_model # Restore checkpoint info = torch.load(args.ckpt) print("Dev. error rate of checkpoint: %.4f @epoch: %d" % (info['dev_error'], info['epoch'])) cfg = info['cfg'] # Create dataset loader = data.load(split=args.split, batch_size=cfg['train']['batch_size']) # Build model tokenizer = torch.load('tokenizer.pth') model = build_model.Seq2Seq(len(tokenizer.vocab), hidden_size=cfg['model']['hidden_size'], encoder_layers=cfg['model']['encoder_layers'], decoder_layers=cfg['model']['decoder_layers']) model.load_state_dict(info['weights']) model.eval() model = model.cuda() # Evaluate error = eval_utils.get_error(loader, model) print("Error rate on %s set = %.4f" % (args.split, error))
def main(): parser = argparse.ArgumentParser(description="Train the model.") parser.add_argument('cfg', type=str, help="Specify which experiment config file to use.") parser.add_argument('--dir', type=str, default="./wmt14", help="Directory of dataset.") parser.add_argument( '--gpu_id', default=0, type=int, help="CUDA visible GPU ID. Currently only support single GPU.") parser.add_argument('--workers', default=0, type=int, help="How many subprocesses to use for data loading.") args = parser.parse_args() os.environ["CUDA_VISIBLE_DEVICES"] = str(args.gpu_id) assert torch.cuda.is_available() import data import build_model with open(args.cfg) as f: cfg = yaml.load(f, Loader=yaml.FullLoader) save_path = os.path.splitext(args.cfg)[0] if not os.path.exists(save_path): os.mkdir(save_path) # Build model bpe_model = yttm.BPE(model=cfg['bpe']) model = build_model.Seq2Seq(bpe_model.vocab_size(), bpe_model.vocab_size(), hidden_size=cfg['model']['hidden_size'], encoder_layers=cfg['model']['encoder_layers'], decoder_layers=cfg['model']['decoder_layers'], drop_p=cfg['model']['drop_p'], use_bn=cfg['model']['use_bn']) model = model.cuda() # Create dataset train_loader = data.load(args.dir, split='train', batch_size=cfg['train']['batch_size'], bpe_model=bpe_model, workers=args.workers) dev_loader = data.load(args.dir, split='dev', batch_size=cfg['train']['batch_size'], bpe_model=bpe_model) # Training criteria optimizer = torch.optim.Adam(model.parameters(), lr=cfg['train']['init_lr']) scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau( optimizer, mode='min', factor=cfg['train']['decay_factor'], patience=cfg['train']['patience'], threshold=0.01, min_lr=1e-6) assert cfg['train']['metric'] in ['loss', 'bleu'] # Restore checkpoints if os.path.exists(os.path.join(save_path, 'last.pth')): info = torch.load(os.path.join(save_path, 'last.pth')) epoch = info['epoch'] model.load_state_dict(info['weights']) optimizer.load_state_dict(info['optimizer']) scheduler.load_state_dict(info['scheduler']) else: epoch = 0 if os.path.exists(os.path.join(save_path, 'best.pth')): info = torch.load(os.path.join(save_path, 'best.pth')) best_epoch = info['epoch'] best_bleu = info['dev_bleu'] else: best_epoch = 0 best_bleu = 0 while (1): print("---") epoch += 1 print("Epoch: %d" % (epoch)) # Show learning rate lr = get_lr(optimizer) print("Learning rate: %f" % lr) # Training loop model.train() train_loss = [] train_tqdm = tqdm(train_loader, desc="Training") for (xs, ys) in train_tqdm: loss = model(xs.cuda(), ys.cuda()) train_loss.append(loss.item()) train_tqdm.set_postfix(loss="%.3f" % np.mean(train_loss)) optimizer.zero_grad() loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), 5.) # Gradient clipping optimizer.step() # Validation loop model.eval() dev_loss, dev_bleu = utils.eval_dataset(dev_loader, model, bpe_model) print("Dev. loss: %.3f," % dev_loss, end=' ') print("dev. BLEU: %.4f" % dev_bleu) if dev_bleu > best_bleu: best_bleu = dev_bleu best_epoch = epoch # Save best model save_checkpoint("best.pth", save_path, best_epoch, best_bleu, cfg, model, optimizer, scheduler) print("Best dev. BLEU: %.4f @epoch: %d" % (best_bleu, best_epoch)) # Update learning rate scheduler if cfg['train']['metric'] == 'loss': scheduler.step(dev_loss) else: scheduler.step(1 - dev_bleu) # Save checkpoint save_checkpoint("last.pth", save_path, epoch, dev_bleu, cfg, model, optimizer, scheduler) # Logging datetime = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) msg = "%s,%d,%f,%f,%f,%f" % (datetime, epoch, lr, np.mean(train_loss), dev_loss, dev_bleu) log_history(save_path, msg)
def main(): parser = argparse.ArgumentParser( description= "Train the model on DEVELOPMENT set to make sure it can overfit.") parser.add_argument('cfg', type=str, help="Specify which experiment config file to use.") parser.add_argument('--dir', type=str, default="./wmt14", help="Directory of dataset.") parser.add_argument( '--gpu_id', default=0, type=int, help="CUDA visible GPU ID. Currently only support single GPU.") args = parser.parse_args() os.environ["CUDA_VISIBLE_DEVICES"] = str(args.gpu_id) assert torch.cuda.is_available() import data import build_model with open(args.cfg) as f: cfg = yaml.load(f, Loader=yaml.FullLoader) # Build model bpe_model = yttm.BPE(model=cfg['bpe']) model = build_model.Seq2Seq(bpe_model.vocab_size(), bpe_model.vocab_size(), hidden_size=cfg['model']['hidden_size'], encoder_layers=cfg['model']['encoder_layers'], decoder_layers=cfg['model']['decoder_layers'], drop_p=cfg['model']['drop_p'], use_bn=cfg['model']['use_bn']) model = model.cuda() # Create dataset dev_loader = data.load(args.dir, split='dev', batch_size=32, bpe_model=bpe_model) # Training criteria optimizer = torch.optim.Adam(model.parameters(), lr=cfg['train']['init_lr']) epoch = 0 best_epoch = 0 best_bleu = 0 while (1): print("---") epoch += 1 print("Epoch: %d" % (epoch)) # Training loop model.train() train_loss = [] train_tqdm = tqdm(dev_loader, desc="Training") for (xs, ys) in train_tqdm: loss = model(xs.cuda(), ys.cuda()) train_loss.append(loss.item()) train_tqdm.set_postfix(loss="%.3f" % np.mean(train_loss)) optimizer.zero_grad() loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), 5.) # Gradient clipping optimizer.step() # Validation loop model.eval() dev_loss, dev_bleu = utils.eval_dataset(dev_loader, model, bpe_model) print("Dev. loss: %.3f," % dev_loss, end=' ') print("dev. BLEU: %.4f" % dev_bleu) if dev_bleu > best_bleu: best_bleu = dev_bleu best_epoch = epoch print("Best dev. BLEU: %.4f @epoch: %d" % (best_bleu, best_epoch))
def main(): parser = argparse.ArgumentParser(description="Train the model.") parser.add_argument('cfg', type=str, help="Specify which experiment config file to use.") parser.add_argument( '--gpu_id', default=0, type=int, help="CUDA visible GPU ID. Currently only support single GPU.") parser.add_argument('--workers', default=0, type=int, help="How many subprocesses to use for data loading.") parser.add_argument( '--ckpt_freq', default=10, type=int, help="Frequency (number of epochs) to save checkpoints.") args = parser.parse_args() os.environ["CUDA_VISIBLE_DEVICES"] = str(args.gpu_id) assert torch.cuda.is_available() import data import build_model with open(args.cfg) as f: cfg = yaml.load(f, Loader=yaml.FullLoader) if not cfg['logdir']: save_path = os.path.splitext(args.cfg)[0] if not os.path.exists(save_path): os.mkdir(save_path) # Create dataset train_loader = data.load(split='train', batch_size=cfg['train']['batch_size'], workers=args.workers) dev_loader = data.load(split='dev', batch_size=cfg['train']['batch_size']) # Build model tokenizer = torch.load('tokenizer.pth') model = build_model.Seq2Seq(len(tokenizer.vocab), hidden_size=cfg['model']['hidden_size'], encoder_layers=cfg['model']['encoder_layers'], decoder_layers=cfg['model']['decoder_layers'], drop_p=cfg['model']['drop_p']) model = model.cuda() # Training criteria optimizer = torch.optim.Adam(model.parameters(), lr=cfg['train']['init_lr']) scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau( optimizer, mode='min', factor=cfg['train']['decay_factor'], patience=cfg['train']['patience'], min_lr=1e-6) best_epoch = 0 best_error = float('inf') for epoch in range(cfg['train']['epochs'] + 1): print("---") # Show learning rate lr = get_lr(optimizer) print("Learning rate: %f" % lr) # Training loop model.train() train_loss = 0 n_tokens = 0 for step, (xs, xlens, ys) in enumerate(train_loader): loss = model(xs.cuda(), xlens, ys.cuda()) train_loss += loss.item() * (ys[:, 1:] > 0).long().sum() n_tokens += (ys[:, 1:] > 0).long().sum() optimizer.zero_grad() loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), 5.) # Gradient clipping optimizer.step() if not step % 10: print(time.strftime("%H:%M:%S", time.localtime()), end=' ') print("epoch: %d, step: %d, loss: %.3f" % (epoch, step, loss.item())) train_loss = train_loss / n_tokens # Validation loop model.eval() # Compute dev loss dev_loss = 0 n_tokens = 0 with torch.no_grad(): for (xs, xlens, ys) in dev_loader: dev_loss += model(xs.cuda(), xlens, ys.cuda()).item() * ( ys[:, 1:] > 0).long().sum() n_tokens += (ys[:, 1:] > 0).long().sum() dev_loss = dev_loss / n_tokens # Compute dev error rate error = eval_utils.get_error(dev_loader, model) print("Dev. loss: %.3f," % dev_loss, end=' ') print("dev. error rate: %.4f" % error) if error < best_error: best_error = error best_epoch = epoch # Save best model save_checkpoint("best.pth", save_path, best_epoch, best_error, cfg, model.state_dict()) print("Best dev. error rate: %.4f @epoch: %d" % (best_error, best_epoch)) scheduler.step(error) # Save checkpoint if not epoch % args.ckpt_freq or epoch == cfg['train']['epochs']: save_checkpoint("checkpoint_%05d.pth" % epoch, save_path, epoch, error, cfg, model.state_dict()) # Logging datetime = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) msg = "%s,%d,%f,%f,%f,%f" % (datetime, epoch, lr, train_loss, dev_loss, error) log_history(save_path, msg)
def main(): parser = argparse.ArgumentParser( description= "Test on random audio from dataset and visualize the attention matrix." ) parser.add_argument('ckpt', type=str, help="Checkpoint to restore.") parser.add_argument('--split', default='test', type=str, help="Specify which split of data to evaluate.") parser.add_argument( '--gpu_id', default=0, type=int, help="CUDA visible GPU ID. Currently only support single GPU.") parser.add_argument('--beams', default=1, type=int, help="Beam Search width.") args = parser.parse_args() os.environ["CUDA_VISIBLE_DEVICES"] = str(args.gpu_id) assert torch.cuda.is_available() import data import build_model # Restore checkpoint info = torch.load(args.ckpt) cfg = info['cfg'] # Create dataset loader = data.load(split=args.split, batch_size=1) # Build model tokenizer = torch.load('tokenizer.pth') model = build_model.Seq2Seq(len(tokenizer.vocab), hidden_size=cfg['model']['hidden_size'], encoder_layers=cfg['model']['encoder_layers'], decoder_layers=cfg['model']['decoder_layers'], use_bn=cfg['model']['use_bn']) model.load_state_dict(info['weights']) model.eval() model = model.cuda() # Inference with torch.no_grad(): for (x, xlens, y) in loader: predictions, attentions = model(x.cuda(), xlens, beam_width=args.beams) predictions, attentions = predictions[0], attentions[0] predictions = tokenizer.decode(predictions) attentions = attentions[:len(predictions.split())].cpu().numpy( ) # (target_length, source_length) ground_truth = tokenizer.decode(y[0]) print("Predict:") print(predictions) print("Ground-truth:") print(ground_truth) print() showAttention(predictions, attentions)