def setUp(self): # Seed the Random Number Generators seed = 1234 torch.manual_seed(seed) torch.cuda.manual_seed(seed) np.random.seed(seed * 13 // 7) # Load training data & vocabulary train_data_src = submission.read_corpus( './sanity_check_en_es_data/train_sanity_check.es', 'src') train_data_tgt = submission.read_corpus( './sanity_check_en_es_data/train_sanity_check.en', 'tgt') train_data = list(zip(train_data_src, train_data_tgt)) for src_sents, tgt_sents in submission.batch_iter( train_data, batch_size=BATCH_SIZE, shuffle=True): self.src_sents = src_sents self.tgt_sents = tgt_sents break self.vocab = Vocab.load( './sanity_check_en_es_data/vocab_sanity_check.json') # Create NMT Model self.model = submission.NMT(embed_size=EMBED_SIZE, hidden_size=HIDDEN_SIZE, dropout_rate=DROPOUT_RATE, vocab=self.vocab)
def decode(args: Dict[str, str]): """ Performs decoding on a test set, and save the best-scoring decoding results. If the target gold-standard sentences are given, the function also computes corpus-level BLEU score. @param args (Dict): args from cmd line """ print("load test source sentences from [{}]".format(args['TEST_SOURCE_FILE']), file=sys.stderr) test_data_src = read_corpus(args['TEST_SOURCE_FILE'], source='src') if args['TEST_TARGET_FILE']: print("load test target sentences from [{}]".format(args['TEST_TARGET_FILE']), file=sys.stderr) test_data_tgt = read_corpus(args['TEST_TARGET_FILE'], source='tgt') print("load model from {}".format(args['MODEL_PATH']), file=sys.stderr) model = NMT.load(args['MODEL_PATH'], no_char_decoder=args['--no-char-decoder']) if args['--cuda']: model = model.to(torch.device("cuda:0")) hypotheses = beam_search(model, test_data_src, beam_size=int(args['--beam-size']), max_decoding_time_step=int(args['--max-decoding-time-step'])) if args['TEST_TARGET_FILE']: top_hypotheses = [hyps[0] for hyps in hypotheses] bleu_score = compute_corpus_level_bleu_score(test_data_tgt, top_hypotheses) print('Corpus BLEU: {}'.format(bleu_score * 100), file=sys.stderr) with open(args['OUTPUT_FILE'], 'w') as f: for src_sent, hyps in zip(test_data_src, hypotheses): top_hyp = hyps[0] hyp_sent = ' '.join(top_hyp.value) f.write(hyp_sent + '\n')
def test_0(self): """1d-0-basic: Sanity check for Encode. Compares student output to that of model with dummy data.""" # Seed the Random Number Generators seed = 1234 torch.manual_seed(seed) torch.cuda.manual_seed(seed) np.random.seed(seed * 13 // 7) # Load training data & vocabulary train_data_src = submission.read_corpus( './sanity_check_en_es_data/train_sanity_check.es', 'src') train_data_tgt = submission.read_corpus( './sanity_check_en_es_data/train_sanity_check.en', 'tgt') train_data = list(zip(train_data_src, train_data_tgt)) for src_sents, tgt_sents in submission.batch_iter( train_data, batch_size=BATCH_SIZE, shuffle=True): src_sents = src_sents tgt_sents = tgt_sents break vocab = Vocab.load('./sanity_check_en_es_data/vocab_sanity_check.json') # Create NMT Model model = submission.NMT(embed_size=EMBED_SIZE, hidden_size=HIDDEN_SIZE, dropout_rate=DROPOUT_RATE, vocab=vocab) # Configure for Testing reinitialize_layers(model) source_lengths = [len(s) for s in src_sents] source_padded = model.vocab.src.to_input_tensor(src_sents, device=model.device) # Load Outputs enc_hiddens_target = torch.load( './sanity_check_en_es_data/enc_hiddens.pkl') dec_init_state_target = torch.load( './sanity_check_en_es_data/dec_init_state.pkl') # Test with torch.no_grad(): enc_hiddens_pred, dec_init_state_pred = model.encode( source_padded, source_lengths) self.assertTrue( np.allclose(enc_hiddens_target.numpy(), enc_hiddens_pred.numpy()) ), "enc_hiddens is incorrect: it should be:\n {} but is:\n{}".format( enc_hiddens_target, enc_hiddens_pred) print("enc_hiddens Sanity Checks Passed!") self.assertTrue( np.allclose(dec_init_state_target[0].numpy(), dec_init_state_pred[0].numpy()) ), "dec_init_state[0] is incorrect: it should be:\n {} but is:\n{}".format( dec_init_state_target[0], dec_init_state_pred[0]) print("dec_init_state[0] Sanity Checks Passed!") self.assertTrue( np.allclose(dec_init_state_target[1].numpy(), dec_init_state_pred[1].numpy()) ), "dec_init_state[1] is incorrect: it should be:\n {} but is:\n{}".format( dec_init_state_target[1], dec_init_state_pred[1]) print("dec_init_state[1] Sanity Checks Passed!")
def vocab(args: Dict): print('read in source sentences: %s' % args['--train-src']) print('read in target sentences: %s' % args['--train-tgt']) src_sents = read_corpus(args['--train-src'], source='src') tgt_sents = read_corpus(args['--train-tgt'], source='tgt') vocab = Vocab.build(src_sents, tgt_sents, int(args['--size']), int(args['--freq-cutoff'])) print('generated vocabulary, source %d words, target %d words' % (len(vocab.src), len(vocab.tgt))) vocab.save(args['VOCAB_FILE']) print('vocabulary saved to %s' % args['VOCAB_FILE'])
def setup(): # Load training data & vocabulary train_data_src = submission.read_corpus( './sanity_check_en_es_data/train_sanity_check.es', 'src') train_data_tgt = submission.read_corpus( './sanity_check_en_es_data/train_sanity_check.en', 'tgt') train_data = list(zip(train_data_src, train_data_tgt)) for src_sents, tgt_sents in submission.batch_iter( train_data, batch_size=LARGE_BATCH_SIZE, shuffle=True): src_sents = src_sents tgt_sents = tgt_sents break vocab = Vocab.load('./sanity_check_en_es_data/vocab_sanity_check.json') return src_sents, tgt_sents, vocab
def bleu(args: Dict[str, str]): """ computes belu score @param args (Dict): args for file path details """ test_data_out = submission.read_corpus(args['TEST_OUTPUT_FILE'], source='tgt') test_data_gold = submission.read_corpus(args['TEST_GOLD_FILE'], source='tgt') min_len = min(len(test_data_out), len(test_data_gold)) bleu_score = corpus_bleu([[ref] for ref in test_data_gold[:min_len]], [hyp for hyp in test_data_out[:min_len]]) print('Corpus BLEU: {}'.format(bleu_score * 100), file=sys.stderr) return bleu_score * 100
def decode(args: Dict[str, str]): """ Performs decoding on the autograder test set Make sure to run this code before submitting the code to the autograder @param args (Dict): args from cmd line """ test_data_src = read_corpus(args['SOURCE_FILE'], source='src') model = NMT.load(args['MODEL_PATH']) if args['CUDA']: model = model.to(torch.device("cuda:0")) hypotheses = beam_search(model, test_data_src, beam_size=int(args['BEAM_SIZE']), max_decoding_time_step=int( args['MAX_DECODING_TIME_STEP'])) with open(args['OUTPUT_FILE'], 'w') as f: for src_sent, hyps in zip(test_data_src, hypotheses): top_hyp = hyps[0] hyp_sent = ' '.join(top_hyp.value) f.write(hyp_sent + '\n')
def train(args: Dict): """ Train the NMT Model. @param args (Dict): args from cmd line """ train_data_src = read_corpus(args['--train-src'], source='src') train_data_tgt = read_corpus(args['--train-tgt'], source='tgt') dev_data_src = read_corpus(args['--dev-src'], source='src') dev_data_tgt = read_corpus(args['--dev-tgt'], source='tgt') train_data = list(zip(train_data_src, train_data_tgt)) dev_data = list(zip(dev_data_src, dev_data_tgt)) train_batch_size = int(args['--batch-size']) clip_grad = float(args['--clip-grad']) valid_niter = int(args['--valid-niter']) log_every = int(args['--log-every']) model_save_path = args['--save-to'] vocab = Vocab.load(args['--vocab']) model = NMT(embed_size=int(args['--embed-size']), hidden_size=int(args['--hidden-size']), dropout_rate=float(args['--dropout']), vocab=vocab, no_char_decoder=args['--no-char-decoder']) model.train() uniform_init = float(args['--uniform-init']) if np.abs(uniform_init) > 0.: print('uniformly initialize parameters [-%f, +%f]' % (uniform_init, uniform_init), file=sys.stderr) for p in model.parameters(): p.data.uniform_(-uniform_init, uniform_init) vocab_mask = torch.ones(len(vocab.tgt)) vocab_mask[vocab.tgt['<pad>']] = 0 device = torch.device("cuda:0" if args['--cuda'] else "cpu") print('use device: %s' % device, file=sys.stderr) model = model.to(device) optimizer = torch.optim.Adam(model.parameters(), lr=float(args['--lr'])) num_trial = 0 train_iter = patience = cum_loss = report_loss = cum_tgt_words = report_tgt_words = 0 cum_examples = report_examples = epoch = valid_num = 0 hist_valid_scores = [] train_time = begin_time = time.time() print('begin Maximum Likelihood training') while True: epoch += 1 for src_sents, tgt_sents in batch_iter(train_data, batch_size=train_batch_size, shuffle=True): train_iter += 1 optimizer.zero_grad() batch_size = len(src_sents) example_losses = -model(src_sents, tgt_sents) # (batch_size,) batch_loss = example_losses.sum() loss = batch_loss / batch_size loss.backward() # clip gradient grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), clip_grad) optimizer.step() batch_losses_val = batch_loss.item() report_loss += batch_losses_val cum_loss += batch_losses_val tgt_words_num_to_predict = sum(len(s[1:]) for s in tgt_sents) # omitting leading `<s>` report_tgt_words += tgt_words_num_to_predict cum_tgt_words += tgt_words_num_to_predict report_examples += batch_size cum_examples += batch_size if train_iter % log_every == 0: print('epoch %d, iter %d, avg. loss %.2f, avg. ppl %.2f ' \ 'cum. examples %d, speed %.2f words/sec, time elapsed %.2f sec' % (epoch, train_iter, report_loss / report_examples, math.exp( report_loss / report_tgt_words), cum_examples, report_tgt_words / ( time.time() - train_time), time.time() - begin_time), file=sys.stderr) train_time = time.time() report_loss = report_tgt_words = report_examples = 0. # perform validation if train_iter % valid_niter == 0: print('epoch %d, iter %d, cum. loss %.2f, cum. ppl %.2f cum. examples %d' % (epoch, train_iter, cum_loss / cum_examples, np.exp( cum_loss / cum_tgt_words), cum_examples), file=sys.stderr) cum_loss = cum_examples = cum_tgt_words = 0. valid_num += 1 print('begin validation ...', file=sys.stderr) # compute dev. ppl and bleu dev_ppl = evaluate_ppl(model, dev_data, batch_size=128) # dev batch size can be a bit larger valid_metric = -dev_ppl print('validation: iter %d, dev. ppl %f' % (train_iter, dev_ppl), file=sys.stderr) is_better = len(hist_valid_scores) == 0 or valid_metric > max(hist_valid_scores) hist_valid_scores.append(valid_metric) if is_better: patience = 0 print('save currently the best model to [%s]' % model_save_path, file=sys.stderr) model.save(model_save_path) # also save the optimizers' state torch.save(optimizer.state_dict(), model_save_path + '.optim') elif patience < int(args['--patience']): patience += 1 print('hit patience %d' % patience, file=sys.stderr) if patience == int(args['--patience']): num_trial += 1 print('hit #%d trial' % num_trial, file=sys.stderr) if num_trial == int(args['--max-num-trial']): print('early stop!', file=sys.stderr) exit(0) # decay lr, and restore from previously best checkpoint lr = optimizer.param_groups[0]['lr'] * float(args['--lr-decay']) print('load previously best model and decay learning rate to %f' % lr, file=sys.stderr) # load model params = torch.load(model_save_path, map_location=lambda storage, loc: storage) model.load_state_dict(params['state_dict']) model = model.to(device) print('restore parameters of the optimizers', file=sys.stderr) optimizer.load_state_dict(torch.load(model_save_path + '.optim')) # set new lr for param_group in optimizer.param_groups: param_group['lr'] = lr # reset patience patience = 0 if epoch == int(args['--max-epoch']): print('reached maximum number of epochs!', file=sys.stderr) exit(0)
entry = json.load(open(file_path, 'r')) src_word2id = entry['src_word2id'] tgt_word2id = entry['tgt_word2id'] return Vocab(VocabEntry(src_word2id), VocabEntry(tgt_word2id)) def __repr__(self): """ Representation of Vocab to be used when printing the object. """ return 'Vocab(source %d words, target %d words)' % (len( self.src), len(self.tgt)) if __name__ == '__main__': args = docopt(__doc__) print('read in source sentences: %s' % args['--train-src']) print('read in target sentences: %s' % args['--train-tgt']) src_sents = read_corpus(args['--train-src'], source='src') tgt_sents = read_corpus(args['--train-tgt'], source='tgt') vocab = Vocab.build(src_sents, tgt_sents, int(args['--size']), int(args['--freq-cutoff'])) print('generated vocabulary, source %d words, target %d words' % (len(vocab.src), len(vocab.tgt))) vocab.save(args['VOCAB_FILE']) print('vocabulary saved to %s' % args['VOCAB_FILE'])
def test_0(self): """1f-0-basic: Sanity check for Step. Compares student output to that of model with dummy data.""" # Seed the Random Number Generators seed = 1234 torch.manual_seed(seed) torch.cuda.manual_seed(seed) np.random.seed(seed * 13 // 7) # Load training data & vocabulary train_data_src = submission.read_corpus( './sanity_check_en_es_data/train_sanity_check.es', 'src') train_data_tgt = submission.read_corpus( './sanity_check_en_es_data/train_sanity_check.en', 'tgt') train_data = list(zip(train_data_src, train_data_tgt)) for src_sents, tgt_sents in submission.batch_iter( train_data, batch_size=BATCH_SIZE, shuffle=True): self.src_sents = src_sents self.tgt_sents = tgt_sents break self.vocab = Vocab.load( './sanity_check_en_es_data/vocab_sanity_check.json') # Create NMT Model self.model = submission.NMT(embed_size=EMBED_SIZE, hidden_size=HIDDEN_SIZE, dropout_rate=DROPOUT_RATE, vocab=self.vocab) reinitialize_layers(self.model) # Inputs Ybar_t = torch.load('./sanity_check_en_es_data/Ybar_t.pkl') dec_init_state = torch.load( './sanity_check_en_es_data/dec_init_state.pkl') enc_hiddens = torch.load('./sanity_check_en_es_data/enc_hiddens.pkl') enc_masks = torch.load('./sanity_check_en_es_data/enc_masks.pkl') enc_hiddens_proj = torch.load( './sanity_check_en_es_data/enc_hiddens_proj.pkl') # Output dec_state_target = torch.load( './sanity_check_en_es_data/dec_state.pkl') o_t_target = torch.load('./sanity_check_en_es_data/o_t.pkl') e_t_target = torch.load('./sanity_check_en_es_data/e_t.pkl') # Run Tests with torch.no_grad(): dec_state_pred, o_t_pred, e_t_pred = self.model.step( Ybar_t, dec_init_state, enc_hiddens, enc_hiddens_proj, enc_masks) self.assertTrue( np.allclose(dec_state_target[0].numpy(), dec_state_pred[0].numpy()), "decoder_state[0] should be:\n {} but is:\n{}".format( dec_state_target[0], dec_state_pred[0])) print("dec_state[0] Sanity Checks Passed!") self.assertTrue( np.allclose(dec_state_target[1].numpy(), dec_state_pred[1].numpy()), "decoder_state[1] should be:\n {} but is:\n{}".format( dec_state_target[1], dec_state_pred[1])) print("dec_state[1] Sanity Checks Passed!") self.assertTrue( np.allclose(o_t_target.numpy(), o_t_pred.numpy()), "combined_output should be:\n {} but is:\n{}".format( o_t_target, o_t_pred)) print("combined_output Sanity Checks Passed!") self.assertTrue( np.allclose(e_t_target.numpy(), e_t_pred.numpy()), "e_t should be:\n {} but is:\n{}".format(e_t_target, e_t_pred))