def decode(test_src_path, test_tgt_path=None, model_path='model.bin', beam_size=5, max_decoding=70, device='cpu', output_path='output.txt'): """ Performs decoding on a test set, and save the best-scoring decoding results. If the target gold-standard sentences are given, the function also computes corpus-level BLEU score. Params: test_src_path (str): Path to the test source file test_tgt_path (str): Path to the test target file (optional). Default=None model_path (str): Path to the model file generated after training. Default='model.bin' beam_size (int): beam size (# of hypotheses to hold for a translation at every step) max_decoding (int): maximum sentence length that Beam search can produce. Default=70 device (str): device to perform the calc on. Default = 'cpu' output_path (str): Path for the output file to write the results of the translation. Default='output.txt' """ print(f'load test source sentences from [{test_src_path}]', file=sys.stderr) test_data_src = read_corpus(test_src_path, corpus_type='src') if test_tgt_path is not None: print(f'load test target sentences from [{test_tgt_path}]', file=sys.stderr) test_data_tgt = read_corpus(test_tgt_path, corpus_type='tgt') print(f'load model from {model_path}', file=sys.stderr) model = NMT.load(model_path) model = model.to(torch.device(device)) hypotheses = beam_search(model, test_data_src, beam_size=beam_size, max_decoding_time_step=max_decoding) if test_tgt_path is not None: top_hypotheses = [hyps[0] for hyps in hypotheses] bleu_score = compute_corpus_level_bleu_score(test_data_tgt, top_hypotheses) print(f'Corpus BLEU: {bleu_score}', file=sys.stderr) with open(output_path, 'w') as f: for src_sent, hyps in zip(test_data_src, hypotheses): top_hyp = hyps[0] hyp_sent = ' '.join(top_hyp.value) f.write(hyp_sent + '\n')
""" # train the spm model spm.SentencePieceTrainer.train(input=file_path, model_prefix=corpus_type, vocab_size=vocab_size) # create an instance; this saves .model and .vocab files sp = spm.SentencePieceProcessor() # loads tgt.model or src.model sp.load('{}.model'.format(corpus_type)) sp_list = [sp.id_to_piece(piece_id) for piece_id in range(sp.get_piece_size())] return sp_list if __name__ == '__main__': args = docopt(__doc__) if args["--tok"] == 'nltk': print(f'read in source sentences: {args["--src"]}...') src_sents = read_corpus(args['--src'], corpus_type='src') print(f'read in target sentences: {args["--tgt"]}...') tgt_sents = read_corpus(args['--tgt'], corpus_type='tgt') print('generating vocab...') vocab = Vocab.build(src_sents, tgt_sents, int(args['--size']), int(args['--cutoff'])) print(f'Vocab generated: {vocab}') jfile = f'{args["--f"]}_{args["--size"]}.json' elif args["--tok"] == 'spm': print(f'read in source sentences: {args["--src"]}...') src_sents = get_vocab_list(args['--src'], corpus_type='src', vocab_size=args["--src-size"]) print(f'read in target sentences: {args["--tgt"]}...')
def decode(test_src_path, test_tgt_path=None, model_path='model.bin', tokenizer='nltk', spm_model_src='./spm/src.model', spm_model_tgt='./spm/tgt.model', beam_size=5, max_decoding=70, device='cpu', output_path='output.txt'): """ Performs decoding on a test set, and save the best-scoring decoding results. If the target gold-standard sentences are given, the function also computes corpus-level BLEU score. Params: test_src_path (str): Path to the test source file test_tgt_path (str): Path to the test target file (optional). Default=None model_path (str): Path to the model file generated after training. Default='model.bin' tokenizer (str): Tokenizer used (nltk or spm). Default = nltk spm_model_src (str): Path to the source spm model. Default: ./spm/src.model spm_model_tgt (str): Path to the target spm model. Default: ./spm/tgt.model beam_size (int): beam size (# of hypotheses to hold for a translation at every step) max_decoding (int): maximum sentence length that Beam search can produce. Default=70 device (str): device to perform the calc on. Default = 'cpu' output_path (str): Path for the output file to write the results of the translation. Default='output.txt' """ print(f'load test source sentences from [{test_src_path}]', file=sys.stderr) if tokenizer == 'nltk': test_data_src = read_corpus(test_src_path, corpus_type='src') elif tokenizer == 'spm': test_data_src = read_corpus_spm(test_src_path, corpus_type='src', model_path=spm_model_src) else: raise Exception( f'unrecognised tokenizer {tokenizer}. Should be nltk or spm') if test_tgt_path is not None: print(f'load test target sentences from [{test_tgt_path}]', file=sys.stderr) if tokenizer == 'nltk': test_data_tgt = read_corpus(test_tgt_path, corpus_type='tgt') else: #spm test_data_tgt = read_corpus_spm(test_tgt_path, corpus_type='tgt', model_path=spm_model_tgt) print(f'load model from {model_path}', file=sys.stderr) model = NMT.load(model_path) model = model.to(torch.device(device)) hypotheses = beam_search(model, test_data_src, beam_size=beam_size, max_decoding_time_step=max_decoding) #print(hypotheses) if test_tgt_path is not None: top_hypotheses = [hyps[0] for hyps in hypotheses] bleu_score = compute_corpus_level_bleu_score(test_data_tgt, top_hypotheses) print(f'Corpus BLEU: {bleu_score}', file=sys.stderr) with open(output_path, 'w') as f: for src_sent, hyps in zip(test_data_src, hypotheses): top_hyp = hyps[0] if tokenizer == 'nltk': hyp_sent = ' '.join(top_hyp.value) else: #spm hyp_sent = ''.join(top_hyp.value).replace('▁', ' ') f.write(hyp_sent + '\n')
def load_data_train(train_src_path, train_tgt_path, dev_src_path, dev_tgt_path, tokenizer='nltk', spm_model_src='./spm/src.model', spm_model_tgt='./spm/tgt.model', create_vocab=False, vocab_path=None, vocab_size=50000, vocab_cutoff=2, subset=1.0, random_subset=False): """ Load dataset used by the NMT model Params: train_src_path (str): Path to the source sentences for training train_tgt_path (str): Path to the target sentences for training dev_src_path (str): Path to the source sentences for dev dev_tgt_path (str): Path to the target sentences for dev tokenizer (str): Tokenizer used (nltk or spm). Default = nltk spm_model_src (str): Path to the source spm model. Default: ./spm/src.model spm_model_tgt (str): Path to the target spm model. Default: ./spm/tgt.model create_vocab (bool): If True, vocab will be created otherwise loaded from vocab_path (works with nltk tokenizer) Default: False vocab_path (str): Path to the json file with Vocab. Default = None vocab_size (int): Size of vocabulary for both source and target languages. Default = 50000 vocab_cutoff (int): if word occurs n < freq_cutoff times, drop the word. Default = 2 subset (float): Percentage to apply to the train and dev sets in order to load a subset of the data. Subset is a number > 0 and <= 1. Default = 1 random_subset (bool): if True the data subset is random otherwise the first requierd elements of the data. Default = False Return: train_data (list of (src_sent, tgt_sent)): list of tuples containing source and target sentences for training dev_data (list of (src_sent, tgt_sent)): list of tuples containing source and target sentences for dev vocab (Vocab): Vocab object for source and target """ # read all data if tokenizer == 'nltk': train_data_src = read_corpus(train_src_path, corpus_type='src') train_data_tgt = read_corpus(train_tgt_path, corpus_type='tgt') dev_data_src = read_corpus(dev_src_path, corpus_type='src') dev_data_tgt = read_corpus(dev_tgt_path, corpus_type='tgt') elif tokenizer == 'spm': train_data_src = read_corpus_spm(train_src_path, corpus_type='src', model_path=spm_model_src) train_data_tgt = read_corpus_spm(train_tgt_path, corpus_type='tgt', model_path=spm_model_tgt) dev_data_src = read_corpus_spm(dev_src_path, corpus_type='src', model_path=spm_model_src) dev_data_tgt = read_corpus_spm(dev_tgt_path, corpus_type='tgt', model_path=spm_model_tgt) else: raise ValueError( f'Incorrect value [{tokenizer}] for tokenizer; should be nltk or spm' ) if subset == 1: train_data = list(zip(train_data_src, train_data_tgt)) dev_data = list(zip(dev_data_src, dev_data_tgt)) elif subset > 0 and subset < 1: num_train = int(subset * len(train_data_src)) num_dev = int(subset * len(dev_data_src)) if random_subset: train = list(zip(train_data_src, train_data_tgt)) dev = list(zip(dev_data_src, dev_data_tgt)) random.shuffle(train) random.shuffle(dev) train_data_src, train_data_tgt = zip(*train) dev_data_src, dev_data_tgt = zip(*dev) train_data = list( zip(train_data_src[:num_train], train_data_tgt[:num_train])) dev_data = list(zip(dev_data_src[:num_dev], dev_data_tgt[:num_dev])) else: raise ValueError( f'Incorrect value [{subset}] for subset; should be 0 < subset <=1') if create_vocab and tokenizer == 'nltk': src_sents, tgt_sents = zip(*train_data) vocab = Vocab.build(src_sents, tgt_sents, vocab_size, vocab_cutoff) else: if vocab_path is not None: vocab = Vocab.load(vocab_path) else: raise ValueError( f'Incorrect combination {tokenizer}, {create_vocab}, {vocab_path}' ) return train_data, dev_data, vocab
def trainOld(args: Dict): """ Train the NMT Model. @param args (Dict): args from cmd line """ train_data_src = read_corpus(args['--train-src'], corpus_type='src') train_data_tgt = read_corpus(args['--train-tgt'], corpus_type='tgt') dev_data_src = read_corpus(args['--dev-src'], corpus_type='src') dev_data_tgt = read_corpus(args['--dev-tgt'], corpus_type='tgt') train_data = list(zip(train_data_src, train_data_tgt)) dev_data = list(zip(dev_data_src, dev_data_tgt)) train_batch_size = int(args['--batch-size']) clip_grad = float(args['--clip-grad']) valid_niter = int(args['--valid-niter']) log_every = int(args['--log-every']) model_save_path = args['--save-to'] vocab = Vocab.load(args['--vocab']) model = NMT(embed_size=int(args['--embed-size']), hidden_size=int(args['--hidden-size']), dropout_rate=float(args['--dropout']), vocab=vocab) model.train() uniform_init = float(args['--uniform-init']) if np.abs(uniform_init) > 0.: print('uniformly initialize parameters [-%f, +%f]' % (uniform_init, uniform_init), file=sys.stderr) for p in model.parameters(): p.data.uniform_(-uniform_init, uniform_init) vocab_mask = torch.ones(len(vocab.tgt)) vocab_mask[vocab.tgt['<pad>']] = 0 device = torch.device("cuda:0" if args['--cuda'] else "cpu") print('use device: %s' % device, file=sys.stderr) model = model.to(device) optimizer = torch.optim.Adam(model.parameters(), lr=float(args['--lr'])) num_trial = 0 train_iter = patience = cum_loss = report_loss = cum_tgt_words = report_tgt_words = 0 cum_examples = report_examples = epoch = valid_num = 0 hist_valid_scores = [] train_time = begin_time = time.time() print('begin Maximum Likelihood training') while True: epoch += 1 for src_sents, tgt_sents in batch_iter(train_data, batch_size=train_batch_size, shuffle=True): train_iter += 1 optimizer.zero_grad() batch_size = len(src_sents) example_losses = -model(src_sents, tgt_sents) # (batch_size,) batch_loss = example_losses.sum() loss = batch_loss / batch_size loss.backward() # clip gradient grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), clip_grad) optimizer.step() batch_losses_val = batch_loss.item() report_loss += batch_losses_val cum_loss += batch_losses_val tgt_words_num_to_predict = sum( len(s[1:]) for s in tgt_sents) # omitting leading `<s>` report_tgt_words += tgt_words_num_to_predict cum_tgt_words += tgt_words_num_to_predict report_examples += batch_size cum_examples += batch_size if train_iter % log_every == 0: print('epoch %d, iter %d, avg. loss %.2f, avg. ppl %.2f ' \ 'cum. examples %d, speed %.2f words/sec, time elapsed %.2f sec' % (epoch, train_iter, report_loss / report_examples, math.exp(report_loss / report_tgt_words), cum_examples, report_tgt_words / (time.time() - train_time), time.time() - begin_time), file=sys.stderr) train_time = time.time() report_loss = report_tgt_words = report_examples = 0. # perform validation if train_iter % valid_niter == 0: print( 'epoch %d, iter %d, cum. loss %.2f, cum. ppl %.2f cum. examples %d' % (epoch, train_iter, cum_loss / cum_examples, np.exp(cum_loss / cum_tgt_words), cum_examples), file=sys.stderr) cum_loss = cum_examples = cum_tgt_words = 0. valid_num += 1 print('begin validation ...', file=sys.stderr) # compute dev. ppl and bleu dev_ppl = evaluate_ppl( model, dev_data, batch_size=128) # dev batch size can be a bit larger valid_metric = -dev_ppl print('validation: iter %d, dev. ppl %f' % (train_iter, dev_ppl), file=sys.stderr) is_better = len(hist_valid_scores ) == 0 or valid_metric > max(hist_valid_scores) hist_valid_scores.append(valid_metric) if is_better: patience = 0 print('save currently the best model to [%s]' % model_save_path, file=sys.stderr) model.save(model_save_path) # also save the optimizers' state torch.save(optimizer.state_dict(), model_save_path + '.optim') elif patience < int(args['--patience']): patience += 1 print('hit patience %d' % patience, file=sys.stderr) if patience == int(args['--patience']): num_trial += 1 print('hit #%d trial' % num_trial, file=sys.stderr) if num_trial == int(args['--max-num-trial']): print('early stop!', file=sys.stderr) exit(0) # decay lr, and restore from previously best checkpoint lr = optimizer.param_groups[0]['lr'] * float( args['--lr-decay']) print( 'load previously best model and decay learning rate to %f' % lr, file=sys.stderr) # load model params = torch.load( model_save_path, map_location=lambda storage, loc: storage) model.load_state_dict(params['state_dict']) model = model.to(device) print('restore parameters of the optimizers', file=sys.stderr) optimizer.load_state_dict( torch.load(model_save_path + '.optim')) # set new lr for param_group in optimizer.param_groups: param_group['lr'] = lr # reset patience patience = 0 if epoch == int(args['--max-epoch']): print('reached maximum number of epochs!', file=sys.stderr) exit(0)
def load_data_train(train_src_path, train_tgt_path, dev_src_path, dev_tgt_path, create_vocab=False, vocab_path=None, vocab_size=50000, vocab_cutoff=2, subset=1.0, random_subset=False): """ Load dataset used by the NMT model Params: train_src_path (str): Path to the source sentences for training train_tgt_path (str): Path to the target sentences for training dev_src_path (str): Path to the source sentences for dev dev_tgt_path (str): Path to the target sentences for dev create_vocab (bool): If True, vocab will be created otherwise loaded from vocab_path vocab_path (str): Path to the json file with Vocab vocab_size (int): Size of vocabulary for both source and target languages. Default = 50000 vocab_cutoff (int): if word occurs n < freq_cutoff times, drop the word. Default = 2 subset (float): Percentage to apply to the train and dev sets in order to load a subset of the data. Subset is a number > 0 and <= 1. Default = 1 random_subset (bool): if True the data subset is random otherwise the first requierd elements of the data. Default = False Return: train_data (list of (src_sent, tgt_sent)): list of tuples containing source and target sentences for training dev_data (list of (src_sent, tgt_sent)): list of tuples containing source and target sentences for dev vocab (Vocab): Vocab object for source and target """ # read all data train_data_src = read_corpus(train_src_path, corpus_type='src') train_data_tgt = read_corpus(train_tgt_path, corpus_type='tgt') dev_data_src = read_corpus(dev_src_path, corpus_type='src') dev_data_tgt = read_corpus(dev_tgt_path, corpus_type='tgt') if subset == 1: train_data = list(zip(train_data_src, train_data_tgt)) dev_data = list(zip(dev_data_src, dev_data_tgt)) elif subset > 0 and subset < 1: num_train = int(subset * len(train_data_src)) num_dev = int(subset * len(dev_data_src)) if random_subset: train = list(zip(train_data_src, train_data_tgt)) dev = list(zip(dev_data_src, dev_data_tgt)) random.shuffle(train) random.shuffle(dev) train_data_src, train_data_tgt = zip(*train) dev_data_src, dev_data_tgt = zip(*dev) train_data = list( zip(train_data_src[:num_train], train_data_tgt[:num_train])) dev_data = list(zip(dev_data_src[:num_dev], dev_data_tgt[:num_dev])) else: raise ValueError( f'Incorrect value [{subset}] for subset; should be 0 < subset <=1') if create_vocab: src_sents, tgt_sents = zip(*train_data) vocab = Vocab.build(src_sents, tgt_sents, vocab_size, vocab_cutoff) else: if vocab_path is not None: vocab = Vocab.load(vocab_path) else: raise ValueError(f'Vocab path is None and create_vocab is False') return train_data, dev_data, vocab