def beam_search(model: NMT, test_data_src: List[List[str]], beam_size: int, max_decoding_time_step: int) -> List[List[Hypothesis]]: """ Run beam search to construct hypotheses for a list of src-language sentences. @param model (NMT): NMT Model @param test_data_src (List[List[str]]): List of sentences (words) in source language, from test set. @param beam_size (int): beam_size (# of hypotheses to hold for a translation at every step) @param max_decoding_time_step (int): maximum sentence length that Beam search can produce @returns hypotheses (List[List[Hypothesis]]): List of Hypothesis translations for every source sentence. """ was_training = model.training model.eval() hypotheses = [] with torch.no_grad(): for src_sent in tqdm(test_data_src, desc='Decoding', file=sys.stdout): example_hyps = model.beam_search( src_sent, beam_size=beam_size, max_decoding_time_step=max_decoding_time_step) hypotheses.append(example_hyps) if was_training: model.train(was_training) return hypotheses
def __init__(self, _hparams): self.hparams = _hparams set_seed(_hparams.fixed_seed) self.train_loader = get_dataloader(_hparams.train_src_path, _hparams.train_dst_path, _hparams.batch_size, _hparams.num_workers) self.src_vocab, self.dst_vocab = load_vocab(_hparams.train_src_pkl, _hparams.train_dst_pkl) self.device = torch.device(_hparams.device) self.model = NMT(_hparams.embed_size, _hparams.hidden_size, self.src_vocab, self.dst_vocab, self.device, _hparams.dropout_rate).to(self.device) self.optimizer = torch.optim.Adam(self.model.parameters(), lr=_hparams.lr)
def sample(args): train_data_src = read_corpus(args.src_file, source='src') train_data_tgt = read_corpus(args.tgt_file, source='tgt') train_data = zip(train_data_src, train_data_tgt) # load model params print('load model from [%s]' % args.model_bin, file=sys.stderr) params = torch.load(args.model_bin, map_location=lambda storage, loc: storage) vocab = params['vocab'] opt = params['args'] state_dict = params['state_dict'] # build model model = NMT(opt, vocab) model.load_state_dict(state_dict) model.eval() model = model.cuda() # sampling print('begin sampling') train_iter = cum_samples = 0 for src_sents, tgt_sents in data_iter(train_data, batch_size=1): train_iter += 1 samples = model.sample(src_sents, sample_size=5, to_word=True) cum_samples += sum(len(sample) for sample in samples) for i, tgt_sent in enumerate(tgt_sents): print('*' * 80) print('target:' + ' '.join(tgt_sent)) tgt_samples = samples[i] print('samples:') for sid, sample in enumerate(tgt_samples, 1): print('[%d] %s' % (sid, ' '.join(sample[1:-1]))) print('*' * 80)
def decode(args: Dict[str, str]): """ Performs decoding on a test set, and save the best-scoring decoding results. If the target gold-standard sentences are given, the function also computes corpus-level BLEU score. @param args (Dict): args from cmd line """ print("load test source sentences from [{}]".format( args['TEST_SOURCE_FILE']), file=sys.stderr) test_data_src = read_corpus(args['TEST_SOURCE_FILE'], source='src') if args['TEST_TARGET_FILE']: print("load test target sentences from [{}]".format( args['TEST_TARGET_FILE']), file=sys.stderr) test_data_tgt = read_corpus(args['TEST_TARGET_FILE'], source='tgt') else: test_data_tgt = None print("load model from {}".format(args['MODEL_PATH']), file=sys.stderr) model = NMT.load(args['MODEL_PATH']) if args['--cuda']: model = model.to(torch.device("cuda:0")) beam_size = int(args['--beam-size']) max_decoding_time_step = int(args['--max-decoding-time-step']) output_file = args['OUTPUT_FILE'] decode_with_params(model, test_data_src, test_data_tgt, beam_size, max_decoding_time_step, output_file)
def decode(args: Dict[str, str]): """ Performs decoding on a test set, and save the best-scoring decoding results. If the target gold-standard sentences are given, the function also computes corpus-level BLEU score. @param args (Dict): args from cmd line """ print("load test source sentences from [{}]".format(args['TEST_SOURCE_FILE']), file=sys.stderr) test_data_src = read_corpus(args['TEST_SOURCE_FILE'], source='src') if args['TEST_TARGET_FILE']: print("load test target sentences from [{}]".format(args['TEST_TARGET_FILE']), file=sys.stderr) test_data_tgt = read_corpus(args['TEST_TARGET_FILE'], source='tgt') print("load model from {}".format(args['MODEL_PATH']), file=sys.stderr) model = NMT.load(args['MODEL_PATH']) if args['--cuda']: model = model.to(torch.device("cuda:0")) hypotheses = beam_search(model, test_data_src, beam_size=int(args['--beam-size']), max_decoding_time_step=int(args['--max-decoding-time-step'])) if args['TEST_TARGET_FILE']: top_hypotheses = [hyps[0] for hyps in hypotheses] bleu_score = compute_corpus_level_bleu_score(test_data_tgt, top_hypotheses) print('Corpus BLEU: {}'.format(bleu_score * 100), file=sys.stderr) with open(args['OUTPUT_FILE'], 'w') as f: for src_sent, hyps in zip(test_data_src, hypotheses): top_hyp = hyps[0] hyp_sent = ' '.join(top_hyp.value) f.write(hyp_sent + '\n')
def main(options): use_cuda = (len(options.gpuid) >= 1) if options.gpuid: cuda.set_device(options.gpuid[0]) _, src_dev, _, src_vocab = torch.load(open(options.data_file + "." + options.src_lang, 'rb')) _, trg_dev, _, trg_vocab = torch.load(open(options.data_file + "." + options.trg_lang, 'rb')) batched_dev_src, batched_dev_src_mask, sort_index = utils.tensor.advanced_batchize(src_dev, options.batch_size, src_vocab.stoi["<blank>"]) batched_dev_trg, batched_dev_trg_mask = utils.tensor.advanced_batchize_no_sort(trg_dev, options.batch_size, trg_vocab.stoi["<blank>"], sort_index) trg_vocab_size = len(trg_vocab) print(trg_vocab.itos[4]) original_model = torch.load(open(options.original_model_file, 'rb')) nmt = NMT(original_model) # TODO: add more arguments as necessary nmt.eval() if use_cuda > 0: nmt.cuda() else: nmt.cpu() criterion = torch.nn.NLLLoss() # optimizer = eval("torch.optim." + options.optimizer)(nmt.parameters(), options.learning_rate) total_loss = 0 num_sents = 0 for i, batch_i in enumerate(utils.rand.srange(len(batched_dev_src))): print("{0}/ {1}".format(i, len(batched_dev_src))) dev_src_batch = Variable(batched_dev_src[batch_i]) # of size (src_seq_len, batch_size) dev_trg_batch = Variable(batched_dev_trg[batch_i]) # of size (src_seq_len, batch_size) dev_src_mask = Variable(batched_dev_src_mask[batch_i]) dev_trg_mask = Variable(batched_dev_trg_mask[batch_i]) if use_cuda: dev_src_batch = dev_src_batch.cuda() dev_trg_batch = dev_trg_batch.cuda() dev_src_mask = dev_src_mask.cuda() dev_trg_mask = dev_trg_mask.cuda() num_sents += 1 sys_out_batch = nmt(dev_src_batch, dev_trg_batch) # (trg_seq_len, batch_size, trg_vocab_size) # TODO: add more arguments as necessary dev_trg_mask = dev_trg_mask.view(-1) dev_trg_batch = dev_trg_batch.view(-1) dev_trg_batch = dev_trg_batch.masked_select(dev_trg_mask) dev_trg_mask = dev_trg_mask.unsqueeze(1).expand(len(dev_trg_mask), trg_vocab_size) sys_out_batch = sys_out_batch.view(-1, trg_vocab_size) sys_out_batch = sys_out_batch.masked_select(dev_trg_mask).view(-1, trg_vocab_size) loss = criterion(sys_out_batch, dev_trg_batch) # _, max = torch.max(sys_out_batch,dim=1) # print(sys_out_batch[dev_trg_batch]) # print(max, dev_trg_batch) total_loss += loss # break print(total_loss, num_sents) print(total_loss/num_sents) print(torch.exp(total_loss/num_sents))
def test(self): print('*' * 20, 'start test', '*' * 20) self.model = NMT.load(self.hparams.model_save_path, self.device) sources, references, hypotheses = self.beam_search() bleu_score = compute_corpus_level_bleu_score(references, hypotheses) print('Corpus BLEU: {}'.format(bleu_score * 100)) with open(self.hparams.test_res_path, 'w') as f: for src_sent, hypo in zip(sources, hypotheses): src_sent = ' '.join(src_sent) hypo_sent = ' '.join(hypo.value) f.write(src_sent + '\n' + hypo_sent + '\n\n') print('save test result to {}'.format(self.hparams.test_res_path)) print('*' * 20, 'end test', '*' * 20)
def create_model(sess, args, src_vocab_size, tgt_vocab_size, src_vocab_rev, tgt_vocab_rev, mode=constants.TRAIN, reuse=None, load_pretrained_model=False, direction="", model_save_dir=None): sess.run(tf.tables_initializer()) with tf.variable_scope(constants.NMT_VAR_SCOPE + direction, reuse=reuse): with tf.variable_scope("src"): src_emb = tf.get_variable("embedding", shape=[src_vocab_size, args.emb_dim]) with tf.variable_scope("dst"): tgt_emb = tf.get_variable("embedding", shape=[tgt_vocab_size, args.emb_dim]) model = NMT(mode, args.__dict__, src_vocab_size, tgt_vocab_size, src_emb, tgt_emb, src_vocab_rev, tgt_vocab_rev, direction) if load_pretrained_model: if model_save_dir is None: model_save_dir = args.nmt_model_save_dir if direction not in model_save_dir: if direction[::-1] in model_save_dir: model_save_dir = re.sub(direction[::-1], direction, model_save_dir) else: model_save_dir = os.path.join(model_save_dir, direction) print(model_save_dir) try: print("Loading nmt model from", model_save_dir) model.saver.restore(sess, model_save_dir) except Exception as e: print("Error! Loading nmt model from", model_save_dir) print("Again! Loading nmt model from", tf.train.latest_checkpoint(model_save_dir)) model.saver.restore(sess, tf.train.latest_checkpoint(model_save_dir)) else: if reuse is None: print("Creating model with new parameters.") sess.run(tf.global_variables_initializer()) else: print("Reuse parameters.") return model
def beam(args): # load model params print('load model from [%s]' % args.model_bin, file=sys.stderr) params = torch.load(args.model_bin, map_location=lambda storage, loc: storage) vocab = params['vocab'] opt = params['args'] state_dict = params['state_dict'] # build model model = NMT(opt, vocab) model.load_state_dict(state_dict) model.train() # model.eval() model = model.cuda() # loss function loss_fn = torch.nn.NLLLoss() # sampling print('begin beam searching') src_sent = ['we', 'have', 'told', 'that', '.'] hyps = model.beam(src_sent) print('src_sent:', ' '.join(src_sent)) for ids, hyp, dist in hyps: print('tgt_sent:', ' '.join(hyp)) print('tgt_ids :', end=' ') for id in ids: print(id, end=', ') print() print('out_dist:', dist) var_ids = torch.autograd.Variable(torch.LongTensor(ids[1:]), requires_grad=False) loss = loss_fn(dist, var_ids) print('NLL loss =', loss) loss.backward()
def main(options): _, _, src_test, src_vocab = torch.load( open(options.data_file + "." + options.src_lang, 'rb')) _, _, trg_test, trg_vocab = torch.load( open(options.data_file + "." + options.trg_lang, 'rb')) src_vocab_size = len(src_vocab) trg_vocab_size = len(trg_vocab) nmt = NMT(src_vocab_size, trg_vocab_size) nmt = torch.load(open(options.modelname, 'rb')) nmt.eval() if torch.cuda.is_available(): nmt.cuda() else: nmt.cpu() with open('data/output_tanay.txt', 'w') as f_write: for i in range(len(src_test)): src = to_var(torch.unsqueeze(src_test[i], 1), volatile=True) trg = to_var(torch.unsqueeze(trg_test[i], 1), volatile=True) results = nmt(src, trg) s = "" for ix in results: idx = np.argmax(ix.data.cpu().numpy()) if idx == 2: # if <s>, don't write it continue if idx == 3: # if </s>, end the loop break s += trg_vocab.itos[idx] + " " #print s.encode('utf-8') #if len(s): s += '\n' s += '\n' f_write.write(s.encode('utf-8'))
def main(options): src_train, src_dev, src_test, src_vocab = torch.load( open(options.data_file + "." + options.src_lang, 'rb')) trg_train, trg_dev, trg_test, trg_vocab = torch.load( open(options.data_file + "." + options.trg_lang, 'rb')) batched_train_src, batched_train_src_mask, sort_index = utils.tensor.advanced_batchize( src_train, options.batch_size, src_vocab.stoi["<blank>"]) batched_train_trg, batched_train_trg_mask = utils.tensor.advanced_batchize_no_sort( trg_train, options.batch_size, trg_vocab.stoi["<blank>"], sort_index) batched_dev_src, batched_dev_src_mask, sort_index = utils.tensor.advanced_batchize( src_dev, options.batch_size, src_vocab.stoi["<blank>"]) batched_dev_trg, batched_dev_trg_mask = utils.tensor.advanced_batchize_no_sort( trg_dev, options.batch_size, trg_vocab.stoi["<blank>"], sort_index) src_vocab_size = len(src_vocab) trg_vocab_size = len(trg_vocab) nmt = NMT(src_vocab_size, trg_vocab_size) # TODO: add more arguments as necessary if torch.cuda.is_available(): nmt.cuda() else: nmt.cpu() criterion = torch.nn.NLLLoss() optimizer = eval("torch.optim." + options.optimizer)(nmt.parameters(), options.learning_rate) # main training loop last_dev_avg_loss = float("inf") for epoch_i in range(options.epochs): logging.info("At {0}-th epoch.".format(epoch_i)) # srange generates a lazy sequence of shuffled range for i, batch_i in enumerate(utils.rand.srange(len(batched_train_src))): train_src_batch = to_var(batched_train_src[batch_i] ) # of size (src_seq_len, batch_size) train_trg_batch = to_var(batched_train_trg[batch_i] ) # of size (src_seq_len, batch_size) train_src_mask = to_var(batched_train_src_mask[batch_i]) train_trg_mask = to_var(batched_train_trg_mask[batch_i]) sys_out_batch = nmt( train_src_batch, train_trg_batch, training=True ) # (trg_seq_len, batch_size, trg_vocab_size) # TODO: add more arguments as necessary train_trg_mask = train_trg_mask.view(-1) train_trg_batch = train_trg_batch.view(-1) train_trg_batch = train_trg_batch.masked_select(train_trg_mask) train_trg_mask = train_trg_mask.unsqueeze(1).expand( len(train_trg_mask), trg_vocab_size) sys_out_batch = sys_out_batch.view(-1, trg_vocab_size) sys_out_batch = sys_out_batch.masked_select(train_trg_mask).view( -1, trg_vocab_size) loss = criterion(sys_out_batch, train_trg_batch) # logging.debug("loss at batch {0}: {1}".format(i, loss.data[0])) optimizer.zero_grad() loss.backward() optimizer.step() # validation -- this is a crude esitmation because there might be some paddings at the end dev_loss = 0.0 for batch_i in range(len(batched_dev_src)): dev_src_batch = to_var(batched_dev_src[batch_i], volatile=True) dev_trg_batch = to_var(batched_dev_trg[batch_i], volatile=True) dev_src_mask = to_var(batched_dev_src_mask[batch_i], volatile=True) dev_trg_mask = to_var(batched_dev_trg_mask[batch_i], volatile=True) sys_out_batch = nmt( dev_src_batch, dev_trg_batch ) # (trg_seq_len, batch_size, trg_vocab_size) # TODO: add more arguments as necessary dev_trg_mask = dev_trg_mask.view(-1) dev_trg_batch = dev_trg_batch.view(-1) dev_trg_batch = dev_trg_batch.masked_select(dev_trg_mask) dev_trg_mask = dev_trg_mask.unsqueeze(1).expand( len(dev_trg_mask), trg_vocab_size) sys_out_batch = sys_out_batch.view(-1, trg_vocab_size) sys_out_batch = sys_out_batch.masked_select(dev_trg_mask).view( -1, trg_vocab_size) #sys_out_batch = sys_out_batch.masked_select(dev_trg_mask).view(-1, trg_vocab_size) loss = criterion(sys_out_batch, dev_trg_batch) # logging.debug("dev loss at batch {0}: {1}".format(batch_i, loss.data[0])) dev_loss += loss dev_avg_loss = dev_loss / len(batched_dev_src) logging.info( "Average loss value per instance is {0} at the end of epoch {1}". format(dev_avg_loss.data[0], epoch_i)) # if (last_dev_avg_loss - dev_avg_loss).data[0] < options.estop: # logging.info("Early stopping triggered with threshold {0} (previous dev loss: {1}, current: {2})".format(epoch_i, last_dev_avg_loss.data[0], dev_avg_loss.data[0])) # break torch.save( nmt, open( options.model_file + ".nll_{0:.2f}.epoch_{1}".format(dev_avg_loss.data[0], epoch_i), 'wb'), pickle_module=dill) last_dev_avg_loss = dev_avg_loss
def main(options): use_cuda = (len(options.gpuid) >= 1) if options.gpuid: cuda.set_device(options.gpuid[0]) src_train, src_dev, src_test, src_vocab = torch.load(open(options.data_file + "." + options.src_lang, 'rb')) trg_train, trg_dev, trg_test, trg_vocab = torch.load(open(options.data_file + "." + options.trg_lang, 'rb')) batched_train_src, batched_train_src_mask, sort_index = utils.tensor.advanced_batchize(src_train, options.batch_size, src_vocab.stoi["<blank>"]) batched_train_trg, batched_train_trg_mask = utils.tensor.advanced_batchize_no_sort(trg_train, options.batch_size, trg_vocab.stoi["<blank>"], sort_index) batched_dev_src, batched_dev_src_mask, sort_index = utils.tensor.advanced_batchize(src_dev, options.batch_size, src_vocab.stoi["<blank>"]) batched_dev_trg, batched_dev_trg_mask = utils.tensor.advanced_batchize_no_sort(trg_dev, options.batch_size, trg_vocab.stoi["<blank>"], sort_index) trg_vocab_size = len(trg_vocab) src_vocab_size = len(src_vocab) word_emb_size = 300 hidden_size = 1024 nmt = NMT(src_vocab_size, trg_vocab_size, word_emb_size, hidden_size, src_vocab, trg_vocab, attn_model = "general", use_cuda = True) if use_cuda > 0: nmt.cuda() if options.distributed: nmt = torch.nn.DataParallel(nmt) else: nmt.cpu() criterion = torch.nn.NLLLoss() # Configure optimization lr = options.learning_rate optimizer = eval("torch.optim." + options.optimizer)(nmt.parameters(), lr) # main training loop last_dev_avg_loss = float("inf") for epoch_i in range(options.epochs): logging.info("At {0}-th epoch.".format(epoch_i)) # Set training mode nmt.train() # srange generates a lazy sequence of shuffled range for i, batch_i in enumerate(utils.rand.srange(len(batched_train_src))): train_src_batch = Variable(batched_train_src[batch_i]) # of size (src_seq_len, batch_size) train_trg_batch = Variable(batched_train_trg[batch_i]) # of size (src_seq_len, batch_size) train_src_mask = Variable(batched_train_src_mask[batch_i]) train_trg_mask = Variable(batched_train_trg_mask[batch_i]) if use_cuda: train_src_batch = train_src_batch.cuda() train_trg_batch = train_trg_batch.cuda() train_src_mask = train_src_mask.cuda() train_trg_mask = train_trg_mask.cuda() sys_out_batch = nmt(train_src_batch, train_trg_batch, True) del train_src_batch train_trg_mask = train_trg_mask.view(-1) train_trg_batch = train_trg_batch.view(-1) train_trg_batch = train_trg_batch.masked_select(train_trg_mask) train_trg_mask = train_trg_mask.unsqueeze(1).expand(len(train_trg_mask), trg_vocab_size) sys_out_batch = sys_out_batch.view(-1, trg_vocab_size) sys_out_batch = sys_out_batch.masked_select(train_trg_mask).view(-1, trg_vocab_size) loss = criterion(sys_out_batch, train_trg_batch) logging.debug("loss at batch {0}: {1}".format(i, loss.data[0])) optimizer.zero_grad() loss.backward() # # gradient clipping torch.nn.utils.clip_grad_norm(nmt.parameters(), 5.0) optimizer.step() # validation -- this is a crude esitmation because there might be some paddings at the end dev_loss = 0.0 # Set validation mode nmt.eval() for batch_i in range(len(batched_dev_src)): dev_src_batch = Variable(batched_dev_src[batch_i], volatile=True) dev_trg_batch = Variable(batched_dev_trg[batch_i], volatile=True) dev_src_mask = Variable(batched_dev_src_mask[batch_i], volatile=True) dev_trg_mask = Variable(batched_dev_trg_mask[batch_i], volatile=True) if use_cuda: dev_src_batch = dev_src_batch.cuda() dev_trg_batch = dev_trg_batch.cuda() dev_src_mask = dev_src_mask.cuda() dev_trg_mask = dev_trg_mask.cuda() sys_out_batch = nmt(dev_src_batch, dev_trg_batch, False) dev_trg_mask = dev_trg_mask.view(-1) dev_trg_batch = dev_trg_batch.view(-1) dev_trg_batch = dev_trg_batch.masked_select(dev_trg_mask) dev_trg_mask = dev_trg_mask.unsqueeze(1).expand(len(dev_trg_mask), trg_vocab_size) sys_out_batch = sys_out_batch.view(-1, trg_vocab_size) sys_out_batch = sys_out_batch.masked_select(dev_trg_mask).view(-1, trg_vocab_size) loss = criterion(sys_out_batch, dev_trg_batch) logging.debug("dev loss at batch {0}: {1}".format(batch_i, loss.data[0])) dev_loss += loss dev_avg_loss = dev_loss / len(batched_dev_src) logging.info("Average loss value per instance is {0} at the end of epoch {1}".format(dev_avg_loss.data[0], epoch_i)) # if (last_dev_avg_loss - dev_avg_loss).data[0] < options.estop: # logging.info("Early stopping triggered with threshold {0} (previous dev loss: {1}, current: {2})".format(epoch_i, last_dev_avg_loss.data[0], dev_avg_loss.data[0])) # break torch.save(nmt, open(options.model_file + ".nll_{0:.2f}.epoch_{1}".format(dev_avg_loss.data[0], epoch_i), 'wb'), pickle_module=dill) last_dev_avg_loss = dev_avg_loss
def init_model(vocab_sizes, use_cuda, model_file_name): nmt = NMT(vocab_sizes, use_cuda) model.load_state(model_file_name, nmt) return nmt
def main(options): original_model = torch.load(open(options.original_model_file, 'rb')) nmt = NMT(original_model) use_cuda = (len(options.gpuid) >= 1) if options.gpuid: cuda.set_device(options.gpuid[0]) src_train, src_dev, src_test, src_vocab = torch.load( open(options.data_file + "." + options.src_lang, 'rb')) trg_train, trg_dev, trg_test, trg_vocab = torch.load( open(options.data_file + "." + options.trg_lang, 'rb')) batched_test_src, batched_test_src_mask, _ = utils.tensor.advanced_batchize( src_test, 1, src_vocab.stoi["<pad>"]) batched_test_trg, batched_test_trg_mask, _ = utils.tensor.advanced_batchize( trg_test, 1, trg_vocab.stoi["<pad>"]) batched_train_src, batched_train_src_mask, _ = utils.tensor.advanced_batchize( src_train, 1, src_vocab.stoi["<pad>"]) batched_train_trg, batched_train_trg_mask, _ = utils.tensor.advanced_batchize( trg_train, 1, trg_vocab.stoi["<pad>"]) batched_dev_src, batched_dev_src_mask, _ = utils.tensor.advanced_batchize( src_dev, options.batch_size, src_vocab.stoi["<pad>"]) batched_dev_trg, batched_dev_trg_mask, _ = utils.tensor.advanced_batchize( trg_dev, options.batch_size, trg_vocab.stoi["<pad>"]) trg_vocab_size = len(trg_vocab) src_vocab_size = len(src_vocab) # for i, batch_i in enumerate(utils.rand.srange(len(batched_train_src))): # train_src_batch = Variable(batched_train_src[batch_i]) # of size (src_seq_len, batch_size) # train_trg_batch = Variable(batched_train_trg[batch_i]) # of size (src_seq_len, batch_size) # train_src_mask = Variable(batched_train_src_mask[batch_i]) # train_trg_mask = Variable(batched_train_trg_mask[batch_i]) # if use_cuda: # train_src_batch = train_src_batch.cuda() # train_trg_batch = train_trg_batch.cuda() # train_src_mask = train_src_mask.cuda() # train_trg_mask = train_trg_mask.cuda() # sys_out_batch = nmt(train_src_batch, train_trg_batch.size()[0]) # # print(sys_out_batch.size()) # _, sys_out_batch = torch.max(sys_out_batch, dim=2) # sys_out_batch = sys_out_batch.view(-1) # sent = [] # # print(sys_out_batch) # for w in sys_out_batch: # # print(w) # sent.append(trg_vocab.itos[w.data[0]]) # print(sent) # # Initialize encoder with weights parameters from original model # encoder = nn.LSTM(300, 512, bidirectional=True) # encoder.weight_ih_l0 = nn.Parameter(original_model['encoder.rnn.weight_ih_l0']) # encoder.weight_hh_l0 = nn.Parameter(original_model['encoder.rnn.weight_hh_l0']) # encoder.bias_ih_l0 = nn.Parameter(original_model['encoder.rnn.bias_ih_l0']) # encoder.bias_hh_l0 = nn.Parameter(original_model['encoder.rnn.bias_hh_l0']) # encoder.weight_ih_l0_reverse = nn.Parameter(original_model['encoder.rnn.weight_ih_l0_reverse']) # encoder.weight_hh_l0_reverse = nn.Parameter(original_model['encoder.rnn.weight_hh_l0_reverse']) # encoder.bias_ih_l0_reverse = nn.Parameter(original_model['encoder.rnn.bias_ih_l0_reverse']) # encoder.bias_hh_l0_reverse = nn.Parameter(original_model['encoder.rnn.bias_hh_l0_reverse']) # # Initialize decoder with weights parameters from original model # decoder = nn.LSTM(1324, 1024) # decoder.weight_ih_l0 = nn.Parameter(original_model['decoder.rnn.layers.0.weight_ih']) # decoder.weight_hh_l0 = nn.Parameter(original_model['decoder.rnn.layers.0.weight_hh']) # decoder.bias_ih_l0 = nn.Parameter(original_model['decoder.rnn.layers.0.bias_ih']) # decoder.bias_hh_l0 = nn.Parameter(original_model['decoder.rnn.layers.0.bias_hh']) # if use_cuda > 0: # encoder.cuda() # decoder.cuda() # else: # encoder.cpu() # decoder.cpu() # # Initialize embeddings # encoder_embedding = nn.Embedding(36616, 300) # decoder_embedding = nn.Embedding(23262, 300) # encoder_embedding.weight = nn.Parameter(original_model['encoder.embeddings.emb_luts.0.weight']) # decoder_embedding.weight = nn.Parameter(original_model['decoder.embeddings.emb_luts.0.weight']) # # Initialize Ws # wi = nn.Linear(1024,1024, bias=False) # wi.weight = nn.Parameter(original_model['decoder.attn.linear_in.weight']) # wo = nn.Linear(2048, 1024, bias=False) # wo.weight = nn.Parameter(original_model['decoder.attn.linear_out.weight']) # generator = nn.Linear(1024, 23262) # generator.weight = nn.Parameter(original_model['0.weight']) # generator.bias = nn.Parameter(original_model['0.bias']) criterion = torch.nn.NLLLoss() # encoder_optimizer = eval("torch.optim." + options.optimizer)(encoder.parameters(), options.learning_rate) # decoder_optimizer = eval("torch.optim." + options.optimizer)(decoder.parameters(), options.learning_rate) # soft_max = nn.Softmax() optimizer = eval("torch.optim." + options.optimizer)(nmt.parameters(), options.learning_rate) # main training loop last_dev_avg_loss = float("inf") for epoch_i in range(options.epochs): logging.info("At {0}-th epoch.".format(epoch_i)) h_t_1 = Variable(torch.ones(1024)) # srange generates a lazy sequence of shuffled range for i, batch_i in enumerate(utils.rand.srange(len(batched_train_src))): train_src_batch = Variable(batched_train_src[batch_i] ) # of size (src_seq_len, batch_size) train_trg_batch = Variable(batched_train_trg[batch_i] ) # of size (src_seq_len, batch_size) train_src_mask = Variable(batched_train_src_mask[batch_i]) train_trg_mask = Variable(batched_train_trg_mask[batch_i]) if use_cuda: train_src_batch = train_src_batch.cuda() train_trg_batch = train_trg_batch.cuda() train_src_mask = train_src_mask.cuda() train_trg_mask = train_trg_mask.cuda() # encoder_input = encoder_embedding(train_trg_batch) # sys_out_batch, (encoder_hidden_states, _) = encoder(encoder_input) # (trg_seq_len, batch_size, trg_vocab_size) # TODO: add more arguments as necessary # h = Variable(torch.FloatTensor(sys_out_batch.size()[1], 1024).fill_(1./1024)) # c = Variable(torch.FloatTensor(sys_out_batch.size()[1], 1024).fill_(0)) # softmax = torch.nn.Softmax() # tanh = torch.nn.Tanh() # # _,w = torch.max(softmax(generator(h)), dim=1) # w = softmax(generator(h)) # result = Variable(torch.FloatTensor(sys_out_batch.size()[0], sys_out_batch.size()[1], 23262)) # for i in range(sys_out_batch.size()[0]): # wht1 = wi(h).view(1, -1, 1024).expand_as(sys_out_batch) # score = softmax(torch.sum(sys_out_batch * wht1, dim=2)).view(sys_out_batch.size()[0],sys_out_batch.size()[1],1) # st = torch.sum(score * sys_out_batch, dim=0) # ct = tanh(wo(torch.cat([st, h], dim=1))) # _, w = torch.max(w, dim=1) # input = torch.cat([decoder_embedding(w), ct], dim=1) # input = input.view(1, input.size()[0], input.size()[1]) # _,(b,c) = decoder(input, (h,c)) # h = b[0] # c = c[0] # w = softmax(generator(h)) # result[i] = w # # result.append(w) # sys_out_batch = result sys_out_batch = nmt(train_src_batch, train_trg_batch.size()[0]) # s_vector = [] # for hs in sys_out_batch: # score = hs.matmul(wi).matmul(h_t_1) # score = score.unsqueeze(0) # a_h_s = soft_max(score) # # print a_h_s, hs.squeeze(0) # s_vector.append(a_h_s.squeeze(0).dot(hs.squeeze(0))) # s_tilda = sum(s_vector) # c_t = nn.Tanh(wo.matmul(torch.cat(s_tilda, h_t_1))) # sys.exit() # train_trg_mask = train_trg_mask.view(-1) # train_trg_batch = train_trg_batch.view(-1) # train_trg_batch = train_trg_batch.masked_select(train_trg_mask) # train_trg_mask = train_trg_mask.unsqueeze(1).expand(len(train_trg_mask), trg_vocab_size) # sys_out_batch = sys_out_batch.view(-1, trg_vocab_size) # sys_out_batch = sys_out_batch.masked_select(train_trg_mask).view(-1, trg_vocab_size) print(train_trg_mask.size()) train_trg_mask = train_trg_mask.view(-1) train_trg_batch = train_trg_batch.view(-1) train_trg_batch = train_trg_batch.masked_select(train_trg_mask) train_trg_mask = train_trg_mask.unsqueeze(1).expand( len(train_trg_mask), trg_vocab_size - 1) # print(trainin.size()) # print(train_trg_batch[:,:-1].size()) sys_out_batch = sys_out_batch.view(-1, trg_vocab_size - 1) print(trg_vocab_size) print(train_trg_mask.size()) sys_out_batch = sys_out_batch.masked_select(train_trg_mask).view( -1, trg_vocab_size - 1) print(sys_out_batch.size()) loss = criterion(sys_out_batch, train_trg_batch) logging.debug("loss at batch {0}: {1}".format(i, loss.data[0])) optimizer.zero_grad() loss.backward() optimizer.step() # validation -- this is a crude esitmation because there might be some paddings at the end dev_loss = 0.0 for batch_i in range(len(batched_dev_src)): dev_src_batch = Variable(batched_dev_src[batch_i], volatile=True) dev_trg_batch = Variable(batched_dev_trg[batch_i], volatile=True) dev_src_mask = Variable(batched_dev_src_mask[batch_i], volatile=True) dev_trg_mask = Variable(batched_dev_trg_mask[batch_i], volatile=True) if use_cuda: dev_src_batch = dev_src_batch.cuda() dev_trg_batch = dev_trg_batch.cuda() dev_src_mask = dev_src_mask.cuda() dev_trg_mask = dev_trg_mask.cuda() # encoder_input = encoder_embedding(dev_trg_batch) # sys_out_batch = encoder(encoder_input) # (trg_seq_len, batch_size, trg_vocab_size) # TODO: add more arguments as necessary sys_out_batch = nmt(dev_src_batch, dev_trg_batch.size()[0]) dev_trg_mask = dev_trg_mask.view(-1) dev_trg_batch = dev_trg_batch.view(-1) dev_trg_batch = dev_trg_batch.masked_select(dev_trg_mask) dev_trg_mask = dev_trg_mask.unsqueeze(1).expand( len(dev_trg_mask), trg_vocab_size - 1) sys_out_batch = sys_out_batch.view(-1, trg_vocab_size - 1) sys_out_batch = sys_out_batch.masked_select(dev_trg_mask).view( -1, trg_vocab_size - 1) loss = criterion(sys_out_batch, dev_trg_batch) logging.debug("dev loss at batch {0}: {1}".format( batch_i, loss.data[0])) dev_loss += loss dev_avg_loss = dev_loss / len(batched_dev_in) logging.info( "Average loss value per instance is {0} at the end of epoch {1}". format(dev_avg_loss.data[0], epoch_i)) if (last_dev_avg_loss - dev_avg_loss).data[0] < options.estop: logging.info( "Early stopping triggered with threshold {0} (previous dev loss: {1}, current: {2})" .format(epoch_i, last_dev_avg_loss.data[0], dev_avg_loss.data[0])) break torch.save( nmt.state_dict(), open( options.model_file + ".nll_{0:.2f}.epoch_{1}".format(dev_avg_loss.data[0], epoch_i), 'wb'), pickle_module=dill) last_dev_avg_loss = dev_avg_loss
def main(options): use_cuda = (len(options.gpuid) >= 1) # if options.gpuid: # cuda.set_device(options.gpuid[0]) src_train, src_dev, src_test, src_vocab = torch.load( open(options.data_file + "." + options.src_lang, 'rb')) trg_train, trg_dev, trg_test, trg_vocab = torch.load( open(options.data_file + "." + options.trg_lang, 'rb')) batched_train_src, batched_train_src_mask, sort_index = utils.tensor.advanced_batchize( src_train, options.batch_size, src_vocab.stoi["<blank>"]) batched_train_trg, batched_train_trg_mask = utils.tensor.advanced_batchize_no_sort( trg_train, options.batch_size, trg_vocab.stoi["<blank>"], sort_index) batched_dev_src, batched_dev_src_mask, sort_index = utils.tensor.advanced_batchize( src_dev, options.batch_size, src_vocab.stoi["<blank>"]) batched_dev_trg, batched_dev_trg_mask = utils.tensor.advanced_batchize_no_sort( trg_dev, options.batch_size, trg_vocab.stoi["<blank>"], sort_index) print "preprocessing batched data..." processed_src = list() processed_trg = list() processed_src_mask = list() processed_trg_mask = list() for batch_i in range(len(batched_train_src)): if batched_train_src[batch_i].size( 0) <= 35 and batched_train_trg[batch_i].size(0) <= 35: processed_src.append(batched_train_src[batch_i]) processed_trg.append(batched_train_trg[batch_i]) processed_src_mask.append(batched_train_src_mask[batch_i]) processed_trg_mask.append(batched_train_trg_mask[batch_i]) batched_train_src = processed_src batched_train_trg = processed_trg batched_train_src_mask = processed_src_mask batched_train_trg_mask = processed_trg_mask processed_src = list() processed_trg = list() processed_src_mask = list() processed_trg_mask = list() for batch_i in range(len(batched_dev_src)): if batched_dev_src[batch_i].size( 0) <= 35 and batched_dev_trg[batch_i].size(0) <= 35: processed_src.append(batched_dev_src[batch_i]) processed_trg.append(batched_dev_trg[batch_i]) processed_src_mask.append(batched_dev_src_mask[batch_i]) processed_trg_mask.append(batched_dev_trg_mask[batch_i]) batched_dev_src = processed_src batched_dev_trg = processed_trg batched_dev_src_mask = processed_src_mask batched_dev_trg_mask = processed_trg_mask del processed_src, processed_trg, processed_trg_mask, processed_src_mask trg_vocab_size = len(trg_vocab) src_vocab_size = len(src_vocab) word_emb_size = 50 hidden_size = 1024 nmt = NMT(src_vocab_size, trg_vocab_size, word_emb_size, hidden_size, src_vocab, trg_vocab, attn_model="general", use_cuda=True) discriminator = Discriminator(src_vocab_size, trg_vocab_size, word_emb_size, src_vocab, trg_vocab, use_cuda=True) if use_cuda > 0: #nmt = torch.nn.DataParallel(nmt,device_ids=options.gpuid).cuda() nmt.cuda() #discriminator = torch.nn.DataParallel(discriminator,device_ids=options.gpuid).cuda() discriminator.cuda() else: nmt.cpu() discriminator.cpu() criterion_g = torch.nn.NLLLoss().cuda() criterion = torch.nn.CrossEntropyLoss().cuda() # Configure optimization optimizer_g = eval("torch.optim." + options.optimizer)( nmt.parameters(), options.learning_rate) optimizer_d = eval("torch.optim." + options.optimizer)( discriminator.parameters(), options.learning_rate) # main training loop f1 = open("train_loss", "a") f2 = open("dev_loss", "a") last_dev_avg_loss = float("inf") for epoch_i in range(options.epochs): logging.info("At {0}-th epoch.".format(epoch_i)) # srange generates a lazy sequence of shuffled range train_loss_g = 0.0 train_loss_d = 0.0 train_loss_g_nll = 0.0 train_loss_g_ce = 0.0 train_loss_nll_batch_num = 0 train_loss_ce_batch_num = 0 for i, batch_i in enumerate(utils.rand.srange(len(batched_train_src))): if i == 1500: break # if i==5: # break train_src_batch = Variable(batched_train_src[batch_i] ) # of size (src_seq_len, batch_size) train_trg_batch = Variable(batched_train_trg[batch_i] ) # of size (src_seq_len, batch_size) train_src_mask = Variable(batched_train_src_mask[batch_i]) train_trg_mask = Variable(batched_train_trg_mask[batch_i]) if use_cuda: train_src_batch = train_src_batch.cuda() train_trg_batch = train_trg_batch.cuda() train_src_mask = train_src_mask.cuda() train_trg_mask = train_trg_mask.cuda() # train discriminator sys_out_batch = nmt(train_src_batch, train_trg_batch, True).detach() _, predict_batch = sys_out_batch.topk(1) del _ predict_batch = predict_batch.squeeze(2) real_dis_label_out = discriminator(train_src_batch, train_trg_batch, True) fake_dis_label_out = discriminator(train_src_batch, predict_batch, True) optimizer_d.zero_grad() loss_d_real = criterion( real_dis_label_out, Variable( torch.ones(options.batch_size * len(options.gpuid)).long()).cuda()) loss_d_real.backward() loss_d_fake = criterion( fake_dis_label_out, Variable( torch.zeros(options.batch_size * len(options.gpuid)).long()).cuda()) #loss_d_fake.backward(retain_graph=True) loss_d_fake.backward() loss_d = loss_d_fake.data[0] + loss_d_real.data[0] del loss_d_fake, loss_d_real logging.debug("D loss at batch {0}: {1}".format(i, loss_d)) f1.write("D train loss at batch {0}: {1}\n".format(i, loss_d)) optimizer_d.step() if use_cuda > 0: sys_out_batch = sys_out_batch.cuda() train_trg_batch = train_trg_batch.cuda() else: sys_out_batch = sys_out_batch.cpu() train_trg_batch = train_trg_batch.cpu() # train nmt sys_out_batch = nmt(train_src_batch, train_trg_batch, True) _, predict_batch = sys_out_batch.topk(1) predict_batch = predict_batch.squeeze(2) fake_dis_label_out = discriminator(train_src_batch, predict_batch, True) if random.random() > 0.5: train_trg_mask = train_trg_mask.view(-1) train_trg_batch = train_trg_batch.view(-1) train_trg_batch = train_trg_batch.masked_select(train_trg_mask) train_trg_mask = train_trg_mask.unsqueeze(1).expand( len(train_trg_mask), trg_vocab_size) sys_out_batch = sys_out_batch.view(-1, trg_vocab_size) sys_out_batch = sys_out_batch.masked_select( train_trg_mask).view(-1, trg_vocab_size) loss_g = criterion_g(sys_out_batch, train_trg_batch) train_loss_g_nll += loss_g train_loss_nll_batch_num += 1 f1.write("G train NLL loss at batch {0}: {1}\n".format( i, loss_g.data[0])) else: loss_g = criterion( fake_dis_label_out, Variable( torch.ones(options.batch_size * len(options.gpuid)).long()).cuda()) train_loss_g_ce += loss_g train_loss_ce_batch_num += 1 f1.write("G train CE loss at batch {0}: {1}\n".format( i, loss_g.data[0])) logging.debug("G loss at batch {0}: {1}".format(i, loss_g.data[0])) optimizer_g.zero_grad() loss_g.backward() # # gradient clipping torch.nn.utils.clip_grad_norm(nmt.parameters(), 5.0) optimizer_g.step() train_loss_d += loss_d train_avg_loss_g_nll = train_loss_g_nll / train_loss_nll_batch_num train_avg_loss_g_ce = train_loss_g_ce / train_loss_ce_batch_num train_avg_loss_d = train_loss_d / len(train_src_batch) logging.info( "G TRAIN Average NLL loss value per instance is {0} at the end of epoch {1}" .format(train_avg_loss_g_nll, epoch_i)) logging.info( "G TRAIN Average CE loss value per instance is {0} at the end of epoch {1}" .format(train_avg_loss_g_ce, epoch_i)) logging.info( "D TRAIN Average loss value per instance is {0} at the end of epoch {1}" .format(train_avg_loss_d, epoch_i)) # validation -- this is a crude esitmation because there might be some paddings at the end # dev_loss_g_nll = 0.0 # dev_loss_g_ce = 0.0 # dev_loss_d = 0.0 # for batch_i in range(len(batched_dev_src)): # dev_src_batch = Variable(batched_dev_src[batch_i], volatile=True) # dev_trg_batch = Variable(batched_dev_trg[batch_i], volatile=True) # dev_src_mask = Variable(batched_dev_src_mask[batch_i], volatile=True) # dev_trg_mask = Variable(batched_dev_trg_mask[batch_i], volatile=True) # if use_cuda: # dev_src_batch = dev_src_batch.cuda() # dev_trg_batch = dev_trg_batch.cuda() # dev_src_mask = dev_src_mask.cuda() # dev_trg_mask = dev_trg_mask.cuda() # sys_out_batch = nmt(dev_src_batch, dev_trg_batch, False).detach() # _,predict_batch = sys_out_batch.topk(1) # predict_batch = predict_batch.squeeze(2) # real_dis_label_out = discriminator(dev_src_batch, dev_trg_batch, True).detach() # fake_dis_label_out = discriminator(dev_src_batch, predict_batch, True).detach() # if use_cuda > 0: # sys_out_batch = sys_out_batch.cuda() # dev_trg_batch = dev_trg_batch.cuda() # else: # sys_out_batch = sys_out_batch.cpu() # dev_trg_batch = dev_trg_batch.cpu() # dev_trg_mask = dev_trg_mask.view(-1) # dev_trg_batch = dev_trg_batch.view(-1) # dev_trg_batch = dev_trg_batch.masked_select(dev_trg_mask) # dev_trg_mask = dev_trg_mask.unsqueeze(1).expand(len(dev_trg_mask), trg_vocab_size) # sys_out_batch = sys_out_batch.view(-1, trg_vocab_size) # sys_out_batch = sys_out_batch.masked_select(dev_trg_mask).view(-1, trg_vocab_size) # loss_g_nll = criterion_g(sys_out_batch, dev_trg_batch) # loss_g_ce = criterion(fake_dis_label_out, Variable(torch.ones(options.batch_size*len(options.gpuid)).long(),volatile=True).cuda()) # loss_d = criterion(real_dis_label_out, Variable(torch.ones(options.batch_size*len(options.gpuid)).long(),volatile=True).cuda()) + criterion(fake_dis_label_out, Variable(torch.zeros(options.batch_size*len(options.gpuid)).long(),volatile=True).cuda()) # logging.debug("G dev NLL loss at batch {0}: {1}".format(batch_i, loss_g_nll.data[0])) # logging.debug("G dev CE loss at batch {0}: {1}".format(batch_i, loss_g_ce.data[0])) # f2.write("G dev NLL loss at batch {0}: {1}\n".format(batch_i, loss_g_nll.data[0])) # f2.write("G dev CE loss at batch {0}: {1}\n".format(batch_i, loss_g_ce.data[0])) # logging.debug("D dev loss at batch {0}: {1}".format(batch_i, loss_d.data[0])) # f2.write("D dev loss at batch {0}: {1}\n".format(batch_i, loss_d.data[0])) # dev_loss_g_nll += loss_g_nll # dev_loss_g_ce += loss_g_ce # dev_loss_d += loss_d # dev_avg_loss_g_nll = dev_loss_g_nll / len(batched_dev_src) # dev_avg_loss_g_ce = dev_loss_g_ce / len(batched_dev_src) # dev_avg_loss_d = dev_loss_d / len(batched_dev_src) # logging.info("G DEV Average NLL loss value per instance is {0} at the end of epoch {1}".format(dev_avg_loss_g_nll.cpu().data[0], epoch_i)) # logging.info("G DEV Average CE loss value per instance is {0} at the end of epoch {1}".format(dev_avg_loss_g_ce.cpu().data[0], epoch_i)) # logging.info("D DEV Average loss value per instance is {0} at the end of epoch {1}".format(dev_avg_loss_d.data[0], epoch_i)) # # if (last_dev_avg_loss - dev_avg_loss).data[0] < options.estop: # # logging.info("Early stopping triggered with threshold {0} (previous dev loss: {1}, current: {2})".format(epoch_i, last_dev_avg_loss.data[0], dev_avg_loss.data[0])) # # break torch.save(nmt, open( "nmt.nll_{0:.2f}.epoch_{1}".format( train_avg_loss_g_nll.cpu().data[0], epoch_i), 'wb'), pickle_module=dill) torch.save(discriminator, open( "discriminator.nll_{0:.2f}.epoch_{1}".format( train_avg_loss_d.data[0], epoch_i), 'wb'), pickle_module=dill) f1.close() f2.close()
def train(args: Dict): train_data_src = read_corpus(args['--train-src'], source='src') train_data_tgt = read_corpus(args['--train-tgt'], source='tgt') dev_data_src = read_corpus(args['--dev-src'], source='src') dev_data_tgt = read_corpus(args['--dev-tgt'], source='tgt') # [(src_0, tgt_0), (src_1, tgt_1), ..., ] train_data = list(zip(train_data_src, train_data_tgt)) dev_data = list(zip(dev_data_src, dev_data_tgt)) train_batch_size = int(args['--batch-size']) clip_grad = float(args['--clip-grad']) valid_niter = int(args['--valid-niter']) log_every = int(args['--log-every']) model_save_path = args['--save-to'] # vocab = Vocab.load(args['--vocab']) vocab = Vocab.build(train_data_src, train_data_tgt, int(args['--vocab-size']), 1) model = NMT(embed_size=int(args['--embed-size']), hidden_size=int(args['--hidden-size']), dropout_rate=float(args['--dropout']), vocab=vocab) model.train() print(model) uniform_init = float(args['--uniform-init']) if np.abs(uniform_init) > 0.: print('uniformly initialize parameters [-%f, +%f]' % (uniform_init, uniform_init), file=sys.stderr) for p in model.parameters(): p.data.uniform_(-uniform_init, uniform_init) # vocab_mask = torch.ones(len(vocab.tgt)) # vocab_mask[vocab.tgt['<pad>']] = 0 device = torch.device("cuda:0" if args['--cuda'] else "cpu") print('use device: %s' % device, file=sys.stderr) model = model.to(device) model.save(model_save_path) optimizer = torch.optim.Adam(model.parameters(), lr=float(args['--lr'])) num_trial = 0 train_iter = patience = cum_loss = report_loss = cum_tgt_words = report_tgt_words = 0 cum_examples = report_examples = epoch = valid_num = 0 hist_valid_scores = [] train_time = begin_time = time.time() print('begin Maximum Likelihood training') while True: epoch += 1 for src_sents, tgt_sents in batch_iter(train_data, batch_size=train_batch_size, shuffle=True): train_iter += 1 optimizer.zero_grad() batch_size = len(src_sents) #################### forward pass and compute loss ######################### # example_losses = -model(src_sents, tgt_sents) # (batch_size,) example_losses = model(src_sents, tgt_sents) # [batch_size,] batch_loss = example_losses.sum() loss = batch_loss / batch_size #################### backward pass to compute gradients #################### loss.backward() # clip gradient grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), clip_grad) #################### update model parameters ############################### optimizer.step() #################### do some statistics #################################### batch_losses_val = batch_loss.item() report_loss += batch_losses_val cum_loss += batch_losses_val tgt_words_num_to_predict = sum( len(s[1:]) for s in tgt_sents) # omitting leading `<s>` report_tgt_words += tgt_words_num_to_predict cum_tgt_words += tgt_words_num_to_predict report_examples += batch_size cum_examples += batch_size #################### print log ############################################# if train_iter % log_every == 0: print( 'epoch %d, iter %d, avg. loss %.2f, avg. ppl %.2f ' 'cum. examples %d, speed %.2f words/sec, time elapsed %.2f sec' % ( epoch, train_iter, report_loss / report_examples, # math.exp(report_loss / report_tgt_words), (report_loss / report_tgt_words), cum_examples, report_tgt_words / (time.time() - train_time), time.time() - begin_time), file=sys.stderr) train_time = time.time() report_loss = report_tgt_words = report_examples = 0. ##################### perform validation ################################## if train_iter % valid_niter == 0: print( 'epoch %d, iter %d, cum. loss %.2f, cum. ppl %.2f cum. examples %d' % (epoch, train_iter, cum_loss / cum_examples, np.exp(cum_loss / cum_tgt_words), cum_examples), file=sys.stderr) cum_loss = cum_examples = cum_tgt_words = 0. valid_num += 1 print('begin validation ...', file=sys.stderr) # compute dev. ppl and bleu dev_ppl = evaluate_ppl( model, dev_data, batch_size=128) # dev batch size can be a bit larger valid_metric = -dev_ppl print('validation: iter %d, dev. ppl %f' % (train_iter, dev_ppl), file=sys.stderr) is_better = len(hist_valid_scores ) == 0 or valid_metric > max(hist_valid_scores) hist_valid_scores.append(valid_metric) # hypotheses = beam_search(model, dev_data_src, # beam_size=4, # max_decoding_time_step=10) if is_better: patience = 0 print('save currently the best model to [%s]' % model_save_path, file=sys.stderr) model.save(model_save_path) # also save the optimizers' state torch.save(optimizer.state_dict(), model_save_path + '.optim') elif patience < int(args['--patience']): patience += 1 print('hit patience %d' % patience, file=sys.stderr) if patience == int(args['--patience']): num_trial += 1 print('hit #%d trial' % num_trial, file=sys.stderr) if num_trial == int(args['--max-num-trial']): print('early stop!', file=sys.stderr) exit(0) # decay lr, and restore from previously best checkpoint lr = optimizer.param_groups[0]['lr'] * float( args['--lr-decay']) print( 'load previously best model and decay learning rate to %f' % lr, file=sys.stderr) # load model params = torch.load( model_save_path, map_location=lambda storage, loc: storage) model.load_state_dict(params['state_dict']) model = model.to(device) print('restore parameters of the optimizers', file=sys.stderr) optimizer.load_state_dict( torch.load(model_save_path + '.optim')) # set new lr for param_group in optimizer.param_groups: param_group['lr'] = lr # reset patience patience = 0 if epoch == int(args['--max-epoch']): print('reached maximum number of epochs!', file=sys.stderr) exit(0)
def train(args: Dict): """ Train the NMT Model. @param args (Dict): args from cmd line """ do_bleu = '--ignore-test-bleu' not in args or not args['--ignore-test-bleu'] train_data_src = read_corpus(args['--train-src'], source='src', dev_mode=dev_mode) train_data_tgt = read_corpus(args['--train-tgt'], source='tgt', dev_mode=dev_mode) dev_data_src = read_corpus(args['--dev-src'], source='src', dev_mode=dev_mode) dev_data_tgt = read_corpus(args['--dev-tgt'], source='tgt', dev_mode=dev_mode) if do_bleu: test_data_src = read_corpus(args['--test-src'], source='src', dev_mode=dev_mode) test_data_tgt = read_corpus(args['--test-tgt'], source='tgt', dev_mode=dev_mode) train_data = list(zip(train_data_src, train_data_tgt)) dev_data = list(zip(dev_data_src, dev_data_tgt)) max_tokens_in_sentence = int(args['--max-decoding-time-step']) train_data = clean_data(train_data, max_tokens_in_sentence) dev_data = clean_data(dev_data, max_tokens_in_sentence) train_batch_size = int(args['--batch-size']) dev_batch_size = 128 clip_grad = float(args['--clip-grad']) valid_niter = int(args['--valid-niter']) bleu_niter = int(args['--bleu-niter']) log_every = int(args['--log-every']) model_save_path = args['--save-to'] vocab = Vocab.load(args['--vocab'], args['--word_freq']) model = NMT(embed_size=int(args['--embed-size']), hidden_size=int(args['--hidden-size']), dropout_rate=float(args['--dropout']), vocab=vocab) writer = SummaryWriter() # model = TransformerNMT(vocab, num_hidden_layers=3) model.train() uniform_init = float(args['--uniform-init']) if np.abs(uniform_init) > 0.: print('uniformly initialize parameters [-%f, +%f]' % (uniform_init, uniform_init), file=sys.stderr) for p in model.parameters(): if p.dim() > 1: torch.nn.init.xavier_uniform_(p) else: p.data.uniform_(-uniform_init, uniform_init) vocab_mask = torch.ones(len(vocab.tgt)) vocab_mask[vocab.tgt['<pad>']] = 0 device = torch.device("cuda:0" if args['--cuda'] else "cpu") print('use device: %s' % device, file=sys.stderr) model = model.to(device) optimizer = torch.optim.Adam(model.parameters(), lr=float(args['--lr'])) num_trial = 0 train_iter = patience = cum_loss = report_loss = cum_tgt_words = report_tgt_words = 0 cum_examples = report_examples = epoch = valid_num = 0 hist_valid_scores = [] train_time = begin_time = time.time() print("Sorting dataset based on difficulty...") dataset = (train_data, dev_data) ordered_dataset = load_order(args['--order-name'], dataset, vocab) # TODO: order = balance_order(order, dataset) (train_data, dev_data) = ordered_dataset visualize_scoring_examples = False if visualize_scoring_examples: visualize_scoring(ordered_dataset, vocab) n_iters = math.ceil(len(train_data) / train_batch_size) print("n_iters per epoch is {}: ({} / {})".format(n_iters, len(train_data), train_batch_size)) max_epoch = int(args['--max-epoch']) max_iters = max_epoch * n_iters print('begin Maximum Likelihood training') print('Using order function: {}'.format(args['--order-name'])) print('Using pacing function: {}'.format(args['--pacing-name'])) while True: epoch += 1 for _ in range(n_iters): # Get pacing data according to train_iter current_train_data, current_dev_data = pacing_data( train_data, dev_data, time=train_iter, warmup_iters=int(args["--warmup-iters"]), method=args['--pacing-name'], tb=writer) # Uniformly sample batches from the paced dataset src_sents, tgt_sents = get_pacing_batch( current_train_data, batch_size=train_batch_size, shuffle=True) train_iter += 1 # ERROR START optimizer.zero_grad() batch_size = len(src_sents) example_losses = -model(src_sents, tgt_sents) # (batch_size,) batch_loss = example_losses.sum() loss = batch_loss / batch_size loss.backward() # clip gradient grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), clip_grad) optimizer.step() batch_losses_val: int = batch_loss.item() report_loss += batch_losses_val cum_loss += batch_losses_val tgt_words_num_to_predict = sum( len(s[1:]) for s in tgt_sents) # omitting leading `<s>` report_tgt_words += tgt_words_num_to_predict cum_tgt_words += tgt_words_num_to_predict report_examples += batch_size cum_examples += batch_size if train_iter % log_every == 0: print( 'epoch %d, iter %d, avg. loss %.2f, avg. ppl %.2f ' 'cum. examples %d, speed %.2f words/sec, time elapsed %.2f sec' % (epoch, train_iter, report_loss / report_examples, math.exp(report_loss / report_tgt_words), cum_examples, report_tgt_words / (time.time() - train_time), time.time() - begin_time), file=sys.stderr) writer.add_scalar('Loss/train', report_loss / report_examples, train_iter) writer.add_scalar('ppl/train', math.exp(report_loss / report_tgt_words), train_iter) train_time = time.time() report_loss = report_tgt_words = report_examples = 0. # evaluate BLEU if train_iter % bleu_niter == 0 and do_bleu: bleu = decode_with_params( model, test_data_src, test_data_tgt, int(args['--beam-size']), int(args['--max-decoding-time-step'])) writer.add_scalar('bleu/test', bleu, train_iter) # perform validation if train_iter % valid_niter == 0: print( 'epoch %d, iter %d, cum. loss %.2f, cum. ppl %.2f cum. examples %d' % (epoch, train_iter, cum_loss / cum_examples, np.exp(cum_loss / cum_tgt_words), cum_examples), file=sys.stderr) cum_loss = cum_examples = cum_tgt_words = 0. valid_num += 1 print('begin validation ...', file=sys.stderr) # compute dev. ppl and bleu # dev batch size can be a bit larger dev_ppl = evaluate_ppl(model, current_dev_data, batch_size=dev_batch_size) valid_metric = -dev_ppl writer.add_scalar('ppl/valid', dev_ppl, train_iter) cum_loss = cum_examples = cum_tgt_words = 0. valid_num += 1 print('validation: iter %d, dev. ppl %f' % (train_iter, dev_ppl), file=sys.stderr) is_better = len(hist_valid_scores ) == 0 or valid_metric > max(hist_valid_scores) hist_valid_scores.append(valid_metric) if is_better: patience = 0 print('save currently the best model to [%s]' % model_save_path, file=sys.stderr) model.save(model_save_path) # also save the optimizers' state torch.save(optimizer.state_dict(), model_save_path + '.optim') elif patience < int(args['--patience']): patience += 1 print('hit patience %d' % patience, file=sys.stderr) if patience == int(args['--patience']): num_trial += 1 print('hit #%d trial' % num_trial, file=sys.stderr) if num_trial == int(args['--max-num-trial']): print('early stop!', file=sys.stderr) exit(0) # decay lr, and restore from previously best checkpoint lr = optimizer.param_groups[0]['lr'] * \ float(args['--lr-decay']) print( 'load previously best model and decay learning rate to %f' % lr, file=sys.stderr) # load model params = torch.load( model_save_path, map_location=lambda storage, loc: storage) model.load_state_dict(params['state_dict']) model = model.to(device) print('restore parameters of the optimizers', file=sys.stderr) optimizer.load_state_dict( torch.load(model_save_path + '.optim')) # set new lr for param_group in optimizer.param_groups: param_group['lr'] = lr # reset patience patience = 0 if epoch >= int(args['--max-epoch']): print('reached maximum number of epochs!', file=sys.stderr) exit(0)
def decode(args): option, values = load_model(args.model) #option, values = load_average_model(args.model) config = tf.ConfigProto() config.gpu_options.allow_growth = True svocabs, tvocabs = option["vocabulary"] svocab, isvocab = svocabs tvocab, itvocab = tvocabs unk_sym = option["unk"] eos_sym = option["eos"] source_word2vec, target_word2vec = option["word2vecs"] count = 0 doption = { "maxlen": args.maxlen, "minlen": args.minlen, "beamsize": args.beamsize, "normalize": args.normalize } # create graph model = NMT(option["num_layers"], option["num_heads"], option["attention_dropout"], option["residual_dropout"], option["relu_dropout"], option["embedding"], option["hidden"], option["filter"], len(isvocab), len(itvocab), source_word2vec, target_word2vec) model.option = option input_file = open(args.corpus, 'r') output_file = open(args.translation, 'w') with tf.Session(config=config): tf.global_variables_initializer().run() set_variables(tf.trainable_variables(), values) line = input_file.readline() while line: line_list = line.split() data = [line] seq, _, seq_len = convert_data(data, svocab, unk_sym, eos_sym) t1 = time.time() tlist = beamsearch(model, seq, seq_len, **doption) t2 = time.time() if len(tlist) == 0: sys.stdout.write("\n") score = -10000.0 else: best, score = tlist[0] output_file.write(" ".join(best[:-1])) output_file.write("\n") count = count + 1 sys.stderr.write(str(count) + " ") sys.stderr.write(str(score) + " " + str(t2 - t1) + "\n") line = input_file.readline() output_file.close() input_file.close()
def train(args): option = default_option() # predefined model names pathname, basename = os.path.split(args.model) modelname = get_filename(basename) autoname = os.path.join(pathname, modelname + ".autosave.pkl") bestname = os.path.join(pathname, modelname + ".best.pkl") # load models if os.path.exists(args.model): opt, params = load_model(args.model) override(option, opt) init = False else: init = True params = None override(option, args_to_dict(args)) print_option(option) # load references if option["references"]: references = load_references(option["references"]) else: references = None # input corpus batch = option["batch"] sortk = option["sort"] or 1 shuffle = option["seed"] if option["shuffle"] else None reader = TextReader(option["corpus"], shuffle) processor = [data_length, data_length] stream = TextIterator(reader, [batch, batch * sortk], processor, option["limit"], option["sort"]) if shuffle and option["indices"] is not None: reader.set_indices(option["indices"]) if args.reset: option["count"] = [0, 0] option["epoch"] = 0 option["cost"] = 0.0 skip_stream(reader, option["count"][1]) # beamsearch option search_opt = { "beamsize": option["beamsize"], "normalize": option["normalize"], "maxlen": option["maxlen"], "minlen": option["minlen"] } # misc svocabs, tvocabs = option["vocabulary"] svocab, isvocab = svocabs tvocab, itvocab = tvocabs unk = option["unk"] eos = option["eos"] source_word2vec, target_word2vec = option["word2vecs"] scale = option["scale"] # set seed np.random.seed(option["seed"]) tf.set_random_seed(option["seed"]) initializer = tf.random_uniform_initializer(-scale, scale) model = NMT(option["num_layers"], option["num_heads"], option["attention_dropout"], option["residual_dropout"], option["relu_dropout"], option["embedding"], option["hidden"], option["filter"], len(isvocab), len(itvocab), source_word2vec, target_word2vec, initializer=initializer) model.option = option # create optimizer optim = Optimizer(model, algorithm=option["optimizer"], norm=True, constraint=("norm", option["norm"])) # create session config = tf.ConfigProto() config.gpu_options.allow_growth = True with tf.Session(config=config): tf.global_variables_initializer().run() print "parameters:", count_parameters(tf.trainable_variables()) if not init: set_variables(tf.trainable_variables(), params) def lr_decay_fn(*args, **kwargs): global_step = kwargs["global_step"] step = kwargs["step"] epoch = kwargs["epoch"] option["alpha"] = option["alpha"] * option["decay"] msg = "G/E/S: %d/%d/%d alpha: %f" print(msg % (global_step, epoch, step, option["alpha"])) def train_step_fn(data, **variables): alpha = option["alpha"] global_step = variables["global_step"] step = variables["step"] epoch = variables["epoch"] xdata, _, xlen = convert_data(data[0], svocab, unk, eos) ydata, _, ylen = convert_data(data[1], tvocab, unk, eos) t1 = time.time() cost, norm = optim.optimize(xdata, xlen, ydata, ylen) alpha = (1 / float(option["embedding"])**0.5) * min( 1 / float(global_step)**0.5, global_step / float(option["warmup"])**1.5) optim.update(alpha=alpha) t2 = time.time() #cost = cost * len(ylen) / sum(ylen) msg = "G/E/S: %d/%d/%d cost: %f norm: %f time: %f" print(msg % (global_step, epoch, step, cost, norm, t2 - t1)) return cost / math.log(2) def sample_fn(*args, **kwargs): data = args[0] batch = len(data[0]) ind = np.random.randint(0, batch) sdata = data[0][ind] tdata = data[1][ind] xdata, _, xlen = convert_data(data[0], svocab, unk, eos) xdata = xdata[ind:ind + 1, :] xlen = xlen[ind:ind + 1] hls = beamsearch(model, xdata, xlen, **search_opt) best, score = hls[0] print("> " + sdata) print("> " + tdata) print("> " + " ".join(best[:-1])) def cost_summary(*args, **kwargs): cost = kwargs["local_cost"] global_cost = kwargs["global_cost"] step = kwargs["local_step"] global_step = kwargs["global_step"] ac, gac = cost / step, global_cost / global_step print("averaged cost: %f/%f" % (ac, gac)) def stop_fn(*args, **kwargs): if option["maxepoch"] < kwargs["epoch"]: raise StopIteration def save_fn(*args, **kwargs): save_model(model, autoname, reader, option, **kwargs) def validate_fn(*args, **kwargs): if option["validation"] and references: validate_model(model, option["validation"], references, search_opt, bestname, reader, option, **kwargs) # global/epoch lr_decay_hook = ops.train_loop.hook(option["stop"], 1, lr_decay_fn) # local save_hook = ops.train_loop.hook(0, option["freq"], save_fn) e_save_hook = ops.train_loop.hook(0, 2, save_fn) # local sample_hook = ops.train_loop.hook(0, option["sfreq"], sample_fn) # global/local/epoch validate_hook = ops.train_loop.hook(0, option["vfreq"], validate_fn) e_validate_hook = ops.train_loop.hook(0, 1, validate_fn) # epoch cost_summary_hook = ops.train_loop.hook(0, 1, cost_summary) # global/epoch stop_hook = ops.train_loop.hook(0, 1, stop_fn) global_level_hooks = [] local_level_hooks = [save_hook, sample_hook, validate_hook] epoch_level_hooks = [ lr_decay_hook, cost_summary_hook, e_save_hook, e_validate_hook, stop_hook ] ops.train_loop.train_loop(stream, train_step_fn, option, global_level_hooks, local_level_hooks, epoch_level_hooks) stream.close()
en_val, de_val, epoch=epoch, file_id=file_id, file_nums=num_splits) file_id += 1 #model=main() if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--mode', default='train', help="train or infer ?") args = parser.parse_args() X_word2idx, X_idx2word, Y_word2idx, Y_idx2word = load_idx() en_val, de_val = load_val() model = NMT(X_word2idx=X_word2idx, X_idx2word=X_idx2word, Y_word2idx=Y_word2idx, Y_idx2word=Y_idx2word) if args.mode == "train": #python train_skip.py --mode train print("start training the model...") train() elif args.mode == "test": #python train_skip.py --mode test print("start inferring....") en_val = load_val_large() with open("translation_result", 'w') as f: for en in en_val: output = model.infer(en) f.write(output + "\n") f.close()
class Trainer: """ 训练类,使用训练集训练模型 Args: _hparams (NameSpace): 人为设定的超参数,默认值见config.py,也可以在命令行指定。 """ def __init__(self, _hparams): self.hparams = _hparams set_seed(_hparams.fixed_seed) self.train_loader = get_dataloader(_hparams.train_src_path, _hparams.train_dst_path, _hparams.batch_size, _hparams.num_workers) self.src_vocab, self.dst_vocab = load_vocab(_hparams.train_src_pkl, _hparams.train_dst_pkl) self.device = torch.device(_hparams.device) self.model = NMT(_hparams.embed_size, _hparams.hidden_size, self.src_vocab, self.dst_vocab, self.device, _hparams.dropout_rate).to(self.device) self.optimizer = torch.optim.Adam(self.model.parameters(), lr=_hparams.lr) def train(self): print('*' * 20, 'train', '*' * 20) hist_valid_scores = [] patience = 0 num_trial = 0 for epoch in range(int(self.hparams.max_epochs)): self.model.train() epoch_loss_val = 0 epoch_steps = len(self.train_loader) for step, data_pairs in tqdm(enumerate(self.train_loader)): sents = [(dp.src, dp.dst) for dp in data_pairs] src_sents, tgt_sents = zip(*sents) self.optimizer.zero_grad() batch_size = len(src_sents) example_losses = -self.model(src_sents, tgt_sents) batch_loss = example_losses.sum() train_loss = batch_loss / batch_size epoch_loss_val += train_loss.item() train_loss.backward() torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.hparams.clip_gradient) self.optimizer.step() epoch_loss_val /= epoch_steps print('epoch: {}, epoch_loss_val: {}'.format(epoch, epoch_loss_val)) # perform validation if epoch % self.hparams.valid_niter == 0: print('*' * 20, 'validate', '*' * 20) dev_ppl = evaluate_ppl(self.model, self.hparams.val_src_path, self.hparams.val_dst_path, self.hparams.batch_val_size, self.hparams.num_workers) valid_metric = -dev_ppl is_better = len(hist_valid_scores) == 0 or valid_metric > max(hist_valid_scores) hist_valid_scores.append(valid_metric) if is_better: patience = 0 print('save currently the best model to {}'.format(self.hparams.model_save_path)) self.model.save(self.hparams.model_save_path) torch.save(self.optimizer.state_dict(), self.hparams.optimizer_save_path) elif patience < self.hparams.patience: patience += 1 print('hit patience %d' % patience) if patience == self.hparams.patience: num_trial += 1 print('hit #{} trial'.format(num_trial)) if num_trial == self.hparams.max_num_trial: print('early stop!') exit(0) # 兼容设计,考虑Adam不需要人工调整lr,而其他优化器需要 if hasattr(self.optimizer, 'param_group'): # decay lr, and restore from previously best checkpoint lr = self.optimizer.param_groups[0]['lr'] * self.hparams.lr_decay print('load previously best model and decay learning rate to %f' % lr) params = torch.load(self.hparams.model_save_path, map_location=lambda storage, loc: storage) self.model.load_state_dict(params['state_dict']) self.model = self.model.to(self.device) print('restore parameters of the optimizers') self.optimizer.load_state_dict(torch.load(self.hparams.optimizer_save_path)) # set new lr for param_group in self.optimizer.param_groups: param_group['lr'] = lr # reset patience patience = 0 print('*' * 20, 'end validate', '*' * 20) print('*' * 20, 'end train', '*' * 20)
def init_model(vocab_sizes, use_cuda): nmt = NMT(vocab_sizes, use_cuda) model.load_state("intermediate_ds_new", nmt) # TODO Remove this #model.load_state(os.path.join('data', 'model.param'), nmt, generator) # TODO Uncomment this return nmt
else: fname = './utils/data100000.pkl' train_x, train_y, dev_x, dev_y, test_x, test_y, \ train_dict, test_dict, w2v_train, w2v_test = Load_data(fname) num2word = reverse(test_dict) dim_in = len(train_dict) dim_out = len(test_dict) print('dataset load done.') # load dataset if use_w2v: w2v_train = pickle.load(open('./utils/w2vtrain.pkl', 'rb')) w2v_test = pickle.load(open('./utils/w2vtest.pkl', 'rb')) model = NMT(dim_in, dim_out, w2v_train=w2v_train, w2v_test=w2v_test, ues_attention=attention, ratio=teach_force).to(device) else: model = NMT(dim_in, dim_out, ues_attention=attention, ratio=teach_force).to(device) optimizer = Adam(model.parameters(), lr=LR, weight_decay=1e-4) weight = Calc_P(train_y, dim_out) Loss_fn = MyLoss(weight).to(device) timer = Timer(epoch_size) init_output_log(save_dir) print('model load done.')
def main(options): use_cuda = (len(options.gpuid) >= 1) if options.gpuid: cuda.set_device(options.gpuid[0]) src_train, src_dev, src_test, src_vocab = torch.load( open(options.data_file + "." + options.src_lang, 'rb')) trg_train, trg_dev, trg_test, trg_vocab = torch.load( open(options.data_file + "." + options.trg_lang, 'rb')) """src_train = get_lm_input(src_train) src_dev = get_lm_input(src_dev) src_test = get_lm_input(src_test) trg_train = get_lm_output(trg_train) trg_dev = get_lm_output(trg_dev) trg_test = get_lm_output(trg_test)""" batched_train_src, batched_train_src_mask, sort_index = tensor.advanced_batchize( src_train, options.batch_size, src_vocab.stoi["<blank>"]) batched_train_trg, batched_train_trg_mask = tensor.advanced_batchize_no_sort( trg_train, options.batch_size, trg_vocab.stoi["<blank>"], sort_index) batched_dev_src, batched_dev_src_mask, sort_index = tensor.advanced_batchize( src_dev, options.batch_size, src_vocab.stoi["<blank>"]) batched_dev_trg, batched_dev_trg_mask = tensor.advanced_batchize_no_sort( trg_dev, options.batch_size, trg_vocab.stoi["<blank>"], sort_index) (src_vocab_size, trg_vocab_size) = len(src_vocab), len(trg_vocab) nmt = NMT((src_vocab_size, trg_vocab_size), use_cuda) # TODO: add more arguments as necessary #nmt = init_model((src_vocab_size, trg_vocab_size), use_cuda) if use_cuda > 0: nmt.cuda() else: nmt.cpu() criterion = torch.nn.NLLLoss() optimizer = eval("torch.optim." + options.optimizer)(nmt.parameters(), options.learning_rate) # main training loop last_dev_avg_loss = float("inf") for epoch_i in range(options.epochs): logging.info("At {0}-th epoch.".format(epoch_i)) # srange generates a lazy sequence of shuffled range for i, batch_i in enumerate(rand.srange(len(batched_train_src))): #if random.random() > 0.5: if False: #i > 1500: # TODO REMOVE THIS !!!!!!!!!!!!!!!!!!!! #model.save("intermediate_ds_new", nmt); break if i % 200 == 0: model.save("intermediate_ds", nmt) pass train_src_batch = Variable(batched_train_src[batch_i] ) # of size (src_seq_len, batch_size) train_trg_batch = Variable(batched_train_trg[batch_i] ) # of size (src_seq_len, batch_size) train_src_mask = Variable(batched_train_src_mask[batch_i]) train_trg_mask = Variable(batched_train_trg_mask[batch_i]) if use_cuda: train_src_batch = train_src_batch.cuda() train_trg_batch = train_trg_batch.cuda() train_src_mask = train_src_mask.cuda() train_trg_mask = train_trg_mask.cuda() sys_out_batch, translated_sentence_wd_index = nmt( train_src_batch, train_trg_batch ) # (trg_seq_len, batch_size, trg_vocab_size) # TODO: add more arguments as necessary train_trg_mask = train_trg_mask.view(-1) train_trg_batch = train_trg_batch.view(-1) train_trg_batch = train_trg_batch.masked_select(train_trg_mask) train_trg_mask = train_trg_mask.unsqueeze(1).expand( len(train_trg_mask), trg_vocab_size) sys_out_batch = sys_out_batch.view(-1, trg_vocab_size) sys_out_batch = sys_out_batch.masked_select(train_trg_mask).view( -1, trg_vocab_size) loss = criterion(sys_out_batch, train_trg_batch) logging.debug("loss at batch {0}: {1}".format(i, loss.data[0])) optimizer.zero_grad() loss.backward() optimizer.step() # validation -- this is a crude esitmation because there might be some paddings at the end dev_loss = 0.0 for batch_i in range(len(batched_dev_src)): #if random.random() > 0.5: if False: #i > 1500: # TODO REMOVE THIS !!!!!!!!!!!!!!!!!!!! #model.save("intermediate_ds_new", nmt); break dev_src_batch = Variable(batched_dev_src[batch_i], volatile=True) dev_trg_batch = Variable(batched_dev_trg[batch_i], volatile=True) dev_src_mask = Variable(batched_dev_src_mask[batch_i], volatile=True) dev_trg_mask = Variable(batched_dev_trg_mask[batch_i], volatile=True) if use_cuda: dev_src_batch = dev_src_batch.cuda() dev_trg_batch = dev_trg_batch.cuda() dev_src_mask = dev_src_mask.cuda() dev_trg_mask = dev_trg_mask.cuda() sys_out_batch, translated_sentence_wd_index = nmt( dev_src_batch ) # (trg_seq_len, batch_size, trg_vocab_size) # TODO: add more arguments as necessary actual_trg_max_len = dev_trg_mask.data.shape[0] dev_trg_mask = dev_trg_mask.view(-1) dev_trg_batch = dev_trg_batch.view(-1) dev_trg_batch = dev_trg_batch.masked_select(dev_trg_mask) dev_trg_mask = dev_trg_mask.unsqueeze(1).expand( len(dev_trg_mask), trg_vocab_size) # TODO Remove this!!!!!! predicted_trg_max_len = sys_out_batch.data.shape[0] if actual_trg_max_len > predicted_trg_max_len: sys_out_batch = torch.cat( (sys_out_batch, torch.ones((actual_trg_max_len - predicted_trg_max_len, options.batch_size, trg_vocab_size)))) else: sys_out_batch = sys_out_batch[0:actual_trg_max_len] # TODO Remove this ^^^ !!!!!! sys_out_batch = sys_out_batch.view(-1, trg_vocab_size) sys_out_batch = sys_out_batch.masked_select(dev_trg_mask).view( -1, trg_vocab_size) loss = criterion(sys_out_batch, dev_trg_batch) logging.debug("dev loss at batch {0}: {1}".format( batch_i, loss.data[0])) dev_loss += loss #if True: break dev_avg_loss = dev_loss / len(batched_dev_src) logging.info( "Average loss value per instance is {0} at the end of epoch {1}". format(dev_avg_loss.data[0], epoch_i)) if (last_dev_avg_loss - dev_avg_loss).data[0] < options.estop: logging.info( "Early stopping triggered with threshold {0} (previous dev loss: {1}, current: {2})" .format(epoch_i, last_dev_avg_loss.data[0], dev_avg_loss.data[0])) break torch.save( nmt, open( options.model_file + ".nll_{0:.2f}.epoch_{1}".format(dev_avg_loss.data[0], epoch_i), 'wb'), pickle_module=dill) last_dev_avg_loss = dev_avg_loss import datetime current_dt_tm = datetime.datetime.now().strftime("%Y_%m_%d_%H_%M_%S") model.save("%s_%s" % (options.model_file, current_dt_tm), nmt)
def train(mode, checkpoint_path): # Data data_train = IWSLT15EnViDataSet(en_path="../data/train-en-vi/train.en", vi_path="../data/train-en-vi/train.vi") data_loader = DataLoader(data_train, batch_size=BATCH_SIZE, shuffle=False, drop_last=False) if mode == EN2VI: src_vocab_size, tgt_vocab_size = data_train.en_vocab_size, data_train.vi_vocab_size else: src_vocab_size, tgt_vocab_size = data_train.vi_vocab_size, data_train.en_vocab_size print("Loading data done!") # Model & Optimizer model = NMT(mode=mode, src_vocab_size=src_vocab_size, tgt_vocab_size=tgt_vocab_size) model.to(device) criterion = MaskedPaddingCrossEntropyLoss().to(device) optimizer = Adam(model.parameters()) prev_epoch = 0 if checkpoint_path.exists(): # Resume training model, optimizer, prev_epoch = load_checkpoint(model, optimizer, checkpoint_path) print(f"Resume training from {prev_epoch} epochs!") else: model.apply(xavier_init_weights) print("Training from start!") model.train() for epoch in range(N_EPOCHS - prev_epoch): print(f"\nEpoch: {epoch+prev_epoch+1}") for b, (en_tokens, en_valid_len, vi_tokens, vi_valid_len) in enumerate(data_loader): en_tokens, vi_tokens = en_tokens.to(device), vi_tokens.to(device) en_valid_len, vi_valid_len = en_valid_len.to( device), vi_valid_len.to(device) en_padding_masks = mask_padding(en_tokens, en_valid_len, device) vi_padding_masks = mask_padding(vi_tokens, vi_valid_len, device) if mode == EN2VI: src, tgt = en_tokens, vi_tokens tgt_valid_len = vi_valid_len src_masks, tgt_masks = en_padding_masks, vi_padding_masks else: src, tgt = vi_tokens, en_tokens tgt_valid_len = en_valid_len src_masks, tgt_masks = vi_padding_masks, en_padding_masks optimizer.zero_grad() # Encoder's forward pass: encoder_state = model.encoder(src, src_masks) # Decoder's forward pass decoder_X = torch.tensor([[DEFAULT_SOS_INDEX] * tgt.shape[0]], device=device).reshape(-1, 1) decoder_state = encoder_state loss = torch.tensor(0, device=device, dtype=torch.float) for i in range(1, tgt.shape[1]): decoder_state, logit_pred = model.decoder( decoder_X, decoder_state) loss += criterion(pred=logit_pred[:, 0, :], label=tgt[:, i], device=device).sum() # Teacher forcing decoder_X = tgt[:, i].reshape(-1, 1) loss.backward() clip_grad_norm_(model.parameters(), max_norm=1.0) optimizer.step() if b % 50 == 0: seq_loss = loss / (MAX_LENGTH - 1) print(f"\tBatch {b}; Loss: {seq_loss:.2f}; " f"Mean Token Loss: {seq_loss/tgt_valid_len.sum():.4f}") ## Free up GPU memory del src, tgt, en_valid_len, vi_valid_len, decoder_state, logit_pred, loss torch.cuda.empty_cache() save_checkpoint(mode, src_vocab_size, tgt_vocab_size, model, optimizer, data_train.tokenizer_en, data_train.tokenizer_vi, prev_epoch + epoch + 1, checkpoint_path) for en in ens: vi = translate_en2vi(en_sentence=en, length=MAX_LENGTH, model=model, tokenizer_en=data_train.tokenizer_en, tokenizer_vi=data_train.tokenizer_vi, device=device) print("en:", en, "=> vi:", vi)
def main(options): original_model = torch.load(open(options.original_model_file, 'rb')) nmt = NMT(original_model) # use_cuda = (len(options.gpuid) >= 1) # if options.gpuid: # cuda.set_device(options.gpuid[0]) src_train, src_dev, src_test, src_vocab = torch.load( open(options.data_file + "." + options.src_lang, 'rb')) trg_train, trg_dev, trg_test, trg_vocab = torch.load( open(options.data_file + "." + options.trg_lang, 'rb')) batched_test_src, batched_test_src_mask, _ = utils.tensor.advanced_batchize( src_test, 24, src_vocab.stoi["<pad>"]) batched_test_trg, batched_test_trg_mask, _ = utils.tensor.advanced_batchize( trg_test, 24, trg_vocab.stoi["<pad>"]) # total_loss = 0 # total_sent = 0 # nmt(Variable(torch.from_numpy(np.array([[46, 68], [470, 72], [30, 4]]))),Variable(torch.from_numpy(np.array([[1],[1]])))) # sys.exit(0) # print(torch.min(src_test)) # print(torch.max(trg_test)) for i, batch_i in enumerate(utils.rand.srange(len(batched_test_src))): print(i) test_src_batch = Variable( batched_test_src[batch_i], volatile=True) # of size (src_seq_len, batch_size) test_trg_batch = Variable( batched_test_trg[batch_i], volatile=True) # of size (src_seq_len, batch_size) test_src_mask = Variable(batched_test_src_mask[batch_i], volatile=True) test_trg_mask = Variable(batched_test_trg_mask[batch_i], volatile=True) total_sent += test_src_batch.size()[0] if use_cuda: test_src_batch = test_src_batch.cuda() test_trg_batch = test_trg_batch.cuda() test_src_mask = test_src_mask.cuda() test_trg_mask = test_trg_mask.cuda() # print(torch.min(test_src_batch)) # print(torch.max(test_src_batch)) # print(test_src_batch) sys_out_batch = nmt(test_src_batch, test_trg_batch.size()[0]) test_trg_mask = test_trg_mask.view(-1) test_trg_batch = test_trg_batch.view(-1) test_trg_batch = test_trg_batch.masked_select(test_trg_mask) test_trg_mask = test_trg_mask.unsqueeze(1).expand( len(test_trg_mask), trg_vocab_size - 1) sys_out_batch = sys_out_batch.view(-1, trg_vocab_size - 1) sys_out_batch = sys_out_batch.masked_select(test_trg_mask).view( -1, trg_vocab_size - 1) loss = criterion(sys_out_batch, test_trg_batch) logging.debug("loss at batch {0}: {1}".format(i, loss.data[0])) total_loss += loss # break # _, sys_out_batch = torch.max(sys_out_batch, dim=2) # sys_out_batch = sys_out_batch.view(-1) # sent = [] # # print(sys_out_batch) # for w in sys_out_batch: # # print(w) # sent.append(trg_vocab.itos[w.data[0]]) # # print(sent) # # print(sent.join()) # print(' '.join(sent).encode('utf-8').strip()) print(total_loss, total_sent) print(total_loss / total_sent) print(torch.exp(total_loss / total_sent)) sys.exit(0)