def __init__(self, vocab_size: int, embedding_size: int, n_hidden: int, sos_token: int = 0, eos_token: int = 1, mask_token: int = 2, max_output_length: int = 100, rnn_cell: str = 'lstm') -> None: self.decoder = DecoderRNN(vocab_size, max_output_length, embedding_size, n_layers=n_hidden, rnn_cell=rnn_cell, use_attention=False, bidirectional=False, eos_id=eos_token, sos_id=sos_token) if torch.cuda.is_available(): self.decoder.cuda() self.rnn_cell = rnn_cell self.n_hidden = n_hidden self.embedding_size = embedding_size self.SOS_token = sos_token self.EOS_token = eos_token self.mask_token = mask_token self.max_output_length = max_output_length token_weights = torch.ones(vocab_size) if torch.cuda.is_available(): token_weights = token_weights.cuda() self.loss = NLLLoss(weight=token_weights, mask=mask_token) self.optimizer = None
def test_dropout_WITH_PROB_ZERO(self): rnn = DecoderRNN(self.dataset.output_vocab, 50, 16, dropout_p=0) for param in rnn.parameters(): param.data.uniform_(-1, 1) batch = [[1, 2, 3], [1, 2], [1]] output1, _, _ = rnn(batch) output2, _, _ = rnn(batch) self.assertEqual(output1, output2)
def test_dropout_WITH_PROB_ZERO(self): rnn = DecoderRNN(self.vocab_size, 50, 16, 0, 1, dropout_p=0) for param in rnn.parameters(): param.data.uniform_(-1, 1) output1, _, _ = rnn() output2, _, _ = rnn() for prob1, prob2 in zip(output1, output2): self.assertTrue(torch.equal(prob1.data, prob2.data))
def test_input_dropout_WITH_NON_ZERO_PROB(self): rnn = DecoderRNN(self.vocab_size, 50, 16, 0, 1, input_dropout_p=0.5) for param in rnn.parameters(): param.data.uniform_(-1, 1) equal = True for _ in range(50): output1, _, _ = rnn() output2, _, _ = rnn() if not torch.equal(output1[0].data, output2[0].data): equal = False break self.assertFalse(equal)
def test_dropout_WITH_NON_ZERO_PROB(self): rnn = DecoderRNN(self.dataset.output_vocab, 50, 16, dropout_p=0.5) for param in rnn.parameters(): param.data.uniform_(-1, 1) batch = [[1, 2, 3], [1, 2], [1]] equal = True for _ in range(50): output1, _, _ = rnn(batch) output2, _, _ = rnn(batch) if output1[0] != output2[0]: equal = False break self.assertFalse(equal)
def main(): vocabulary = pickle.load(open(f'{EMBEDDING_DIR}/vocab.pkl', 'rb')) print("Number of words in data set: %d" % len(vocabulary)) embedding_matrix, vocab_to_index = map_vocab_to_embedding(vocabulary) hidden_size = 600 encoder = EncoderRNN(embedding_matrix, hidden_size) decoder = DecoderRNN(embedding_matrix, hidden_size) if torch.cuda.is_available(): encoder.cuda() decoder.cuda() train_file = open(os.path.join(EMBEDDING_DIR, "train.pkl"), 'rb') train_data = pickle.load(train_file) train_file.close() n_iters = 2000 train(train_data, vocab_to_index, vocabulary, encoder, decoder, n_iters)
def build_model(tgt_field, max_len=50, hidden_size=100, bidirectional=False): print("building model...") vocab: torchtext.vocab.Vocab = tgt_field.vocab print("vocab: ", vocab.stoi) encoder = EncoderCNN2D() decoder = DecoderRNN(vocab_size=len(vocab), max_len=max_len, hidden_size=hidden_size * 2 if bidirectional else hidden_size, dropout_p=0.2, use_attention=True, bidirectional=bidirectional, eos_id=tgt_field.eos_id, sos_id=tgt_field.sos_id, rnn_cell='lstm') model_obj = Seq2seq(encoder, decoder) # if torch.cuda.is_available(): # model_obj.cuda() # for param in model_obj.parameters(): # init.xavier_uniform(param.data) for param in model_obj.parameters(): param.data.uniform_(-0.08, 0.08) return model_obj
def initialize_model(opt, src, tgt, train): # build vocabulary src.build_vocab(train, max_size=opt.src_vocab) tgt.build_vocab(train, max_size=opt.tgt_vocab) input_vocab = src.vocab output_vocab = tgt.vocab # Initialize model hidden_size = opt.hidden_size decoder_hidden_size = hidden_size * 2 if opt.bidirectional else hidden_size encoder = EncoderRNN(len(src.vocab), opt.max_len, hidden_size, opt.embedding_size, dropout_p=opt.dropout_p_encoder, n_layers=opt.n_layers, bidirectional=opt.bidirectional, rnn_cell=opt.rnn_cell, variable_lengths=True) decoder = DecoderRNN(len(tgt.vocab), opt.max_len, decoder_hidden_size, dropout_p=opt.dropout_p_decoder, n_layers=opt.n_layers, attention_method=opt.attention_method, full_focus=opt.full_focus, bidirectional=opt.bidirectional, rnn_cell=opt.rnn_cell, eos_id=tgt.eos_id, sos_id=tgt.sos_id) seq2seq = Seq2seq(encoder, decoder) seq2seq.to(device) return seq2seq, input_vocab, output_vocab
def __init__(self, data_path, model_save_path, model_load_path, hidden_size=32, max_vocab=4000, device='cuda'): self.src = SourceField() self.tgt = TargetField() self.max_length = 90 self.data_path = data_path self.model_save_path = model_save_path self.model_load_path = model_load_path def len_filter(example): return len(example.src) <= self.max_length and len( example.tgt) <= self.max_length self.trainset = torchtext.data.TabularDataset( path=os.path.join(self.data_path, 'train'), format='tsv', fields=[('src', self.src), ('tgt', self.tgt)], filter_pred=len_filter) self.devset = torchtext.data.TabularDataset(path=os.path.join( self.data_path, 'eval'), format='tsv', fields=[('src', self.src), ('tgt', self.tgt)], filter_pred=len_filter) self.src.build_vocab(self.trainset, max_size=max_vocab) self.tgt.build_vocab(self.trainset, max_size=max_vocab) weight = torch.ones(len(self.tgt.vocab)) pad = self.tgt.vocab.stoi[self.tgt.pad_token] self.loss = Perplexity(weight, pad) self.loss.cuda() self.optimizer = None self.hidden_size = hidden_size self.bidirectional = True encoder = EncoderRNN(len(self.src.vocab), self.max_length, self.hidden_size, bidirectional=self.bidirectional, variable_lengths=True) decoder = DecoderRNN(len(self.tgt.vocab), self.max_length, self.hidden_size * 2 if self.bidirectional else self.hidden_size, dropout_p=0.2, use_attention=True, bidirectional=self.bidirectional, eos_id=self.tgt.eos_id, sos_id=self.tgt.sos_id) self.device = device self.seq2seq = Seq2seq(encoder, decoder).cuda() for param in self.seq2seq.parameters(): param.data.uniform_(-0.08, 0.08)
def build_model(src, tgt, hidden_size, mini_batch_size, bidirectional, dropout, attention, init_value): EXPERIMENT.param("Hidden", hidden_size) EXPERIMENT.param("Bidirectional", bidirectional) EXPERIMENT.param("Dropout", dropout) EXPERIMENT.param("Attention", attention) EXPERIMENT.param("Mini-batch", mini_batch_size) weight = torch.ones(len(tgt.vocab)) pad = tgt.vocab.stoi[tgt.pad_token] loss = Perplexity(weight, pad) encoder = EncoderRNN(len(src.vocab), MAX_LEN, hidden_size, rnn_cell="lstm", bidirectional=bidirectional, dropout_p=dropout, variable_lengths=False) decoder = DecoderRNN( len(tgt.vocab), MAX_LEN, hidden_size, # * 2 if bidirectional else hidden_size, rnn_cell="lstm", use_attention=attention, eos_id=tgt.eos_id, sos_id=tgt.sos_id) seq2seq = Seq2seq(encoder, decoder) using_cuda = False if torch.cuda.is_available(): using_cuda = True encoder.cuda() decoder.cuda() seq2seq.cuda() loss.cuda() EXPERIMENT.param("CUDA", using_cuda) for param in seq2seq.parameters(): param.data.uniform_(-init_value, init_value) trainer = SupervisedTrainer(loss=loss, batch_size=mini_batch_size, checkpoint_every=5000, random_seed=42, print_every=1000) return seq2seq, trainer
def test_k_1(self): """ When k=1, the output of topk decoder should be the same as a normal decoder. """ batch_size = 1 eos = 1 for _ in range(10): # Repeat the randomized test multiple times decoder = DecoderRNN(self.vocab_size, 50, 16, 0, eos) for param in decoder.parameters(): param.data.uniform_(-1, 1) topk_decoder = TopKDecoder(decoder, 1) output, _, other = decoder(None) output_topk, _, other_topk = topk_decoder(None) self.assertEqual(len(output), len(output_topk)) finished = [False] * batch_size seq_scores = [0] * batch_size for t_step in range(len(output)): score, _ = output[t_step].topk(1) symbols = other['sequence'][t_step] for b in range(batch_size): seq_scores[b] += score[b].data[0] symbol = symbols[b].data[0] if not finished[b] and symbol == eos: finished[b] = True self.assertEqual(other_topk['length'][b], t_step + 1) self.assertTrue( np.isclose(seq_scores[b], other_topk['score'][b][0])) if not finished[b]: symbol_topk = other_topk['topk_sequence'][t_step][ b].data[0][0] self.assertEqual(symbol, symbol_topk) self.assertTrue( torch.equal(output[t_step].data, output_topk[t_step].data)) if sum(finished) == batch_size: break
def __init__(self, args): super(IPComm_speaker, self).__init__() self.vocab_size = 10 self.max_len = 5 self.hidden_size = args.comm_embed_dim self.eos_id = 1 self.sos_id = 0 self.speaker = DecoderRNN(self.vocab_size, self.max_len, self.hidden_size, eos_id=self.eos_id, sos_id=self.sos_id, rnn_cell='gru')
def setUpClass(self): self.test_wd = os.getcwd() self.dataset = Dataset(path=os.path.join(self.test_wd,'tests/data/eng-fra.txt'), src_max_len=50, tgt_max_len=50, src_max_vocab=50000, tgt_max_vocab=50000) self.encoder = EncoderRNN(self.dataset.input_vocab,max_len=10, hidden_size=10, rnn_cell='lstm') self.decoder = DecoderRNN(self.dataset.output_vocab, max_len=10, hidden_size=10, rnn_cell='lstm') self.seq2seq = Seq2seq(self.encoder,self.decoder) if torch.cuda.is_available(): self.seq2seq.cuda() self.mock_seq2seq = Seq2seq(self.encoder, self.decoder) for param in self.seq2seq.parameters(): param.data.uniform_(-0.08, 0.08)
def setUpClass(self): test_path = os.path.dirname(os.path.realpath(__file__)) src = SourceField() trg = TargetField() dataset = torchtext.data.TabularDataset( path=os.path.join(test_path, 'data/eng-fra.txt'), format='tsv', fields=[('src', src), ('trg', trg)], ) src.build_vocab(dataset) trg.build_vocab(dataset) encoder = EncoderRNN(len(src.vocab), 10, 10, rnn_cell='lstm') decoder = DecoderRNN(len(trg.vocab), 10, 10, trg.sos_id, trg.eos_id, rnn_cell='lstm') seq2seq = Seq2seq(encoder, decoder) self.predictor = Predictor(seq2seq, src.vocab, trg.vocab)
def __init__( self, vocabulary_size, embedding_size, hidden_state_size, start_label, end_label, pad_label, slk_parser, MAX_LENGTH=500, dropout_p=0.1, n_layer=3, ): super().__init__() self.embedding = nn.Embedding(vocabulary_size, embedding_size) self.sample = False self.dropout_p = dropout_p self.encoder = EncoderRNN(vocab_size=vocabulary_size, max_len=MAX_LENGTH, input_size=embedding_size, hidden_size=hidden_state_size // 2, n_layers=n_layer, bidirectional=True, rnn_cell='lstm', input_dropout_p=self.dropout_p, dropout_p=self.dropout_p, variable_lengths=False, embedding=None, update_embedding=True) self.decoder = DecoderRNN(vocab_size=vocabulary_size, max_len=MAX_LENGTH, hidden_size=hidden_state_size, sos_id=start_label, eos_id=end_label, n_layers=n_layer, rnn_cell='lstm', bidirectional=False, input_dropout_p=self.dropout_p, dropout_p=self.dropout_p, use_attention=True) self.is_copy_output = nn.Linear(hidden_state_size, 1) self.grammar_mask_output = MaskOutput(hidden_state_size, vocabulary_size) self.decoder_start = torch.ones(1, 1) * start_label self.pad_label = pad_label self.MAX_LENGTH = MAX_LENGTH self.num_layers = n_layer
def setUp(self): test_path = os.path.dirname(os.path.realpath(__file__)) src = SourceField() tgt = TargetField() self.dataset = torchtext.data.TabularDataset( path=os.path.join(test_path, 'data/eng-fra.txt'), format='tsv', fields=[('src', src), ('tgt', tgt)], ) src.build_vocab(self.dataset) tgt.build_vocab(self.dataset) encoder = EncoderRNN(len(src.vocab), 10, 10, rnn_cell='lstm') decoder = DecoderRNN(len(tgt.vocab), 10, 10, tgt.sos_id, tgt.eos_id, rnn_cell='lstm') self.seq2seq = Seq2seq(encoder, decoder) for param in self.seq2seq.parameters(): param.data.uniform_(-0.08, 0.08)
def setUpClass(self): self.test_wd = os.getcwd() self.dataset = Dataset(path=os.path.join(self.test_wd,'tests/data/eng-fra.txt'), src_max_len=50, tgt_max_len=50, src_max_vocab=50000, tgt_max_vocab=50000) self.encoder = EncoderRNN(self.dataset.input_vocab,max_len=10, hidden_size=10, rnn_cell='lstm') self.decoder = DecoderRNN(self.dataset.output_vocab, max_len=10, hidden_size=10, rnn_cell='lstm') self.seq2seq = Seq2seq(self.encoder,self.decoder) self.mock_seq2seq = Seq2seq(self.encoder, self.decoder) for param in self.seq2seq.parameters(): param.data.uniform_(-0.08, 0.08) if not os.path.exists(os.path.join(self.test_wd,'checkpoints')): os.mkdir(os.path.join(self.test_wd,'checkpoints')) self.seq2seq.save(os.path.join(self.test_wd,'checkpoints')) self.mock_seq2seq.load(os.path.join(self.test_wd, 'checkpoints'))
def initialize_model( train, input_vocab, output_vocab, max_len=10, hidden_size=256, dropout_p=0.5, bidirectional=True, n_beam=5, ): # Initialize model encoder = EncoderRNN( len(input_vocab), max_len, hidden_size, bidirectional=bidirectional, variable_lengths=True, ) decoder = DecoderRNN( len(output_vocab), max_len, hidden_size * (2 if bidirectional else 1), dropout_p=dropout_p, use_attention=True, bidirectional=bidirectional, eos_id=train.tgt_field.eos_id, sos_id=train.tgt_field.sos_id, ) # decoder = TopKDecoder(decoder ,n_beam) seq2seq = Seq2seq(encoder, decoder) if torch.cuda.is_available(): seq2seq = seq2seq.cuda() for param in seq2seq.parameters(): param.data.uniform_(-0.08, 0.08) # Optimizer and learning rate scheduler can be customized by # explicitly constructing the objects and pass to the trainer optimizer = Optimizer(torch.optim.Adam(seq2seq.parameters()), max_grad_norm=5) scheduler = StepLR(optimizer.optimizer, 1) optimizer.set_scheduler(scheduler) return seq2seq, optimizer, scheduler
def __init__(self, zh_max_len, zh_hidden, dec_layers, input_dropout_p, dropout_p, beam_size, zh_embedding_size): super(Dec, self).__init__() self.dec_rnn = DecoderRNN(vocab_size = len(transform.zh_voc), max_len = zh_max_len, embedding_size = zh_embedding_size, hidden_size = zh_hidden, sos_id = transform.zh_go_id, eos_id = transform.zh_eos_id, n_layers = dec_layers, rnn_cell='lstm', bidirectional=True, input_dropout_p = input_dropout_p, dropout_p=dropout_p, use_attention=True) self.beam_dec = TopKDecoder(self.dec_rnn, beam_size)
bidirectional = opt.word_bidirect encoder = EncoderRNN(vocab_size=len(src.vocab), max_len=max_len, word_dim=opt.word_dim, hidden_size=hidden_size, input_dropout_p=opt.input_dropout, bidirectional=bidirectional, n_layers=1, rnn_cell='gru', variable_lengths=True) decoder = DecoderRNN(vocab_size=len(tgt.vocab), max_len=max_len, hidden_size=hidden_size * 2 if bidirectional else 1, dropout_p=opt.dropout, use_attention=True, bidirectional=bidirectional, n_layers=1, rnn_cell='gru', eos_id=tgt.eos_id, sos_id=tgt.sos_id) seq2seq = Seq2seq(encoder, decoder) for param in seq2seq.parameters(): param.data.uniform_(-0.08, 0.08) print(param.data[0:3]) _, _, norm_val = encoder.vectors_stats() encoder.init_vectors(src.vocab.vectors) # encoder.scale_vectors(0.08) encoder.normalize_vectors(norm_val) encoder.vectors_stats() for param in seq2seq.parameters():
loss = Perplexity(weight, pad) if torch.cuda.is_available(): loss.cuda() seq2seq = None optimizer = None if not opt.resume: # Initialize model # hidden_size=128 hidden_size = 300 bidirectional = True encoder = EncoderRNN(len(src.vocab), max_len, hidden_size, bidirectional=bidirectional, variable_lengths=True) decoder = DecoderRNN(len(tgt.vocab), max_len, hidden_size * 2 if bidirectional else 1, dropout_p=0.2, use_attention=True, bidirectional=bidirectional, eos_id=tgt.eos_id, sos_id=tgt.sos_id) seq2seq = Seq2seq(encoder, decoder) for param in seq2seq.parameters(): param.data.uniform_(-0.08, 0.08) print(param.data) encoder.vectors_stats() # encoder.init_vectors(src.vocab.vectors) # for param in seq2seq.parameters(): # print(param.data) if torch.cuda.is_available(): seq2seq.cuda() # Optimizer and learning rate scheduler can be customized by # explicitly constructing the objects and pass to the trainer.
def run_training(opt, default_data_dir, num_epochs=100): if opt.load_checkpoint is not None: logging.info("loading checkpoint from {}".format( os.path.join(opt.expt_dir, Checkpoint.CHECKPOINT_DIR_NAME, opt.load_checkpoint))) checkpoint_path = os.path.join(opt.expt_dir, Checkpoint.CHECKPOINT_DIR_NAME, opt.load_checkpoint) checkpoint = Checkpoint.load(checkpoint_path) seq2seq = checkpoint.model input_vocab = checkpoint.input_vocab output_vocab = checkpoint.output_vocab else: # Prepare dataset src = SourceField() tgt = TargetField() max_len = 50 data_file = os.path.join(default_data_dir, opt.train_path, 'data.txt') logging.info("Starting new Training session on %s", data_file) def len_filter(example): return (len(example.src) <= max_len) and (len(example.tgt) <= max_len) \ and (len(example.src) > 0) and (len(example.tgt) > 0) train = torchtext.data.TabularDataset( path=data_file, format='json', fields={'src': ('src', src), 'tgt': ('tgt', tgt)}, filter_pred=len_filter ) dev = None if opt.no_dev is False: dev_data_file = os.path.join(default_data_dir, opt.train_path, 'dev-data.txt') dev = torchtext.data.TabularDataset( path=dev_data_file, format='json', fields={'src': ('src', src), 'tgt': ('tgt', tgt)}, filter_pred=len_filter ) src.build_vocab(train, max_size=50000) tgt.build_vocab(train, max_size=50000) input_vocab = src.vocab output_vocab = tgt.vocab # NOTE: If the source field name and the target field name # are different from 'src' and 'tgt' respectively, they have # to be set explicitly before any training or inference # seq2seq.src_field_name = 'src' # seq2seq.tgt_field_name = 'tgt' # Prepare loss weight = torch.ones(len(tgt.vocab)) pad = tgt.vocab.stoi[tgt.pad_token] loss = Perplexity(weight, pad) if torch.cuda.is_available(): logging.info("Yayyy We got CUDA!!!") loss.cuda() else: logging.info("No cuda available device found running on cpu") seq2seq = None optimizer = None if not opt.resume: hidden_size = 128 decoder_hidden_size = hidden_size * 2 logging.info("EncoderRNN Hidden Size: %s", hidden_size) logging.info("DecoderRNN Hidden Size: %s", decoder_hidden_size) bidirectional = True encoder = EncoderRNN(len(src.vocab), max_len, hidden_size, bidirectional=bidirectional, rnn_cell='lstm', variable_lengths=True) decoder = DecoderRNN(len(tgt.vocab), max_len, decoder_hidden_size, dropout_p=0, use_attention=True, bidirectional=bidirectional, rnn_cell='lstm', eos_id=tgt.eos_id, sos_id=tgt.sos_id) seq2seq = Seq2seq(encoder, decoder) if torch.cuda.is_available(): seq2seq.cuda() for param in seq2seq.parameters(): param.data.uniform_(-0.08, 0.08) # Optimizer and learning rate scheduler can be customized by # explicitly constructing the objects and pass to the trainer. optimizer = Optimizer(torch.optim.Adam(seq2seq.parameters()), max_grad_norm=5) scheduler = StepLR(optimizer.optimizer, 1) optimizer.set_scheduler(scheduler) # train num_epochs = num_epochs batch_size = 32 checkpoint_every = num_epochs / 10 print_every = num_epochs / 100 properties = dict(batch_size=batch_size, checkpoint_every=checkpoint_every, print_every=print_every, expt_dir=opt.expt_dir, num_epochs=num_epochs, teacher_forcing_ratio=0.5, resume=opt.resume) logging.info("Starting training with the following Properties %s", json.dumps(properties, indent=2)) t = SupervisedTrainer(loss=loss, batch_size=num_epochs, checkpoint_every=checkpoint_every, print_every=print_every, expt_dir=opt.expt_dir) seq2seq = t.train(seq2seq, train, num_epochs=num_epochs, dev_data=dev, optimizer=optimizer, teacher_forcing_ratio=0.5, resume=opt.resume) evaluator = Evaluator(loss=loss, batch_size=batch_size) if opt.no_dev is False: dev_loss, accuracy = evaluator.evaluate(seq2seq, dev) logging.info("Dev Loss: %s", dev_loss) logging.info("Accuracy: %s", dev_loss) beam_search = Seq2seq(seq2seq.encoder, TopKDecoder(seq2seq.decoder, 4)) predictor = Predictor(beam_search, input_vocab, output_vocab) while True: try: seq_str = raw_input("Type in a source sequence:") seq = seq_str.strip().split() results = predictor.predict_n(seq, n=3) for i, res in enumerate(results): print('option %s: %s\n', i + 1, res) except KeyboardInterrupt: logging.info("Bye Bye") exit(0)
if bidirectional: hidden_size = hidden_size * 2 if config['use_vecs']: # aug_size = len(train_vecs[0][0]) aug_size = vectors.vector_size else: # aug_size = 0 aug_size = feat_hidden_size # pdb.set_trace() decoder = DecoderRNN(len(tgt.vocab), max_len, feat_hidden_size, hidden_size=hidden_size, aug_size=aug_size, dropout_p=float(config['dropout']), input_dropout_p=float(config['dropout']), use_attention=True, bidirectional=bidirectional, rnn_cell='LSTM', eos_id=tgt.eos_id, sos_id=tgt.sos_id, n_layers=config['num layers']) # if torch.cuda.is_available(): # encoder.cuda() # decoder.cuda() # topk_decoder = TopKDecoder(decoder, 3) seq2seq = Seq2seq(encoder, decoder) # seq2seq = Seq2seq(encoder, topk_decoder) if torch.cuda.is_available(): # pdb.set_trace() # seq2seq.to(DEVICE)
# Initialize model encoder = EncoderRNN(len(src_vocab.vocab), opt.max_src_length, embedding_size=opt.embedding_size, rnn_cell=opt.rnn_cell, n_layers=opt.n_hidden_layer, hidden_size=opt.hidden_size, bidirectional=opt.bidirectional, variable_lengths=False) decoder = DecoderRNN(len(tgt_vocab.vocab), opt.max_tgt_length, embedding_size=opt.embedding_size, rnn_cell=opt.rnn_cell, n_layers=opt.n_hidden_layer, hidden_size=opt.hidden_size * 2 if opt.bidirectional else opt.hidden_size, bidirectional=opt.bidirectional, dropout_p=0.2, use_attention=opt.use_attn, eos_id=tgt_vocab.word2idx[tgt_vocab.eos_token], sos_id=tgt_vocab.word2idx[tgt_vocab.sos_token]) seq2seq = Seq2seq(encoder, decoder) seq2seq.to(device) if opt.resume and not opt.load_checkpoint: last_checkpoint = get_last_checkpoint(opt.model_dir) if last_checkpoint: opt.load_checkpoint = os.path.join(opt.model_dir, last_checkpoint) opt.skip_steps = int(last_checkpoint.strip('.pt').split('/')[-1]) if opt.load_checkpoint:
hidden_size = 128 bidirectional = True encoder = EncoderRNN( len(src.vocab), max_len, hidden_size, bidirectional=bidirectional, rnn_cell="lstm", variable_lengths=True, ) decoder = DecoderRNN( len(tgt.vocab), max_len, hidden_size * 2, dropout_p=0.2, use_attention=True, bidirectional=bidirectional, rnn_cell="lstm", eos_id=tgt.eos_id, sos_id=tgt.sos_id, ) seq2seq = Seq2seq(encoder, decoder) if torch.cuda.is_available(): seq2seq.cuda() for param in seq2seq.parameters(): param.data.uniform_(-0.08, 0.08) # train t = SupervisedTrainer( loss=loss,
weight = torch.ones(len(tgt_vocab.vocab)) loss = Perplexity(weight, pad_id) loss.to(device) # Initialize model encoder = EncoderRNN(len(src_vocab.vocab), opt.max_src_length, hidden_size=opt.hidden_size, bidirectional=opt.bidirectional, variable_lengths=False) decoder = DecoderRNN(len(tgt_vocab.vocab), opt.max_tgt_length, hidden_size=opt.hidden_size * 2 if opt.bidirectional else opt.hidden_size, dropout_p=0.2, use_attention=opt.use_attn, bidirectional=opt.bidirectional, eos_id=tgt_vocab.word2idx[tgt_vocab.eos_token], sos_id=tgt_vocab.word2idx[tgt_vocab.sos_token]) seq2seq = Seq2seq(encoder, decoder) seq2seq.to(device) if opt.resume and not opt.load_checkpoint: last_checkpoint = get_last_checkpoint(opt.model_dir) if last_checkpoint: opt.load_checkpoint = os.path.join(opt.model_dir, last_checkpoint) opt.skip_steps = int(last_checkpoint.strip('.pt').split('/')[-1]) if opt.load_checkpoint: seq2seq.load_state_dict(torch.load(opt.load_checkpoint))
dataset = Dataset(opt.train_path, src_max_len=50, tgt_max_len=50) input_vocab = dataset.input_vocab output_vocab = dataset.output_vocab dev_set = Dataset(opt.dev_path, src_max_len=50, tgt_max_len=50, src_vocab=input_vocab, tgt_vocab=output_vocab) # Prepare model hidden_size = 128 encoder = EncoderRNN(input_vocab, dataset.src_max_len, hidden_size) decoder = DecoderRNN(output_vocab, dataset.tgt_max_len, hidden_size, dropout_p=0.2, use_attention=True) seq2seq = Seq2seq(encoder, decoder) if opt.resume: print("resuming training") latest_checkpoint = Checkpoint.get_latest_checkpoint(opt.expt_dir) seq2seq.load(latest_checkpoint) else: for param in seq2seq.parameters(): param.data.uniform_(-0.08, 0.08) # Prepare loss weight = torch.ones(output_vocab.get_vocab_size()) mask = output_vocab.MASK_token_id
def train(): src = SourceField(sequential=True, tokenize=lambda x: [i for i in jieba.lcut(x)]) tgt = TargetField(sequential=True, tokenize=lambda x: [i for i in jieba.lcut(x)]) max_len = 50 def len_filter(example): return len(example.src) <= max_len and len(example.tgt) <= max_len train = torchtext.data.TabularDataset(path=opt.train_path, format='csv', fields=[('src', src), ('tgt', tgt)], filter_pred=len_filter) dev = torchtext.data.TabularDataset(path=opt.dev_path, format='csv', fields=[('src', src), ('tgt', tgt)], filter_pred=len_filter) src.build_vocab(train, max_size=50000) tgt.build_vocab(train, max_size=50000) input_vocab = src.vocab output_vocab = tgt.vocab # NOTE: If the source field name and the target field name # are different from 'src' and 'tgt' respectively, they have # to be set explicitly before any training or inference # seq2seq.src_field_name = 'src' # seq2seq.tgt_field_name = 'tgt' # Prepare loss weight = torch.ones(len(tgt.vocab)) pad = tgt.vocab.stoi[tgt.pad_token] loss = Perplexity(weight, pad) if torch.cuda.is_available(): loss.cuda() seq2seq = None optimizer = None if not opt.resume: # Initialize model hidden_size = 128 bidirectional = True encoder = EncoderRNN(len(src.vocab), max_len, hidden_size, bidirectional=bidirectional, variable_lengths=True) decoder = DecoderRNN(len(tgt.vocab), max_len, hidden_size * 2 if bidirectional else hidden_size, dropout_p=0.2, use_attention=True, bidirectional=bidirectional, eos_id=tgt.eos_id, sos_id=tgt.sos_id) seq2seq = Seq2seq(encoder, decoder) if torch.cuda.is_available(): seq2seq.cuda() for param in seq2seq.parameters(): param.data.uniform_(-0.08, 0.08) # Optimizer and learning rate scheduler can be customized by # explicitly constructing the objects and pass to the trainer. # # optimizer = Optimizer(torch.optim.Adam(seq2seq.parameters()), max_grad_norm=5) # scheduler = StepLR(optimizer.optimizer, 1) # optimizer.set_scheduler(scheduler) # train t = SupervisedTrainer(loss=loss, batch_size=32, checkpoint_every=50, print_every=10, expt_dir=opt.expt_dir) seq2seq = t.train(seq2seq, train, num_epochs=6, dev_data=dev, optimizer=optimizer, teacher_forcing_ratio=0.5, resume=opt.resume) predictor = Predictor(seq2seq, input_vocab, output_vocab)
if not opt.resume: # Initialize model hidden_size = params['hidden_size'] bidirectional = True encoder = EncoderRNN(len(src.vocab), max_len, hidden_size, bidirectional=bidirectional, variable_lengths=True, n_layers=params['n_layers'], rnn_cell=params['rnn_cell']) decoder = DecoderRNN(len(tgt.vocab), max_len, hidden_size * 2 if bidirectional else hidden_size, dropout_p=0.2, use_attention=True, bidirectional=bidirectional, rnn_cell=params['rnn_cell'], n_layers=params['n_layers'], eos_id=tgt.eos_id, sos_id=tgt.sos_id) seq2seq = Seq2seq(encoder, decoder) if torch.cuda.is_available(): seq2seq.cuda() for param in seq2seq.parameters(): param.data.uniform_(-0.08, 0.08) # Optimizer and learning rate scheduler can be customized by # explicitly constructing the objects and pass to the trainer. # optimizer = Optimizer(torch.optim.Adam(seq2seq.parameters()),
def test_k_greater_than_1(self): """ Implement beam search manually and compare results from topk decoder. """ max_len = 50 beam_size = 3 batch_size = 1 hidden_size = 8 sos = 0 eos = 1 for _ in range(10): decoder = DecoderRNN(self.vocab_size, max_len, hidden_size, sos, eos) for param in decoder.parameters(): param.data.uniform_(-1, 1) topk_decoder = TopKDecoder(decoder, beam_size) encoder_hidden = torch.autograd.Variable( torch.randn(1, batch_size, hidden_size)) _, hidden_topk, other_topk = topk_decoder( None, encoder_hidden=encoder_hidden) # Queue state: # 1. time step # 2. symbol # 3. hidden state # 4. accumulated log likelihood # 5. beam number batch_queue = [[(-1, sos, encoder_hidden[:, b, :].unsqueeze(1), 0, None)] for b in range(batch_size)] time_batch_queue = [batch_queue] batch_finished_seqs = [list() for _ in range(batch_size)] for t in range(max_len): new_batch_queue = [] for b in range(batch_size): new_queue = [] for k in range(min(len(time_batch_queue[t][b]), beam_size)): _, inputs, hidden, seq_score, _ = time_batch_queue[t][ b][k] if inputs == eos: batch_finished_seqs[b].append( time_batch_queue[t][b][k]) continue inputs = torch.autograd.Variable( torch.LongTensor([[inputs]])) context, hidden, attn = decoder.forward_step( inputs, hidden, None) decoder_outputs, symbols = decoder.decoder( context, attn, None, None) decoder_outputs = decoder_outputs.log() topk_score, topk = decoder_outputs[0].data.topk( beam_size) for score, sym in zip(topk_score.tolist()[0], topk.tolist()[0]): new_queue.append( (t, sym, hidden, score + seq_score, k)) new_queue = sorted(new_queue, key=lambda x: x[3], reverse=True)[:beam_size] new_batch_queue.append(new_queue) time_batch_queue.append(new_batch_queue) # finished beams finalist = [l[:beam_size] for l in batch_finished_seqs] # unfinished beams for b in range(batch_size): if len(finalist[b]) < beam_size: last_step = sorted(time_batch_queue[-1][b], key=lambda x: x[3], reverse=True) finalist[b] += last_step[:beam_size - len(finalist[b])] # back track topk = [] for b in range(batch_size): batch_topk = [] for k in range(beam_size): seq = [finalist[b][k]] prev_k = seq[-1][4] prev_t = seq[-1][0] while prev_k is not None: seq.append(time_batch_queue[prev_t][b][prev_k]) prev_k = seq[-1][4] prev_t = seq[-1][0] batch_topk.append([s for s in reversed(seq)]) topk.append(batch_topk) for b in range(batch_size): topk[b] = sorted(topk[b], key=lambda s: s[-1][3], reverse=True) topk_scores = other_topk['score'] topk_lengths = other_topk['topk_length'] topk_pred_symbols = other_topk['topk_sequence'] for b in range(batch_size): precision_error = False for k in range(beam_size - 1): if np.isclose(topk_scores[b][k], topk_scores[b][k + 1]): precision_error = True break if precision_error: break for k in range(beam_size): self.assertEqual(topk_lengths[b][k], len(topk[b][k]) - 1) self.assertTrue( np.isclose(topk_scores[b][k], topk[b][k][-1][3])) total_steps = topk_lengths[b][k] for t in range(total_steps): self.assertEqual(topk_pred_symbols[t][b, k].data[0], topk[b][k][t + 1][1]) # topk includes SOS