def setUpClass(self): self.test_wd = os.getcwd() self.dataset = Dataset(path=os.path.join(self.test_wd,'tests/data/eng-fra.txt'), src_max_len=50, tgt_max_len=50, src_max_vocab=50000, tgt_max_vocab=50000) self.encoder = EncoderRNN(self.dataset.input_vocab,max_len=10, hidden_size=10, rnn_cell='lstm') self.decoder = DecoderRNN(self.dataset.output_vocab, max_len=10, hidden_size=10, rnn_cell='lstm') self.seq2seq = Seq2seq(self.encoder,self.decoder) if torch.cuda.is_available(): self.seq2seq.cuda() self.mock_seq2seq = Seq2seq(self.encoder, self.decoder) for param in self.seq2seq.parameters(): param.data.uniform_(-0.08, 0.08)
def predict(expt_dir, seq_str, date, epoch, step, n=3): seq = seq_str.strip().split() checkpoint_path = os.path.join(expt_dir, Checkpoint.CHECKPOINT_DIR_NAME, date, epoch, step) seq2seq, input_vocab, output_vocab = get_model(checkpoint_path) beam_search = Seq2seq(seq2seq.encoder, TopKDecoder(seq2seq.decoder, 4)) predictor = Predictor(beam_search, input_vocab, output_vocab) return predictor.predict_n(seq, n=n)
def build_model(tgt_field, max_len=50, hidden_size=100, bidirectional=False): print("building model...") vocab: torchtext.vocab.Vocab = tgt_field.vocab print("vocab: ", vocab.stoi) encoder = EncoderCNN2D() decoder = DecoderRNN(vocab_size=len(vocab), max_len=max_len, hidden_size=hidden_size * 2 if bidirectional else hidden_size, dropout_p=0.2, use_attention=True, bidirectional=bidirectional, eos_id=tgt_field.eos_id, sos_id=tgt_field.sos_id, rnn_cell='lstm') model_obj = Seq2seq(encoder, decoder) # if torch.cuda.is_available(): # model_obj.cuda() # for param in model_obj.parameters(): # init.xavier_uniform(param.data) for param in model_obj.parameters(): param.data.uniform_(-0.08, 0.08) return model_obj
def initialize_model(opt, src, tgt, train): # build vocabulary src.build_vocab(train, max_size=opt.src_vocab) tgt.build_vocab(train, max_size=opt.tgt_vocab) input_vocab = src.vocab output_vocab = tgt.vocab # Initialize model hidden_size = opt.hidden_size decoder_hidden_size = hidden_size * 2 if opt.bidirectional else hidden_size encoder = EncoderRNN(len(src.vocab), opt.max_len, hidden_size, opt.embedding_size, dropout_p=opt.dropout_p_encoder, n_layers=opt.n_layers, bidirectional=opt.bidirectional, rnn_cell=opt.rnn_cell, variable_lengths=True) decoder = DecoderRNN(len(tgt.vocab), opt.max_len, decoder_hidden_size, dropout_p=opt.dropout_p_decoder, n_layers=opt.n_layers, attention_method=opt.attention_method, full_focus=opt.full_focus, bidirectional=opt.bidirectional, rnn_cell=opt.rnn_cell, eos_id=tgt.eos_id, sos_id=tgt.sos_id) seq2seq = Seq2seq(encoder, decoder) seq2seq.to(device) return seq2seq, input_vocab, output_vocab
def setUpClass(self): self.test_wd = os.getcwd() self.dataset = Dataset(path=os.path.join(self.test_wd,'tests/data/eng-fra.txt'), src_max_len=50, tgt_max_len=50, src_max_vocab=50000, tgt_max_vocab=50000) self.encoder = EncoderRNN(self.dataset.input_vocab,max_len=10, hidden_size=10, rnn_cell='lstm') self.decoder = DecoderRNN(self.dataset.output_vocab, max_len=10, hidden_size=10, rnn_cell='lstm') self.seq2seq = Seq2seq(self.encoder,self.decoder) self.mock_seq2seq = Seq2seq(self.encoder, self.decoder) for param in self.seq2seq.parameters(): param.data.uniform_(-0.08, 0.08) if not os.path.exists(os.path.join(self.test_wd,'checkpoints')): os.mkdir(os.path.join(self.test_wd,'checkpoints')) self.seq2seq.save(os.path.join(self.test_wd,'checkpoints')) self.mock_seq2seq.load(os.path.join(self.test_wd, 'checkpoints'))
def __init__(self, data_path, model_save_path, model_load_path, hidden_size=32, max_vocab=4000, device='cuda'): self.src = SourceField() self.tgt = TargetField() self.max_length = 90 self.data_path = data_path self.model_save_path = model_save_path self.model_load_path = model_load_path def len_filter(example): return len(example.src) <= self.max_length and len( example.tgt) <= self.max_length self.trainset = torchtext.data.TabularDataset( path=os.path.join(self.data_path, 'train'), format='tsv', fields=[('src', self.src), ('tgt', self.tgt)], filter_pred=len_filter) self.devset = torchtext.data.TabularDataset(path=os.path.join( self.data_path, 'eval'), format='tsv', fields=[('src', self.src), ('tgt', self.tgt)], filter_pred=len_filter) self.src.build_vocab(self.trainset, max_size=max_vocab) self.tgt.build_vocab(self.trainset, max_size=max_vocab) weight = torch.ones(len(self.tgt.vocab)) pad = self.tgt.vocab.stoi[self.tgt.pad_token] self.loss = Perplexity(weight, pad) self.loss.cuda() self.optimizer = None self.hidden_size = hidden_size self.bidirectional = True encoder = EncoderRNN(len(self.src.vocab), self.max_length, self.hidden_size, bidirectional=self.bidirectional, variable_lengths=True) decoder = DecoderRNN(len(self.tgt.vocab), self.max_length, self.hidden_size * 2 if self.bidirectional else self.hidden_size, dropout_p=0.2, use_attention=True, bidirectional=self.bidirectional, eos_id=self.tgt.eos_id, sos_id=self.tgt.sos_id) self.device = device self.seq2seq = Seq2seq(encoder, decoder).cuda() for param in self.seq2seq.parameters(): param.data.uniform_(-0.08, 0.08)
def build_model(self): self._logger.info("Building model...") self.model = Seq2seq( batch_input_shape=(TRAIN_BATCH_SIZE, (INPUT_SEQ_LEN + 1) * MSG_HISTORY_LEN, 29), hidden_dim=HIDDEN_LAYER_DIM, output_length=MAX_OUTPUT_TOKEN_LENGTH, output_dim=29, depth=1) self._logger.info("Compiling...") self.model.compile(loss='mse', optimizer='rmsprop')
def setUpClass(self): test_path = os.path.dirname(os.path.realpath(__file__)) src = SourceField() trg = TargetField() dataset = torchtext.data.TabularDataset( path=os.path.join(test_path, 'data/eng-fra.txt'), format='tsv', fields=[('src', src), ('trg', trg)], ) src.build_vocab(dataset) trg.build_vocab(dataset) encoder = EncoderRNN(len(src.vocab), 10, 10, rnn_cell='lstm') decoder = DecoderRNN(len(trg.vocab), 10, 10, trg.sos_id, trg.eos_id, rnn_cell='lstm') seq2seq = Seq2seq(encoder, decoder) self.predictor = Predictor(seq2seq, src.vocab, trg.vocab)
def setUp(self): test_path = os.path.dirname(os.path.realpath(__file__)) src = SourceField() tgt = TargetField() self.dataset = torchtext.data.TabularDataset( path=os.path.join(test_path, 'data/eng-fra.txt'), format='tsv', fields=[('src', src), ('tgt', tgt)], ) src.build_vocab(self.dataset) tgt.build_vocab(self.dataset) encoder = EncoderRNN(len(src.vocab), 10, 10, rnn_cell='lstm') decoder = DecoderRNN(len(tgt.vocab), 10, 10, tgt.sos_id, tgt.eos_id, rnn_cell='lstm') self.seq2seq = Seq2seq(encoder, decoder) for param in self.seq2seq.parameters(): param.data.uniform_(-0.08, 0.08)
def initialize_model( train, input_vocab, output_vocab, max_len=10, hidden_size=256, dropout_p=0.5, bidirectional=True, n_beam=5, ): # Initialize model encoder = EncoderRNN( len(input_vocab), max_len, hidden_size, bidirectional=bidirectional, variable_lengths=True, ) decoder = DecoderRNN( len(output_vocab), max_len, hidden_size * (2 if bidirectional else 1), dropout_p=dropout_p, use_attention=True, bidirectional=bidirectional, eos_id=train.tgt_field.eos_id, sos_id=train.tgt_field.sos_id, ) # decoder = TopKDecoder(decoder ,n_beam) seq2seq = Seq2seq(encoder, decoder) if torch.cuda.is_available(): seq2seq = seq2seq.cuda() for param in seq2seq.parameters(): param.data.uniform_(-0.08, 0.08) # Optimizer and learning rate scheduler can be customized by # explicitly constructing the objects and pass to the trainer optimizer = Optimizer(torch.optim.Adam(seq2seq.parameters()), max_grad_norm=5) scheduler = StepLR(optimizer.optimizer, 1) optimizer.set_scheduler(scheduler) return seq2seq, optimizer, scheduler
def build_model(src, tgt, hidden_size, mini_batch_size, bidirectional, dropout, attention, init_value): EXPERIMENT.param("Hidden", hidden_size) EXPERIMENT.param("Bidirectional", bidirectional) EXPERIMENT.param("Dropout", dropout) EXPERIMENT.param("Attention", attention) EXPERIMENT.param("Mini-batch", mini_batch_size) weight = torch.ones(len(tgt.vocab)) pad = tgt.vocab.stoi[tgt.pad_token] loss = Perplexity(weight, pad) encoder = EncoderRNN(len(src.vocab), MAX_LEN, hidden_size, rnn_cell="lstm", bidirectional=bidirectional, dropout_p=dropout, variable_lengths=False) decoder = DecoderRNN( len(tgt.vocab), MAX_LEN, hidden_size, # * 2 if bidirectional else hidden_size, rnn_cell="lstm", use_attention=attention, eos_id=tgt.eos_id, sos_id=tgt.sos_id) seq2seq = Seq2seq(encoder, decoder) using_cuda = False if torch.cuda.is_available(): using_cuda = True encoder.cuda() decoder.cuda() seq2seq.cuda() loss.cuda() EXPERIMENT.param("CUDA", using_cuda) for param in seq2seq.parameters(): param.data.uniform_(-init_value, init_value) trainer = SupervisedTrainer(loss=loss, batch_size=mini_batch_size, checkpoint_every=5000, random_seed=42, print_every=1000) return seq2seq, trainer
import seq2seq.layers.decoders from seq2seq.models import Seq2seq import cPickle batch_size = 2 input_dim = 3 output_dim = 4 timesteps_i = 5 timesteps_o = 6 X_batch = np.arange( batch_size * timesteps_i * input_dim).reshape( batch_size, timesteps_i, input_dim) Y_batch = np.arange( batch_size * timesteps_o * output_dim).reshape( batch_size, timesteps_o, output_dim) model = Seq2seq(batch_input_shape=(batch_size, timesteps_i, input_dim), hidden_dim=7, output_length=timesteps_o, output_dim=output_dim, depth=2, peek=True) # model.add(SimpleRNN(output_dim, # input_shape=(timesteps, input_dim), # return_sequences=True, # unroll=True)) model.compile(loss='categorical_crossentropy', optimizer='sgd') model.train_on_batch(X_batch, Y_batch) # loss_and_metrics = model.evaluate(X_test, Y_test, batch_size=32)
loss1, accuracy1 = evaluator.evaluate(seq2seq_m, test) print(" testing ") print("loss: ", loss1) print("accuracy: ", accuracy1) loss1, accuracy1 = evaluator.evaluate(seq2seq_m, dev) print(" evaluation set ") print("loss: ", loss1) print("accuracy: ", accuracy1) ############# beam_search = Seq2seq(seq2seq_m.encoder, TopKDecoder(seq2seq_m.decoder, 3)) if torch.cuda.is_available(): beam_search.cuda() else: print(" error no cuda") predictor = Predictor(beam_search, input_vocab, output_vocab) #### from rouge import Rouge print("**training rouge") references = [] hypothesis = [] test_set = train for i in range(len(test_set)):
loss = Perplexity(weight, pad) if torch.cuda.is_available(): loss.cuda() seq2seq = None optimizer = None if not opt.resume: # Initialize model hidden_size=128 bidirectional = True encoder = EncoderRNN(len(src.vocab), max_len, hidden_size, bidirectional=bidirectional, variable_lengths=True) decoder = DecoderRNN(len(tgt.vocab), max_len, hidden_size * 2 if bidirectional else hidden_size, dropout_p=0.2, use_attention=True, bidirectional=bidirectional, eos_id=tgt.eos_id, sos_id=tgt.sos_id) seq2seq = Seq2seq(encoder, decoder) if torch.cuda.is_available(): seq2seq.cuda() for param in seq2seq.parameters(): param.data.uniform_(-0.08, 0.08) # Optimizer and learning rate scheduler can be customized by # explicitly constructing the objects and pass to the trainer. # # optimizer = Optimizer(torch.optim.Adam(seq2seq.parameters()), max_grad_norm=5) # scheduler = StepLR(optimizer.optimizer, 1) # optimizer.set_scheduler(scheduler) # train t = SupervisedTrainer(loss=loss, batch_size=32,
def main(): '''Main Function''' parser = argparse.ArgumentParser(description='sum_file.py') parser.add_argument('-model', required=True, help='Path to model .pt file') parser.add_argument('-src', required=True, help='Source sequence to decode (one line per sequence)') parser.add_argument('-vocab', required=True, help='Source sequence to decode (one line per sequence)') parser.add_argument('-output', default='pred.txt', help="""Path to output the predictions (each line will be the decoded sequence""") parser.add_argument('-beam_size', type=int, default=5, help='Beam size') parser.add_argument('-batch_size', type=int, default=30, help='Batch size') parser.add_argument('-n_best', type=int, default=1, help="""If verbose is set, will output the n_best decoded sentences""") parser.add_argument('-no_cuda', action='store_true') opt = parser.parse_args() opt.cuda = not opt.no_cuda # Prepare DataLoader preprocess_data = torch.load(opt.vocab) preprocess_settings = preprocess_data['settings'] test_src_word_insts = read_instances_from_file( opt.src, preprocess_settings.max_word_seq_len, preprocess_settings.keep_case, preprocess_settings.mode) test_src_insts = convert_instance_to_idx_seq( test_src_word_insts, preprocess_data['dict']['src']) # prepare model device = torch.device('cuda' if opt.cuda else 'cpu') checkpoint = torch.load(opt.model) model_opt = checkpoint['settings'] model_opt.bidirectional = True encoder = EncoderRNN(model_opt.src_vocab_size, model_opt.max_token_seq_len, model_opt.d_model, bidirectional=model_opt.bidirectional, variable_lengths=True) decoder = DecoderRNN(model_opt.tgt_vocab_size, model_opt.max_token_seq_len, model_opt.d_model * 2 if model_opt.bidirectional else model_opt.d_model, n_layers=model_opt.n_layer, dropout_p=model_opt.dropout, use_attention=True, bidirectional=model_opt.bidirectional, eos_id=Constants.BOS, sos_id=Constants.EOS) model = Seq2seq(encoder, decoder).to(device) model = nn.DataParallel(model) # using Dataparallel because training used model.load_state_dict(checkpoint['model']) print('[Info] Trained model state loaded.') predictor = Predictor(model, preprocess_data['dict']['tgt']) with open(opt.output, 'w') as f: for src_seq in tqdm(test_src_insts, mininterval=2, desc=' - (Test)', leave=False): pred_line = ' '.join(predictor.predict(src_seq)) f.write(pred_line + '\n') print('[Info] Finished.')
def train(args): ############################################################################### # Load data ############################################################################### cuda = int(torch.cuda.is_available()) - 1 TEXT = data.Field(lower=True, init_token="<start>", eos_token="<end>") LABELS = data.Field(sequential=True) train, val, test = data.TabularDataset.splits( # ms_draw data path='../ms_draw/', train='draw-train.tsv', validation='draw-dev.tsv', test='draw-test.tsv', format='tsv', fields=[('text', TEXT), ('label', LABELS)]) print('train.examples.data:', train.examples[0].label) prevecs = None if (args.pretr_emb == True): #print('Making vocab w/ glove.6B.' + str(args.emb_dim) + ' dim vectors') TEXT.build_vocab(train, vectors=GloVe(name='6B', dim=args.emb_dim), min_freq=args.mf) #wv_type="glove.6B") prevecs = TEXT.vocab.vectors else: TEXT.build_vocab(train) LABELS.build_vocab(train) vecs = Vecs(args.emb_dim) #print('Making interator for splits...') train_iter, val_iter, test_iter = data.BucketIterator.splits( (train, val, test), batch_sizes=(args.batch_size, args.batch_size, args.batch_size), sort_key=lambda x: len(x.text)) #, device=cuda) num_classes = len(LABELS.vocab) vocab_size = len(TEXT.vocab) ############################################################################### # Build the model ############################################################################### encoder_model = EncoderRNN(vocab_size=vocab_size, max_len=200, hidden_size=args.hidden_sz, input_dropout_p=0, dropout_p=args.dropout, n_layers=args.num_layers, bidirectional=args.num_dir == 2, rnn_cell=args.net_type, variable_lengths=False) decoder_model = DecoderRNN( vocab_size=vocab_size, max_len=200, hidden_size=args.hidden_sz, sos_id=2, # Add to params eos_id=3, # Add to params n_layers=args.num_layers, rnn_cell=args.net_type, bidirectional=args.num_dir == 2, input_dropout_p=0, dropout_p=args.dropout, use_attention=False) model = Seq2seq(encoder_model, decoder_model) criterion = NLLLoss() #criterion = nn.CrossEntropyLoss() # Select optimizer if (args.opt == 'adamax'): optimizer = torch.optim.Adamax(model.parameters()) #, lr=args.lr) elif (args.opt == 'adam'): optimizer = torch.optim.Adam(model.parameters()) #, lr=args.lr) elif (args.opt == 'sgd'): optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.5) #,lr=args.lr,momentum=0.5) else: #print('Optimizer unknown, defaulting to adamax') optimizer = torch.optim.Adamax(model.parameters()) ############################################################################### # Training the Model ############################################################################### if cuda == 0: model = model.cuda() #args.device) highest_t1_acc = 0 highest_t1_acc_metrics = '' highest_t1_acc_params = '' results = '' for epoch in range(args.epochs): losses = [] tot_loss = 0 train_iter.repeat = False for batch_count, batch in enumerate(train_iter): print('Batch:', batch_count) model.zero_grad() inp = batch.text.t() print('type(inp)', type(inp)) inp3d = torch.autograd.Variable( torch.cuda.FloatTensor(inp.size(0), inp.size(1), args.emb_dim)) print('type(inp3d)', type(inp3d)) for i in range(inp.size(0)): for j in range(inp.size(1)): inp3d[i, j, :] = vecs[TEXT.vocab.itos[inp[i, j].data[0]]] #print("INP: ",inp.size()) #print(inp3d) #print(inp) preds = model(inp3d) #print("PREDS: ",np.shape(preds)) #print("LABELS: ",batch.label.size()) loss = criterion(preds, batch.label) loss.backward() optimizer.step() losses.append(loss) tot_loss += loss.data[0] #if (batch_count % 20 == 0): # print('Batch: ', batch_count, '\tLoss: ', str(losses[-1].data[0])) batch_count += 1 #print('Average loss over epoch ' + str(epoch) + ': ' + str(tot_loss/len(losses))) (avg_loss, accuracy, corrects, size, t5_acc, t5_corrects, mrr) = eval(val_iter, model, vecs, TEXT, args.emb_dim) #, args.device) if accuracy > args.acc_thresh: save_path = '{}/acc{:.2f}_e{}.pt'.format(args.save_path_full, accuracy, epoch) if not os.path.isdir(args.save_path_full): os.makedirs(args.save_path_full) torch.save(model, save_path) if highest_t1_acc < accuracy: highest_t1_acc = accuracy highest_t1_acc_metrics = ('acc: {:6.4f}%({:3d}/{}) EPOCH{:2d} - loss: {:.4f} t5_acc: {:6.4f}%({:3d}' \ '/{}) MRR: {:.6f}'.format(accuracy, corrects, size,epoch, avg_loss, t5_acc, t5_corrects, size, mrr)) highest_t1_acc_params = (('PARAMETERS:' \ 'net-%s' \ '_e%i' \ '_bs%i' \ '_opt-%s' \ '_ly%i' \ '_hs%i' \ '_dr%i' '_ed%i' \ '_femb%s' \ '_ptemb%s' \ '_drp%.1f' \ '_mf%d\n' % (args.net_type, args.epochs, args.batch_size, args.opt, args.num_layers, args.hidden_sz, args.num_dir, args.emb_dim, args.embfix, args.pretr_emb, args.dropout, args.mf))) results += ('\nEPOCH{:2d} - loss: {:.4f} acc: {:6.4f}%({:3d}/{}) t5_acc: {:6.4f}%({:3d}' \ '/{}) MRR: {:.6f}'.format(epoch, avg_loss, accuracy, corrects, size, t5_acc, t5_corrects, size, mrr)) print(highest_t1_acc_metrics + '\n') writeResults(args, results, highest_t1_acc, highest_t1_acc_metrics, highest_t1_acc_params)
rnn_cell='lstm', bidirectional=bidirectional, n_layers=2, variable_lengths=True) decoder = DecoderRNN(len(tgt.vocab), max_len, hidden_size * 2 if bidirectional else hidden_size, rnn_cell='lstm', dropout_p=0.25, use_attention=True, bidirectional=bidirectional, n_layers=2, eos_id=tgt.eos_id, sos_id=tgt.sos_id) seq2seq_model = Seq2seq(encoder, decoder) if torch.cuda.is_available(): seq2seq_model.cuda() for param in seq2seq_model.parameters(): param.data.uniform_(-0.1, 0.1) optimizer = Optimizer(torch.optim.Adam(seq2seq_model.parameters()), max_grad_norm=5) # In[20]: seq2seq_model = torch.nn.DataParallel(seq2seq_model) # In[21]:
use_attention=True, bidirectional=True, eos_id=tgt.eos_id, sos_id=tgt.sos_id, embedding=hidden_size, use_concept=opt.concept) dialog_encoder = torch.nn.LSTM(input_size=hidden_size * 2 if bidirectional else hidden_size, hidden_size=dialog_hidden_size, batch_first=True, dropout=dropout) if opt.concept: seq2seq = Seq2seq(encoder, decoder, dialog_encoder=dialog_encoder, cpt_vocab=cpt.vocab, hidden_size=dialog_hidden_size, concept_level=opt.concept_level, conceptnet_file=opt.conceptnet_file) else: seq2seq = Seq2seq(encoder, decoder, dialog_encoder=dialog_encoder, hidden_size=dialog_hidden_size) if torch.cuda.is_available(): seq2seq.cuda() for param in seq2seq.parameters(): param.data.uniform_(-0.08, 0.08) # Optimizer and learning rate scheduler can be customized by
def run_training(opt, default_data_dir, num_epochs=100): if opt.load_checkpoint is not None: logging.info("loading checkpoint from {}".format( os.path.join(opt.expt_dir, Checkpoint.CHECKPOINT_DIR_NAME, opt.load_checkpoint))) checkpoint_path = os.path.join(opt.expt_dir, Checkpoint.CHECKPOINT_DIR_NAME, opt.load_checkpoint) checkpoint = Checkpoint.load(checkpoint_path) seq2seq = checkpoint.model input_vocab = checkpoint.input_vocab output_vocab = checkpoint.output_vocab else: # Prepare dataset src = SourceField() tgt = TargetField() max_len = 50 data_file = os.path.join(default_data_dir, opt.train_path, 'data.txt') logging.info("Starting new Training session on %s", data_file) def len_filter(example): return (len(example.src) <= max_len) and (len(example.tgt) <= max_len) \ and (len(example.src) > 0) and (len(example.tgt) > 0) train = torchtext.data.TabularDataset( path=data_file, format='json', fields={'src': ('src', src), 'tgt': ('tgt', tgt)}, filter_pred=len_filter ) dev = None if opt.no_dev is False: dev_data_file = os.path.join(default_data_dir, opt.train_path, 'dev-data.txt') dev = torchtext.data.TabularDataset( path=dev_data_file, format='json', fields={'src': ('src', src), 'tgt': ('tgt', tgt)}, filter_pred=len_filter ) src.build_vocab(train, max_size=50000) tgt.build_vocab(train, max_size=50000) input_vocab = src.vocab output_vocab = tgt.vocab # NOTE: If the source field name and the target field name # are different from 'src' and 'tgt' respectively, they have # to be set explicitly before any training or inference # seq2seq.src_field_name = 'src' # seq2seq.tgt_field_name = 'tgt' # Prepare loss weight = torch.ones(len(tgt.vocab)) pad = tgt.vocab.stoi[tgt.pad_token] loss = Perplexity(weight, pad) if torch.cuda.is_available(): logging.info("Yayyy We got CUDA!!!") loss.cuda() else: logging.info("No cuda available device found running on cpu") seq2seq = None optimizer = None if not opt.resume: hidden_size = 128 decoder_hidden_size = hidden_size * 2 logging.info("EncoderRNN Hidden Size: %s", hidden_size) logging.info("DecoderRNN Hidden Size: %s", decoder_hidden_size) bidirectional = True encoder = EncoderRNN(len(src.vocab), max_len, hidden_size, bidirectional=bidirectional, rnn_cell='lstm', variable_lengths=True) decoder = DecoderRNN(len(tgt.vocab), max_len, decoder_hidden_size, dropout_p=0, use_attention=True, bidirectional=bidirectional, rnn_cell='lstm', eos_id=tgt.eos_id, sos_id=tgt.sos_id) seq2seq = Seq2seq(encoder, decoder) if torch.cuda.is_available(): seq2seq.cuda() for param in seq2seq.parameters(): param.data.uniform_(-0.08, 0.08) # Optimizer and learning rate scheduler can be customized by # explicitly constructing the objects and pass to the trainer. optimizer = Optimizer(torch.optim.Adam(seq2seq.parameters()), max_grad_norm=5) scheduler = StepLR(optimizer.optimizer, 1) optimizer.set_scheduler(scheduler) # train num_epochs = num_epochs batch_size = 32 checkpoint_every = num_epochs / 10 print_every = num_epochs / 100 properties = dict(batch_size=batch_size, checkpoint_every=checkpoint_every, print_every=print_every, expt_dir=opt.expt_dir, num_epochs=num_epochs, teacher_forcing_ratio=0.5, resume=opt.resume) logging.info("Starting training with the following Properties %s", json.dumps(properties, indent=2)) t = SupervisedTrainer(loss=loss, batch_size=num_epochs, checkpoint_every=checkpoint_every, print_every=print_every, expt_dir=opt.expt_dir) seq2seq = t.train(seq2seq, train, num_epochs=num_epochs, dev_data=dev, optimizer=optimizer, teacher_forcing_ratio=0.5, resume=opt.resume) evaluator = Evaluator(loss=loss, batch_size=batch_size) if opt.no_dev is False: dev_loss, accuracy = evaluator.evaluate(seq2seq, dev) logging.info("Dev Loss: %s", dev_loss) logging.info("Accuracy: %s", dev_loss) beam_search = Seq2seq(seq2seq.encoder, TopKDecoder(seq2seq.decoder, 4)) predictor = Predictor(beam_search, input_vocab, output_vocab) while True: try: seq_str = raw_input("Type in a source sequence:") seq = seq_str.strip().split() results = predictor.predict_n(seq, n=3) for i, res in enumerate(results): print('option %s: %s\n', i + 1, res) except KeyboardInterrupt: logging.info("Bye Bye") exit(0)
hidden_size, n_layers=n_layers, bidirectional=bidirectional, variable_lengths=True) decoder = DecoderRNN(len(output_vocab), opt.max_len, hidden_size * 2 if bidirectional else hidden_size, dropout_p=decoder_dropout, use_attention=True, bidirectional=bidirectional, n_layers=n_layers, eos_id=first_field.eos_id, sos_id=first_field.sos_id) seq2seq = Seq2seq(encoder, decoder, batch_size, num_sequences) if torch.cuda.is_available(): seq2seq.cuda() for param in seq2seq.parameters(): param.data.uniform_(-0.08, 0.08) # Optimizer and learning rate scheduler can be customized by # explicitly constructing the objects and pass to the trainer. # # optimizer = Optimizer(torch.optim.Adam(seq2seq.parameters()), max_grad_norm=5) # scheduler = StepLR(optimizer.optimizer, 1) # optimizer.set_scheduler(scheduler) # train t = SupervisedTrainer(loss=loss,
hidden_size * 2 if bidirectional else hidden_size, dropout_p=0.2, use_attention=args.use_att, bidirectional=bidirectional, eos_id=sos_id, sos_id=sos_id + 1, batch_size=args.batch_size, att_method=args.att_method, att_mlp=args.att_mlp, att_type=args.att_type) # decoder3 = DecoderRNN(args.decoder3_n_layer, args.vocab_size, max_len, hidden_size * 2 if bidirectional else hidden_size, # dropout_p=0.2, use_attention=True, bidirectional=bidirectional, # eos_id=eos_id, sos_id=sos_id) # seq2seq = Seq2seq(args, decoder1, decoder2, decoder3) seq2seq = Seq2seq(args, decoder1, decoder2) seq2seq.cuda() seq2seq = torch.nn.DataParallel(seq2seq) cudnn.benchmark = True print('Initialize model parameter ...') if args.init == 'uniform': print('uniform init !') for param in seq2seq.parameters(): param.data.uniform_(-args.init_weight, args.init_weight) elif args.init == 'mos': print('mos init !') for m in seq2seq.modules(): if type(m) in [nn.GRU, nn.LSTM, nn.RNN]: for name, param in m.named_parameters(): if 'weight_ih' in name:
def offline_training(opt, traget_file_path): # Prepare dataset with torchtext src = SourceField(tokenize=treebank_tokenizer) tgt = TargetField(tokenize=treebank_tokenizer) def sample_filter(sample): """ sample example for future purpose""" return True train = torchtext.data.TabularDataset(path=opt.train_path, format='tsv', fields=[('src', src), ('tgt', tgt)], filter_pred=sample_filter) dev = torchtext.data.TabularDataset(path=opt.dev_path, format='tsv', fields=[('src', src), ('tgt', tgt)], filter_pred=sample_filter) test = torchtext.data.TabularDataset(path=opt.dev_path, format='tsv', fields=[('src', src), ('tgt', tgt)], filter_pred=sample_filter) src.build_vocab(train, max_size=opt.src_vocab_size) tgt.build_vocab(train, max_size=opt.tgt_vocab_size) input_vocab = src.vocab output_vocab = tgt.vocab # NOTE: If the source field name and the target field name # are different from 'src' and 'tgt' respectively, they have # to be set explicitly before any training or inference # seq2seq.src_field_name = 'src' # seq2seq.tgt_field_name = 'tgt' # Prepare loss weight = torch.ones(len(tgt.vocab)) pad = tgt.vocab.stoi[tgt.pad_token] if opt.loss == 'perplexity': loss = Perplexity(weight, pad) else: raise TypeError seq2seq = None optimizer = None if not opt.resume: # Initialize model encoder = EncoderRNN(vocab_size=len(src.vocab), max_len=opt.max_length, hidden_size=opt.hidden_size, input_dropout_p=opt.intput_dropout_p, dropout_p=opt.dropout_p, n_layers=opt.n_layers, bidirectional=opt.bidirectional, rnn_cell=opt.rnn_cell, variable_lengths=True, embedding=input_vocab.vectors if opt.use_pre_trained_embedding else None, update_embedding=opt.update_embedding) decoder = DecoderRNN(vocab_size=len(tgt.vocab), max_len=opt.max_length, hidden_size=opt.hidden_size * 2 if opt.bidirectional else opt.hidden_size, sos_id=tgt.sos_id, eos_id=tgt.eos_id, n_layers=opt.n_layers, rnn_cell=opt.rnn_cell, bidirectional=opt.bidirectional, input_dropout_p=opt.input_dropout_p, dropout_p=opt.dropout_p, use_attention=opt.use_attention) seq2seq = Seq2seq(encoder=encoder, decoder=decoder) if opt.gpu >= 0 and torch.cuda.is_available(): seq2seq.cuda() for param in seq2seq.parameters(): param.data.uniform_(-0.08, 0.08) # train trainer = SupervisedTrainer(loss=loss, batch_size=opt.batch_size, checkpoint_every=opt.checkpoint_every, print_every=opt.print_every, expt_dir=opt.expt_dir) seq2seq = trainer.train(model=seq2seq, data=train, num_epochs=opt.epochs, resume=opt.resume, dev_data=dev, optimizer=optimizer, teacher_forcing_ratio=opt.teacher_forcing_rate)
def train(): src = SourceField(sequential=True, tokenize=lambda x: [i for i in jieba.lcut(x)]) tgt = TargetField(sequential=True, tokenize=lambda x: [i for i in jieba.lcut(x)]) max_len = 50 def len_filter(example): return len(example.src) <= max_len and len(example.tgt) <= max_len train = torchtext.data.TabularDataset(path=opt.train_path, format='csv', fields=[('src', src), ('tgt', tgt)], filter_pred=len_filter) dev = torchtext.data.TabularDataset(path=opt.dev_path, format='csv', fields=[('src', src), ('tgt', tgt)], filter_pred=len_filter) src.build_vocab(train, max_size=50000) tgt.build_vocab(train, max_size=50000) input_vocab = src.vocab output_vocab = tgt.vocab # NOTE: If the source field name and the target field name # are different from 'src' and 'tgt' respectively, they have # to be set explicitly before any training or inference # seq2seq.src_field_name = 'src' # seq2seq.tgt_field_name = 'tgt' # Prepare loss weight = torch.ones(len(tgt.vocab)) pad = tgt.vocab.stoi[tgt.pad_token] loss = Perplexity(weight, pad) if torch.cuda.is_available(): loss.cuda() seq2seq = None optimizer = None if not opt.resume: # Initialize model hidden_size = 128 bidirectional = True encoder = EncoderRNN(len(src.vocab), max_len, hidden_size, bidirectional=bidirectional, variable_lengths=True) decoder = DecoderRNN(len(tgt.vocab), max_len, hidden_size * 2 if bidirectional else hidden_size, dropout_p=0.2, use_attention=True, bidirectional=bidirectional, eos_id=tgt.eos_id, sos_id=tgt.sos_id) seq2seq = Seq2seq(encoder, decoder) if torch.cuda.is_available(): seq2seq.cuda() for param in seq2seq.parameters(): param.data.uniform_(-0.08, 0.08) # Optimizer and learning rate scheduler can be customized by # explicitly constructing the objects and pass to the trainer. # # optimizer = Optimizer(torch.optim.Adam(seq2seq.parameters()), max_grad_norm=5) # scheduler = StepLR(optimizer.optimizer, 1) # optimizer.set_scheduler(scheduler) # train t = SupervisedTrainer(loss=loss, batch_size=32, checkpoint_every=50, print_every=10, expt_dir=opt.expt_dir) seq2seq = t.train(seq2seq, train, num_epochs=6, dev_data=dev, optimizer=optimizer, teacher_forcing_ratio=0.5, resume=opt.resume) predictor = Predictor(seq2seq, input_vocab, output_vocab)
encoder_embedding_size = 100 encoder_hidden_size = 50 encoder = Encoder(encoder_vocab_size, encoder_embedding_size, encoder_hidden_size) # decoder decoder_vocab_size = len(dest_word2index) decoder_embedding_size = 100 decoder_hidden_size = 50 decoder_output_size = 100 decoder = Decoder(decoder_vocab_size, decoder_embedding_size, decoder_hidden_size, decoder_output_size) # Sequential to sequential learning model model = Seq2seq(encoder, decoder, RMSprop(clip=5.0, lr=0.001, gamma=0.9, eps=1e-8), logger=logger) # training def epoch_end_callback(): def sampling(x, mask_x, y, mask_y, sample_size=5): sample_indices = rng.randint(0, x.get_value(borrow=True).shape[0], sample_size) predict = model.predict(x[sample_indices], mask_x[:, sample_indices], y[sample_indices], mask_y[:, sample_indices]) sample_x = x.get_value(borrow=True)[sample_indices] sample_y = y.get_value(borrow=True)[sample_indices] predict_y = predict.eval() return (sample_x, sample_y, predict_y)
bidirectional=bidirectional, rnn_cell="lstm", variable_lengths=True, ) decoder = DecoderRNN( len(tgt.vocab), max_len, hidden_size * 2, dropout_p=0.2, use_attention=True, bidirectional=bidirectional, rnn_cell="lstm", eos_id=tgt.eos_id, sos_id=tgt.sos_id, ) seq2seq = Seq2seq(encoder, decoder) if torch.cuda.is_available(): seq2seq.cuda() for param in seq2seq.parameters(): param.data.uniform_(-0.08, 0.08) # train t = SupervisedTrainer( loss=loss, batch_size=32, checkpoint_every=50, print_every=10, expt_dir=opt.expt_dir, )
dropout_p=0.2, use_attention=True, bidirectional=bidirectional, eos_id=tgt.eos_id, sos_id=tgt.sos_id, embedding=hidden_size, use_concept=opt.concept) dialog_encoder = torch.nn.LSTM(input_size=hidden_size * 2 if bidirectional else hidden_size, hidden_size=dialog_hidden_size, batch_first=True, dropout=dropout) if opt.concept: seq2seq = Seq2seq(encoder, decoder, dialog_encoder=dialog_encoder, cpt_vocab=cpt.vocab, hidden_size=dialog_hidden_size) else: seq2seq = Seq2seq(encoder, decoder, dialog_encoder=dialog_encoder, hidden_size=dialog_hidden_size) if torch.cuda.is_available(): seq2seq.cuda() for param in seq2seq.parameters(): param.data.uniform_(-0.08, 0.08) # Optimizer and learning rate scheduler can be customized by # explicitly constructing the objects and pass to the trainer.
bidirectional=bidirectional, n_layers=1, rnn_cell='gru', variable_lengths=True) decoder = DecoderRNN(vocab_size=len(tgt.vocab), max_len=max_len, hidden_size=hidden_size * 2 if bidirectional else 1, dropout_p=opt.dropout, use_attention=True, bidirectional=bidirectional, n_layers=1, rnn_cell='gru', eos_id=tgt.eos_id, sos_id=tgt.sos_id) seq2seq = Seq2seq(encoder, decoder) for param in seq2seq.parameters(): param.data.uniform_(-0.08, 0.08) print(param.data[0:3]) _, _, norm_val = encoder.vectors_stats() encoder.init_vectors(src.vocab.vectors) # encoder.scale_vectors(0.08) encoder.normalize_vectors(norm_val) encoder.vectors_stats() for param in seq2seq.parameters(): print(param.data[0:3]) if torch.cuda.is_available(): seq2seq.cuda() # Optimizer and learning rate scheduler can be customized by
def main(): ''' Main function ''' parser = argparse.ArgumentParser() parser.add_argument('-data', required=True) parser.add_argument('-epoch', type=int, default=3) parser.add_argument('-batch_size', type=int, default=64) parser.add_argument('-d_model', type=int, default=1024) parser.add_argument('-n_layer', type=int, default=1) parser.add_argument('-dropout', type=float, default=0) parser.add_argument('-log', default=None) parser.add_argument('-save_model', default=None) parser.add_argument('-save_mode', type=str, choices=['all', 'best'], default='best') parser.add_argument('-seed', type=int, default=42, help="random seed for initialization") parser.add_argument('-no_cuda', action='store_true') parser.add_argument('-teacher_forcing_ratio', type=float, default=0.5) opt = parser.parse_args() opt.cuda = not opt.no_cuda opt.d_word_vec = opt.d_model opt.log = opt.save_model random.seed(opt.seed) np.random.seed(opt.seed) torch.manual_seed(opt.seed) if opt.cuda: torch.cuda.manual_seed_all(opt.seed) #========= Loading Dataset =========# data = torch.load(opt.data) opt.max_token_seq_len = data['settings'].max_token_seq_len training_data, validation_data = prepare_dataloaders(data, opt) opt.src_vocab_size = training_data.dataset.src_vocab_size opt.tgt_vocab_size = training_data.dataset.tgt_vocab_size #========= Preparing Model =========# print(opt) device = torch.device('cuda' if opt.cuda else 'cpu') # model opt.bidirectional = True encoder = EncoderRNN(opt.src_vocab_size, opt.max_token_seq_len, opt.d_model, bidirectional=opt.bidirectional, variable_lengths=True) decoder = DecoderRNN(opt.tgt_vocab_size, opt.max_token_seq_len, opt.d_model * 2 if opt.bidirectional else opt.d_model, n_layers=opt.n_layer, dropout_p=opt.dropout, use_attention=True, bidirectional=opt.bidirectional, eos_id=Constants.BOS, sos_id=Constants.EOS) seq2seq = Seq2seq(encoder, decoder).to(device) for param in seq2seq.parameters(): param.data.uniform_(-0.08, 0.08) seq2seq = nn.DataParallel(seq2seq) # loss weight = torch.ones(opt.tgt_vocab_size) pad = Constants.PAD loss = Perplexity(weight, pad) if opt.cuda: loss.cuda() # optimizer optimizer = Optimizer(torch.optim.Adam(seq2seq.parameters()), max_grad_norm=5) train(seq2seq, training_data, validation_data, loss, optimizer, device, opt)