def setUp(self): test_path = os.path.dirname(os.path.realpath(__file__)) src = SourceField() tgt = TargetField() self.dataset = torchtext.data.TabularDataset( path=os.path.join(test_path, 'data/eng-fra.txt'), format='tsv', fields=[('src', src), ('tgt', tgt)], ) src.build_vocab(self.dataset) tgt.build_vocab(self.dataset)
def make_datasets(train_df, dev_df): src = SourceField(tokenize=list) tgt = TargetField(tokenize=list) train = _prepare_dataset( train_df, (src, tgt) ) dev = _prepare_dataset( dev_df, (src, tgt) ) src.build_vocab(train) tgt.build_vocab(train) return train, dev, src, tgt
def setUpClass(self): test_path = os.path.dirname(os.path.realpath(__file__)) src = SourceField() trg = TargetField() dataset = torchtext.data.TabularDataset( path=os.path.join(test_path, 'data/eng-fra.txt'), format='tsv', fields=[('src', src), ('trg', trg)], ) src.build_vocab(dataset) trg.build_vocab(dataset) encoder = EncoderRNN(len(src.vocab), 10, 10, rnn_cell='lstm') decoder = DecoderRNN(len(trg.vocab), 10, 10, trg.sos_id, trg.eos_id, rnn_cell='lstm') seq2seq = Seq2seq(encoder, decoder) self.predictor = Predictor(seq2seq, src.vocab, trg.vocab)
def setUp(self): test_path = os.path.dirname(os.path.realpath(__file__)) src = SourceField() tgt = TargetField() self.dataset = torchtext.data.TabularDataset( path=os.path.join(test_path, 'data/eng-fra.txt'), format='tsv', fields=[('src', src), ('tgt', tgt)], ) src.build_vocab(self.dataset) tgt.build_vocab(self.dataset) encoder = EncoderRNN(len(src.vocab), 10, 10, rnn_cell='lstm') decoder = DecoderRNN(len(tgt.vocab), 10, 10, tgt.sos_id, tgt.eos_id, rnn_cell='lstm') self.seq2seq = Seq2seq(encoder, decoder) for param in self.seq2seq.parameters(): param.data.uniform_(-0.08, 0.08)
def gen_data(train_path, dev_path): # Prepare dataset src = SourceField() tgt = TargetField() train = torchtext.data.TabularDataset(path=train_path, format='tsv', fields=[('src', src), ('tgt', tgt)], filter_pred=len_filter) dev = torchtext.data.TabularDataset(path=dev_path, format='tsv', fields=[('src', src), ('tgt', tgt)], filter_pred=len_filter) src.build_vocab(train, max_size=50000) tgt.build_vocab(train, max_size=50000) input_vocab = src.vocab output_vocab = tgt.vocab return train, dev, input_vocab, output_vocab
src = SourceField(init_token='<sos>', eos_token='<eos>') tgt = TargetField( init_token='<sos>', eos_token='<eos>') # init_token='<sos>', eos_token='<eos>' train_data = torchtext.data.TabularDataset( # path='data/diffs/test/val.small.del_add.data', format='tsv', # path='data/splitted_two_input_100/train_100_10.data', format='tsv', path='data/splitted_two_input_100/train_100_47500.data', format='tsv', # path='data/splitted_two_input_200/train_200_83000.data', format='tsv', # path='../../../../new_data/processed_data/splitted_two_input_100/train_100.data', format='tsv', fields=[(src_del_field_name, src), (src_add_field_name, src), (tgt_field_name, tgt)], ) src.build_vocab(train_data, max_size=50000) tgt.build_vocab(train_data, max_size=50000, min_freq=1) input_vocab = src.vocab output_vocab = tgt.vocab pad_index = output_vocab.stoi['<pad>'] test_data = torchtext.data.TabularDataset( # path='data/diffs/test/val.small.del_add.data', format='tsv', path='data/splitted_two_input_100/test_100_19000.data', format='tsv', # path='data/splitted_two_input_200/test_200_17000.data', format='tsv', # path='data/splitted_two_input_100/test_100.data', format='tsv', # path='../../../../new_data/processed_data/splitted_two_input_100/train_100.data', format='tsv', fields=[(src_del_field_name, src), (src_add_field_name, src), (tgt_field_name, tgt)], )
src.rebuild_vocab(input_vocab) tgt.rebuild_vocab(output_vocab) weight = torch.ones(len(tgt.vocab)) pad = tgt.vocab.stoi[tgt.pad_token] loss = BLEULoss(weight, pad, tgt) else: # Prepare dataset src = SourceField(sequential=True, use_vocab=True) tgt = TargetField(sequential=True, use_vocab=True) max_len = 23 train = torchtext.data.TabularDataset(path=opt.train_path, format="tsv", fields=[("src", src), ("tgt", tgt)]) src.build_vocab(train, vectors="glove.6B.100d", max_size=16384) tgt.build_vocab(train, vectors="glove.6B.100d", max_size=16384) input_vocab = src.vocab output_vocab = tgt.vocab # NOTE: If the source field name and the target field name # are different from 'src' and 'tgt' respectively, they have # to be set explicitly before any training or inference # seq2seq.src_field_name = 'src' # seq2seq.tgt_field_name = 'tgt' # Prepare loss weight = torch.ones(len(tgt.vocab)) pad = tgt.vocab.stoi[tgt.pad_token] loss = BLEULoss(weight, pad, tgt)
tgt = TargetField() max_len = 200 def len_filter(example): return len(example.src) <= max_len and len(example.tgt) <= max_len train = torchtext.data.TabularDataset(path=opt.train_path, format='tsv', fields=[('src', src), ('tgt', tgt)], filter_pred=len_filter) dev = torchtext.data.TabularDataset(path=opt.dev_path, format='tsv', fields=[('src', src), ('tgt', tgt)], filter_pred=len_filter) vectors = torchtext.vocab.Vectors( name='../data/toy_reverse/train/falv.vector') src.build_vocab(train, max_size=200000, vectors=vectors) tgt.build_vocab(train, max_size=200000) input_vocab = src.vocab output_vocab = tgt.vocab # 通过pytorch创建的Embedding层 embedding = nn.Embedding(2893, 400) # 指定嵌入矩阵的初始权重 weight_matrix = src.vocab.vectors embedding.weight.data.copy_(weight_matrix) # 指定预训练权重的同时设定requires_grad=True # embeddings.weight = nn.Parameter(embeddings, requires_grad=True)
tgt = TargetField(stop_words=stopwords) max_len = 100 def len_filter(example): return len(example.src) <= max_len and len(example.tgt) <= max_len train = torchtext.data.TabularDataset( path=opt.train_path, format='tsv', fields=[('src', src), ('tgt', tgt)], filter_pred=len_filter ) dev = torchtext.data.TabularDataset( path=opt.dev_path, format='tsv', fields=[('src', src), ('tgt', tgt)], filter_pred=len_filter ) src.build_vocab(train, max_size=50000, vectors=GloVe(name='6B', dim=300)) tgt.build_vocab(train, max_size=50000, vectors=GloVe(name='6B', dim=300)) input_vocab = src.vocab output_vocab = tgt.vocab print(len(train), len(dev)) # NOTE: If the source field name and the target field name # are different from 'src' and 'tgt' respectively, they have # to be set explicitly before any training or inference # seq2seq.src_field_name = 'src' # seq2seq.tgt_field_name = 'tgt' # Prepare loss weight = torch.ones(len(tgt.vocab)) pad = tgt.vocab.stoi[tgt.pad_token] loss = Perplexity(weight, pad)
def train(): src = SourceField(sequential=True, tokenize=lambda x: [i for i in jieba.lcut(x)]) tgt = TargetField(sequential=True, tokenize=lambda x: [i for i in jieba.lcut(x)]) max_len = 50 def len_filter(example): return len(example.src) <= max_len and len(example.tgt) <= max_len train = torchtext.data.TabularDataset(path=opt.train_path, format='csv', fields=[('src', src), ('tgt', tgt)], filter_pred=len_filter) dev = torchtext.data.TabularDataset(path=opt.dev_path, format='csv', fields=[('src', src), ('tgt', tgt)], filter_pred=len_filter) src.build_vocab(train, max_size=50000) tgt.build_vocab(train, max_size=50000) input_vocab = src.vocab output_vocab = tgt.vocab # NOTE: If the source field name and the target field name # are different from 'src' and 'tgt' respectively, they have # to be set explicitly before any training or inference # seq2seq.src_field_name = 'src' # seq2seq.tgt_field_name = 'tgt' # Prepare loss weight = torch.ones(len(tgt.vocab)) pad = tgt.vocab.stoi[tgt.pad_token] loss = Perplexity(weight, pad) if torch.cuda.is_available(): loss.cuda() seq2seq = None optimizer = None if not opt.resume: # Initialize model hidden_size = 128 bidirectional = True encoder = EncoderRNN(len(src.vocab), max_len, hidden_size, bidirectional=bidirectional, variable_lengths=True) decoder = DecoderRNN(len(tgt.vocab), max_len, hidden_size * 2 if bidirectional else hidden_size, dropout_p=0.2, use_attention=True, bidirectional=bidirectional, eos_id=tgt.eos_id, sos_id=tgt.sos_id) seq2seq = Seq2seq(encoder, decoder) if torch.cuda.is_available(): seq2seq.cuda() for param in seq2seq.parameters(): param.data.uniform_(-0.08, 0.08) # Optimizer and learning rate scheduler can be customized by # explicitly constructing the objects and pass to the trainer. # # optimizer = Optimizer(torch.optim.Adam(seq2seq.parameters()), max_grad_norm=5) # scheduler = StepLR(optimizer.optimizer, 1) # optimizer.set_scheduler(scheduler) # train t = SupervisedTrainer(loss=loss, batch_size=32, checkpoint_every=50, print_every=10, expt_dir=opt.expt_dir) seq2seq = t.train(seq2seq, train, num_epochs=6, dev_data=dev, optimizer=optimizer, teacher_forcing_ratio=0.5, resume=opt.resume) predictor = Predictor(seq2seq, input_vocab, output_vocab)
else: logging.info("loading checkpoint from {}".format( os.path.join(opt.expt_dir, Checkpoint.CHECKPOINT_DIR_NAME, opt.load_checkpoint))) checkpoint_path = os.path.join(opt.expt_dir, Checkpoint.CHECKPOINT_DIR_NAME, opt.load_checkpoint) checkpoint = Checkpoint.load(checkpoint_path) seq2seq = checkpoint.model # input_vocab = checkpoint.input_vocab # output_vocab = checkpoint.output_vocab src.vocab = checkpoint.input_vocab tgt.vocab = checkpoint.output_vocab else: src.build_vocab(train, max_size=params['src_vocab_size'], specials=replace_tokens) tgt.build_vocab(train, max_size=params['tgt_vocab_size']) # input_vocab = src.vocab # output_vocab = tgt.vocab logging.info('Indices of special replace tokens:\n') for rep in replace_tokens: logging.info("%s, %d; " % (rep, src.vocab.stoi[rep])) logging.info('\n') # Prepare loss weight = torch.ones(len(tgt.vocab)) pad = tgt.vocab.stoi[tgt.pad_token] loss = Perplexity(weight, pad) if torch.cuda.is_available():
src = SourceField() tgt = TargetField() max_len = 20 def len_filter(example): return len(example.src) <= max_len and len(example.tgt) <= max_len train = torchtext.data.TabularDataset(path=opt.train_path, format='tsv', fields=[('src', src), ('tgt', tgt)], filter_pred=len_filter) dev = torchtext.data.TabularDataset(path=opt.dev_path, format='tsv', fields=[('src', src), ('tgt', tgt)], filter_pred=len_filter) src.build_vocab(train, max_size=100000, min_freq=1) tgt.build_vocab(train, max_size=100000, min_freq=1) input_vocab = src.vocab output_vocab = tgt.vocab # NOTE: If the source field name and the target field name # are different from 'src' and 'tgt' respectively, they have # to be set explicitly before any training or inference # seq2seq.src_field_name = 'src' # seq2seq.tgt_field_name = 'tgt' # Prepare loss weight = torch.ones(len(tgt.vocab)) pad = tgt.vocab.stoi[tgt.pad_token] loss = Perplexity(weight, pad) if torch.cuda.is_available():
src = SourceField() tgt = TargetField() max_len = params['max_len'] def len_filter(example): return len(example.src) <= max_len and len(example.tgt) <= max_len train = torchtext.data.TabularDataset(path=opt.train_path, format='tsv', fields=[('src', src), ('tgt', tgt)], filter_pred=len_filter) dev = torchtext.data.TabularDataset(path=opt.dev_path, format='tsv', fields=[('src', src), ('tgt', tgt)], filter_pred=len_filter) src.build_vocab(train, max_size=params['src_vocab_size']) tgt.build_vocab(train, max_size=params['tgt_vocab_size']) input_vocab = src.vocab output_vocab = tgt.vocab # NOTE: If the source field name and the target field name # are different from 'src' and 'tgt' respectively, they have # to be set explicitly before any training or inference # seq2seq.src_field_name = 'src' # seq2seq.tgt_field_name = 'tgt' # Prepare loss weight = torch.ones(len(tgt.vocab)) pad = tgt.vocab.stoi[tgt.pad_token] loss = Perplexity(weight, pad) if torch.cuda.is_available():
train = torchtext.data.TabularDataset( path=opt.train_path, format='tsv', fields=[('src', src), ('tgt', tgt)], filter_pred=len_filter ) dev = torchtext.data.TabularDataset( path=opt.dev_path, format='tsv', fields=[('src', src), ('tgt', tgt)], filter_pred=len_filter ) test = torchtext.data.TabularDataset( path=opt.test_path, format='tsv', fields=[('src', src), ('tgt', tgt)], filter_pred=len_filter ) src.build_vocab(train, max_size=50000) tgt.build_vocab(train, max_size=50000) input_vocab = src.vocab output_vocab = tgt.vocab # inputs = torchtext.Field(lower=True, include_lengths=True, batch_first=True) # inputs.build_vocab(src.vocab) src.vocab.load_vectors(wv_type='glove.840B', wv_dim=300) # NOTE: If the source field name and the target field name # are different from 'src' and 'tgt' respectively, they have # to be set explicitly before any training or inference # seq2seq.src_field_name = 'src' # seq2seq.tgt_field_name = 'tgt' # Prepare loss
('tgt', tgt)], train='test', validation='test', test='test') adv_train, adv_dev, adv_test = Lang8.splits(exts=('.adv.cor', '.adv.err'), fields=[('src', src), ('tgt', tgt)], train='test', validation='test', test='test') adv_train_iter, adv_dev_iter, real_iter = torchtext.data.BucketIterator.splits( (adv_train, adv_dev, adv_train), batch_sizes=(1, 256, 256), device=device, sort_key=lambda x: len(x.src)) src.build_vocab(pre_train, pre_dev, pre_test, adv_train, adv_dev, adv_test) tgt.build_vocab(pre_train, pre_dev, pre_test, adv_train, adv_dev, adv_test) pad_id = tgt.vocab.stoi[tgt.pad_token] # init generator encoder = EncoderRNN(len(src.vocab), max_len, hidden_size, bidirectional=bidirectional, rnn_cell='lstm', variable_lengths=True) decoder = DecoderRNN(len(tgt.vocab), max_len, hidden_size * 2 if bidirectional else hidden_size, dropout_p=0.2, use_attention=True,
output_vocab = checkpoint.output_vocab else: # Prepare dataset src = SourceField() tgt = TargetField() max_len = 50 def len_filter(example): return len(example.src) <= max_len and len(example.tgt) <= max_len train = torchtext.data.TabularDataset(path=opt.train_path, format='tsv', fields=[('src', src), ('tgt', tgt)], filter_pred=len_filter) src.build_vocab(train, max_size=20000 - 2) tgt.build_vocab(train, max_size=20000 - 2) sos_id = tgt.vocab.stoi['<sos>'] # Prepare loss weight = torch.ones(len(tgt.vocab)) pad = tgt.vocab.stoi[tgt.pad_token] loss = Perplexity(weight, pad) if torch.cuda.is_available(): loss.cuda() seq2seq = None optimizer = None if not opt.resume: # Initialize model
tgt = TargetField() max_len = 5 def len_filter(example): return len(example.src) <= max_len and len(example.tgt) <= max_len train = torchtext.data.TabularDataset(path=opt.train_path, format='tsv', fields=[('src', src), ('tgt', tgt)], filter_pred=len_filter) dev = torchtext.data.TabularDataset(path=opt.dev_path, format='tsv', fields=[('src', src), ('tgt', tgt)], filter_pred=len_filter) src.build_vocab(train, wv_type='glove.6B', fill_from_vectors=True, max_size=100000) tgt.build_vocab(train, wv_type='glove.6B', fill_from_vectors=True, max_size=100000) input_vocab = src.vocab output_vocab = tgt.vocab # NOTE: If the source field name and the target field name # are different from 'src' and 'tgt' respectively, they have # to be set explicitly before any training or inference # seq2seq.src_field_name = 'src' # seq2seq.tgt_field_name = 'tgt' # Prepare loss
def offline_training(opt, traget_file_path): # Prepare dataset with torchtext src = SourceField(tokenize=treebank_tokenizer) tgt = TargetField(tokenize=treebank_tokenizer) def sample_filter(sample): """ sample example for future purpose""" return True train = torchtext.data.TabularDataset(path=opt.train_path, format='tsv', fields=[('src', src), ('tgt', tgt)], filter_pred=sample_filter) dev = torchtext.data.TabularDataset(path=opt.dev_path, format='tsv', fields=[('src', src), ('tgt', tgt)], filter_pred=sample_filter) test = torchtext.data.TabularDataset(path=opt.dev_path, format='tsv', fields=[('src', src), ('tgt', tgt)], filter_pred=sample_filter) src.build_vocab(train, max_size=opt.src_vocab_size) tgt.build_vocab(train, max_size=opt.tgt_vocab_size) input_vocab = src.vocab output_vocab = tgt.vocab # NOTE: If the source field name and the target field name # are different from 'src' and 'tgt' respectively, they have # to be set explicitly before any training or inference # seq2seq.src_field_name = 'src' # seq2seq.tgt_field_name = 'tgt' # Prepare loss weight = torch.ones(len(tgt.vocab)) pad = tgt.vocab.stoi[tgt.pad_token] if opt.loss == 'perplexity': loss = Perplexity(weight, pad) else: raise TypeError seq2seq = None optimizer = None if not opt.resume: # Initialize model encoder = EncoderRNN(vocab_size=len(src.vocab), max_len=opt.max_length, hidden_size=opt.hidden_size, input_dropout_p=opt.intput_dropout_p, dropout_p=opt.dropout_p, n_layers=opt.n_layers, bidirectional=opt.bidirectional, rnn_cell=opt.rnn_cell, variable_lengths=True, embedding=input_vocab.vectors if opt.use_pre_trained_embedding else None, update_embedding=opt.update_embedding) decoder = DecoderRNN(vocab_size=len(tgt.vocab), max_len=opt.max_length, hidden_size=opt.hidden_size * 2 if opt.bidirectional else opt.hidden_size, sos_id=tgt.sos_id, eos_id=tgt.eos_id, n_layers=opt.n_layers, rnn_cell=opt.rnn_cell, bidirectional=opt.bidirectional, input_dropout_p=opt.input_dropout_p, dropout_p=opt.dropout_p, use_attention=opt.use_attention) seq2seq = Seq2seq(encoder=encoder, decoder=decoder) if opt.gpu >= 0 and torch.cuda.is_available(): seq2seq.cuda() for param in seq2seq.parameters(): param.data.uniform_(-0.08, 0.08) # train trainer = SupervisedTrainer(loss=loss, batch_size=opt.batch_size, checkpoint_every=opt.checkpoint_every, print_every=opt.print_every, expt_dir=opt.expt_dir) seq2seq = trainer.train(model=seq2seq, data=train, num_epochs=opt.epochs, resume=opt.resume, dev_data=dev, optimizer=optimizer, teacher_forcing_ratio=opt.teacher_forcing_rate)
class auto_seq2seq: def __init__(self, data_path, model_save_path, model_load_path, hidden_size=32, max_vocab=4000, device='cuda'): self.src = SourceField() self.tgt = TargetField() self.max_length = 90 self.data_path = data_path self.model_save_path = model_save_path self.model_load_path = model_load_path def len_filter(example): return len(example.src) <= self.max_length and len( example.tgt) <= self.max_length self.trainset = torchtext.data.TabularDataset( path=os.path.join(self.data_path, 'train'), format='tsv', fields=[('src', self.src), ('tgt', self.tgt)], filter_pred=len_filter) self.devset = torchtext.data.TabularDataset(path=os.path.join( self.data_path, 'eval'), format='tsv', fields=[('src', self.src), ('tgt', self.tgt)], filter_pred=len_filter) self.src.build_vocab(self.trainset, max_size=max_vocab) self.tgt.build_vocab(self.trainset, max_size=max_vocab) weight = torch.ones(len(self.tgt.vocab)) pad = self.tgt.vocab.stoi[self.tgt.pad_token] self.loss = Perplexity(weight, pad) self.loss.cuda() self.optimizer = None self.hidden_size = hidden_size self.bidirectional = True encoder = EncoderRNN(len(self.src.vocab), self.max_length, self.hidden_size, bidirectional=self.bidirectional, variable_lengths=True) decoder = DecoderRNN(len(self.tgt.vocab), self.max_length, self.hidden_size * 2 if self.bidirectional else self.hidden_size, dropout_p=0.2, use_attention=True, bidirectional=self.bidirectional, eos_id=self.tgt.eos_id, sos_id=self.tgt.sos_id) self.device = device self.seq2seq = Seq2seq(encoder, decoder).cuda() for param in self.seq2seq.parameters(): param.data.uniform_(-0.08, 0.08) def train(self, epoch=20, resume=False): t = SupervisedTrainer(loss=self.loss, batch_size=96, checkpoint_every=1000, print_every=1000, expt_dir=self.model_save_path) self.seq2seq = t.train(self.seq2seq, self.trainset, num_epochs=epoch, dev_data=self.devset, optimizer=self.optimizer, teacher_forcing_ratio=0.5, resume=resume)
def train(opt): LOG_FORMAT = '%(asctime)s %(levelname)-8s %(message)s' logging.basicConfig(format=LOG_FORMAT, level=getattr(logging, opt.log_level.upper())) logging.info(opt) if int(opt.GPU) >= 0: torch.cuda.set_device(int(opt.GPU)) if opt.load_checkpoint is not None: logging.info("loading checkpoint from {}".format( os.path.join(opt.expt_dir, Checkpoint.CHECKPOINT_DIR_NAME, opt.load_checkpoint))) checkpoint_path = os.path.join(opt.expt_dir, Checkpoint.CHECKPOINT_DIR_NAME, opt.load_checkpoint) checkpoint = Checkpoint.load(checkpoint_path) seq2tree = checkpoint.model input_vocab = checkpoint.input_vocab else: # Prepare dataset src = SourceField() nt = NTField() pos = PosField() tgt_tree = TreeField() comp = CompField() max_len = opt.max_len def len_filter(example): return len(example.src) <= max_len train = torchtext.data.TabularDataset(path=opt.train_path, format='tsv', fields=[('src', src), ('nt', nt), ('pos', pos), ('tree', tgt_tree)], filter_pred=len_filter) dev = torchtext.data.TabularDataset(path=opt.dev_path, format='tsv', fields=[('src', src), ('nt', nt), ('pos', pos), ('tree', tgt_tree)], filter_pred=len_filter) src.build_vocab(train, max_size=50000) comp.build_vocab(train, max_size=50000) nt.build_vocab(train, max_size=50000) pos.build_vocab(train, max_size=50000) # src_tree.build_vocab(train, max_size=50000) pos_in_nt = set() for Pos in pos.vocab.stoi: if nt.vocab.stoi[Pos] > 1: pos_in_nt.add(nt.vocab.stoi[Pos]) hidden_size = opt.hidden_size input_vocab = src.vocab nt_vocab = nt.vocab def tree_to_id(tree): tree.set_label(nt_vocab.stoi[tree.label()]) if len(tree) == 1 and str(tree[0])[0] is not '(': tree[0] = input_vocab.stoi[tree[0]] return else: for subtree in tree: tree_to_id(subtree) tree.append(Tree(nt_vocab.stoi['<eos>'], [])) return tree # train.examples = [str(tree_to_id(ex.tree)) for ex in train.examples] # dev.examples = [str(tree_to_id(ex.tree)) for ex in dev.examples] for ex in train.examples: ex.tree = str(tree_to_id(Tree.fromstring(ex.tree))) for ex in dev.examples: ex.tree = str(tree_to_id(Tree.fromstring(ex.tree))) # train.examples = [tree_to_id(Tree.fromstring(ex.tree)) for ex in train.examples] # dev.examples = [str(tree_to_id(Tree.fromstring(ex.tree))) for ex in dev.examples] if opt.word_embedding is not None: input_vocab.load_vectors([opt.word_embedding]) loss = NLLLoss() if torch.cuda.is_available(): loss.cuda() loss.reset() seq2tree = None optimizer = None if not opt.resume: # Initialize model bidirectional = opt.bidirectional_encoder encoder = EncoderRNN(len(src.vocab), opt.word_embedding_size, max_len, hidden_size, bidirectional=bidirectional, variable_lengths=True) decoder = DecoderTree(len(src.vocab), opt.word_embedding_size, opt.nt_embedding_size, len(nt.vocab), max_len, hidden_size * 2 if bidirectional else hidden_size, sos_id=nt_vocab.stoi['<sos>'], eos_id=nt_vocab.stoi['<eos>'], dropout_p=0.2, use_attention=True, bidirectional=bidirectional, pos_in_nt=pos_in_nt) seq2tree = Seq2tree(encoder, decoder) if torch.cuda.is_available(): seq2tree.cuda() for param in seq2tree.parameters(): param.data.uniform_(-0.08, 0.08) # encoder.embedding.weight.data.set_(input_vocab.vectors) # encoder.embedding.weight.data.set_(output_vocab.vectors) # Optimizer and learning rate scheduler can be customized by # explicitly constructing the objects and pass to the trainer. # # optimizer = Optimizer(torch.optim.Adam(seq2seq.parameters()), max_grad_norm=5) # scheduler = StepLR(optimizer.optimizer, 1) # optimizer.set_scheduler(scheduler) optimizer = Optimizer(optim.Adam(seq2tree.parameters(), lr=opt.lr), max_grad_norm=5) # train t = SupervisedTrainer(loss=loss, batch_size=opt.batch_size, checkpoint_every=opt.checkpoint_every, print_every=10, expt_dir=opt.expt_dir, lr=opt.lr) seq2tree = t.train(seq2tree, train, num_epochs=opt.epoch, dev_data=dev, optimizer=optimizer, teacher_forcing_ratio=0, resume=opt.resume) predictor = Predictor(seq2tree, input_vocab, nt_vocab) return predictor, dev, train
def run_training(opt, default_data_dir, num_epochs=100): if opt.load_checkpoint is not None: logging.info("loading checkpoint from {}".format( os.path.join(opt.expt_dir, Checkpoint.CHECKPOINT_DIR_NAME, opt.load_checkpoint))) checkpoint_path = os.path.join(opt.expt_dir, Checkpoint.CHECKPOINT_DIR_NAME, opt.load_checkpoint) checkpoint = Checkpoint.load(checkpoint_path) seq2seq = checkpoint.model input_vocab = checkpoint.input_vocab output_vocab = checkpoint.output_vocab else: # Prepare dataset src = SourceField() tgt = TargetField() max_len = 50 data_file = os.path.join(default_data_dir, opt.train_path, 'data.txt') logging.info("Starting new Training session on %s", data_file) def len_filter(example): return (len(example.src) <= max_len) and (len(example.tgt) <= max_len) \ and (len(example.src) > 0) and (len(example.tgt) > 0) train = torchtext.data.TabularDataset( path=data_file, format='json', fields={'src': ('src', src), 'tgt': ('tgt', tgt)}, filter_pred=len_filter ) dev = None if opt.no_dev is False: dev_data_file = os.path.join(default_data_dir, opt.train_path, 'dev-data.txt') dev = torchtext.data.TabularDataset( path=dev_data_file, format='json', fields={'src': ('src', src), 'tgt': ('tgt', tgt)}, filter_pred=len_filter ) src.build_vocab(train, max_size=50000) tgt.build_vocab(train, max_size=50000) input_vocab = src.vocab output_vocab = tgt.vocab # NOTE: If the source field name and the target field name # are different from 'src' and 'tgt' respectively, they have # to be set explicitly before any training or inference # seq2seq.src_field_name = 'src' # seq2seq.tgt_field_name = 'tgt' # Prepare loss weight = torch.ones(len(tgt.vocab)) pad = tgt.vocab.stoi[tgt.pad_token] loss = Perplexity(weight, pad) if torch.cuda.is_available(): logging.info("Yayyy We got CUDA!!!") loss.cuda() else: logging.info("No cuda available device found running on cpu") seq2seq = None optimizer = None if not opt.resume: hidden_size = 128 decoder_hidden_size = hidden_size * 2 logging.info("EncoderRNN Hidden Size: %s", hidden_size) logging.info("DecoderRNN Hidden Size: %s", decoder_hidden_size) bidirectional = True encoder = EncoderRNN(len(src.vocab), max_len, hidden_size, bidirectional=bidirectional, rnn_cell='lstm', variable_lengths=True) decoder = DecoderRNN(len(tgt.vocab), max_len, decoder_hidden_size, dropout_p=0, use_attention=True, bidirectional=bidirectional, rnn_cell='lstm', eos_id=tgt.eos_id, sos_id=tgt.sos_id) seq2seq = Seq2seq(encoder, decoder) if torch.cuda.is_available(): seq2seq.cuda() for param in seq2seq.parameters(): param.data.uniform_(-0.08, 0.08) # Optimizer and learning rate scheduler can be customized by # explicitly constructing the objects and pass to the trainer. optimizer = Optimizer(torch.optim.Adam(seq2seq.parameters()), max_grad_norm=5) scheduler = StepLR(optimizer.optimizer, 1) optimizer.set_scheduler(scheduler) # train num_epochs = num_epochs batch_size = 32 checkpoint_every = num_epochs / 10 print_every = num_epochs / 100 properties = dict(batch_size=batch_size, checkpoint_every=checkpoint_every, print_every=print_every, expt_dir=opt.expt_dir, num_epochs=num_epochs, teacher_forcing_ratio=0.5, resume=opt.resume) logging.info("Starting training with the following Properties %s", json.dumps(properties, indent=2)) t = SupervisedTrainer(loss=loss, batch_size=num_epochs, checkpoint_every=checkpoint_every, print_every=print_every, expt_dir=opt.expt_dir) seq2seq = t.train(seq2seq, train, num_epochs=num_epochs, dev_data=dev, optimizer=optimizer, teacher_forcing_ratio=0.5, resume=opt.resume) evaluator = Evaluator(loss=loss, batch_size=batch_size) if opt.no_dev is False: dev_loss, accuracy = evaluator.evaluate(seq2seq, dev) logging.info("Dev Loss: %s", dev_loss) logging.info("Accuracy: %s", dev_loss) beam_search = Seq2seq(seq2seq.encoder, TopKDecoder(seq2seq.decoder, 4)) predictor = Predictor(beam_search, input_vocab, output_vocab) while True: try: seq_str = raw_input("Type in a source sequence:") seq = seq_str.strip().split() results = predictor.predict_n(seq, n=3) for i, res in enumerate(results): print('option %s: %s\n', i + 1, res) except KeyboardInterrupt: logging.info("Bye Bye") exit(0)
################################################################################# # prepare model if opt.load_checkpoint is not None: logging.info("loading checkpoint from {}".format( os.path.join(opt.output_dir, opt.load_checkpoint))) checkpoint_path = os.path.join(opt.output_dir, opt.load_checkpoint) checkpoint = Checkpoint.load(checkpoint_path) seq2seq = checkpoint.model input_vocab = checkpoint.input_vocab output_vocab = checkpoint.output_vocab src.vocab = input_vocab tgt.vocab = output_vocab else: # build vocabulary src.build_vocab(train, max_size=opt.src_vocab) tgt.build_vocab(train, max_size=opt.tgt_vocab) input_vocab = src.vocab output_vocab = tgt.vocab # Initialize model hidden_size = opt.hidden_size decoder_hidden_size = hidden_size * 2 if opt.bidirectional else hidden_size encoder = EncoderRNN(len(src.vocab), max_len, hidden_size, opt.embedding_size, bidirectional=opt.bidirectional, rnn_cell=opt.rnn_cell, variable_lengths=True) decoder = DecoderRNN(len(tgt.vocab),
def len_filter(example): return len(example.src) <= max_len and len(example.tgt) <= max_len train = torchtext.data.TabularDataset(path=opt.train_path, format='tsv', fields=[('src', src), ('tgt', tgt)], filter_pred=len_filter) dev = torchtext.data.TabularDataset(path=opt.dev_path, format='tsv', fields=[('src', src), ('tgt', tgt)], filter_pred=len_filter) test = torchtext.data.TabularDataset(path=opt.test_path, format='tsv', fields=[('src', src), ('tgt', tgt)], filter_pred=len_filter) src.build_vocab(train, dev, test, max_size=50000) tgt.build_vocab(train, dev, test, max_size=1000) input_vocab = src.vocab output_vocab = tgt.vocab batch_iterator = torchtext.data.BucketIterator( dataset=train, batch_size=8, sort_key=lambda x: -len(x.src), repeat=False) for i, batch in enumerate(batch_iterator): input_variables, input_lengths = getattr(batch, 'src') target_variables = getattr(batch, 'tgt') print('Train inspection') for j, l in enumerate(input_lengths): src_indices = input_variables[j][0:l].data.tolist()
max_len = int(myconf.model.maxlen) def len_filter(example): return len(example.src) <= max_len and len(example.tgt) <= max_len train = torchtext.data.TabularDataset( path=opt.train_path, format='tsv', fields=[('src', src), ('tgt', tgt)], filter_pred=len_filter ) dev = torchtext.data.TabularDataset( path=opt.dev_path, format='tsv', fields=[('src', src), ('tgt', tgt)], filter_pred=len_filter ) # 构建语料库的 Vocabulary,同时加载预训练的 word-embedding。通过 vocab.Vectors 使用自定义的 vectors。 vectors = vocab.Vectors(myconf.embedding.char2vec) src.build_vocab(train, max_size=int(myconf.model.src_vocab_size)) tgt.build_vocab(train, max_size=int(myconf.model.tgt_vocab_size)) src.vocab.set_vectors(vectors.stoi, vectors.vectors, vectors.dim) input_vocab = src.vocab output_vocab = tgt.vocab # NOTE: If the source field name and the target field name # are different from 'src' and 'tgt' respectively, they have # to be set explicitly before any training or inference # seq2seq.src_field_name = 'src' # seq2seq.tgt_field_name = 'tgt' # Prepare loss weight = torch.ones(len(tgt.vocab)) pad = tgt.vocab.stoi[tgt.pad_token] loss = Perplexity(weight, pad)