def create_model(sess, data, args, embed): with tf.variable_scope(args.name): model = LM(data, args, embed) model.print_parameters() latest_dir = '%s/checkpoint_latest' % args.model_dir best_dir = '%s/checkpoint_best' % args.model_dir if not os.path.isdir(args.model_dir): os.mkdir(args.model_dir) if not os.path.isdir(latest_dir): os.mkdir(latest_dir) if not os.path.isdir(best_dir): os.mkdir(best_dir) if tf.train.get_checkpoint_state(latest_dir, args.name) and args.restore == "last": print("Reading model parameters from %s" % tf.train.latest_checkpoint(latest_dir, args.name)) model.latest_saver.restore( sess, tf.train.latest_checkpoint(latest_dir, args.name)) else: if tf.train.get_checkpoint_state( best_dir, args.name) and args.restore == "best": print('Reading model parameters from %s' % tf.train.latest_checkpoint(best_dir, args.name)) model.best_saver.restore( sess, tf.train.latest_checkpoint(best_dir, args.name)) else: print("Created model with fresh parameters.") global_variable = [ gv for gv in tf.global_variables() if args.name in gv.name ] sess.run(tf.variables_initializer(global_variable)) return model
def test(args): config = load_config(args.model_dir) dataset_cls = DATASETS[config.get("dataset_cls", "text")] vocab_dump_path = os.path.join(args.model_dir, "vocab.pkl") test_file = config["test_file"] if len( args.test_file) == 0 else args.test_file test_dataset = dataset_cls(test_file, vocab_dump=vocab_dump_path, **(config.get("dataset_args", {}))) config["vocab_size"] = len(test_dataset.vocab) model = LM(config, args.model_dir) if args.epoch is not None: print_time_info("Loading checkpoint {} from model_dir".format( args.epoch)) epoch = model.load_model(args.model_dir, args.epoch) else: print_time_info("Loading last checkpoint from model_dir") epoch = model.load_model(args.model_dir) loss = model.test(batch_size=config["batch_size"], data_engine=test_dataset)
def main(args): print("Loading data") corpus = data.Corpus(args.data, max_vocab_size=args.max_vocab, max_length=args.max_length) vocab_size = len(corpus.word2idx) print("\ttraining data size: ", corpus.train_data.size) print("\tvocabulary size: ", vocab_size) print("Constructing model") print(args) device = torch.device('cpu' if args.nocuda else 'cuda') model = LM(vocab_size, args.embed_size, args.hidden_size, args.dropout).to(device) optimizer = torch.optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.wd) best_loss = None print("\nStart training") try: for epoch in range(1, args.epochs + 1): epoch_start_time = time.time() train_ce, train_ppl = train(corpus.train_data, model, optimizer, epoch, device) valid_ce, valid_ppl = evaluate(corpus.valid_data, model, device) print('-' * 70) meta = "| epoch {:2d} | time {:5.2f}s ".format( epoch, time.time() - epoch_start_time) print(meta + "| train loss {:5.2f} | train ppl {:5.2f}".format( train_ce, train_ppl)) print(len(meta) * ' ' + "| valid loss {:5.2f} " "| valid ppl {:5.2f}".format(valid_ce, valid_ppl), flush=True) if best_loss is None or valid_ce < best_loss: best_loss = valid_ce with open(get_savepath(args), 'wb') as f: torch.save(model, f) except KeyboardInterrupt: print('-' * 70) print('Exiting from training early') with open(get_savepath(args), 'rb') as f: model = torch.load(f) test_ce, test_ppl = evaluate(corpus.test_data, model, device) print('=' * 70) print("| End of training | test loss {:5.2f} | test ppl {:5.2f}".format( test_ce, test_ppl)) print('=' * 70)
def train(): # generate file paths now = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") savedir = os.path.join(config.SAVE_DIR, "hierarchical_bottom", f"{now}") os.makedirs(savedir, exist_ok=True) checkpointfn = os.path.join(savedir, "checkpoint.model") logfn = os.path.join(savedir, "run.log") torch.manual_seed(config.SEED) # create logger logger = logging.getLogger() if logger.hasHandlers(): logger.handlers.clear() logger.setLevel(logging.DEBUG) fmt = logging.Formatter("%(asctime)s %(levelname)-8s: %(message)s") console = logging.StreamHandler() console.setFormatter(fmt) logger.addHandler(console) logfile = logging.FileHandler(logfn, "a") logfile.setFormatter(fmt) logfile.setLevel(logging.DEBUG) logger.addHandler(logfile) with open("config.py", "r") as f: for l in f: logging.debug(l.strip()) dataset = BrownDataset(config.CONTEXT_SIZE) model = LM() num_params = sum(p.numel() for p in model.parameters()) logging.debug(f"The model has {num_params:,} parameters") # Trainer init logging.debug("Initiate the training environment") trainer = LMTrainer(model, dataset, checkpointfn) # Training logging.debug("Starting the training") for epoch in tqdm.tqdm(range(config.EPOCHS), total=config.EPOCHS, desc="EPOCH"): trainer.run_epoch() if trainer.patience > config.PATIENCE: logging.info("patience over {}, exiting".format(config.PATIENCE)) break
def gen_sent_on_topic(idxvocab, vocabxid, start_symbol, end_symbol, cf): output = codecs.open(args.gen_sent_on_topic, "w", "utf-8") topics, entropy = tm.get_topics(sess, topn=topn) with tf.variable_scope("model", reuse=True, initializer=initializer): mgen = LM(is_training=False, vocab_size=len(idxvocab), batch_size=1, num_steps=1, config=cf, \ reuse_conv_variables=True) for t in range(cf.topic_number): output.write("\n" + "=" * 100 + "\n") output.write("Topic " + str(t) + ":\n") output.write(" ".join([idxvocab[item] for item in topics[t]]) + "\n\n") output.write("\nSentence generation (greedy; argmax):" + "\n") s = mgen.generate_on_topic(sess, t, vocabxid[start_symbol], 0, cf.lm_sent_len + 10, vocabxid[end_symbol]) output.write("[0] " + " ".join([idxvocab[item] for item in s]) + "\n") for temp in gen_temps: output.write("\nSentence generation (random; temperature = " + str(temp) + "):\n") for i in xrange(gen_num): s = mgen.generate_on_topic(sess, t, vocabxid[start_symbol], temp, cf.lm_sent_len+10, \ vocabxid[end_symbol]) output.write("[" + str(i) + "] " + " ".join([idxvocab[item] for item in s]) + "\n")
def train(args): config = load_config(args.model_dir) train_dataset = LMDataset(config["train_file"], vocab_file=config["vocab_file"]) vocab_dump_path = os.path.join(args.model_dir, "vocab.pkl") with open(vocab_dump_path, 'wb') as fp: pickle.dump(train_dataset.vocab, fp) valid_dataset = LMDataset(config["valid_file"], vocab_dump=vocab_dump_path) config["vocab_size"] = len(train_dataset.vocab) model = LM(config, args.model_dir) if args.epoch is not None: print_time_info("Loading checkpoint {} from model_dir".format( args.epoch)) model.load_model(args.model_dir, args.epoch) model.train(epochs=config["train_epochs"], batch_size=config["batch_size"], data_engine=train_dataset, valid_data_engine=valid_dataset, train_decoder_epochs=config.get("train_decoder_epochs", 0), max_iter_per_epoch=config.get("max_iter_per_epoch", 100000))
def main(config): log_setp(logger, config) dataloader = DataLoader(config) word_map = dataloader.word_map print('number of tokens:', len(word_map)) print( f'create model: input size {config.input_size}, hidden size {config.hidden_size}, layer number {config.layer_num}' ) model = LM(len(word_map), config.input_size, config.hidden_size, config.layer_num) print(f'start training of {config.iter_num} iterations') train(model, dataloader, config)
def main(test_data_path): dic = pickle.load(open('vocab.pkl','rb')) word_vocab = dic['word_vocab'] char_vocab = dic['char_vocab'] max_len = dic['max_len'] batch_size = config.batch_size embed_dim = config.embed_dim out_channels = config.out_channels kernels = config.kernels hidden_size = config.hidden_size learning_rate = config.learning_rate seq_len = config.seq_len test_data, _ = corpus_to_word(test_data_path, batch_size) test_idx = word_to_idx(test_data,word_vocab) test_idx = test_idx.contiguous().view(batch_size, -1) test_data = word_to_char(test_data, char_vocab, max_len) test_data = torch.from_numpy(test_data) test_data = test_data.contiguous().view(batch_size, -1, max_len) model = LM(word_vocab,char_vocab,max_len,embed_dim,out_channels,kernels,hidden_size) if torch.cuda.is_available(): model.cuda() model.load_state_dict(torch.load('model.pkl')) criterion = nn.CrossEntropyLoss() optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate, weight_decay=1e-5) scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min',factor=0.5,patience=1,verbose=True) hidden_state = (Variable(torch.zeros(2,batch_size,hidden_size).cuda(), volatile=False), Variable(torch.zeros(2,batch_size,hidden_size).cuda(), volatile=False)) model.eval() test_loss = eval(seq_len,test_data,test_idx,model,hidden_state, criterion) test_loss = np.exp(test_loss)
def gen_sent_on_doc(docs, tags, idxvocab, vocabxid, start_symbol, end_symbol, cf): topics, _ = tm.get_topics(sess, topn=topn) topics = [" ".join([idxvocab[w] for w in t]) for t in topics] doc_text = [ item.replace("\t", "\n") for item in codecs.open(args.input_doc, "r", "utf-8").readlines() ] output = codecs.open(args.gen_sent_on_doc, "w", "utf-8") with tf.variable_scope("model", reuse=True, initializer=initializer): mgen = LM(is_training=False, vocab_size=len(idxvocab), batch_size=1, num_steps=1, config=cf, \ reuse_conv_variables=True) for d in range(len(docs)): output.write("\n" + "=" * 100 + "\n") output.write("Doc " + str(d) + ":\n") output.write(doc_text[d]) doc, _, _, t, _ = get_batch_doc(docs, None, tags, d, cf.doc_len, cf.tag_len, 1, vocabxid[pad_symbol]) best_topics, best_words = mgen.get_topics_on_doc(sess, doc, t, topn) output.write("\nRepresentative topics:\n") output.write("\n".join([ ("[%.3f] %s: %s" % (item[1],str(item[0]).zfill(3),topics[item[0]])) \ for item in best_topics ]) + "\n") output.write("\nRepresentative words:\n") output.write("\n".join([("[%.3f] %s" % (item[1], idxvocab[item[0]])) for item in best_words]) + "\n") output.write("\nSentence generation (greedy; argmax):" + "\n") s = mgen.generate_on_doc(sess, doc, t, vocabxid[start_symbol], 0, cf.lm_sent_len + 10, vocabxid[end_symbol]) output.write("[0] " + " ".join([idxvocab[item] for item in s]) + "\n") for temp in gen_temps: output.write("\nSentence generation (random; temperature = " + str(temp) + "):\n") for i in xrange(gen_num): s = mgen.generate_on_doc(sess, doc, t, vocabxid[start_symbol], temp, cf.lm_sent_len+10, \ vocabxid[end_symbol]) output.write("[" + str(i) + "] " + " ".join([idxvocab[item] for item in s]) + "\n")
def build_model(self, load_model=False): self.model = cc(E2E(input_dim=self.config['input_dim'], enc_hidden_dim=self.config['enc_hidden_dim'], enc_n_layers=self.config['enc_n_layers'], subsample=self.config['subsample'], dropout_rate=self.config['dropout_rate'], dec_hidden_dim=self.config['dec_hidden_dim'], att_dim=self.config['att_dim'], conv_channels=self.config['conv_channels'], conv_kernel_size=self.config['conv_kernel_size'], att_odim=self.config['att_odim'], output_dim=len(self.vocab), embedding_dim=self.config['embedding_dim'], ls_weight=self.config['ls_weight'], labeldist=self.labeldist, pad=self.vocab['<PAD>'], bos=self.vocab['<BOS>'], eos=self.vocab['<EOS>'] )) print(self.model) self.gen_opt = torch.optim.Adam(self.model.parameters(), lr=self.config['learning_rate'], weight_decay=self.config['weight_decay'], amsgrad=True) if load_model: self.load_model(self.config['load_model_path'], self.config['load_optimizer']) print(self.gen_opt) self.judge = cc(LM( output_dim=len(self.vocab), embedding_dim=self.config['dis_embedding_dim'], hidden_dim=self.config['dis_hidden_dim'], dropout_rate=self.config['dis_dropout_rate'], n_layers=self.config['dis_layers'], bos=self.vocab['<BOS>'], eos=self.vocab['<EOS>'], pad=self.vocab['<PAD>'], ls_weight=self.config['ls_weight'], labeldist=self.unlab_labeldist )) print(self.judge) self.dis_opt = torch.optim.Adam( filter(lambda p: p.requires_grad, self.judge.parameters()), lr=self.config['d_learning_rate']) return
def try_params(n_iterations, params): n_iterations = int(n_iterations) use_cuda = (len(options.gpuid) >= 1) if options.gpuid: cuda.set_device(options.gpuid[0]) src_vocab = dill.load(open('src_vocab.pickle', 'rb')) trg_vocab = dill.load(open('trg_vocab.pickle', 'rb')) src_dev = dill.load(open('src_dev.pickle', 'rb')) trg_dev = dill.load(open('trg_dev.pickle', 'rb')) batched_dev_src, batched_dev_src_mask, sort_index = utils.tensor.advanced_batchize( src_dev, options.batch_size, src_vocab.stoi["<blank>"]) batched_dev_trg, batched_dev_trg_mask = utils.tensor.advanced_batchize_no_sort( trg_dev, options.batch_size, trg_vocab.stoi["<blank>"], sort_index) batches = [] if options.contain_bilingual: print('Load') src_train = dill.load(open('src_sents1.pickle', 'rb')) print('Load src sents 1') trg_train = dill.load(open('trg_sents1.pickle', 'rb')) print('Load trg sents 1') batched_train_src1, batched_train_src_mask1, sort_index = utils.tensor.advanced_batchize( src_train, options.batch_size, src_vocab.stoi["<blank>"]) batched_train_trg1, batched_train_trg_mask1 = utils.tensor.advanced_batchize_no_sort( trg_train, options.batch_size, trg_vocab.stoi["<blank>"], sort_index) batches = batches + [(1, i) for i in range(len(batched_train_src1))] if options.mono_loss: batches = batches + [(4, i) for i in range(len(batched_train_src1))] batches = batches + [(5, i) for i in range(len(batched_train_src1))] if options.contain_trg: print('Load') # src_train = dill.load(open('src_sents2.pickle', 'rb')) # print('Load src sents 2') trg_train = dill.load(open('trg_sents2.pickle', 'rb')) print('Load trg sents 2') # batched_train_src2, batched_train_src_mask2, sort_index = utils.tensor.advanced_batchize(src_train, options.batch_size, src_vocab.stoi["<blank>"]) batched_train_trg2, batched_train_trg_mask2 = utils.tensor.advanced_batchize_no_sort( trg_train, options.batch_size, trg_vocab.stoi["<blank>"], sort_index) batches = batches + [(2, i) for i in range(len(batched_train_trg2))] if options.contain_src: print('Load') src_train = dill.load(open('src_sents3.pickle', 'rb')) print('Load src sents 3') # trg_train = dill.load(open('trg_sents3.pickle', 'rb')) # print('Load trg sents 3') batched_train_src3, batched_train_src_mask3, sort_index = utils.tensor.advanced_batchize( src_train, options.batch_size, src_vocab.stoi["<blank>"]) # batched_train_trg3, batched_train_trg_mask3 = utils.tensor.advanced_batchize_no_sort(trg_train, options.batch_size, trg_vocab.stoi["<blank>"], sort_index) batches = batches + [(3, i) for i in range(len(batched_train_src3))] src_vocab_size = len(src_vocab) trg_vocab_size = len(trg_vocab) if os.path.isfile(options.load_file_src) and os.path.isfile( options.load_file_trg): src_lm = torch.load(open(options.load_file_src, 'rb')) trg_lm = torch.load(open(options.load_file_trg, 'rb')) else: src_lm = LM(src_vocab_size, src_vocab.stoi['<s>'], src_vocab.stoi['</s>'], params['embedding_size'], params['hidden_size'], params['dropout'], use_cuda) trg_lm = LM(trg_vocab_size, trg_vocab.stoi['<s>'], trg_vocab.stoi['</s>'], params['embedding_size'], params['hidden_size'], params['dropout'], use_cuda) if use_cuda > 0: src_lm.cuda() trg_lm.cuda() else: src_lm.cpu() trg_lm.cpu() criterion = torch.nn.NLLLoss() optimizer_src = eval("torch.optim." + options.optimizer)( src_lm.parameters(), params['learning_rate']) optimizer_trg = eval("torch.optim." + options.optimizer)( trg_lm.parameters(), params['learning_rate']) # main training loop # last_dev_avg_loss = float("inf") for epoch_i in range(n_iterations): print(epoch_i) logging.info("At {0}-th epoch.".format(epoch_i)) shuffle(batches) src_lm.train() trg_lm.train() for i, (index, batch_i) in enumerate(batches): train_src_batch = None train_src_mask = None train_trg_batch = None train_trg_mask = None if index == 1: train_src_batch = Variable(batched_train_src1[batch_i]) train_src_mask = Variable(batched_train_src_mask1[batch_i]) train_trg_batch = Variable(batched_train_trg1[batch_i]) train_trg_mask = Variable(batched_train_trg_mask1[batch_i]) if use_cuda: train_src_batch = train_src_batch.cuda() train_trg_batch = train_trg_batch.cuda() train_src_mask = train_src_mask.cuda() train_trg_mask = train_trg_mask.cuda() elif index == 2: train_trg_batch = Variable(batched_train_trg2[batch_i]) train_trg_mask = Variable(batched_train_trg_mask2[batch_i]) if use_cuda: train_trg_batch = train_trg_batch.cuda() train_trg_mask = train_trg_mask.cuda() elif index == 3: train_src_batch = Variable(batched_train_src3[batch_i]) train_src_mask = Variable(batched_train_src_mask3[batch_i]) if use_cuda: train_src_batch = train_src_batch.cuda() train_src_mask = train_src_mask.cuda() elif index == 4: train_src_batch = Variable(batched_train_src1[batch_i]) train_src_mask = Variable(batched_train_src_mask1[batch_i]) if use_cuda: train_src_batch = train_src_batch.cuda() train_src_mask = train_src_mask.cuda() elif index == 5: train_trg_batch = Variable(batched_train_trg1[batch_i]) train_trg_mask = Variable(batched_train_trg_mask1[batch_i]) if use_cuda: train_trg_batch = train_trg_batch.cuda() train_trg_mask = train_trg_mask.cuda() else: raise ValueError() total_loss = 0 if index == 1: optimizer_trg.zero_grad() optimizer_src.zero_grad() h_src, c_src = src_lm(sent=train_src_batch) use_teacher_forcing = True if random.random( ) < params['teacher_forcing_ratio'] else False sys_out_batch = trg_lm(h=h_src, c=c_src, encode=False, tgt_sent=train_trg_batch, teacher_forcing=use_teacher_forcing) train_trg_mask_tmp = train_trg_mask.view(-1) train_trg_batch_tmp = train_trg_batch.view(-1) train_trg_batch_tmp = train_trg_batch_tmp.masked_select( train_trg_mask_tmp) train_trg_mask_tmp = train_trg_mask_tmp.unsqueeze(1).expand( len(train_trg_mask_tmp), trg_vocab_size) sys_out_batch = sys_out_batch.view(-1, trg_vocab_size) sys_out_batch = sys_out_batch.masked_select( train_trg_mask_tmp).view(-1, trg_vocab_size) loss = criterion(sys_out_batch, train_trg_batch_tmp) loss.backward() optimizer_src.step() optimizer_trg.step() if i % 100 == 0: logging.debug("loss at batch {0}: {1}".format( i, loss.data[0])) elif options.mono_loss and train_src_batch is not None: optimizer_trg.zero_grad() optimizer_src.zero_grad() h_src, c_src = src_lm(sent=train_src_batch) use_teacher_forcing = True if random.random( ) < params['teacher_forcing_ratio'] else False sys_out_batch = src_lm(h=h_src, c=c_src, encode=False, tgt_sent=train_src_batch, teacher_forcing=use_teacher_forcing) train_src_mask_tmp = train_src_mask.view(-1) train_src_batch_tmp = train_src_batch.view(-1) train_src_batch_tmp = train_src_batch_tmp.masked_select( train_src_mask_tmp) train_src_mask_tmp = train_src_mask_tmp.unsqueeze(1).expand( len(train_src_mask_tmp), src_vocab_size) sys_out_batch = sys_out_batch.view(-1, src_vocab_size) sys_out_batch = sys_out_batch.masked_select( train_src_mask_tmp).view(-1, src_vocab_size) loss = criterion(sys_out_batch, train_src_batch_tmp) loss *= params['mono_loss_multi'] * (1.0 / 10 * epoch_i) loss.backward() optimizer_src.step() optimizer_trg.step() if i % 100 == 0: logging.debug("loss at batch {0}: {1}".format( i, loss.data[0])) elif train_trg_batch is not None and options.mono_loss: optimizer_trg.zero_grad() optimizer_src.zero_grad() h_trg, c_trg = trg_lm(sent=train_trg_batch) use_teacher_forcing = True if random.random( ) < params['teacher_forcing_ratio'] else False sys_out_batch = trg_lm(h=h_trg, c=c_trg, encode=False, tgt_sent=train_trg_batch, teacher_forcing=use_teacher_forcing) train_trg_mask_tmp = train_trg_mask.view(-1) train_trg_batch_tmp = train_trg_batch.view(-1) train_trg_batch_tmp = train_trg_batch_tmp.masked_select( train_trg_mask_tmp) train_trg_mask_tmp = train_trg_mask_tmp.unsqueeze(1).expand( len(train_trg_mask_tmp), trg_vocab_size) sys_out_batch = sys_out_batch.view(-1, trg_vocab_size) sys_out_batch = sys_out_batch.masked_select( train_trg_mask_tmp).view(-1, trg_vocab_size) loss = criterion(sys_out_batch, train_trg_batch_tmp) loss *= params['mono_loss_multi'] * (1.0 / 10 * epoch_i) loss.backward() optimizer_src.step() optimizer_trg.step() if i % 100 == 0: logging.debug("loss at batch {0}: {1}".format( i, loss.data[0])) # validation -- this is a crude esitmation because there might be some paddings at the end dev_loss = 0.0 src_lm.eval() trg_lm.eval() for batch_i in range(len(batched_dev_src)): dev_src_batch = Variable(batched_dev_src[batch_i], volatile=True) dev_trg_batch = Variable(batched_dev_trg[batch_i], volatile=True) dev_src_mask = Variable(batched_dev_src_mask[batch_i], volatile=True) dev_trg_mask = Variable(batched_dev_trg_mask[batch_i], volatile=True) if use_cuda: dev_src_batch = dev_src_batch.cuda() dev_trg_batch = dev_trg_batch.cuda() dev_src_mask = dev_src_mask.cuda() dev_trg_mask = dev_trg_mask.cuda() h_src, c_src = src_lm(sent=dev_src_batch) sys_out_batch = trg_lm(h=h_src, c=c_src, encode=False, tgt_sent=dev_trg_batch) dev_trg_mask = dev_trg_mask.view(-1) dev_trg_batch = dev_trg_batch.view(-1) dev_trg_batch = dev_trg_batch.masked_select(dev_trg_mask) dev_trg_mask = dev_trg_mask.unsqueeze(1).expand( len(dev_trg_mask), trg_vocab_size) sys_out_batch = sys_out_batch.view(-1, trg_vocab_size) sys_out_batch = sys_out_batch.masked_select(dev_trg_mask).view( -1, trg_vocab_size) loss = criterion(sys_out_batch, dev_trg_batch) logging.debug("dev loss at batch {0}: {1}".format( batch_i, loss.data[0])) dev_loss += loss dev_avg_loss = dev_loss / len(batched_dev_src) logging.info( "Average loss value per instance is {0} at the end of epoch {1}". format(dev_avg_loss.data[0], epoch_i)) # if (last_dev_avg_loss - dev_avg_loss).data[0] < options.estop: # logging.info("Early stopping triggered with threshold {0} (previous dev loss: {1}, current: {2})".format(epoch_i, last_dev_avg_loss.data[0], dev_avg_loss.data[0])) # break # torch.save(src_lm, open(options.model_file_src + ".nll_{0:.2f}.epoch_{1}".format(dev_avg_loss.data[0], epoch_i), 'wb'), pickle_module=dill) # torch.save(trg_lm, open(options.model_file_trg + ".nll_{0:.2f}.epoch_{1}".format(dev_avg_loss.data[0], epoch_i), 'wb'), pickle_module=dill) # last_dev_avg_loss = dev_avg_loss return {'loss': dev_avg_loss.data[0]}
# -*- coding: UTF-8 -*- from model import LM import numpy as np import torch from torch.autograd import Variable from plot import make_dot if __name__ == '__main__': x = Variable(torch.randn(32, 42)) a = LM(42) y = a(x) g = make_dot(y) #g.view() g.render('here', view=False)
num_steps=3, num_classes=num_classes, cf=cf) tm_valid = TM(is_training=False, vocab_size=len(idxvocab), batch_size=cf.batch_size, num_steps=3, num_classes=num_classes, cf=cf) tm_train.conv_word_embedding = torch.from_numpy( init_embedding(wordvec, idxvocab)) lm_train = LM(is_training=True, vocab_size=len(idxvocab), batch_size=cf.batch_size, num_steps=3, num_classes=num_classes, cf=cf) lm_valid = LM(is_training=True, vocab_size=len(idxvocab), batch_size=cf.batch_size, num_steps=3, num_classes=num_classes, cf=cf) lm_train.lstm_word_embedding = torch.from_numpy( init_embedding(wordvec, idxvocab)) for i in range(cf.epoch_size): print "hello i am here2" run_epoch(train_sents, train_docs, None, None, (tm_train, None), True)
def main(options): use_cuda = (len(options.gpuid) >= 1) if options.gpuid: cuda.set_device(options.gpuid[0]) src_vocab = dill.load(open('src_vocab.pickle', 'rb')) trg_vocab = dill.load(open('trg_vocab.pickle', 'rb')) src_dev = dill.load(open('src_dev.pickle', 'rb')) trg_dev = dill.load(open('trg_dev.pickle', 'rb')) batched_dev_src, batched_dev_src_mask, sort_index = utils.tensor.advanced_batchize( src_dev, options.batch_size, src_vocab.stoi["<blank>"]) batched_dev_trg, batched_dev_trg_mask = utils.tensor.advanced_batchize_no_sort( trg_dev, options.batch_size, trg_vocab.stoi["<blank>"], sort_index) # batches = [] # if options.contain_bilingual: print('Load') src_train = dill.load(open('src_sents1.pickle', 'rb')) print('Load src sents 1') trg_train = dill.load(open('trg_sents1.pickle', 'rb')) print('Load trg sents 1') src_train = src_train + dill.load(open('src_sents2.pickle', 'rb')) print('Load src sents 2') trg_train = trg_train + dill.load(open('trg_sents2.pickle', 'rb')) print('Load trg sents 2') src_train = src_train + dill.load(open('src_sents3.pickle', 'rb')) print('Load src sents 3') trg_train = trg_train + dill.load(open('trg_sents3.pickle', 'rb')) print('Load trg sents 3') batched_train_src, batched_train_src_mask, sort_index = utils.tensor.advanced_batchize( src_train, options.batch_size, src_vocab.stoi["<blank>"]) batched_train_trg, batched_train_trg_mask = utils.tensor.advanced_batchize_no_sort( trg_train, options.batch_size, trg_vocab.stoi["<blank>"], sort_index) src_vocab_size = len(src_vocab) trg_vocab_size = len(trg_vocab) if os.path.isfile(options.load_file_src) and os.path.isfile( options.load_file_trg): src_lm = torch.load(open(options.load_file_src, 'rb')) trg_lm = torch.load(open(options.load_file_trg, 'rb')) else: src_lm = LM(src_vocab_size, src_vocab.stoi['<s>'], src_vocab.stoi['</s>'], options.embedding_size, options.hidden_size, options.dropout, use_cuda) trg_lm = LM(trg_vocab_size, trg_vocab.stoi['<s>'], trg_vocab.stoi['</s>'], options.embedding_size, options.hidden_size, options.dropout, use_cuda) if use_cuda > 0: src_lm.cuda() trg_lm.cuda() else: src_lm.cpu() trg_lm.cpu() criterion = torch.nn.NLLLoss() optimizer_src = eval("torch.optim." + options.optimizer)( src_lm.parameters(), options.learning_rate) optimizer_trg = eval("torch.optim." + options.optimizer)( trg_lm.parameters(), options.learning_rate) # main training loop # last_dev_avg_loss = float("inf") for epoch_i in range(options.epochs): print(epoch_i) logging.info("At {0}-th epoch.".format(epoch_i)) # srange generates a lazy sequence of shuffled range src_lm.train() trg_lm.train() for i, batch_i in enumerate(range(len(batched_train_src))): optimizer_trg.zero_grad() optimizer_src.zero_grad() train_src_batch = Variable(batched_train_src[batch_i]) train_src_mask = Variable(batched_train_src_mask[batch_i]) train_trg_batch = Variable(batched_train_trg[batch_i]) train_trg_mask = Variable(batched_train_trg_mask[batch_i]) if use_cuda: train_src_batch = train_src_batch.cuda() train_trg_batch = train_trg_batch.cuda() train_src_mask = train_src_mask.cuda() train_trg_mask = train_trg_mask.cuda() h_src, c_src = src_lm(sent=train_src_batch) use_teacher_forcing = True if random.random( ) < options.teacher_forcing_ratio else False sys_out_batch = trg_lm(h=h_src, c=c_src, encode=False, tgt_sent=train_trg_batch, teacher_forcing=use_teacher_forcing) train_trg_mask_tmp = train_trg_mask.view(-1) train_trg_batch_tmp = train_trg_batch.view(-1) train_trg_batch_tmp = train_trg_batch_tmp.masked_select( train_trg_mask_tmp) train_trg_mask_tmp = train_trg_mask_tmp.unsqueeze(1).expand( len(train_trg_mask_tmp), trg_vocab_size) sys_out_batch = sys_out_batch.view(-1, trg_vocab_size) sys_out_batch = sys_out_batch.masked_select( train_trg_mask_tmp).view(-1, trg_vocab_size) loss = criterion(sys_out_batch, train_trg_batch_tmp) loss.backward() optimizer_src.step() optimizer_trg.step() if i % 100 == 0: logging.debug("loss at batch {0}: {1}".format(i, loss.data[0])) # validation -- this is a crude esitmation because there might be some paddings at the end dev_loss = 0.0 src_lm.eval() trg_lm.eval() for batch_i in range(len(batched_dev_src)): dev_src_batch = Variable(batched_dev_src[batch_i], volatile=True) dev_trg_batch = Variable(batched_dev_trg[batch_i], volatile=True) dev_src_mask = Variable(batched_dev_src_mask[batch_i], volatile=True) dev_trg_mask = Variable(batched_dev_trg_mask[batch_i], volatile=True) if use_cuda: dev_src_batch = dev_src_batch.cuda() dev_trg_batch = dev_trg_batch.cuda() dev_src_mask = dev_src_mask.cuda() dev_trg_mask = dev_trg_mask.cuda() h_src, c_src = src_lm(sent=dev_src_batch) sys_out_batch = trg_lm(h=h_src, c=c_src, encode=False, tgt_sent=dev_trg_batch) dev_trg_mask = dev_trg_mask.view(-1) dev_trg_batch = dev_trg_batch.view(-1) dev_trg_batch = dev_trg_batch.masked_select(dev_trg_mask) dev_trg_mask = dev_trg_mask.unsqueeze(1).expand( len(dev_trg_mask), trg_vocab_size) sys_out_batch = sys_out_batch.view(-1, trg_vocab_size) sys_out_batch = sys_out_batch.masked_select(dev_trg_mask).view( -1, trg_vocab_size) loss = criterion(sys_out_batch, dev_trg_batch) logging.debug("dev loss at batch {0}: {1}".format( batch_i, loss.data[0])) dev_loss += loss dev_avg_loss = dev_loss / len(batched_dev_src) logging.info( "Average loss value per instance is {0} at the end of epoch {1}". format(dev_avg_loss.data[0], epoch_i)) # if (last_dev_avg_loss - dev_avg_loss).data[0] < options.estop: # logging.info("Early stopping triggered with threshold {0} (previous dev loss: {1}, current: {2})".format(epoch_i, last_dev_avg_loss.data[0], dev_avg_loss.data[0])) # break torch.save( src_lm, open( options.model_file_src + ".nll_{0:.2f}.epoch_{1}".format(dev_avg_loss.data[0], epoch_i), 'wb'), pickle_module=dill) torch.save( trg_lm, open( options.model_file_trg + ".nll_{0:.2f}.epoch_{1}".format(dev_avg_loss.data[0], epoch_i), 'wb'), pickle_module=dill)
(val_loss / count, np.exp(val_loss / count))) return val_loss / count best_ppl = 10000 final_ppl = 100 num_trial = 20 for trial in range(num_trial): pivot = 100000 model = LM(word_vocab, char_vocab, max_len, embed_dim, out_channels, kernels, hidden_size, batch_size) if torch.cuda.is_available(): model.cuda() learning_rate = 1.0 criterion = nn.CrossEntropyLoss() optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate) scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', factor=0.5, patience=1, verbose=True)
def train_(self): cur_best = 10000 model = LM(self.unique_words, self.char_vocab, self.max_len, self.embed_dim, self.channels, self.kernels, self.hidden_size) if torch.cuda.is_available(): model.cuda() learning_rate = self.learning_rate criterion = nn.CrossEntropyLoss() optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate) for epoch in range(self.epochs): model.train(True) hidden_state = [torch.zeros(2, self.batch_size, self.hidden_size).cuda()] * 2 ######## for i in range(0, self.train.size(1)-self.seq_len, self.seq_len): model.zero_grad() inputs = self.train[:, i : i + self.seq_len,:].cuda() # 20 * 35 * 21 targets = self.train_idx[:, (i+1) : (i+1) + self.seq_len].cuda() # 20 * 35 temp = [] for state in hidden_state: temp.append(state.detach()) hidden_state = temp output, hidden_state = model(inputs, hidden_state) # initialize? loss = criterion(output, targets.view(-1)) loss.backward() nn.utils.clip_grad_norm_(model.parameters(), 5) # clipping optimizer.step() step = (i+1) // self.seq_len if step % 100 == 0: print ('Epoch %d/%d, Batch x Seq_Len %d/%d, Loss: %.3f, Perplexity: %5.2f' % (epoch, self.epochs, step, self.num_batches//self.seq_len, loss.item(), np.exp(loss.item()))) model.eval() val_loss = self._validate(self.seq_len, self.valid, self.valid_idx, model, hidden_state, criterion) val_perplex = np.exp(val_loss) if cur_best-val_perplex < 1 : # pivot? if learning_rate > 0.03: learning_rate = learning_rate * 0.5 print("Adjusted learning_rate : %.5f"%learning_rate) optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate) else: pass if val_perplex < cur_best: print("The current best val loss: ", val_loss) cur_best = val_perplex torch.save(model.state_dict(), 'model.pkl')
vocabxid = dict([(y, x) for x, y in enumerate(idxvocab)]) # input_doc similar to corpus in train sents, docs, docids, stats = gen_data(vocabxid, cf.dummy_symbols, tm_ignore, input_doc) print "Vocab size =", len(idxvocab) labels = None tags = None with tf.Graph().as_default(), tf.Session() as sess: initializer = tf.contrib.layers.xavier_initializer(seed=cf.seed) with tf.variable_scope("model", reuse=None, initializer=initializer): tm = TM(is_training=False, vocab_size=len(idxvocab), batch_size=cf.batch_size, \ num_steps=cf.tm_sent_len, num_classes=cf.num_classes, config=cf) if cf.topic_number > 0 else None lm = LM(is_training=False, vocab_size=len(idxvocab), batch_size=cf.batch_size, \ num_steps=cf.lm_sent_len, config=cf, reuse_conv_variables=True) if cf.rnn_hidden_size > 0 else None #load tensorflow model saver = tf.train.Saver() saver.restore(sess, os.path.join(args.model_dir, "model.ckpt")) #compute topic distribution of input documents if args.output_topic_dist: if args.input_doc == None: sys.stderr.write( "Error: --output_topic_dist option requires --input_doc\n") raise SystemExit compute_dt_dist(docs[0], labels, tags, tm, cf.doc_len, cf.batch_size, vocabxid[pad_symbol], idxvocab, \ args.output_topic_dist) #print topics
model.zero_grad() h = [state.detach() for state in h] output, h = model(val_inputs, h) loss = criterion(output, val_targets.view(-1)) val_loss += loss.data[0] count += 1 print('Test Loss: %.3f, Perplexity: %5.2f' % (val_loss / count, np.exp(val_loss / count))) return val_loss / count model = LM(word_vocab, char_vocab, max_len, embed_dim, out_channels, kernels, hidden_size, batch_size) if torch.cuda.is_available(): model.cuda() model.load_state_dict(torch.load('model.pkl')) criterion = nn.CrossEntropyLoss() optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate, weight_decay=1e-5) scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', factor=0.5, patience=1,