def __init__(self, args): self.args = args self.device = torch.device("cuda:{}".format(self.args.GPU) if torch. cuda.is_available() else "cpu") self.data = READ(self.args) glove = self.data.WORD.vocab.vectors char_size = len(self.data.CHAR.vocab) self.model = BiDAF(self.args, char_size, glove).to(self.device) self.optimizer = optim.Adadelta(self.model.parameters(), lr=self.args.Learning_Rate) self.ema = EMA(self.args.Exp_Decay_Rate) if APEX_AVAILABLE: # Mixed Precision self.model, self.optimizer = amp.initialize(self.model, self.optimizer, opt_level='O2') for name, param in self.model.named_parameters(): if param.requires_grad: self.ema.register(name, param.data) self.parameters = filter(lambda p: p.requires_grad, self.model.parameters())
def onSubmit(): bidef = model.BiDAF(400, 250, 20) file_name = f'../model/bidaf250_3{str(var.get())}.h5' bidef.load_bidaf(file_name) global answer_level global button_forward global button_back answer_level.grid_forget() submit_str.set('Predicting....') context = context_area.get("1.0", END) question = question_area.get("1.0", END) if len(context) <= 1: messagebox.showwarning('Question Answering App', 'Context Field is Empty!') submit_str.set('Predict Answer') return if len(question) <= 1: messagebox.showwarning('Question Answering App', 'Question Field is Empty!') submit_str.set('Predict Answer') return process = Preprocess(context, question) c, q = process.processForModel() p1, p2 = bidef.predict(c, q) context = process.preprocess(context) answers = PostProcess(context, p1, p2).postProcess() answer_list[0] = answers answer_level = get_level(root, answers[0], 6, 0, 3, 14) print(answers) submit_str.set('Predict Answer') status = get_level(root, f'Answer 1 of {len(answers)}', 7, 0, 3, 14) button_forward = Button(root, text=">>", command=lambda: forward(2)) button_back = Button(root, text="<<", command=back, state=DISABLED) button_back.grid(row=8, column=0, padx=5, pady=10) button_forward.grid(row=8, column=2, padx=5, pady=10)
def train_val_model(pipeline_cfg, model_cfg, train_cfg): data_pipeline = DataPipeline(**pipeline_cfg) device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") if model_cfg['cxt_emb_pretrained'] is not None: model_cfg['cxt_emb_pretrained'] = torch.load( model_cfg['cxt_emb_pretrained']) bidaf = BiDAF(word_emb=data_pipeline.word_type.vocab.vectors, **model_cfg) ema = EMA(train_cfg['exp_decay_rate']) for name, param in bidaf.named_parameters(): if param.requires_grad: ema.register(name, param.data) parameters = filter(lambda p: p.requires_grad, bidaf.parameters()) optimizer = optim.Adadelta(parameters, lr=train_cfg['lr']) criterion = nn.CrossEntropyLoss() result = {'best_f1': 0.0, 'best_model': None} num_epochs = train_cfg['num_epochs'] for epoch in range(1, num_epochs + 1): print('Epoch {}/{}'.format(epoch, num_epochs)) print('-' * 10) for phase in ['train', 'val']: val_answers = dict() val_f1 = 0 val_em = 0 val_cnt = 0 val_r = 0 if phase == 'train': bidaf.train() else: bidaf.eval() backup_params = EMA(0) for name, param in bidaf.named_parameters(): if param.requires_grad: backup_params.register(name, param.data) param.data.copy_(ema.get(name)) with torch.set_grad_enabled(phase == 'train'): for batch_num, batch in enumerate( data_pipeline.data_iterators[phase]): optimizer.zero_grad() p1, p2 = bidaf(batch) loss = criterion(p1, batch.s_idx) + criterion( p2, batch.e_idx) if phase == 'train': loss.backward() optimizer.step() for name, param in bidaf.named_parameters(): if param.requires_grad: ema.update(name, param.data) if batch_num % train_cfg['batch_per_disp'] == 0: batch_loss = loss.item() print('batch %d: loss %.3f' % (batch_num, batch_loss)) if phase == 'val': batch_size, c_len = p1.size() val_cnt += batch_size ls = nn.LogSoftmax(dim=1) mask = (torch.ones(c_len, c_len) * float('-inf')).to(device).tril(-1). \ unsqueeze(0).expand(batch_size, -1, -1) score = (ls(p1).unsqueeze(2) + ls(p2).unsqueeze(1)) + mask score, s_idx = score.max(dim=1) score, e_idx = score.max(dim=1) s_idx = torch.gather(s_idx, 1, e_idx.view(-1, 1)).squeeze() for i in range(batch_size): answer = (s_idx[i], e_idx[i]) gt = (batch.s_idx[i], batch.e_idx[i]) val_f1 += f1_score(answer, gt) val_em += exact_match_score(answer, gt) val_r += r_score(answer, gt) if phase == 'val': val_f1 = val_f1 * 100 / val_cnt val_em = val_em * 100 / val_cnt val_r = val_r * 100 / val_cnt print('Epoch %d: %s f1 %.3f | %s em %.3f | %s rouge %.3f' % (epoch, phase, val_f1, phase, val_em, phase, val_r)) if val_f1 > result['best_f1']: result['best_f1'] = val_f1 result['best_em'] = val_em result['best_model'] = copy.deepcopy(bidaf.state_dict()) torch.save(result, train_cfg['ckpoint_file']) # with open(train_cfg['val_answers'], 'w', encoding='utf-8') as f: # print(json.dumps(val_answers), file=f) for name, param in bidaf.named_parameters(): if param.requires_grad: param.data.copy_(backup_params.get(name))
class SOLVER(): def __init__(self, args): self.args = args self.device = torch.device("cuda:{}".format(self.args.GPU) if torch. cuda.is_available() else "cpu") self.data = READ(self.args) glove = self.data.WORD.vocab.vectors char_size = len(self.data.CHAR.vocab) self.model = BiDAF(self.args, char_size, glove).to(self.device) self.optimizer = optim.Adadelta(self.model.parameters(), lr=self.args.Learning_Rate) self.ema = EMA(self.args.Exp_Decay_Rate) if APEX_AVAILABLE: # Mixed Precision self.model, self.optimizer = amp.initialize(self.model, self.optimizer, opt_level='O2') for name, param in self.model.named_parameters(): if param.requires_grad: self.ema.register(name, param.data) self.parameters = filter(lambda p: p.requires_grad, self.model.parameters()) def train(self): criterion = nn.NLLLoss() criterion = criterion.to(self.device) self.model.train() max_dev_em, max_dev_f1 = -1, -1 num_batches = len(self.data.train_iter) logging.info("Begin Training") self.model.zero_grad() loss = 0.0 for epoch in range(self.args.Epoch): self.model.train() for i, batch in enumerate(self.data.train_iter): i += 1 p1, p2 = self.model(batch) batch_loss = criterion( p1, batch.start_idx.to(self.device)) + criterion( p2, batch.end_idx.to(self.device)) if APEX_AVAILABLE: with amp.scale_loss(batch_loss, self.optimizer) as scaled_loss: scaled_loss.backward() else: batch_loss.backward() loss = batch_loss.item() self.optimizer.step() del p1, p2, batch_loss for name, param in self.model.named_parameters(): if param.requires_grad: self.ema.update(name, param.data) self.model.zero_grad() logging.info("Epoch [{}/{}] Step [{}/{}] Train Loss {}".format(epoch+1, self.args.Epoch, \ i, int(num_batches) +1, round(loss,3))) if epoch > 7: if i % 100 == 0: dev_em, dev_f1 = self.evaluate() logging.info("Epoch [{}/{}] Dev EM {} Dev F1 {}".format(epoch + 1, self.args.Epoch, \ round(dev_em,3), round(dev_f1,3))) self.model.train() if dev_f1 > max_dev_f1: max_dev_f1 = dev_f1 max_dev_em = dev_em dev_em, dev_f1 = self.evaluate() logging.info("Epoch [{}/{}] Dev EM {} Dev F1 {}".format(epoch + 1, self.args.Epoch, \ round(dev_em,3), round(dev_f1,3))) self.model.train() if dev_f1 > max_dev_f1: max_dev_f1 = dev_f1 max_dev_em = dev_em logging.info('Max Dev EM: {} Max Dev F1: {}'.format( round(max_dev_em, 3), round(max_dev_f1, 3))) def evaluate(self): logging.info("Evaluating on Dev Dataset") answers = dict() self.model.eval() temp_ema = EMA(0) for name, param in self.model.named_parameters(): if param.requires_grad: temp_ema.register(name, param.data) param.data.copy_(self.ema.get(name)) with torch.no_grad(): for _, batch in enumerate(self.data.dev_iter): p1, p2 = self.model(batch) batch_size, _ = p1.size() _, s_idx = p1.max(dim=1) _, e_idx = p2.max(dim=1) for i in range(batch_size): qid = batch.qid[i] answer = batch.c_word[0][i][s_idx[i]:e_idx[i] + 1] answer = ' '.join( [self.data.WORD.vocab.itos[idx] for idx in answer]) answers[qid] = answer for name, param in self.model.named_parameters(): if param.requires_grad: param.data.copy_(temp_ema.get(name)) results = evaluate(self.args, answers) return results['exact_match'], results['f1']
def train_bidaf(config, data): # train BiDAF model device = torch.device(f"cuda:{0}" if torch.cuda.is_available() else "cpu") model_bidaf = BiDAF(config, train_data) ema = EMA(args.exp_decay_rate) for name, param in model.named_parameters(): if param.requires_grad: ema.register(name, param.data) parameters = filter(lambda p: p.requires_grad, model.parameters()) optimizer = optim.Adadelta(parameters, lr=args.learning_rate) criterion = nn.CrossEntropyLoss() writer = SummaryWriter(log_dir='runs/' + args.model_time) model.train() loss, last_epoch = 0, -1 max_dev_exact, max_dev_f1 = -1, -1 iterator = data.train_iter for i, batch in enumerate(iterator): present_epoch = int(iterator.epoch) if present_epoch == args.epoch: break if present_epoch > last_epoch: print('epoch:', present_epoch + 1) last_epoch = present_epoch p1, p2 = model(batch) optimizer.zero_grad() batch_loss = criterion(p1, batch.s_idx) + criterion(p2, batch.e_idx) loss += batch_loss.item() batch_loss.backward() optimizer.step() for name, param in model.named_parameters(): if param.requires_grad: ema.update(name, param.data) if (i + 1) % args.print_freq == 0: dev_loss, dev_exact, dev_f1 = test(model, ema, args, data) c = (i + 1) // args.print_freq writer.add_scalar('loss/train', loss, c) writer.add_scalar('loss/dev', dev_loss, c) writer.add_scalar('exact_match/dev', dev_exact, c) writer.add_scalar('f1/dev', dev_f1, c) print(f'train loss: {loss:.3f} / dev loss: {dev_loss:.3f}' f' / dev EM: {dev_exact:.3f} / dev F1: {dev_f1:.3f}') if dev_f1 > max_dev_f1: max_dev_f1 = dev_f1 max_dev_exact = dev_exact best_model_bidaf = copy.deepcopy(model) loss = 0 model.train() writer.close() print(f'max dev EM: {max_dev_exact:.3f} / max dev F1: {max_dev_f1:.3f}') return best_model_bidaf
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=hyper_params["batch_size"], num_workers=4) valid_dataloader = DataLoader(valid_dataset, shuffle=True, batch_size=hyper_params["batch_size"], num_workers=4) print("Length of training data loader is:", len(train_dataloader)) print("Length of valid data loader is:", len(valid_dataloader)) # load the model model = BiDAF(word_vectors=word_embedding_matrix, char_vectors=char_embedding_matrix, hidden_size=hyper_params["hidden_size"], drop_prob=hyper_params["drop_prob"]) if hyper_params["pretrained"]: model.load_state_dict(torch.load(os.path.join(experiment_path, "model.pkl"))["state_dict"]) model.to(device) # define loss and optimizer criterion = nn.CrossEntropyLoss() optimizer = torch.optim.Adadelta(model.parameters(), hyper_params["learning_rate"], weight_decay=1e-4) # best loss so far if hyper_params["pretrained"]: best_valid_loss = torch.load(os.path.join(experiment_path, "model.pkl"))["best_valid_loss"] epoch_checkpoint = torch.load(os.path.join(experiment_path, "model_last_checkpoint.pkl"))["epoch"] print("Best validation loss obtained after {} epochs is: {}".format(epoch_checkpoint, best_valid_loss)) else:
p2_corr = a_end.cpu() == p2_pred p1_acc += torch.sum(p1_corr).data[0] p2_acc += torch.sum(p2_corr).data[0] exact_match += torch.sum(p1_corr * p2_corr).data[0] total += batch_size if i % 10 == 0: print('current acc: {:.3f}%'.format(100*exact_match/total)) print('======== Test result ========') print('p1 acc: {:.3f}%, p2 acc: {:.3f}%, EM: {:.3f}'.format(100.*p1_acc/total, 100.*p2_acc/total, 100.*exact_match/total)) # }}} #create model model = BiDAF(args) if torch.cuda.is_available(): print('use cuda') model.cuda() #resume optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters())) if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume) args.start_epoch = checkpoint['epoch'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' (epoch {})".format(args.resume, checkpoint['epoch']))
def get_textfield(root, height, r): text_area = scrolledtext.ScrolledText(root, wrap=WORD, width=50, height=height, font=("Times New Roman", 15), bd=5) text_area.grid(column=0, row=r, pady=10, padx=20, columnspan=3) text_area.focus() return text_area if __name__ == '__main__': bidef = model.BiDAF(400, 250, 20) root = Tk() root.title('QA App') root.iconbitmap('../images/2.ico') root.geometry('600x700+700+20') answer_list = ['abcdef'] title = get_level(root, "Question Answer Model", 0, 0, 3, 16) context_title = get_level(root, 'Enter Context(within 250 words)', 2, 0, 3, 14) question_title = get_level(root, 'Enter Question(within 20 words)', 4, 0, 3, 14) answer_level = get_level(root, '', 6, 0, 3, 14) status = get_level(root, "Answer 0 of 0", 7, 0, 3, 14) # bd=1, relief=SUNKEN, anchor=E
# load dataset test_dataset = SquadDataset(d_w_context, d_c_context, d_w_question, d_c_question, d_labels) # load data generator test_dataloader = DataLoader(test_dataset, shuffle=True, batch_size=hyper_params["batch_size"], num_workers=4) print("Length of test data loader is:", len(test_dataloader)) # load the model model = BiDAF(word_vectors=word_embedding_matrix, char_vectors=char_embedding_matrix, hidden_size=hyper_params["hidden_size"], drop_prob=hyper_params["drop_prob"]) try: if config.cuda: model.load_state_dict( torch.load(os.path.join(config.squad_models, "model_final.pkl"))["state_dict"]) else: model.load_state_dict( torch.load( os.path.join(config.squad_models, "model_final.pkl"), map_location=lambda storage, loc: storage)["state_dict"]) print("Model weights successfully loaded.") except: print("Model weights not found, initialized model with random weights.") model.to(device)
from collections import Counter # prepare data print('prepare data') # config = Config() device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') pre_trained_ = load('pre_data/embed_pre.json') pre_trained = torch.Tensor(pre_trained_[0]) del pre_trained_ print('loading train_dataset') train_dataset = SQuADData('pre_data/input/train') dev_dataset = SQuADData('pre_data/input/dev') # define model print('define model') model = BiDAF(pre_trained) # model = BiDAF(pre_trained, 128) # model = torch.load('model/model.pt') model = model.to(device) lr = config.learning_rate base_lr = 1.0 warm_up = config.lr_warm_up_num cr = lr / log2(warm_up) optimizer = torch.optim.Adam(lr=config.learning_rate, betas=(config.beta1, config.beta2), eps=config.eps, weight_decay=3e-7, params=model.parameters()) scheduler = optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lambda ee: cr * log2(ee + 1) if ee < warm_up else lr)
def main(argv): config = Config() config.load_user_config() config.log.info("finish loading user config") train_file = config.args["train_file"] dev_file = config.args["dev_file"] old_glove_file = config.args["glove_file"] new_glove_file = config.args["glove_file"] + ".subset" # TODO(demi): switch "overwrite" to False train_data_raw, dev_data_raw, i2w, w2i, i2c, c2i, new_glove_file, glove_dim, vocab_size, char_vocab_size\ = squad_read_data(config, train_file, dev_file, old_glove_file, new_glove_file, overwrite=True) config.log.info("finish reading squad data in raw formats") config.update_batch([("glove_file", new_glove_file), ("glove_dim", glove_dim), ("vocab_size", vocab_size), ("char_vocab_size", char_vocab_size)]) config.log.warning("reminder: now we only support train/fake mode") assert config.args["mode"] in ["train", "fake"], "mode (%s) not found" % config.args["mode"] train_id_conversion, train_data = make_dataset(config, train_data_raw, w2i, c2i) dev_id_conversion, dev_data = make_dataset(config, dev_data_raw, w2i, c2i) config.log.info("finish making datasets: reformatting raw data") train_data = QnADataset(train_data, config) dev_data = QnADataset(dev_data, config) config.log.info("finish generating datasets") train_loader = torch.utils.data.DataLoader(train_data, batch_size=1, shuffle=True, **config.kwargs) dev_loader = torch.utils.data.DataLoader(dev_data, batch_size=1, **config.kwargs) config.log.info("finish generating data loader") model = BiDAF(config, i2w) config.log.info("finish creating model") if config.args["use_cuda"]: model.cuda() # log config and model config.log.info(config.format_string()) config.log.info("model:{}".format(model)) if config.args['optimizer'] == "Adam": optimizer = optim.Adam(model.get_train_parameters(), lr=config.args['lr'], weight_decay=config.args['weight_decay']) if config.args['optimizer'] == "Adamax": optimizer = optim.Adamax(model.get_train_parameters(), lr=config.args['lr'], weight_decay=config.args['weight_decay']) if config.args['optimizer'] == "SGD": optimizer = torch.optim.SGD(model.get_train_parameters(), lr=config.args['lr'], momentum=0.9, weight_decay=config.args['weight_decay']) if config.args['optimizer'] == "Adadelta": optimizer = torch.optim.Adadelta(model.get_train_parameters(), lr=config.args["lr"]) #if config.args['optimizer'] == "Adagrad": config.log.info("model = %s" % model) config.log.info("config = %s" % config.format_string()) trainer = Trainer(config) evaluator = Evaluator(config) """ save model checkpoint """ def save_checkpoint(epoch): checkpoint = {"model_state_dict": model.state_dict(), "config_args" : config.args} if config.args["optimizer"] != "YF": # YF can't save state dict right now checkpoint["optimizer_state_dict"] = optimizer.state_dict() checkpoint_file = config.args["model_dir"] + config.args["model_name"] + "-EPOCH%d" % epoch torch.save(checkpoint, checkpoint_file) config.log.info("saving checkpoint: {}".format(checkpoint_file)) for epoch in range(1, config.args["max_epoch"] + 1): config.log.info("training: epoch %d" % epoch) # QS(demi): do i need to return model & optimizer? model, optimizer, train_avg_loss, train_answer_dict = trainer.run(model, train_id_conversion[0], train_loader, optimizer, mode="train") model, optimizer, dev_avg_loss, dev_answer_dict = trainer.run(model, dev_id_conversion[0], dev_loader, optimizer, mode="dev") # loss is a float tensor with size 1 config.log.info("[EPOCH %d] LOSS = (train)%.5lf | (dev)%.5lf" % (epoch, train_avg_loss[0], dev_avg_loss[0])) answer_filename = "{}/{}-EPOCH{}".format(config.args["model_dir"], config.args["model_name"], epoch) config.log.info("[EVAUATION] TRAIN EVAL") evaluator.eval("official", train_file, train_answer_dict, "{}/answer.train".format(config.args["model_dir"], answer_filename)) config.log.info("[EVAUATION] DEV EVAL") evaluator.eval("official", dev_file, dev_answer_dict, "{}/answer.dev".format(config.args["model_dir"], answer_filename)) save_checkpoint(epoch)
def eval(context, question): with open(os.path.join(config.data_dir, "train", "word2idx.pkl"), "rb") as wi, \ open(os.path.join(config.data_dir, "train", "char2idx.pkl"), "rb") as ci, \ open(os.path.join(config.data_dir, "train", "word_embeddings.pkl"), "rb") as wb, \ open(os.path.join(config.data_dir, "train", "char_embeddings.pkl"), "rb") as cb: word2idx = pickle.load(wi) char2idx = pickle.load(ci) word_embedding_matrix = pickle.load(wb) char_embedding_matrix = pickle.load(cb) # transform them into Tensors word_embedding_matrix = torch.from_numpy( np.array(word_embedding_matrix)).type(torch.float32) char_embedding_matrix = torch.from_numpy( np.array(char_embedding_matrix)).type(torch.float32) idx2word = dict([(y, x) for x, y in word2idx.items()]) context = clean_text(context) context = [w for w in word_tokenize(context) if w] question = clean_text(question) question = [w for w in word_tokenize(question) if w] if len(context) > config.max_len_context: print("The context is too long. Maximum accepted length is", config.max_len_context, "words.") if max([len(w) for w in context]) > config.max_len_word: print("Some words in the context are longer than", config.max_len_word, "characters.") if len(question) > config.max_len_question: print("The question is too long. Maximum accepted length is", config.max_len_question, "words.") if max([len(w) for w in question]) > config.max_len_word: print("Some words in the question are longer than", config.max_len_word, "characters.") if len(question) < 3: print( "The question is too short. It needs to be at least a three words question." ) context_idx = np.zeros([config.max_len_context], dtype=np.int32) question_idx = np.zeros([config.max_len_question], dtype=np.int32) context_char_idx = np.zeros([config.max_len_context, config.max_len_word], dtype=np.int32) question_char_idx = np.zeros( [config.max_len_question, config.max_len_word], dtype=np.int32) # replace 0 values with word and char IDs for j, word in enumerate(context): if word in word2idx: context_idx[j] = word2idx[word] else: context_idx[j] = 1 for k, char in enumerate(word): if char in char2idx: context_char_idx[j, k] = char2idx[char] else: context_char_idx[j, k] = 1 for j, word in enumerate(question): if word in word2idx: question_idx[j] = word2idx[word] else: question_idx[j] = 1 for k, char in enumerate(word): if char in char2idx: question_char_idx[j, k] = char2idx[char] else: question_char_idx[j, k] = 1 model = BiDAF(word_vectors=word_embedding_matrix, char_vectors=char_embedding_matrix, hidden_size=config.hidden_size, drop_prob=config.drop_prob) try: if config.cuda: model.load_state_dict( torch.load(os.path.join(config.squad_models, "model_final.pkl"))["state_dict"]) else: model.load_state_dict( torch.load( os.path.join(config.squad_models, "model_final.pkl"), map_location=lambda storage, loc: storage)["state_dict"]) print("Model weights successfully loaded.") except: pass print( "Model weights not found, initialized model with random weights.") model.to(device) model.eval() with torch.no_grad(): context_idx, context_char_idx, question_idx, question_char_idx = torch.tensor(context_idx, dtype=torch.int64).unsqueeze(0).to(device),\ torch.tensor(context_char_idx, dtype=torch.int64).unsqueeze(0).to(device),\ torch.tensor(question_idx, dtype=torch.int64).unsqueeze(0).to(device),\ torch.tensor(question_char_idx, dtype=torch.int64).unsqueeze(0).to(device) pred1, pred2 = model(context_idx, context_char_idx, question_idx, question_char_idx) starts, ends = discretize(pred1.exp(), pred2.exp(), 15, False) prediction = " ".join(context[starts.item():ends.item() + 1]) return prediction
def main(NMT_config): ### Load RL (global) configurations ### config = parse_args() ### Load trained QA model ### QA_checkpoint = torch.load(config.data_dir + config.QA_best_model) QA_config = QA_checkpoint['config'] QA_mod = BiDAF(QA_config) if QA_config.use_gpu: QA_mod.cuda() QA_mod.load_state_dict(QA_checkpoint['state_dict']) ### Load SQuAD dataset ### data_filter = get_squad_data_filter(QA_config) train_data = read_data(QA_config, 'train', QA_config.load, data_filter=data_filter) dev_data = read_data(QA_config, 'dev', True, data_filter=data_filter) update_config(QA_config, [train_data, dev_data]) print("Total vocabulary for training is %s" % QA_config.word_vocab_size) # from all word2vec_dict = train_data.shared[ 'lower_word2vec'] if QA_config.lower_word else train_data.shared[ 'word2vec'] # from filter-out set word2idx_dict = train_data.shared['word2idx'] # filter-out set idx-vector idx2vec_dict = { word2idx_dict[word]: vec for word, vec in word2vec_dict.items() if word in word2idx_dict } print("{}/{} unique words have corresponding glove vectors.".format( len(idx2vec_dict), len(word2idx_dict))) # <null> and <unk> do not have corresponding vector so random. emb_mat = np.array([ idx2vec_dict[idx] if idx in idx2vec_dict else np.random.multivariate_normal( np.zeros(QA_config.word_emb_size), np.eye(QA_config.word_emb_size)) for idx in range(QA_config.word_vocab_size) ]) config.emb_mat = emb_mat config.new_emb_mat = train_data.shared['new_emb_mat'] num_steps = int( math.ceil(train_data.num_examples / (QA_config.batch_size * QA_config.num_gpus))) * QA_config.num_epochs # offset for question mark NMT_config.max_length = QA_config.ques_size_th - 1 NMT_config.batch_size = QA_config.batch_size ### Construct translator ### translator = make_translator(NMT_config, report_score=True) ### Construct optimizer ### optimizer = optim.SGD(filter(lambda p: p.requires_grad, translator.model.parameters()), lr=config.lr) ### Start RL training ### count = 0 QA_mod.eval() F1_eval = F1Evaluator(QA_config, QA_mod) #eval_model(QA_mod, train_data, dev_data, QA_config, NMT_config, config, translator) for i in range(config.n_episodes): for batches in tqdm(train_data.get_multi_batches( QA_config.batch_size, QA_config.num_gpus, num_steps=num_steps, shuffle=True, cluster=QA_config.cluster), total=num_steps): #for n, p in translator.model.named_parameters(): # print(n) # print(p) #print(p.requires_grad) start = datetime.now() to_input(batches[0][1].data['q'], config.RL_path + config.RL_file) # obtain rewrite and log_prob q, scores, log_prob = translator.translate(NMT_config.src_dir, NMT_config.src, NMT_config.tgt, NMT_config.batch_size, NMT_config.attn_debug) q, cq = ref_query(q) batches[0][1].data['q'] = q batches[0][1].data['cq'] = cq log_prob = torch.stack(log_prob).squeeze(-1) #print(log_prob) translator.model.zero_grad() QA_mod(batches) e = F1_eval.get_evaluation(batches, False, NMT_config, config, translator) reward = Variable(torch.FloatTensor(e.f1s), requires_grad=False) #print(reward) ## Initial loss loss = create_loss(log_prob, reward) loss.backward() optimizer.step()