def get_model(modeldict, modelname): checkpoint = torch.load(modeldict, map_location=lambda storage, loc: storage) dict_args = checkpoint['dict_args'] if modelname == 'bidaf': model = BiDAF(dict_args) elif modelname == 'rnet': model = RNet(dict_args) model.load_state_dict(checkpoint['state_dict']) return model
def train(args, data): device = torch.device( f"cuda:{args.gpu}" if torch.cuda.is_available() else "cpu") model = BiDAF(args, data.WORD.vocab.vectors).to(device) ema = EMA(args.exp_decay_rate) for name, param in model.named_parameters(): if param.requires_grad: ema.register(name, param.data) parameters = filter(lambda p: p.requires_grad, model.parameters()) optimizer = optim.Adadelta(parameters, lr=args.learning_rate) criterion = nn.CrossEntropyLoss() model.train() loss, last_epoch = 0, -1 max_dev_exact, max_dev_f1 = -1, -1 iterator = data.train_iter for i, batch in enumerate(iterator): present_epoch = int(iterator.epoch) if present_epoch == args.epoch: break if present_epoch > last_epoch: print('epoch:', present_epoch + 1) last_epoch = present_epoch p1, p2 = model(batch) optimizer.zero_grad() batch_loss = criterion(p1, batch.s_idx) + criterion(p2, batch.e_idx) loss += batch_loss.item() batch_loss.backward() optimizer.step() for name, param in model.named_parameters(): if param.requires_grad: ema.update(name, param.data) if (i + 1) % args.print_freq == 0: dev_loss, dev_exact, dev_f1 = test(model, ema, args, data) c = (i + 1) // args.print_freq print(f'train loss: {loss:.3f} / dev loss: {dev_loss:.3f}' f' / dev EM: {dev_exact:.3f} / dev F1: {dev_f1:.3f}') if dev_f1 > max_dev_f1: max_dev_f1 = dev_f1 max_dev_exact = dev_exact best_model = copy.deepcopy(model) loss = 0 model.train() print(f'max dev EM: {max_dev_exact:.3f} / max dev F1: {max_dev_f1:.3f}') return best_model
def __init__(self, args, loader): super(ContextMRR_Sep_Switched, self).__init__() hidden_size = args.hidden_size embed_size = args.embed_size ## dropout layer if args.dropout > 0: self._dropout = torch.nn.Dropout(p=args.dropout) else: self._dropout = lambda x: x ## contextual embedding layer self.contextual_embedding_layer = RecurrentContext( input_size=embed_size, hidden_size=hidden_size, num_layers=1) ## bidirectional attention flow between question and context self.attention_flow_layer1 = BiDAF(2 * hidden_size) modeling_layer_inputdim = 2 * hidden_size self.modeling_layer1 = RecurrentContext(modeling_layer_inputdim, hidden_size, num_layers=1) self.linearrelu = ffnLayer(4 * hidden_size, 4 * hidden_size) output_layer_inputdim = 8 * hidden_size self.output_layer = OutputLayer(output_layer_inputdim, hidden_size) self.loss = torch.nn.CrossEntropyLoss()
def _prepare(self): if self.args.algo == Algos.BIDAF: self._create_qa_data() self.model = BiDAF(Algos.BIDAF, self.datasets[1].schema, is_infer=self.args.is_infer, vocab_size=self.args.vocab_size, doc_num=self.datasets[1].doc_num, static_emb=(self.args.pre_emb.strip() != ''), emb_dim=self.args.emb_dim, max_a_len=self.args.max_a_len) elif self.args.algo == Algos.MLSTM: self._create_qa_data() self.model = MatchLstm( Algos.MLSTM, self.datasets[1].schema, is_infer=self.args.is_infer, vocab_size=self.args.vocab_size, doc_num=self.datasets[1].doc_num, static_emb=(self.args.pre_emb.strip() != ''), emb_dim=self.args.emb_dim, max_a_len=self.args.max_a_len) elif self.args.algo == Algos.YESNO: self._create_yesno_data() self.model = OpinionClassifier( Algos.YESNO, self.datasets[1].schema, is_infer=self.args.is_infer, vocab_size=self.args.vocab_size, static_emb=(self.args.pre_emb.strip() != ''), doc_num=1, emb_dim=self.args.emb_dim) else: raise ValueError('Illegal algo: {}'.format(self.args.algo))
def __init__(self, args, loader): super(ContextMRR_Sep, self).__init__() hidden_size = args.hidden_size embed_size = args.embed_size word_vocab_size = loader.vocab.get_length() ## word embedding layer #self.word_embedding_layer = LookupEncoder(word_vocab_size, embedding_dim=embed_size) #, pretrain_embedding=loader.pretrain_embedding) ## dropout layer if args.dropout > 0: self._dropout = torch.nn.Dropout(p=args.dropout) else: self._dropout = lambda x: x ## contextual embedding layer self.contextual_embedding_layer = RecurrentContext( input_size=embed_size, hidden_size=hidden_size, num_layers=1) ## bidirectional attention flow between question and context self.attention_flow_layer1 = BiDAF(2 * hidden_size) ## modelling layer for question and context : this layer also converts the 8 dimensional input intp two dimensioanl output modeling_layer_inputdim = 8 * hidden_size self.modeling_layer1 = RecurrentContext(modeling_layer_inputdim, hidden_size, num_layers=1) '''BIDAF 2''' self.contextual_embedding_layer_2 = RecurrentContext( input_size=embed_size, hidden_size=hidden_size, num_layers=1) ## bidirectional attention flow between [q+c] and answer self.attention_flow_layer2 = BiDAF(2 * hidden_size) ## modeling layer modeling_layer_inputdim = 8 * hidden_size self.modeling_layer2 = RecurrentContext(modeling_layer_inputdim, hidden_size, num_layers=1) ## output layer ## current implementation: run an mlp on the concatenated hidden states of the answer modeling layer output_layer_inputdim = 4 * hidden_size self.output_layer = OutputLayer(output_layer_inputdim, hidden_size) self.loss = torch.nn.CrossEntropyLoss()
def test_forward(self): word_vectors = get_word_vectors(vocab, emb_size) cw_idxs, c_lengths = get_idxs(batch, clen, vocab) qw_idxs, q_lengths = get_idxs(batch, qlen, vocab) model = BiDAF(word_vectors, hidden_size) p1, p2 = model(cw_idxs, qw_idxs) self.assertEqual(p1.size(), (batch, clen)) self.assertEqual(p2.size(), (batch, clen)) self.assertTrue(torch.allclose(p1.exp().sum(-1), torch.ones( (batch, )))) self.assertTrue(torch.allclose(p2.exp().sum(-1), torch.ones( (batch, )))) model = BiDAF(word_vectors, hidden_size, highway=True) p1, p2 = model(cw_idxs, qw_idxs) model = BiDAF(word_vectors, hidden_size, use_gru=False) p1, p2 = model(cw_idxs, qw_idxs)
def _prepare(self): if self.args.algo == Algos.BIDAF: self._create_qa_data() self.model = BiDAF(Algos.BIDAF, self.datasets[1].schema, is_infer=self.args.is_infer, vocab_size=self.args.vocab_size, doc_num=self.datasets[1].doc_num, static_emb=(self.args.pre_emb.strip() != ''), emb_dim=self.args.emb_dim, max_a_len=self.args.max_a_len) else: raise ValueError('Illegal algo: {}'.format(self.args.algo))
def __init__(self, args, vocab): super(ContextMRR, self).__init__() hidden_size = args.hidden_size embed_size = args.embed_size word_vocab_size = vocab.get_length() if args.dropout > 0: self._dropout = torch.nn.Dropout(p=args.dropout) else: self._dropout = lambda x: x ## word embedding layer self.word_embedding_layer = LookupEncoder(word_vocab_size, embedding_dim=embed_size) ## contextual embedding layer self.contextual_embedding_layer = RecurrentContext( input_size=embed_size, hidden_size=hidden_size, num_layers=1) ## bidirectional attention flow between question and context self.attention_flow_layer1 = BiDAF(2 * hidden_size) ## modelling layer for question and context : this layer also converts the 8 dimensional input intp two dimensioanl output modeling_layer_inputdim = 8 * hidden_size self.modeling_layer1 = RecurrentContext(modeling_layer_inputdim, hidden_size) self.modeling_dim = 2 * hidden_size span_start_input_dim = modeling_layer_inputdim + (2 * hidden_size) self._span_predictor = TimeDistributed( torch.nn.Linear(span_start_input_dim, 1)) span_end_input_dim = modeling_layer_inputdim + (2 * hidden_size) self._span_end_predictor = TimeDistributed( torch.nn.Linear(span_end_input_dim, 1)) span_end_dim = modeling_layer_inputdim + 3 * self.modeling_dim self._span_end_encoder = RecurrentContext(span_end_dim, hidden_size) self._span_start_accuracy = Accuracy() self._span_end_accuracy = Accuracy() self._span_accuracy = Accuracy()
def __init__(self, args, loader): super(ContextMRR_Sentence_Level, self).__init__() hidden_size = args.hidden_size embed_size = args.embed_size ## dropout layer if args.dropout > 0: self._dropout = torch.nn.Dropout(p=args.dropout) else: self._dropout = lambda x: x ## contextual embedding layer self.contextual_embedding_layer = RecurrentContext( input_size=embed_size, hidden_size=hidden_size, num_layers=1) ## bidirectional attention flow between question and context self.attention_flow_layer1 = BiDAF(2 * hidden_size) c2q_linearLayer_dim = 8 * hidden_size self.c2q_linearLayer = TimeDistributed( nn.Sequential( torch.nn.Linear(c2q_linearLayer_dim, 2 * hidden_size), torch.nn.ReLU())) modeling_layer_inputdim = 2 * hidden_size self.modeling_layer1 = RecurrentContext(modeling_layer_inputdim, hidden_size, num_layers=1) self.hierarchial_layer1 = RecurrentContext(2 * hidden_size, hidden_size, num_layers=1) self.linearrelu = ffnLayer(4 * hidden_size, 4 * hidden_size) output_layer_inputdim = 6 * hidden_size self.output_layer = OutputLayer(output_layer_inputdim, hidden_size) self.loss = torch.nn.CrossEntropyLoss()
def __init__(self): self.config = self.get_args() self.trainset, self.devset, self.config.embed, self.word2index, self.char2index = read.data() self.config.unique_chars = len(self.char2index) with tf.Graph().as_default() as g: if self.config.name == 'bidaf': self.model = BiDAF(self.config) if self.config.name == 'bidaf-att': self.model = BiDAFSelfAttention(self.config) elif self.config.name == 'mnemonic': self.model = MnemonicReader(self.config) elif self.config.name == 'qanet': self.model = QANet(self.config) else: raise NotImplementedError('Invalid arhitecture name') with tf.Session() as sess: sess.run(tf.global_variables_initializer()) saver = tf.train.Saver(max_to_keep=20) save_path = os.path.join('models', self.config.name) if os.path.exists(save_path): saver.restore(sess, tf.train.latest_checkpoint(save_path)) if self.config.mode == 'train': self.train(sess, saver) else: step = sess.run(self.model.global_step) em, f1 = self.test(sess) print('\nIteration: %d - Exact match: %.2f\tf1: %.2f\t' % (step, em, f1)) if self.config.ema_decay > 0: sess.run(self.model.assign_vars) ema, ema_f1 = self.test(sess) print('\nIteration EMA: %d - Exact match: %.2f\tf1: %.2f' % (step, ema, ema_f1))
def main(): # testing_file = "D:/DataMining/QASystem/new_data/test.ann.json" testing_file = "D:/DataMining/QASystem/new_data/validation.ann.json" # testing_file = "D:/DataMining/QASystem/new_data/training.json" trained_model = "checkpoints/model.ckpt" embedding_file = "D:/DataMining/QASystem/wiki/wiki.zh.text.vector" embedding_size = 60 # word embedding维度 hidden_size = 100 # 隐藏层神经元数量 keep_prob = 1 # 0.8 batch_size = 60 # 分批数据大小 max_quelen, max_evilen = get_max_length(testing_file) embeddings, word2idx = load_embedding(embedding_file) questions, evidences, y1, y2 = load_data(testing_file, word2idx, max_quelen, max_evilen) with tf.Graph().as_default(): with tf.variable_scope('Model'): model = BiDAF(embeddings, max_quelen, max_evilen, embedding_size, hidden_size, keep_prob) with tf.Session().as_default() as sess: saver = tf.train.Saver() print("开始加载模型") saver.restore(sess, trained_model) print("加载模型完毕") # sess.run(tf.global_variables_initializer()) 前面已经使用restore恢复变量了,如果再使用global_variables_initializer,会导致所有学习到的东西清零 for batch_questions, batch_evidences, batch_y1, batch_y2 in next_batch( questions, evidences, y1, y2, batch_size): feed_dict = { model.x: batch_evidences, model.q: batch_questions, model.y1: batch_y1, model.y2: batch_y2 } acc_s, acc_e = sess.run([model.acc_s, model.acc_e], feed_dict) print('ACC_S: %s\t\tACC_E: %s' % (acc_s, acc_e))
keep_prob = 0.8 # 0.8 learning_rate = 0.01 # 0.001 lrdown_rate = 0.9 # 0.8 gpu_mem_usage = 0.75 gpu_device = "/gpu:0" cpu_device = "/cpu:0" max_quelen, max_evilen = get_max_length(training_file) embeddings, word2idx = load_embedding(embedding_file) questions, evidences, y1, y2 = load_data( training_file, word2idx, max_quelen, max_evilen) with tf.Graph().as_default(), tf.device(cpu_device): # gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=gpu_mem_usage) # session_conf = tf.ConfigProto(allow_soft_placement=True, gpu_options=gpu_options) with tf.variable_scope('Model'): model = BiDAF(embeddings, max_quelen, max_evilen, embedding_size, hidden_size, keep_prob) with tf.Session().as_default() as sess: # config=session_conf saver = tf.train.Saver() print("开始训练") sess.run(tf.global_variables_initializer()) for i in range(epochs): print("正在进行第%s次迭代训练" % (i+1)) for batch_questions, batch_evidences, batch_y1, batch_y2 in next_batch(questions, evidences, y1, y2, batch_size): feed_dict = { model.x: batch_evidences, model.q: batch_questions, model.y1: batch_y1, model.y2: batch_y2, model.lr: learning_rate } _, loss, acc_s, acc_e = sess.run(
def main(unused_argv): # Print an error message if you've entered flags incorrectly if len(unused_argv) != 1: raise Exception("There is a problem with how you entered flags: %s" % unused_argv) # Check for Python 2 if sys.version_info[0] != 2: raise Exception( "ERROR: You must use Python 2 but you are running Python %i" % sys.version_info[0]) # Print out Tensorflow version print "This code was developed and tested on TensorFlow 1.4.1. Your TensorFlow version: %s" % tf.__version__ # Define train_dir if not FLAGS.experiment_name and not FLAGS.train_dir and FLAGS.mode != "official_eval": raise Exception( "You need to specify either --experiment_name or --train_dir") FLAGS.train_dir = FLAGS.train_dir or os.path.join(EXPERIMENTS_DIR, FLAGS.experiment_name) # Initialize bestmodel directory bestmodel_dir = os.path.join(FLAGS.train_dir, "best_checkpoint") # Define path for glove vecs FLAGS.glove_path = FLAGS.glove_path or os.path.join( DEFAULT_DATA_DIR, "glove.6B.{}d.txt".format(FLAGS.embedding_size)) # Load embedding matrix and vocab mappings emb_matrix, word2id, id2word = get_glove(FLAGS.glove_path, FLAGS.embedding_size) # Get filepaths to train/dev datafiles for tokenized queries, contexts and answers train_context_path = os.path.join(FLAGS.data_dir, "train.context") train_qn_path = os.path.join(FLAGS.data_dir, "train.question") train_ans_path = os.path.join(FLAGS.data_dir, "train.span") dev_context_path = os.path.join(FLAGS.data_dir, "dev.context") dev_qn_path = os.path.join(FLAGS.data_dir, "dev.question") dev_ans_path = os.path.join(FLAGS.data_dir, "dev.span") # Initialize model #qa_model = QAModel(FLAGS, id2word, word2id, emb_matrix) bidaf_model = BiDAF(FLAGS, id2word, word2id, emb_matrix) # Some GPU settings config = tf.ConfigProto() config.gpu_options.allow_growth = True # Split by mode if FLAGS.mode == "train": # Setup train dir and logfile if not os.path.exists(FLAGS.train_dir): os.makedirs(FLAGS.train_dir) file_handler = logging.FileHandler( os.path.join(FLAGS.train_dir, "log.txt")) logging.getLogger().addHandler(file_handler) # Save a record of flags as a .json file in train_dir with open(os.path.join(FLAGS.train_dir, "flags.json"), 'w') as fout: json.dump(FLAGS.__flags, fout) # Make bestmodel dir if necessary if not os.path.exists(bestmodel_dir): os.makedirs(bestmodel_dir) with tf.Session(config=config) as sess: # Load most recent model initialize_model(sess, bidaf_model, FLAGS.train_dir, expect_exists=False) # Train bidaf_model.train(sess, train_context_path, train_qn_path, train_ans_path, dev_qn_path, dev_context_path, dev_ans_path) elif FLAGS.mode == "show_examples": with tf.Session(config=config) as sess: # Load best model initialize_model(sess, bidaf_model, bestmodel_dir, expect_exists=True) # Show examples with F1/EM scores _, _ = bidaf_model.check_f1_em(sess, dev_context_path, dev_qn_path, dev_ans_path, "dev", num_samples=10, print_to_screen=True) elif FLAGS.mode == "official_eval": if FLAGS.json_in_path == "": raise Exception( "For official_eval mode, you need to specify --json_in_path") if FLAGS.ckpt_load_dir == "": raise Exception( "For official_eval mode, you need to specify --ckpt_load_dir") # Read the JSON data from file qn_uuid_data, context_token_data, qn_token_data = get_json_data( FLAGS.json_in_path) with tf.Session(config=config) as sess: # Load model from ckpt_load_dir initialize_model(sess, bidaf_model, FLAGS.ckpt_load_dir, expect_exists=True) # Get a predicted answer for each example in the data # Return a mapping answers_dict from uuid to answer answers_dict = generate_answers(sess, bidaf_model, word2id, qn_uuid_data, context_token_data, qn_token_data) # Write the uuid->answer mapping a to json file in root dir print "Writing predictions to %s..." % FLAGS.json_out_path with io.open(FLAGS.json_out_path, 'w', encoding='utf-8') as f: f.write(unicode(json.dumps(answers_dict, ensure_ascii=False))) print "Wrote predictions to %s" % FLAGS.json_out_path else: raise Exception("Unexpected value of FLAGS.mode: %s" % FLAGS.mode)
import numpy as np q_lens = np.array(q_lens) d_lens = np.array(d_lens) doc_lens = np.array(doc_lens) print( 'Average query, average number of docs per query and average number of docs' ) print(np.mean(q_lens), np.mean(doc_lens), np.mean(d_lens)) # 10.405203405865658 5.105676442762536 24.902959215817074 kv_model = api.load('glove-wiki-gigaword-50') model = BiDAF(q_iterable, d_iterable, l_iterable, kv_model, text_maxlen=51, unk_handle_method='zero', epochs=1, batch_size=20) # Example of how prediction works print('Hello there result: ', model.tiny_predict('Hello there', 'general kenobi')) print( 'Hello there batch: ', model.batch_tiny_predict( 'Hello there', ['gengeral kenowbi', 'i am groot', 'I dont wear boot'])) queries, doc_group, label_group, query_ids, doc_id_group = MyOtherWikiIterable( os.path.join('experimental_data', 'WikiQACorpus',
class ContextMRR(nn.Module): def __init__(self, args, loader): super(ContextMRR, self).__init__() hidden_size = args.hidden_size embed_size = args.embed_size word_vocab_size = loader.vocab.get_length() ## word embedding layer #self.word_embedding_layer = LookupEncoder(word_vocab_size, embedding_dim=embed_size) #, pretrain_embedding=loader.pretrain_embedding) ## dropout layer if args.dropout > 0: self._dropout = torch.nn.Dropout(p=args.dropout) else: self._dropout = lambda x: x ## contextual embedding layer self.contextual_embedding_layer = RecurrentContext(input_size=embed_size, hidden_size=hidden_size, num_layers=1) ## bidirectional attention flow between question and context self.attention_flow_layer1 = BiDAF(2*hidden_size) ## modelling layer for question and context : this layer also converts the 8 dimensional input intp two dimensioanl output modeling_layer_inputdim = 8 * hidden_size self.modeling_layer1 = RecurrentContext(modeling_layer_inputdim, hidden_size) '''BIDAF 2''' self.contextual_embedding_layer_2 = RecurrentContext(input_size=embed_size, hidden_size=hidden_size, num_layers=1) ## bidirectional attention flow between [q+c] and answer self.attention_flow_layer2 = BiDAF(2*hidden_size) ## modeling layer modeling_layer_inputdim = 6*hidden_size self.modeling_layer2 = RecurrentContext(modeling_layer_inputdim, hidden_size) ## output layer ## current implementation: run an mlp on the concatenated hidden states of the answer modeling layer output_layer_inputdim = 4*hidden_size self.output_layer = OutputLayer(output_layer_inputdim, hidden_size) self.loss = torch.nn.CrossEntropyLoss() def forward(self, batch_query, batch_query_length,batch_query_mask, batch_context, batch_context_length,batch_context_mask, batch_candidates_sorted, batch_candidate_lengths_sorted, batch_candidate_masks_sorted,batch_candidate_unsort, gold_index, negative_indices): ## Embed query and context # (N, J, d) #query_embedded = self.word_embedding_layer(batch_query.unsqueeze(0)) # (N, T, d) #context_embedded = self.word_embedding_layer(batch_context.unsqueeze(0)) query_embedded = batch_query.unsqueeze(0) context_embedded = batch_context.unsqueeze(0) ## Encode query and context # (N, J, 2d) query_encoded,_ = self.contextual_embedding_layer(query_embedded, batch_query_length) query_encoded = self._dropout(query_encoded) # (N, T, 2d) context_encoded,_ = self.contextual_embedding_layer(context_embedded, batch_context_length) context_encoded = self._dropout(context_encoded) ## required to support single element batch of question batch_query_mask = batch_query_mask.unsqueeze(0) batch_context_mask = batch_context_mask.unsqueeze(0) ## BiDAF 1 to get ~U, ~h and G (8d) between context and query # (N, T, 8d) , (N, T ,2d) , (N, 1, 2d) context_attention_encoded, query_aware_context_encoded, context_aware_query_encoded = self.attention_flow_layer1(query_encoded, context_encoded,batch_query_mask,batch_context_mask) ## modelling layer 1 # (N, T, 8d) => (N, T, 2d) context_modeled,_ = self.modeling_layer1(context_attention_encoded, batch_context_length) context_modeled = self._dropout(context_modeled) ''' BIDAF 2 ''' ## BiDAF for answers batch_size = batch_candidates_sorted.size(0) # N=1 so (N, T, 2d) => (N1, T, 2d) batch_context_modeled = context_modeled.repeat(batch_size,1,1) # (N1, K, d) #batch_candidates_embedded = self.word_embedding_layer(batch_candidates_sorted) batch_candidates_embedded = batch_candidates_sorted # (N1, K, 2d) batch_candidates_encoded,_ = self.contextual_embedding_layer(batch_candidates_embedded, batch_candidate_lengths_sorted) batch_candidates_encoded = self._dropout(batch_candidates_encoded) answer_attention_encoded, context_aware_answer_encoded, answer_aware_context_encoded = self.attention_flow_layer2(batch_context_modeled, batch_candidates_encoded, batch_context_mask,batch_candidate_masks_sorted) ## concatenate original answer and context aware answer input_to_answer_model = torch.cat([batch_candidates_encoded,context_aware_answer_encoded,batch_candidates_encoded * context_aware_answer_encoded],dim=-1) ## modelling layer 2 # (N1, K, 8d) => (N1, K, 2d) answer_modeled, (answer_hidden_state, answer_cell_state) = self.modeling_layer2(input_to_answer_model, batch_candidate_lengths_sorted) answer_modeled = self._dropout(answer_modeled) answer_modeled_replaced = self.attention_flow_layer2.replace_masked_values(answer_modeled.transpose(1, 2), batch_candidate_masks_sorted.unsqueeze( 1), 1e-7) answer_modeled_mask = answer_modeled.transpose(1, 2) * batch_candidate_masks_sorted.unsqueeze(1) answer_concat_hidden = torch.cat( (torch.max(answer_modeled_replaced, dim=2)[0], torch.mean(answer_modeled_mask, dim=2)), dim=1) ## output layer : concatenate hidden dimension of the final answer model layer and run through an MLP : (N1, 2d) => (N1, d) # (N1, 2d) => (N1, 1) # answer_concat_hidden = torch.cat([answer_hidden_state[-2], answer_hidden_state[-1]], dim=1) # (N1, 4d) => (N1, 1) # take maxmimum and average of hidden embeddings and concatenate them answer_scores = self.output_layer(answer_concat_hidden) ## unsort the answer scores answer_scores_unsorted = torch.index_select(answer_scores, 0, batch_candidate_unsort) ## Hinge Loss # gold_features = torch.index_select(answer_scores_unsorted, 0, index=gold_index) # negative_features = torch.index_select(answer_scores_unsorted, 0, index=negative_indices) # #negative_metrics = torch.index_select(batch_metrics, 0, index=negative_indices) # #negative_features = negative_features + negative_metrics.unsqueeze(1) # max_negative_feature, max_negative_index = torch.max(negative_features, 0) # loss = torch.clamp(1 - gold_features + max_negative_feature, 0) loss = self.loss(answer_scores_unsorted.transpose(0,1), gold_index) sorted, indices = torch.sort(F.log_softmax(answer_scores_unsorted.squeeze(0),dim=0), dim=0, descending=True) return loss, indices def eval(self,batch_query, batch_query_length,batch_query_mask, batch_context, batch_context_length,batch_context_mask, batch_candidates_sorted, batch_candidate_lengths_sorted,batch_candidate_masks_sorted, batch_candidate_unsort): ## Embed query and context # (N, J, d) #query_embedded = self.word_embedding_layer(batch_query.unsqueeze(0)) # (N, T, d) #context_embedded = self.word_embedding_layer(batch_context.unsqueeze(0)) query_embedded = batch_query.unsqueeze(0) context_embedded = batch_context.unsqueeze(0) ## Encode query and context # (N, J, 2d) query_encoded, _ = self.contextual_embedding_layer(query_embedded, batch_query_length) # (N, T, 2d) context_encoded, _ = self.contextual_embedding_layer(context_embedded, batch_context_length) ## BiDAF 1 to get ~U, ~h and G (8d) between context and query # (N, T, 8d) , (N, T ,2d) , (N, 1, 2d) batch_query_mask = batch_query_mask.unsqueeze(0) batch_context_mask = batch_context_mask.unsqueeze(0) context_attention_encoded, query_aware_context_encoded, context_aware_query_encoded = self.attention_flow_layer1( query_encoded, context_encoded,batch_query_mask,batch_context_mask) ## modelling layer 1 # (N, T, 8d) => (N, T, 2d) context_modeled, _ = self.modeling_layer1(context_attention_encoded, batch_context_length) ## BiDAF for answers batch_size = batch_candidates_sorted.size(0) # N=1 so (N, T, 2d) => (N1, T, 2d) batch_context_modeled = context_modeled.repeat(batch_size, 1, 1) # (N1, K, d) #batch_candidates_embedded = self.word_embedding_layer(batch_candidates_sorted) batch_candidates_embedded = batch_candidates_sorted # (N1, K, 2d) batch_candidates_encoded, _ = self.contextual_embedding_layer(batch_candidates_embedded, batch_candidate_lengths_sorted) answer_attention_encoded, context_aware_answer_encoded, answer_aware_context_encoded = self.attention_flow_layer2( batch_context_modeled, batch_candidates_encoded,batch_context_mask,batch_candidate_masks_sorted) input_to_answer_model = torch.cat([batch_candidates_encoded, context_aware_answer_encoded, batch_candidates_encoded * context_aware_answer_encoded], dim=-1) ## modelling layer 2 # (N1, K, 8d) => (N1, K, 2d) answer_modeled, (answer_hidden_state, answer_cell_state) = self.modeling_layer2(input_to_answer_model, batch_candidate_lengths_sorted,) answer_modeled_replaced = self.attention_flow_layer2.replace_masked_values(answer_modeled.transpose(1, 2), batch_candidate_masks_sorted.unsqueeze( 1), 1e-7) answer_modeled_mask = answer_modeled.transpose(1, 2) * batch_candidate_masks_sorted.unsqueeze(1) answer_concat_hidden = torch.cat( (torch.max(answer_modeled_replaced, dim=2)[0], torch.mean(answer_modeled_mask, dim=2)), dim=1) ## output layer : concatenate hidden dimension of the final answer model layer and run through an MLP : (N1, 2d) => (N1, d) # (N1, 2d) => (N1, 1) # answer_concat_hidden = torch.cat([answer_hidden_state[-2], answer_hidden_state[-1]], dim=1) # (N1, K, 4d) => (N1, 1, 4d) # take maxmimum and average of hidden embeddings and concatenate them answer_scores = self.output_layer(answer_concat_hidden) ## unsort the answer scores answer_scores_unsorted = torch.index_select(answer_scores, 0, batch_candidate_unsort) sorted, indices = torch.sort( F.log_softmax(answer_scores_unsorted, dim=0), dim=0, descending=True) return indices
def main(args): # Set up logging and devices args.save_dir = util.get_save_dir(args.save_dir, args.name, training=True) log = util.get_logger(args.save_dir, args.name) tbx = SummaryWriter(args.save_dir) device, args.gpu_ids = util.get_available_devices() log.info(f'Args: {dumps(vars(args), indent=4, sort_keys=True)}') args.batch_size *= max(1, len(args.gpu_ids)) # Set random seed log.info(f'Using random seed {args.seed}...') random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) # Get embeddings log.info('Loading embeddings...') word_vectors = util.torch_from_json(args.word_emb_file) if args.char_emb: char_vectors = util.torch_from_json(args.char_emb_file) else: char_vectors = None # Get model log.info('Building model...') # model = BiDAF(word_vectors=word_vectors, # hidden_size=args.hidden_size, # drop_prob=args.drop_prob) if args.model == 'attentive_reader': model = AttentiveReaderModel(word_vectors=word_vectors, hidden_size=args.hidden_size, drop_prob=args.drop_prob, rnn_layers=args.rnn_layers, use_gru=args.use_gru) elif args.model == 'bidaf': model = BiDAF(word_vectors=word_vectors, char_vectors=char_vectors, hidden_size=args.hidden_size, drop_prob=args.drop_prob, use_gru=args.use_gru, share_rnn=args.share_rnn, highway=args.highway, share_proj=args.share_proj) elif args.model == 'bidaf_stanford': model = BiDAFStanford(word_vectors=word_vectors, hidden_size=args.hidden_size, drop_prob=args.drop_prob) model = nn.DataParallel(model, args.gpu_ids) if args.load_path: log.info(f'Loading checkpoint from {args.load_path}...') model, step = util.load_model(model, args.load_path, args.gpu_ids) else: step = 0 model = model.to(device) model.train() ema = util.EMA(model, args.ema_decay) # Get saver saver = util.CheckpointSaver(args.save_dir, max_checkpoints=args.max_checkpoints, metric_name=args.metric_name, maximize_metric=args.maximize_metric, log=log) # Get optimizer and scheduler if args.optim == 'adam': optimizer = optim.Adam(model.parameters(), args.lr, weight_decay=args.l2_wd) elif args.optim == 'adamw': optimizer = optim.AdamW(model.parameters(), args.lr, weight_decay=args.l2_wd) elif args.optim == 'adadelta': optimizer = optim.Adadelta(model.parameters(), args.lr, weight_decay=args.l2_wd) if args.lr_sched == 'plateau': scheduler = sched.LambdaLR(optimizer, lambda s: 1.) # Constant LR else: # if args.lr_sched == 'const': scheduler = sched.ReduceLROnPlateau(optimizer) # Constant LR # Get data loader log.info('Building dataset...') train_dataset = SQuAD(args.train_record_file, args.use_squad_v2) log.info(' Loaded raw SQuAD train dataset...') train_loader = data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, collate_fn=collate_fn) log.info(' Built train data loader...') dev_dataset = SQuAD(args.dev_record_file, args.use_squad_v2) log.info(' Loaded raw SQuAD dev dataset...') dev_loader = data.DataLoader(dev_dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, collate_fn=collate_fn) log.info(' Built dev data loader...') # Train log.info('Training...') steps_till_eval = args.eval_steps epoch = step // len(train_dataset) while epoch != args.num_epochs: epoch += 1 log.info(f'Starting epoch {epoch}...') with torch.enable_grad(), \ tqdm(total=len(train_loader.dataset)) as progress_bar: for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in train_loader: # Setup for forward cw_idxs = cw_idxs.to(device) qw_idxs = qw_idxs.to(device) if args.char_emb: cc_idxs = cc_idxs.to(device) qc_idxs = qc_idxs.to(device) batch_size = cw_idxs.size(0) optimizer.zero_grad() # Forward if args.char_emb: log_p1, log_p2 = model(cw_idxs, qw_idxs, cc_idxs, qc_idxs) else: log_p1, log_p2 = model(cw_idxs, qw_idxs) y1, y2 = y1.to(device), y2.to(device) loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2) loss_val = loss.item() # Backward loss.backward() nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step(step // batch_size) ema(model, step // batch_size) # Log info step += batch_size progress_bar.update(batch_size) progress_bar.set_postfix(epoch=epoch, NLL=loss_val) tbx.add_scalar('train/NLL', loss_val, step) tbx.add_scalar('train/LR', optimizer.param_groups[0]['lr'], step) steps_till_eval -= batch_size if steps_till_eval <= 0: steps_till_eval = args.eval_steps # Evaluate and save checkpoint log.info(f'Evaluating at step {step}...') ema.assign(model) results, pred_dict = evaluate(model, dev_loader, device, args.dev_eval_file, args.max_ans_len, args.use_squad_v2, args.char_emb) saver.save(step, model, results[args.metric_name], device) ema.resume(model) # Log to console results_str = ', '.join(f'{k}: {v:05.2f}' for k, v in results.items()) log.info(f'Dev {results_str}') # Log to TensorBoard log.info('Visualizing in TensorBoard...') for k, v in results.items(): tbx.add_scalar(f'dev/{k}', v, step) util.visualize(tbx, pred_dict=pred_dict, eval_path=args.dev_eval_file, step=step, split='dev', num_visuals=args.num_visuals)