Beispiel #1
0
 def get_model(modeldict, modelname):
     checkpoint = torch.load(modeldict,
                             map_location=lambda storage, loc: storage)
     dict_args = checkpoint['dict_args']
     if modelname == 'bidaf':
         model = BiDAF(dict_args)
     elif modelname == 'rnet':
         model = RNet(dict_args)
     model.load_state_dict(checkpoint['state_dict'])
     return model
Beispiel #2
0
def train(args, data):
    device = torch.device(
        f"cuda:{args.gpu}" if torch.cuda.is_available() else "cpu")
    model = BiDAF(args, data.WORD.vocab.vectors).to(device)

    ema = EMA(args.exp_decay_rate)
    for name, param in model.named_parameters():
        if param.requires_grad:
            ema.register(name, param.data)
    parameters = filter(lambda p: p.requires_grad, model.parameters())
    optimizer = optim.Adadelta(parameters, lr=args.learning_rate)
    criterion = nn.CrossEntropyLoss()

    model.train()
    loss, last_epoch = 0, -1
    max_dev_exact, max_dev_f1 = -1, -1

    iterator = data.train_iter
    for i, batch in enumerate(iterator):
        present_epoch = int(iterator.epoch)
        if present_epoch == args.epoch:
            break
        if present_epoch > last_epoch:
            print('epoch:', present_epoch + 1)
        last_epoch = present_epoch

        p1, p2 = model(batch)

        optimizer.zero_grad()
        batch_loss = criterion(p1, batch.s_idx) + criterion(p2, batch.e_idx)
        loss += batch_loss.item()
        batch_loss.backward()
        optimizer.step()

        for name, param in model.named_parameters():
            if param.requires_grad:
                ema.update(name, param.data)

        if (i + 1) % args.print_freq == 0:
            dev_loss, dev_exact, dev_f1 = test(model, ema, args, data)
            c = (i + 1) // args.print_freq

            print(f'train loss: {loss:.3f} / dev loss: {dev_loss:.3f}'
                  f' / dev EM: {dev_exact:.3f} / dev F1: {dev_f1:.3f}')

            if dev_f1 > max_dev_f1:
                max_dev_f1 = dev_f1
                max_dev_exact = dev_exact
                best_model = copy.deepcopy(model)

            loss = 0
            model.train()

    print(f'max dev EM: {max_dev_exact:.3f} / max dev F1: {max_dev_f1:.3f}')

    return best_model
    def __init__(self, args, loader):
        super(ContextMRR_Sep_Switched, self).__init__()
        hidden_size = args.hidden_size
        embed_size = args.embed_size

        ## dropout layer
        if args.dropout > 0:
            self._dropout = torch.nn.Dropout(p=args.dropout)
        else:
            self._dropout = lambda x: x

        ## contextual embedding layer
        self.contextual_embedding_layer = RecurrentContext(
            input_size=embed_size, hidden_size=hidden_size, num_layers=1)

        ## bidirectional attention flow between question and context
        self.attention_flow_layer1 = BiDAF(2 * hidden_size)

        modeling_layer_inputdim = 2 * hidden_size
        self.modeling_layer1 = RecurrentContext(modeling_layer_inputdim,
                                                hidden_size,
                                                num_layers=1)

        self.linearrelu = ffnLayer(4 * hidden_size, 4 * hidden_size)

        output_layer_inputdim = 8 * hidden_size
        self.output_layer = OutputLayer(output_layer_inputdim, hidden_size)

        self.loss = torch.nn.CrossEntropyLoss()
Beispiel #4
0
 def _prepare(self):
     if self.args.algo == Algos.BIDAF:
         self._create_qa_data()
         self.model = BiDAF(Algos.BIDAF,
                            self.datasets[1].schema,
                            is_infer=self.args.is_infer,
                            vocab_size=self.args.vocab_size,
                            doc_num=self.datasets[1].doc_num,
                            static_emb=(self.args.pre_emb.strip() != ''),
                            emb_dim=self.args.emb_dim,
                            max_a_len=self.args.max_a_len)
     elif self.args.algo == Algos.MLSTM:
         self._create_qa_data()
         self.model = MatchLstm(
             Algos.MLSTM,
             self.datasets[1].schema,
             is_infer=self.args.is_infer,
             vocab_size=self.args.vocab_size,
             doc_num=self.datasets[1].doc_num,
             static_emb=(self.args.pre_emb.strip() != ''),
             emb_dim=self.args.emb_dim,
             max_a_len=self.args.max_a_len)
     elif self.args.algo == Algos.YESNO:
         self._create_yesno_data()
         self.model = OpinionClassifier(
             Algos.YESNO,
             self.datasets[1].schema,
             is_infer=self.args.is_infer,
             vocab_size=self.args.vocab_size,
             static_emb=(self.args.pre_emb.strip() != ''),
             doc_num=1,
             emb_dim=self.args.emb_dim)
     else:
         raise ValueError('Illegal algo: {}'.format(self.args.algo))
    def __init__(self, args, loader):
        super(ContextMRR_Sep, self).__init__()
        hidden_size = args.hidden_size
        embed_size = args.embed_size
        word_vocab_size = loader.vocab.get_length()

        ## word embedding layer
        #self.word_embedding_layer = LookupEncoder(word_vocab_size, embedding_dim=embed_size) #, pretrain_embedding=loader.pretrain_embedding)

        ## dropout layer
        if args.dropout > 0:
            self._dropout = torch.nn.Dropout(p=args.dropout)
        else:
            self._dropout = lambda x: x

        ## contextual embedding layer
        self.contextual_embedding_layer = RecurrentContext(
            input_size=embed_size, hidden_size=hidden_size, num_layers=1)

        ## bidirectional attention flow between question and context
        self.attention_flow_layer1 = BiDAF(2 * hidden_size)

        ## modelling layer for question and context : this layer also converts the 8 dimensional input intp two dimensioanl output
        modeling_layer_inputdim = 8 * hidden_size
        self.modeling_layer1 = RecurrentContext(modeling_layer_inputdim,
                                                hidden_size,
                                                num_layers=1)
        '''BIDAF 2'''
        self.contextual_embedding_layer_2 = RecurrentContext(
            input_size=embed_size, hidden_size=hidden_size, num_layers=1)

        ## bidirectional attention flow between [q+c] and answer
        self.attention_flow_layer2 = BiDAF(2 * hidden_size)

        ## modeling layer
        modeling_layer_inputdim = 8 * hidden_size
        self.modeling_layer2 = RecurrentContext(modeling_layer_inputdim,
                                                hidden_size,
                                                num_layers=1)

        ## output layer
        ## current implementation: run an mlp on the concatenated hidden states of the answer modeling layer
        output_layer_inputdim = 4 * hidden_size
        self.output_layer = OutputLayer(output_layer_inputdim, hidden_size)

        self.loss = torch.nn.CrossEntropyLoss()
Beispiel #6
0
    def test_forward(self):
        word_vectors = get_word_vectors(vocab, emb_size)
        cw_idxs, c_lengths = get_idxs(batch, clen, vocab)
        qw_idxs, q_lengths = get_idxs(batch, qlen, vocab)

        model = BiDAF(word_vectors, hidden_size)
        p1, p2 = model(cw_idxs, qw_idxs)
        self.assertEqual(p1.size(), (batch, clen))
        self.assertEqual(p2.size(), (batch, clen))
        self.assertTrue(torch.allclose(p1.exp().sum(-1), torch.ones(
            (batch, ))))
        self.assertTrue(torch.allclose(p2.exp().sum(-1), torch.ones(
            (batch, ))))

        model = BiDAF(word_vectors, hidden_size, highway=True)
        p1, p2 = model(cw_idxs, qw_idxs)

        model = BiDAF(word_vectors, hidden_size, use_gru=False)
        p1, p2 = model(cw_idxs, qw_idxs)
Beispiel #7
0
 def _prepare(self):
     if self.args.algo == Algos.BIDAF:
         self._create_qa_data()
         self.model = BiDAF(Algos.BIDAF,
                            self.datasets[1].schema,
                            is_infer=self.args.is_infer,
                            vocab_size=self.args.vocab_size,
                            doc_num=self.datasets[1].doc_num,
                            static_emb=(self.args.pre_emb.strip() != ''),
                            emb_dim=self.args.emb_dim,
                            max_a_len=self.args.max_a_len)
     else:
         raise ValueError('Illegal algo: {}'.format(self.args.algo))
    def __init__(self, args, vocab):
        super(ContextMRR, self).__init__()
        hidden_size = args.hidden_size
        embed_size = args.embed_size
        word_vocab_size = vocab.get_length()

        if args.dropout > 0:
            self._dropout = torch.nn.Dropout(p=args.dropout)
        else:
            self._dropout = lambda x: x

        ## word embedding layer
        self.word_embedding_layer = LookupEncoder(word_vocab_size,
                                                  embedding_dim=embed_size)

        ## contextual embedding layer
        self.contextual_embedding_layer = RecurrentContext(
            input_size=embed_size, hidden_size=hidden_size, num_layers=1)

        ## bidirectional attention flow between question and context
        self.attention_flow_layer1 = BiDAF(2 * hidden_size)

        ## modelling layer for question and context : this layer also converts the 8 dimensional input intp two dimensioanl output
        modeling_layer_inputdim = 8 * hidden_size
        self.modeling_layer1 = RecurrentContext(modeling_layer_inputdim,
                                                hidden_size)
        self.modeling_dim = 2 * hidden_size

        span_start_input_dim = modeling_layer_inputdim + (2 * hidden_size)
        self._span_predictor = TimeDistributed(
            torch.nn.Linear(span_start_input_dim, 1))

        span_end_input_dim = modeling_layer_inputdim + (2 * hidden_size)
        self._span_end_predictor = TimeDistributed(
            torch.nn.Linear(span_end_input_dim, 1))

        span_end_dim = modeling_layer_inputdim + 3 * self.modeling_dim
        self._span_end_encoder = RecurrentContext(span_end_dim, hidden_size)

        self._span_start_accuracy = Accuracy()
        self._span_end_accuracy = Accuracy()
        self._span_accuracy = Accuracy()
    def __init__(self, args, loader):
        super(ContextMRR_Sentence_Level, self).__init__()
        hidden_size = args.hidden_size
        embed_size = args.embed_size

        ## dropout layer
        if args.dropout > 0:
            self._dropout = torch.nn.Dropout(p=args.dropout)
        else:
            self._dropout = lambda x: x

        ## contextual embedding layer
        self.contextual_embedding_layer = RecurrentContext(
            input_size=embed_size, hidden_size=hidden_size, num_layers=1)

        ## bidirectional attention flow between question and context
        self.attention_flow_layer1 = BiDAF(2 * hidden_size)

        c2q_linearLayer_dim = 8 * hidden_size
        self.c2q_linearLayer = TimeDistributed(
            nn.Sequential(
                torch.nn.Linear(c2q_linearLayer_dim, 2 * hidden_size),
                torch.nn.ReLU()))

        modeling_layer_inputdim = 2 * hidden_size
        self.modeling_layer1 = RecurrentContext(modeling_layer_inputdim,
                                                hidden_size,
                                                num_layers=1)
        self.hierarchial_layer1 = RecurrentContext(2 * hidden_size,
                                                   hidden_size,
                                                   num_layers=1)

        self.linearrelu = ffnLayer(4 * hidden_size, 4 * hidden_size)

        output_layer_inputdim = 6 * hidden_size
        self.output_layer = OutputLayer(output_layer_inputdim, hidden_size)

        self.loss = torch.nn.CrossEntropyLoss()
Beispiel #10
0
    def __init__(self):
        self.config = self.get_args()
        self.trainset, self.devset, self.config.embed, self.word2index, self.char2index = read.data()

        self.config.unique_chars = len(self.char2index)

        with tf.Graph().as_default() as g:
            if self.config.name == 'bidaf':
                self.model = BiDAF(self.config)
            if self.config.name == 'bidaf-att':
                self.model = BiDAFSelfAttention(self.config)
            elif self.config.name == 'mnemonic':
                self.model = MnemonicReader(self.config)
            elif self.config.name == 'qanet':
                self.model = QANet(self.config)
            else:
                raise NotImplementedError('Invalid arhitecture name')

            with tf.Session() as sess:
                sess.run(tf.global_variables_initializer())
                saver = tf.train.Saver(max_to_keep=20)
                save_path = os.path.join('models', self.config.name)

                if os.path.exists(save_path):
                    saver.restore(sess, tf.train.latest_checkpoint(save_path))

                if self.config.mode == 'train':
                    self.train(sess, saver)
                else:
                    step = sess.run(self.model.global_step)

                    em, f1 = self.test(sess)
                    print('\nIteration: %d - Exact match: %.2f\tf1: %.2f\t' % (step, em, f1))

                    if self.config.ema_decay > 0:
                        sess.run(self.model.assign_vars)
                        ema, ema_f1 = self.test(sess)
                        print('\nIteration EMA: %d - Exact match: %.2f\tf1: %.2f' % (step, ema, ema_f1))
Beispiel #11
0
def main():
    # testing_file = "D:/DataMining/QASystem/new_data/test.ann.json"
    testing_file = "D:/DataMining/QASystem/new_data/validation.ann.json"
    # testing_file = "D:/DataMining/QASystem/new_data/training.json"
    trained_model = "checkpoints/model.ckpt"
    embedding_file = "D:/DataMining/QASystem/wiki/wiki.zh.text.vector"
    embedding_size = 60  # word embedding维度
    hidden_size = 100  # 隐藏层神经元数量
    keep_prob = 1  # 0.8
    batch_size = 60  # 分批数据大小

    max_quelen, max_evilen = get_max_length(testing_file)
    embeddings, word2idx = load_embedding(embedding_file)
    questions, evidences, y1, y2 = load_data(testing_file, word2idx,
                                             max_quelen, max_evilen)
    with tf.Graph().as_default():
        with tf.variable_scope('Model'):
            model = BiDAF(embeddings, max_quelen, max_evilen, embedding_size,
                          hidden_size, keep_prob)
            with tf.Session().as_default() as sess:
                saver = tf.train.Saver()
                print("开始加载模型")
                saver.restore(sess, trained_model)
                print("加载模型完毕")
                # sess.run(tf.global_variables_initializer()) 前面已经使用restore恢复变量了,如果再使用global_variables_initializer,会导致所有学习到的东西清零
                for batch_questions, batch_evidences, batch_y1, batch_y2 in next_batch(
                        questions, evidences, y1, y2, batch_size):
                    feed_dict = {
                        model.x: batch_evidences,
                        model.q: batch_questions,
                        model.y1: batch_y1,
                        model.y2: batch_y2
                    }
                    acc_s, acc_e = sess.run([model.acc_s, model.acc_e],
                                            feed_dict)
                    print('ACC_S: %s\t\tACC_E: %s' % (acc_s, acc_e))
Beispiel #12
0
    keep_prob = 0.8  # 0.8
    learning_rate = 0.01  # 0.001
    lrdown_rate = 0.9  # 0.8
    gpu_mem_usage = 0.75
    gpu_device = "/gpu:0"
    cpu_device = "/cpu:0"

    max_quelen, max_evilen = get_max_length(training_file)
    embeddings, word2idx = load_embedding(embedding_file)
    questions, evidences, y1, y2 = load_data(
        training_file, word2idx, max_quelen, max_evilen)
    with tf.Graph().as_default(), tf.device(cpu_device):
        # gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=gpu_mem_usage)
        # session_conf = tf.ConfigProto(allow_soft_placement=True, gpu_options=gpu_options)
        with tf.variable_scope('Model'):
            model = BiDAF(embeddings, max_quelen, max_evilen, embedding_size, hidden_size, keep_prob)
            with tf.Session().as_default() as sess:  # config=session_conf
                saver = tf.train.Saver()
                print("开始训练")
                sess.run(tf.global_variables_initializer())
                for i in range(epochs):
                    print("正在进行第%s次迭代训练" % (i+1))
                    for batch_questions, batch_evidences, batch_y1, batch_y2 in next_batch(questions, evidences, y1, y2, batch_size):
                        feed_dict = {
                            model.x: batch_evidences,
                            model.q: batch_questions,
                            model.y1: batch_y1,
                            model.y2: batch_y2,
                            model.lr: learning_rate
                        }
                        _, loss, acc_s, acc_e = sess.run(
def main(unused_argv):
    # Print an error message if you've entered flags incorrectly
    if len(unused_argv) != 1:
        raise Exception("There is a problem with how you entered flags: %s" %
                        unused_argv)

    # Check for Python 2
    if sys.version_info[0] != 2:
        raise Exception(
            "ERROR: You must use Python 2 but you are running Python %i" %
            sys.version_info[0])

    # Print out Tensorflow version
    print "This code was developed and tested on TensorFlow 1.4.1. Your TensorFlow version: %s" % tf.__version__

    # Define train_dir
    if not FLAGS.experiment_name and not FLAGS.train_dir and FLAGS.mode != "official_eval":
        raise Exception(
            "You need to specify either --experiment_name or --train_dir")
    FLAGS.train_dir = FLAGS.train_dir or os.path.join(EXPERIMENTS_DIR,
                                                      FLAGS.experiment_name)

    # Initialize bestmodel directory
    bestmodel_dir = os.path.join(FLAGS.train_dir, "best_checkpoint")

    # Define path for glove vecs
    FLAGS.glove_path = FLAGS.glove_path or os.path.join(
        DEFAULT_DATA_DIR, "glove.6B.{}d.txt".format(FLAGS.embedding_size))

    # Load embedding matrix and vocab mappings
    emb_matrix, word2id, id2word = get_glove(FLAGS.glove_path,
                                             FLAGS.embedding_size)

    # Get filepaths to train/dev datafiles for tokenized queries, contexts and answers
    train_context_path = os.path.join(FLAGS.data_dir, "train.context")
    train_qn_path = os.path.join(FLAGS.data_dir, "train.question")
    train_ans_path = os.path.join(FLAGS.data_dir, "train.span")
    dev_context_path = os.path.join(FLAGS.data_dir, "dev.context")
    dev_qn_path = os.path.join(FLAGS.data_dir, "dev.question")
    dev_ans_path = os.path.join(FLAGS.data_dir, "dev.span")

    # Initialize model
    #qa_model = QAModel(FLAGS, id2word, word2id, emb_matrix)
    bidaf_model = BiDAF(FLAGS, id2word, word2id, emb_matrix)

    # Some GPU settings
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True

    # Split by mode
    if FLAGS.mode == "train":

        # Setup train dir and logfile
        if not os.path.exists(FLAGS.train_dir):
            os.makedirs(FLAGS.train_dir)
        file_handler = logging.FileHandler(
            os.path.join(FLAGS.train_dir, "log.txt"))
        logging.getLogger().addHandler(file_handler)

        # Save a record of flags as a .json file in train_dir
        with open(os.path.join(FLAGS.train_dir, "flags.json"), 'w') as fout:
            json.dump(FLAGS.__flags, fout)

        # Make bestmodel dir if necessary
        if not os.path.exists(bestmodel_dir):
            os.makedirs(bestmodel_dir)

        with tf.Session(config=config) as sess:

            # Load most recent model
            initialize_model(sess,
                             bidaf_model,
                             FLAGS.train_dir,
                             expect_exists=False)

            # Train
            bidaf_model.train(sess, train_context_path, train_qn_path,
                              train_ans_path, dev_qn_path, dev_context_path,
                              dev_ans_path)

    elif FLAGS.mode == "show_examples":
        with tf.Session(config=config) as sess:

            # Load best model
            initialize_model(sess,
                             bidaf_model,
                             bestmodel_dir,
                             expect_exists=True)

            # Show examples with F1/EM scores
            _, _ = bidaf_model.check_f1_em(sess,
                                           dev_context_path,
                                           dev_qn_path,
                                           dev_ans_path,
                                           "dev",
                                           num_samples=10,
                                           print_to_screen=True)

    elif FLAGS.mode == "official_eval":
        if FLAGS.json_in_path == "":
            raise Exception(
                "For official_eval mode, you need to specify --json_in_path")
        if FLAGS.ckpt_load_dir == "":
            raise Exception(
                "For official_eval mode, you need to specify --ckpt_load_dir")

        # Read the JSON data from file
        qn_uuid_data, context_token_data, qn_token_data = get_json_data(
            FLAGS.json_in_path)

        with tf.Session(config=config) as sess:

            # Load model from ckpt_load_dir
            initialize_model(sess,
                             bidaf_model,
                             FLAGS.ckpt_load_dir,
                             expect_exists=True)

            # Get a predicted answer for each example in the data
            # Return a mapping answers_dict from uuid to answer
            answers_dict = generate_answers(sess, bidaf_model, word2id,
                                            qn_uuid_data, context_token_data,
                                            qn_token_data)

            # Write the uuid->answer mapping a to json file in root dir
            print "Writing predictions to %s..." % FLAGS.json_out_path
            with io.open(FLAGS.json_out_path, 'w', encoding='utf-8') as f:
                f.write(unicode(json.dumps(answers_dict, ensure_ascii=False)))
                print "Wrote predictions to %s" % FLAGS.json_out_path

    else:
        raise Exception("Unexpected value of FLAGS.mode: %s" % FLAGS.mode)
Beispiel #14
0
    import numpy as np
    q_lens = np.array(q_lens)
    d_lens = np.array(d_lens)
    doc_lens = np.array(doc_lens)
    print(
        'Average query, average number of docs per query and average number of docs'
    )
    print(np.mean(q_lens), np.mean(doc_lens), np.mean(d_lens))
    # 10.405203405865658 5.105676442762536 24.902959215817074

    kv_model = api.load('glove-wiki-gigaword-50')
    model = BiDAF(q_iterable,
                  d_iterable,
                  l_iterable,
                  kv_model,
                  text_maxlen=51,
                  unk_handle_method='zero',
                  epochs=1,
                  batch_size=20)

    # Example of how prediction works
    print('Hello there result: ',
          model.tiny_predict('Hello there', 'general kenobi'))
    print(
        'Hello there batch: ',
        model.batch_tiny_predict(
            'Hello there',
            ['gengeral kenowbi', 'i am groot', 'I dont wear boot']))

    queries, doc_group, label_group, query_ids, doc_id_group = MyOtherWikiIterable(
        os.path.join('experimental_data', 'WikiQACorpus',
Beispiel #15
0
class ContextMRR(nn.Module):
	def __init__(self, args, loader):
		super(ContextMRR, self).__init__()
		hidden_size = args.hidden_size
		embed_size = args.embed_size
		word_vocab_size = loader.vocab.get_length()

		## word embedding layer
		#self.word_embedding_layer = LookupEncoder(word_vocab_size, embedding_dim=embed_size) #, pretrain_embedding=loader.pretrain_embedding)

		## dropout layer
		if args.dropout > 0:
			self._dropout = torch.nn.Dropout(p=args.dropout)
		else:
			self._dropout = lambda x: x

		## contextual embedding layer
		self.contextual_embedding_layer = RecurrentContext(input_size=embed_size, hidden_size=hidden_size, num_layers=1)

		## bidirectional attention flow between question and context
		self.attention_flow_layer1 = BiDAF(2*hidden_size)

		## modelling layer for question and context : this layer also converts the 8 dimensional input intp two dimensioanl output
		modeling_layer_inputdim = 8 * hidden_size
		self.modeling_layer1 = RecurrentContext(modeling_layer_inputdim, hidden_size)

		'''BIDAF 2'''
		self.contextual_embedding_layer_2 = RecurrentContext(input_size=embed_size, hidden_size=hidden_size, num_layers=1)

		## bidirectional attention flow between [q+c] and answer
		self.attention_flow_layer2 = BiDAF(2*hidden_size)

		## modeling layer
		modeling_layer_inputdim = 6*hidden_size
		self.modeling_layer2 = RecurrentContext(modeling_layer_inputdim, hidden_size)

		## output layer
		## current implementation: run an mlp on the concatenated hidden states of the answer modeling layer
		output_layer_inputdim = 4*hidden_size
		self.output_layer = OutputLayer(output_layer_inputdim, hidden_size)

		self.loss = torch.nn.CrossEntropyLoss()


	def forward(self, batch_query, batch_query_length,batch_query_mask,
				batch_context, batch_context_length,batch_context_mask,
				batch_candidates_sorted, batch_candidate_lengths_sorted, batch_candidate_masks_sorted,batch_candidate_unsort,
				gold_index, negative_indices):

		## Embed query and context
		# (N, J, d)
		#query_embedded = self.word_embedding_layer(batch_query.unsqueeze(0))
		# (N, T, d)
		#context_embedded = self.word_embedding_layer(batch_context.unsqueeze(0))

		query_embedded = batch_query.unsqueeze(0)
		context_embedded = batch_context.unsqueeze(0)
		## Encode query and context
		# (N, J, 2d)
		query_encoded,_ = self.contextual_embedding_layer(query_embedded, batch_query_length)
		query_encoded = self._dropout(query_encoded)
		# (N, T, 2d)
		context_encoded,_ = self.contextual_embedding_layer(context_embedded, batch_context_length)
		context_encoded = self._dropout(context_encoded)

		## required to support single element batch of question
		batch_query_mask = batch_query_mask.unsqueeze(0)
		batch_context_mask = batch_context_mask.unsqueeze(0)

		## BiDAF 1 to get ~U, ~h and G (8d) between context and query
		# (N, T, 8d) , (N, T ,2d) , (N, 1, 2d)
		context_attention_encoded, query_aware_context_encoded, context_aware_query_encoded = self.attention_flow_layer1(query_encoded, context_encoded,batch_query_mask,batch_context_mask)

		## modelling layer 1
		# (N, T, 8d) => (N, T, 2d)
		context_modeled,_ = self.modeling_layer1(context_attention_encoded, batch_context_length)
		context_modeled = self._dropout(context_modeled)

		'''
		BIDAF 2
		'''
		## BiDAF for answers
		batch_size = batch_candidates_sorted.size(0)
		# N=1 so (N, T, 2d) => (N1, T, 2d)
		batch_context_modeled = context_modeled.repeat(batch_size,1,1)
		# (N1, K, d)
		#batch_candidates_embedded = self.word_embedding_layer(batch_candidates_sorted)
		batch_candidates_embedded = batch_candidates_sorted
		# (N1, K, 2d)
		batch_candidates_encoded,_ = self.contextual_embedding_layer(batch_candidates_embedded, batch_candidate_lengths_sorted)
		batch_candidates_encoded = self._dropout(batch_candidates_encoded)

		answer_attention_encoded, context_aware_answer_encoded, answer_aware_context_encoded = self.attention_flow_layer2(batch_context_modeled, batch_candidates_encoded, batch_context_mask,batch_candidate_masks_sorted)

		## concatenate original answer and context aware answer
		input_to_answer_model = torch.cat([batch_candidates_encoded,context_aware_answer_encoded,batch_candidates_encoded * context_aware_answer_encoded],dim=-1)

		## modelling layer 2
		# (N1, K, 8d) => (N1, K, 2d)
		answer_modeled, (answer_hidden_state, answer_cell_state) = self.modeling_layer2(input_to_answer_model, batch_candidate_lengths_sorted)
		answer_modeled = self._dropout(answer_modeled)

		answer_modeled_replaced = self.attention_flow_layer2.replace_masked_values(answer_modeled.transpose(1, 2),
																				   batch_candidate_masks_sorted.unsqueeze(
																					   1), 1e-7)
		answer_modeled_mask = answer_modeled.transpose(1, 2) * batch_candidate_masks_sorted.unsqueeze(1)
		answer_concat_hidden = torch.cat(
			(torch.max(answer_modeled_replaced, dim=2)[0], torch.mean(answer_modeled_mask, dim=2)), dim=1)

		## output layer : concatenate hidden dimension of the final answer model layer and run through an MLP : (N1, 2d) => (N1, d)
		# (N1, 2d) => (N1, 1)
		# answer_concat_hidden = torch.cat([answer_hidden_state[-2], answer_hidden_state[-1]], dim=1)

		# (N1, 4d) => (N1, 1)
		# take maxmimum and average of hidden embeddings and concatenate them


		answer_scores = self.output_layer(answer_concat_hidden)


		## unsort the answer scores
		answer_scores_unsorted = torch.index_select(answer_scores, 0, batch_candidate_unsort)

		## Hinge Loss
		# gold_features = torch.index_select(answer_scores_unsorted, 0, index=gold_index)
		# negative_features = torch.index_select(answer_scores_unsorted, 0, index=negative_indices)
		# #negative_metrics = torch.index_select(batch_metrics, 0, index=negative_indices)
		# #negative_features = negative_features + negative_metrics.unsqueeze(1)
		# max_negative_feature, max_negative_index = torch.max(negative_features, 0)
		# loss = torch.clamp(1 - gold_features + max_negative_feature, 0)

		loss = self.loss(answer_scores_unsorted.transpose(0,1), gold_index)
		sorted, indices = torch.sort(F.log_softmax(answer_scores_unsorted.squeeze(0),dim=0), dim=0, descending=True)
		return loss, indices


	def eval(self,batch_query, batch_query_length,batch_query_mask,
				batch_context, batch_context_length,batch_context_mask,
			 batch_candidates_sorted, batch_candidate_lengths_sorted,batch_candidate_masks_sorted, batch_candidate_unsort):
		## Embed query and context
		# (N, J, d)
		#query_embedded = self.word_embedding_layer(batch_query.unsqueeze(0))
		# (N, T, d)
		#context_embedded = self.word_embedding_layer(batch_context.unsqueeze(0))

		query_embedded = batch_query.unsqueeze(0)
		context_embedded = batch_context.unsqueeze(0)

		## Encode query and context
		# (N, J, 2d)
		query_encoded, _ = self.contextual_embedding_layer(query_embedded, batch_query_length)
		# (N, T, 2d)
		context_encoded, _ = self.contextual_embedding_layer(context_embedded, batch_context_length)

		## BiDAF 1 to get ~U, ~h and G (8d) between context and query
		# (N, T, 8d) , (N, T ,2d) , (N, 1, 2d)
		batch_query_mask = batch_query_mask.unsqueeze(0)
		batch_context_mask = batch_context_mask.unsqueeze(0)

		context_attention_encoded, query_aware_context_encoded, context_aware_query_encoded = self.attention_flow_layer1(
			query_encoded, context_encoded,batch_query_mask,batch_context_mask)

		## modelling layer 1
		# (N, T, 8d) => (N, T, 2d)
		context_modeled, _ = self.modeling_layer1(context_attention_encoded, batch_context_length)

		## BiDAF for answers
		batch_size = batch_candidates_sorted.size(0)
		# N=1 so (N, T, 2d) => (N1, T, 2d)
		batch_context_modeled = context_modeled.repeat(batch_size, 1, 1)
		# (N1, K, d)
		#batch_candidates_embedded = self.word_embedding_layer(batch_candidates_sorted)
		batch_candidates_embedded = batch_candidates_sorted
		# (N1, K, 2d)
		batch_candidates_encoded, _ = self.contextual_embedding_layer(batch_candidates_embedded,
																	  batch_candidate_lengths_sorted)
		answer_attention_encoded, context_aware_answer_encoded, answer_aware_context_encoded = self.attention_flow_layer2(
			batch_context_modeled, batch_candidates_encoded,batch_context_mask,batch_candidate_masks_sorted)

		input_to_answer_model = torch.cat([batch_candidates_encoded, context_aware_answer_encoded,
										   batch_candidates_encoded * context_aware_answer_encoded], dim=-1)

		## modelling layer 2
		# (N1, K, 8d) => (N1, K, 2d)
		answer_modeled, (answer_hidden_state, answer_cell_state) = self.modeling_layer2(input_to_answer_model,
																						batch_candidate_lengths_sorted,)

		answer_modeled_replaced = self.attention_flow_layer2.replace_masked_values(answer_modeled.transpose(1, 2),
																				   batch_candidate_masks_sorted.unsqueeze(
																					   1), 1e-7)
		answer_modeled_mask = answer_modeled.transpose(1, 2) * batch_candidate_masks_sorted.unsqueeze(1)
		answer_concat_hidden = torch.cat(
			(torch.max(answer_modeled_replaced, dim=2)[0], torch.mean(answer_modeled_mask, dim=2)), dim=1)

		## output layer : concatenate hidden dimension of the final answer model layer and run through an MLP : (N1, 2d) => (N1, d)
		# (N1, 2d) => (N1, 1)
		# answer_concat_hidden = torch.cat([answer_hidden_state[-2], answer_hidden_state[-1]], dim=1)

		# (N1, K, 4d) => (N1, 1, 4d)
		# take maxmimum and average of hidden embeddings and concatenate them

		answer_scores = self.output_layer(answer_concat_hidden)

		## unsort the answer scores
		answer_scores_unsorted = torch.index_select(answer_scores, 0, batch_candidate_unsort)
		sorted, indices = torch.sort(	F.log_softmax(answer_scores_unsorted, dim=0), dim=0, descending=True)
		return indices
Beispiel #16
0
def main(args):
    # Set up logging and devices
    args.save_dir = util.get_save_dir(args.save_dir, args.name, training=True)
    log = util.get_logger(args.save_dir, args.name)
    tbx = SummaryWriter(args.save_dir)
    device, args.gpu_ids = util.get_available_devices()
    log.info(f'Args: {dumps(vars(args), indent=4, sort_keys=True)}')
    args.batch_size *= max(1, len(args.gpu_ids))

    # Set random seed
    log.info(f'Using random seed {args.seed}...')
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    # Get embeddings
    log.info('Loading embeddings...')
    word_vectors = util.torch_from_json(args.word_emb_file)
    if args.char_emb:
        char_vectors = util.torch_from_json(args.char_emb_file)
    else:
        char_vectors = None

    # Get model
    log.info('Building model...')
    # model = BiDAF(word_vectors=word_vectors,
    #               hidden_size=args.hidden_size,
    #               drop_prob=args.drop_prob)
    if args.model == 'attentive_reader':
        model = AttentiveReaderModel(word_vectors=word_vectors,
                                     hidden_size=args.hidden_size,
                                     drop_prob=args.drop_prob,
                                     rnn_layers=args.rnn_layers,
                                     use_gru=args.use_gru)
    elif args.model == 'bidaf':
        model = BiDAF(word_vectors=word_vectors,
                      char_vectors=char_vectors,
                      hidden_size=args.hidden_size,
                      drop_prob=args.drop_prob,
                      use_gru=args.use_gru,
                      share_rnn=args.share_rnn,
                      highway=args.highway,
                      share_proj=args.share_proj)
    elif args.model == 'bidaf_stanford':
        model = BiDAFStanford(word_vectors=word_vectors,
                              hidden_size=args.hidden_size,
                              drop_prob=args.drop_prob)
    model = nn.DataParallel(model, args.gpu_ids)
    if args.load_path:
        log.info(f'Loading checkpoint from {args.load_path}...')
        model, step = util.load_model(model, args.load_path, args.gpu_ids)
    else:
        step = 0
    model = model.to(device)
    model.train()
    ema = util.EMA(model, args.ema_decay)

    # Get saver
    saver = util.CheckpointSaver(args.save_dir,
                                 max_checkpoints=args.max_checkpoints,
                                 metric_name=args.metric_name,
                                 maximize_metric=args.maximize_metric,
                                 log=log)

    # Get optimizer and scheduler
    if args.optim == 'adam':
        optimizer = optim.Adam(model.parameters(),
                               args.lr,
                               weight_decay=args.l2_wd)
    elif args.optim == 'adamw':
        optimizer = optim.AdamW(model.parameters(),
                                args.lr,
                                weight_decay=args.l2_wd)
    elif args.optim == 'adadelta':
        optimizer = optim.Adadelta(model.parameters(),
                                   args.lr,
                                   weight_decay=args.l2_wd)

    if args.lr_sched == 'plateau':
        scheduler = sched.LambdaLR(optimizer, lambda s: 1.)  # Constant LR
    else:  # if args.lr_sched == 'const':
        scheduler = sched.ReduceLROnPlateau(optimizer)  # Constant LR

    # Get data loader
    log.info('Building dataset...')
    train_dataset = SQuAD(args.train_record_file, args.use_squad_v2)
    log.info('  Loaded raw SQuAD train dataset...')
    train_loader = data.DataLoader(train_dataset,
                                   batch_size=args.batch_size,
                                   shuffle=True,
                                   num_workers=args.num_workers,
                                   collate_fn=collate_fn)
    log.info('  Built train data loader...')
    dev_dataset = SQuAD(args.dev_record_file, args.use_squad_v2)
    log.info('  Loaded raw SQuAD dev dataset...')
    dev_loader = data.DataLoader(dev_dataset,
                                 batch_size=args.batch_size,
                                 shuffle=False,
                                 num_workers=args.num_workers,
                                 collate_fn=collate_fn)
    log.info('  Built dev data loader...')

    # Train
    log.info('Training...')
    steps_till_eval = args.eval_steps
    epoch = step // len(train_dataset)
    while epoch != args.num_epochs:
        epoch += 1
        log.info(f'Starting epoch {epoch}...')
        with torch.enable_grad(), \
                tqdm(total=len(train_loader.dataset)) as progress_bar:
            for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in train_loader:
                # Setup for forward
                cw_idxs = cw_idxs.to(device)
                qw_idxs = qw_idxs.to(device)
                if args.char_emb:
                    cc_idxs = cc_idxs.to(device)
                    qc_idxs = qc_idxs.to(device)
                batch_size = cw_idxs.size(0)
                optimizer.zero_grad()

                # Forward
                if args.char_emb:
                    log_p1, log_p2 = model(cw_idxs, qw_idxs, cc_idxs, qc_idxs)
                else:
                    log_p1, log_p2 = model(cw_idxs, qw_idxs)
                y1, y2 = y1.to(device), y2.to(device)
                loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2)
                loss_val = loss.item()

                # Backward
                loss.backward()
                nn.utils.clip_grad_norm_(model.parameters(),
                                         args.max_grad_norm)
                optimizer.step()
                scheduler.step(step // batch_size)
                ema(model, step // batch_size)

                # Log info
                step += batch_size
                progress_bar.update(batch_size)
                progress_bar.set_postfix(epoch=epoch, NLL=loss_val)
                tbx.add_scalar('train/NLL', loss_val, step)
                tbx.add_scalar('train/LR', optimizer.param_groups[0]['lr'],
                               step)

                steps_till_eval -= batch_size
                if steps_till_eval <= 0:
                    steps_till_eval = args.eval_steps

                    # Evaluate and save checkpoint
                    log.info(f'Evaluating at step {step}...')
                    ema.assign(model)
                    results, pred_dict = evaluate(model, dev_loader, device,
                                                  args.dev_eval_file,
                                                  args.max_ans_len,
                                                  args.use_squad_v2,
                                                  args.char_emb)
                    saver.save(step, model, results[args.metric_name], device)
                    ema.resume(model)

                    # Log to console
                    results_str = ', '.join(f'{k}: {v:05.2f}'
                                            for k, v in results.items())
                    log.info(f'Dev {results_str}')

                    # Log to TensorBoard
                    log.info('Visualizing in TensorBoard...')
                    for k, v in results.items():
                        tbx.add_scalar(f'dev/{k}', v, step)
                    util.visualize(tbx,
                                   pred_dict=pred_dict,
                                   eval_path=args.dev_eval_file,
                                   step=step,
                                   split='dev',
                                   num_visuals=args.num_visuals)