def list_to_token_embeddings(self, outfile_to_dump=None):
        '''
        Given an input vocabulary file, dump all the token embeddings to the
        outfile.  The result can be used as the embedding_weight_file when
        constructing a BidirectionalLanguageModel.
        '''

        #batcher = TokenBatcher(vocab_file)
        vocab = UnicodeCharsVocabulary(self.voc_file_path,
                                       self.max_word_length)
        batcher = Batcher(self.voc_file_path, self.max_word_length)
        embedding_op = self.ops['token_embeddings']
        n_tokens = vocab.size
        embed_dim = int(embedding_op.shape[2])
        embeddings = np.zeros((n_tokens, embed_dim), dtype=DTYPE)

        config = tf.ConfigProto(allow_soft_placement=True)
        with tf.Session(config=config) as sess:
            sess.run(tf.global_variables_initializer())
            for k in tqdm(range(n_tokens)):
                token = vocab.id_to_word(k)
                char_ids = batcher.batch_sentences([[token]
                                                    ])[0,
                                                       1, :].reshape(1, 1, -1)
                embeddings[k, :] = sess.run(
                    embedding_op, feed_dict={self.ids_placeholder: char_ids})

        with h5py.File(outfile_to_dump, 'w') as fout:
            ds = fout.create_dataset('embedding',
                                     embeddings.shape,
                                     dtype='float32',
                                     data=embeddings)

        return embeddings, vocab._word_to_id
Beispiel #2
0
def get_elmo_embeddings(config):

    batcher = Batcher(config.filename_words, 50)

    token_ids = tf.placeholder('int32', shape=(None, None, 50))
    bilm = BidirectionalLanguageModel(
        config.filename_elmo_options,
        config.filename_elmo_weights,
    )

    elmo_embeddings_op = bilm(token_ids)
    elmo_context_input = weight_layers('input',
                                       elmo_embeddings_op,
                                       l2_coef=0.0)

    with tf.Session() as sess:
        # It is necessary to initialize variables once before running inference.

        sess.run(tf.global_variables_initializer())

        # Create batches of data.
        train = CoNLLDataset(config.filename_train)
        sents_train = [entry[0] for entry in train]
        sent_ids_train = batcher.batch_sentences(sents_train)

        # Compute ELMo representations (here for the input only, for simplicity).

        elmo_input = sess.run([elmo_context_input['weighted_op']],
                              feed_dict={token_ids: sent_ids_train[0]})
        for batch in sent_ids_train[1:]:
            elmo_input_ = sess.run([elmo_context_input['weighted_op']],
                                   feed_dict={token_ids: batch})
            elmo_input = np.hstack(elmo_input, elmo_input_)

        test = CoNLLDataset(config.filename_test)
        sents_test = [entry[0] for entry in test]
        sent_ids_test = batcher.batch_sentences(sents_test)

        elmo_context_output_ = sess.run([elmo_context_input['weighted_op']],
                                        feed_dict={token_ids: sent_ids_test})

    return elmo_context_input_, elmo_context_output_
Beispiel #3
0
    def get_feed_dict(self,
                      words,
                      words_raw,
                      labels=None,
                      lr=None,
                      dropout=None):
        char_ids, word_ids = zip(*words)
        self.word = word_ids
        word_ids, sequence_lengths = pad_sequences(
            word_ids, self.config.vocab_words['$pad$'], self.max_word_lengths,
            self.max_sequence_lengths)
        char_ids, word_lengths = pad_sequences(
            char_ids,
            self.config.vocab_chars['$pad$'],
            self.max_word_lengths,
            self.max_sequence_lengths,
            nlevels=2)

        if self.config.use_emlo:
            batcher = Batcher("model_emlo/vocab.txt", 50)
            elmo_char_ids = batcher.batch_sentences(words_raw,
                                                    self.max_sequence_lengths)
        # build feed dictionary
        feed = {
            self.word_ids: word_ids,
            self.sequence_lengths: sequence_lengths
        }

        if self.config.use_char_cnn or self.config.use_char_lstm:
            feed[self.char_ids] = char_ids
            feed[self.word_lengths] = word_lengths
        if self.config.use_emlo:
            feed[self.char_ids_elmo] = elmo_char_ids

        if labels is not None:
            labels, _ = pad_sequences(labels, 0, self.max_word_lengths,
                                      self.max_sequence_lengths)
            feed[self.labels] = labels

        if lr is not None:
            feed[self.lr] = lr

        if dropout is not None:
            feed[self.dropout] = dropout

        return feed, sequence_lengths
Beispiel #4
0
class ElmoEmbedding:
    def __init__(self, model_path):
        vocab_file = os.path.join(model_path, 'vocabs.txt')
        options_file = os.path.join(model_path, 'options.json')
        weight_file = os.path.join(model_path, 'weights.hdf5')
        with open(options_file, "r") as fj:
            options = json.load(fj)
        self.max_characters_per_token = options['char_cnn']['max_characters_per_token']        

        # Create a Batcher to map text to character ids.
        self.batcher = Batcher(vocab_file, self.max_characters_per_token)
        # Build the biLM graph.
        self.bilm = BidirectionalLanguageModel(options_file, weight_file)


    def __call__(self, tokenized_sentences_lst):
        # Input placeholders to the biLM.
        context_character_ids = tf.placeholder('int32', shape=(None, None, self.max_characters_per_token))

        # Get ops to compute the LM embeddings.
        context_embeddings_op = self.bilm(context_character_ids)

        # Get an op to compute ELMo (weighted average of the internal biLM layers)
        elmo_context_input = weight_layers('input', context_embeddings_op, l2_coef=0.0)
        elmo_context_output = weight_layers('output', context_embeddings_op, l2_coef=0.0)

        # Now we can compute embeddings.
        context_tokens  = [sentence.split() for sentence in tokenized_sentences_lst]

        with tf.Session() as sess:
            # It is necessary to initialize variables once before running inference.
            sess.run(tf.global_variables_initializer())

            # Create batches of data.
            context_ids = self.batcher.batch_sentences(context_tokens)

            # Compute ELMo representations (here for the input only, for simplicity).
            elmo_context_vecs = sess.run(
            [elmo_context_input['weighted_op']],
            feed_dict={context_character_ids: context_ids}
            )

        return elmo_context_vecs[0]  #, context_tokens, context_ids
    def list_to_embeddings_with_dump(self,
                                     batch: List[List[str]],
                                     outfile_to_dump=None):
        """
        Parameters
        ----------
        batch : ``List[List[str]]``, required
            A list of tokenized sentences.

        """
        document_embeddings = []

        if batch == [[]]:
            raise ValueError('Batch should not be empty')
        else:

            if self.word_embedding_file is None:
                batcher = Batcher(self.voc_file_path, self.max_word_length)
            else:
                batcher = TokenBatcher(self.voc_file_path)
            config = tf.ConfigProto(allow_soft_placement=True)
            with tf.Session(config=config) as sess:
                sess.run(tf.global_variables_initializer())
                ids_list = batcher.batch_sentences(batch)
                with h5py.File(outfile_to_dump, 'w') as fout:
                    for i, ids in enumerate(tqdm(ids_list,
                                                 total=len(ids_list))):
                        _ops = sess.run(
                            self.ops, feed_dict={self.ids_placeholder: [ids]})
                        mask = _ops['mask']
                        lm_embeddings = _ops['lm_embeddings'][0, :]
                        token_embeddings = _ops['token_embeddings']
                        lengths = _ops['lengths']
                        length = int(mask.sum())
                        document_embeddings.append(lm_embeddings)
                        ds = fout.create_dataset('{}'.format(i),
                                                 lm_embeddings.shape,
                                                 dtype='float32',
                                                 data=lm_embeddings)
                document_embeddings = np.asarray(document_embeddings)
        return document_embeddings
    def list_to_embeddings(self, batch: List[List[str]], slice=None):
        """
        Parameters
        ----------
        batch : ``List[List[str]]``, required
            A list of tokenized sentences.

        """
        elmo_embeddings = []

        if batch == [[]]:
            if slice is None:
                elmo_embeddings.append(empty_embedding(self.dims))
            else:
                if slice > 2:
                    raise ValueError('Slice can not be larger than 3')
                elmo_embeddings.append(empty_embedding(self.dims, True))
        else:
            batcher = Batcher(self.voc_file_path, self.max_word_length)
            config = tf.ConfigProto(allow_soft_placement=True)
            with tf.Session(config=config) as sess:
                sess.run(tf.global_variables_initializer())
                for i, _contents in enumerate(tqdm(batch, total=len(batch))):
                    char_ids = batcher.batch_sentences([_contents])
                    _ops = sess.run(self.ops,
                                    feed_dict={self.ids_placeholder: char_ids})
                    mask = _ops['mask']
                    lm_embeddings = _ops['lm_embeddings']
                    token_embeddings = _ops['token_embeddings']
                    lengths = _ops['lengths']
                    length = int(mask.sum())
                    if slice is None:
                        lm_embeddings_mean = np.apply_over_axes(
                            np.mean, lm_embeddings[0], (0, 1))
                    else:
                        lm_embeddings_mean = np.apply_over_axes(
                            np.mean, lm_embeddings[0][slice], (0))
                    elmo_embeddings.append(lm_embeddings_mean)

        return elmo_embeddings
class ELMoRunner:
    def __init__(self, session, bilm_params):
        self.params = bilm_params

        # Create a Batcher to map text to character ids.
        self.batcher = Batcher(self.params.vocab_file,
                               self.params.max_char_len)

        # Input placeholders to the biLM.
        self.sentence_character_ids = tf.placeholder(
            'int32', shape=(None, None, self.params.max_char_len))

        # Build the biLM graph.
        bilm = BidirectionalLanguageModel(
            self.params.options_file,
            self.params.weights_file,
        )

        # Get ops to compute the LM embeddings.
        sentence_embeddings_op = bilm(self.sentence_character_ids)

        self.elmo_sentence_input = weight_layers('input',
                                                 sentence_embeddings_op,
                                                 l2_coef=0.0,
                                                 use_top_only=True)

        self.sess = session
        self.sess.run(tf.global_variables_initializer())

    def preprocess(self, sentences_words):
        return self.batcher.batch_sentences(sentences_words)

    def __call__(self, batch_sentence_ids):
        (elmo_sentence_input_, ) = self.sess.run(
            [self.elmo_sentence_input['weighted_op']],
            feed_dict={self.sentence_character_ids: batch_sentence_ids})
        return elmo_sentence_input_
Beispiel #8
0
# Now we can compute embeddings.
raw_context = [
    'Pretrained biLMs compute representations useful for NLP tasks .',
    'They give state of the art performance for many tasks .'
]
tokenized_context = [sentence.split() for sentence in raw_context]
tokenized_question = [
    ['What', 'are', 'biLMs', 'useful', 'for', '?'],
]

with tf.Session() as sess:
    # It is necessary to initialize variables once before running inference.
    sess.run(tf.global_variables_initializer())

    # Create batches of data.
    context_ids = batcher.batch_sentences(tokenized_context)
    question_ids = batcher.batch_sentences(tokenized_question)

    # Compute ELMo representations (here for the input only, for simplicity).
    elmo_context_input_, elmo_question_input_ = sess.run(
        [
            elmo_context_input['weighted_op'],
            elmo_question_input['weighted_op']
        ],
        feed_dict={
            context_character_ids: context_ids,
            question_character_ids: question_ids
        })
    print(elmo_context_input_, elmo_context_input_.shape)
Beispiel #9
0
#          set 2 if using elmo at both input and pre-output in a neural model.

# list of list of str. (i-th batch, j-th token, token's surface string)
# [1st_sentence = [1st word, 2nd word, ...],
#  2nd_sentence = [...]]
raw_context = [
    'Pretrained biLMs compute representations useful for NLP tasks .',
    'They give state of the art performance for many tasks .'
]
tokenized_context = [sentence.split() for sentence in raw_context]
tokenized_question = [
    ['What', 'are', 'biLMs', 'useful', 'for', '?'],
]

# Create batches of data.
context_ids = batcher.batch_sentences(tokenized_context, add_bos_eos=False)
question_ids = batcher.batch_sentences(tokenized_question, add_bos_eos=False)
# numpy.ndarray or cupy.ndarray
# with shape (batchsize, max_length, max_character_length)
# default max_character_length = 50

# gpu id
# if you want to use cpu, set gpu=-1
# gpu = 0
gpu = -1
if gpu >= 0:
    # transfer the model to the gpu
    chainer.cuda.get_device_from_id(gpu).use()
    elmo.to_gpu()
    # transfer input data to the gpu
    context_ids = elmo.xp.asarray(context_ids)
Beispiel #10
0
class ELMo_Utils(object):
    """
    Impements Elmo functions used by downstream task
    Each tokenized sentence is a list of str, with a batch of sentences a list of tokenized sentences (List[List[str]]).

The Batcher packs these into a shape (n_sentences, max_sentence_length + 2, 50) numpy array of character ids, padding on the right with 0 ids for sentences less then the maximum length. The first and last tokens for each sentence are special begin and end of sentence ids added by the Batcher.

The input character id placeholder can be dimensioned (None, None, 50), with both the batch dimension (axis=0) and time dimension (axis=1) determined for each batch, up the the maximum batch size specified in the BidirectionalLanguageModel constructor.

After running inference with the batch, the return biLM embeddings are a numpy array with shape (n_sentences, 3, max_sentence_length, 1024), after removing the special begin/end tokens.
    """

    START_TOKEN = '<S>'
    END_TOKEN = '</S>'
    UNK_TOKEN = '<UNK>'
    PAD_SNT = '<S></S>'
    PAD_SNT_ID = 0

    def __init__(self,
                 elmo_vocab_file,
                 elmo_weight_file,
                 elmo_option_file,
                 use_character_elmo,
                 use_concat_p,
                 question_window,
                 utterance_cache_file='',
                 passage_cache_file='',
                 question_cache_file=''):
        self.logger = logging.getLogger("dial")
        self.utterance_cache = None
        self.passage_cache = None
        self.question_cache = None
        self.need_q_cache = (question_window > 1)
        self.need_p_cache = use_concat_p
        if os.path.exists(elmo_weight_file) and os.path.exists(
                elmo_option_file) and os.path.exists(elmo_vocab_file):
            # the vocab file exported from the corpus
            self.elmo_vocab_file = elmo_vocab_file
            # elmo weight file
            self.elmo_weight_file = elmo_weight_file
            # elmo option file
            self.elmo_option_file = elmo_option_file
            self.utterance_cache_file = utterance_cache_file
            self.passage_cache_file = passage_cache_file
            self.question_cache_file = question_cache_file
            self.use_character_elmo = use_character_elmo
            with open(self.elmo_option_file, 'r') as fin:
                options = json.load(fin)
            self.output_layers = options['lstm']['n_layers'] + 1
            self.output_dim = 2 * options['lstm']['projection_dim']
            self.logger.info("output_layers :{}, output_dim :{}".format(
                self.output_layers, self.output_dim))
            # by default, the bilm use the character_elmo
            if self.use_character_elmo:
                # max_num_char for characters for a token.
                self.elmo_max_num_char = options['char_cnn'][
                    'max_characters_per_token']
                # line 207 https://github.com/allenai/bilm-tf/blob/ebf52c6ec1012a3672247c2d14ff7bcad7fb812b/bilm/data.py
                # the mask for char id is 0
                self.PAD_TOKEN_CHAR_IDS = np.zeros((self.elmo_max_num_char),
                                                   dtype=np.int32).tolist()
                # use subword character first, which shows extra improvements beside the contextual information.
                self.elmo_char_batcher = Batcher(self.elmo_vocab_file,
                                                 self.elmo_max_num_char)
                # language mode with use_character_inputs = True
                self.elmo_bilm = BidirectionalLanguageModel(
                    self.elmo_option_file, self.elmo_weight_file)
            else:
                # use token batcher
                self.elmo_token_batcher = TokenBatcher(self.elmo_vocab_file)
                # use elmo_bilm with use_character_inputs = False
                self.elmo_bilm = BidirectionalLanguageModel(
                    self.elmo_option_file, self.elmo_weight_file)

            self.chk_load_utterance_cache()
            self.chk_load_passage_cache()
            self.chk_load_question_cache()
        else:
            self.logger.warn(
                "elmo_weight_file = {}, elmo_option_file={}, elmo_vocab_file={}"
                .format(elmo_weight_file, elmo_option_file, elmo_vocab_file))

    def chk_load_utterance_cache(self):
        if self.utterance_cache_file and os.path.exists(
                self.utterance_cache_file):
            self.utterance_cache = h5py.File(self.utterance_cache_file, 'r')
            #self.utterance_cache_in_mem = {}
            #self.utterance_cache_in_mem['lm_embeddings'] = self.load_h5(self.utterance_cache['lm_embeddings'])
            #self.utterance_cache_in_mem['lengths'] = self.load_h5_lengths(self.utterance_cache['lengths'])
            #self.utterance_cache_in_mem['mask'] = self.load_h5(self.utterance_cache['mask'])
            self.logger.info(
                "Utterance cache loaded from {}, size = {}".format(
                    self.utterance_cache_file,
                    len(self.utterance_cache['lm_embeddings'].keys())))
        else:
            self.utterance_cache = None

    def load_h5(self, h5group):
        x = []
        for index in range(len(h5group.keys())):
            # https://stackoverflow.com/questions/10274476/how-to-export-hdf5-file-to-numpy-using-h5py
            x.append(h5group['{}'.format(index)][...].tolist())
        return x

    def load_h5_lengths(self, h5group):
        x = []
        for index in range(len(h5group.keys())):
            x.extend(h5group['{}'.format(index)][...].tolist())
        return x

    def chk_load_passage_cache(self):
        if self.need_p_cache:
            if self.passage_cache_file and os.path.exists(
                    self.passage_cache_file):
                self.passage_cache = h5py.File(self.passage_cache_file, 'r')
                self.logger.info("Passage cache loaded from {}".format(
                    self.passage_cache_file))
            else:
                self.passage_cache = None
                self.logger.info(
                    "Passage cache needed from {}, it will build soon.".format(
                        self.passage_cache_file))
        else:
            self.passage_cache = None
            self.logger.info("Passage cache not needed")

    def chk_load_question_cache(self):
        if self.need_q_cache:
            if self.question_cache_file and os.path.exists(
                    self.question_cache_file):
                self.question_cache = h5py.File(self.question_cache_file, 'r')
                self.logger.info("Question cache loaded from {}".format(
                    self.question_cache_file))
            else:
                self.question_cache = None
                self.logger.info(
                    "Question cache needed from {}, it will build soon.".
                    format(self.question_cache_file))
        else:
            self.question_cache = None
            self.logger.info("Question cache not needed")

    def need_build_passage_cache(self):
        return self.need_p_cache and self.passage_cache_file != '' and self.passage_cache == None

    def need_build_question_cache(self):
        return self.need_q_cache and self.question_cache_file != '' and self.question_cache == None

    def cleanup(self):
        if self.utterance_cache:
            self.utterance_cache.close()
        if self.passage_cache:
            self.passage_cache.close()
        if self.question_cache:
            self.question_cache.close()
        self.logger.info("Clean up elmo cahce")

    def get_elmo_char_ids(self, sentences):
        '''
        Given a nested list of tokens(with start and end token), return the character ids
        Arguments:
            sentences: List[List[str]]

        Return: [sentence_num, token_num, max_char_num]
        '''
        return self.elmo_char_batcher.batch_sentences(sentences).tolist()

    def get_elmo_token_ids(self, sentences):
        '''
        Given a nested list of tokens(without start and end token), return the token ids

        Arguments:
           sentemces : List[List[str]]

        Return : [sentence_num, token_num, max_char_num]
        '''
        return self.elmo_token_batcher.batch_sentences(sentences).tolist()

    def get_elmo_emb_op(self, input_ids_place_holder):
        '''
        Given the input ids place holder, reutrn a ops for computing the language model
        {
         'lm_embeddings': embedding_op, (None, 3, None, 1024)
         'lengths': sequence_lengths_op, (None, )
         'mask': op to compute mask (None, None)
        }
        '''
        return self.elmo_bilm(input_ids_place_holder)

    def weight_layers(self,
                      name,
                      bilm_ops,
                      l2_coef=None,
                      use_top_only=False,
                      do_layer_norm=False):
        '''
        Weight the layers of a biLM with trainable scalar weights to compute ELMo representations.
        See more details on https://github.com/allenai/bilm-tf/blob/81a4b54937f4dfb93308f709c1cf34dbb37c553e/bilm/elmo.py
        {
           'weighted_op': op to compute weighted average for output,
           'regularization_op': op to compute regularization term
        }
        '''
        return weight_layers(name, bilm_ops, l2_coef, use_top_only,
                             do_layer_norm)

    @staticmethod
    def prepare_elmo_vocab_file(vocab, elmo_vocab_file):
        sorted_word = sorted(vocab.token_cnt,
                             key=vocab.token_cnt.get,
                             reverse=True)
        with open(elmo_vocab_file, 'w') as f:
            f.write('{}\n'.format(ELMo_Utils.START_TOKEN))
            f.write('{}\n'.format(ELMo_Utils.END_TOKEN))
            f.write('{}\n'.format(ELMo_Utils.UNK_TOKEN))
            for item in sorted_word:
                f.write('%s\n' % item)

    def build_elmo_char_cache(self, snt_dict_file, max_snt_length,
                              output_cache_file):
        """
        Go through all the snts in the dataset, save into the cache
        """
        self.logger.info(
            'Prepare ELMo character embeddings for {} with ELMo_Utils ...'.
            format(snt_dict_file))
        ids_placeholder = tf.placeholder('int32',
                                         shape=(None, max_snt_length,
                                                self.elmo_max_num_char))
        ops = self.elmo_bilm(ids_placeholder)
        config = tf.ConfigProto(allow_soft_placement=True)
        with tf.Session(config=config) as sess:
            sess.run(tf.global_variables_initializer())
            with open(snt_dict_file,
                      'r') as fin, h5py.File(output_cache_file, 'w') as fout:
                lm_embeddings_h5 = fout.create_group('lm_embeddings')
                lengths_h5 = fout.create_group('lengths')
                mask_h5 = fout.create_group('mask')
                batch_snts = []
                start_snt_id_in_batch = 0
                SNT_BATCH_SIZE = 10
                for line in tqdm(fin, total=get_num_lines(snt_dict_file)):
                    sentence = line.strip().split()
                    batch_snts.append(sentence)
                    length = len(batch_snts)
                    if length >= SNT_BATCH_SIZE:
                        start_snt_id_in_batch += self.consume_batch_snts(
                            sess, ids_placeholder, ops, batch_snts,
                            max_snt_length, start_snt_id_in_batch,
                            lm_embeddings_h5, lengths_h5, mask_h5)
                        batch_snts = []
                if len(batch_snts) > 0:
                    start_snt_id_in_batch += self.consume_batch_snts(
                        sess, ids_placeholder, ops, batch_snts, max_snt_length,
                        start_snt_id_in_batch, lm_embeddings_h5, lengths_h5,
                        mask_h5)
                    batch_snts = []
                self.logger.info(
                    "Finished ELMo embeddings for {} senencesm in {}".format(
                        start_snt_id_in_batch, output_cache_file))

    def consume_batch_snts(self, sess, ids_placeholder, ops, batch_snts,
                           max_snt_length, start_snt_id_in_batch,
                           lm_embeddings_h5, lengths_h5, mask_h5):
        char_ids = self.get_elmo_char_ids(batch_snts)
        char_ids = [(ids + [self.PAD_TOKEN_CHAR_IDS] *
                     (max_snt_length - len(ids)))[:max_snt_length]
                    for ids in char_ids]
        elmo_ops = sess.run(ops, feed_dict={ids_placeholder: char_ids})
        batch_size = len(batch_snts)
        for i in range(batch_size):
            sentence_id = start_snt_id_in_batch + i
            # self.logger.info("create lm for snt {}".format(sentence_id))
            lm_embeddings_h5.create_dataset(
                '{}'.format(sentence_id),
                elmo_ops['lm_embeddings'].shape[1:],
                dtype='float32',
                data=elmo_ops['lm_embeddings'][i, :, :, :],
                compression="gzip")
            lengths_h5.create_dataset('{}'.format(sentence_id), (1, ),
                                      dtype='int32',
                                      data=elmo_ops['lengths'][i])
            mask_h5.create_dataset('{}'.format(sentence_id),
                                   elmo_ops['mask'].shape[1:],
                                   dtype='int32',
                                   data=elmo_ops['mask'][i],
                                   compression="gzip")
        return batch_size

    # TODO for token level embedding.
    def build_elmo_token_cache(self, snt_dict_file, max_snt_length,
                               output_cache_file):
        pass

    def build_elmo_cache(self, snt_dict_file, max_snt_length,
                         output_cache_file):
        if self.use_character_elmo:
            self.build_elmo_char_cache(snt_dict_file, max_snt_length,
                                       output_cache_file)
        else:
            self.build_elmo_token_cache(snt_dict_file, max_snt_length,
                                        output_cache_file)

        self.logger.info(
            'Finished ELMo embeddings for utterance cache with ELMo_Utils')

    def build_elmo_cache_for_samples(self, dataset, max_p_len, max_q_len):
        if (not self.need_p_cache) and (not self.need_q_cache):
            self.logger.info(
                'No need for ELMo embeddings for concated passage and question with ELMo_Utils'
            )
        else:
            # build graph for getting forward elmo embedding.
            self.logger.info('Build ELMo embeddings for p = {}, q = {}'.format(
                self.need_p_cache, self.need_q_cache))
            self.build_pq_elmo_graph()
            if self.need_p_cache:
                p_out = h5py.File(self.passage_cache_file, 'w')
                p_lm_embeddings_h5 = p_out.create_group('lm_embeddings')
                p_lengths_h5 = p_out.create_group('lengths')
                p_mask_h5 = p_out.create_group('mask')

            if self.need_q_cache:
                q_out = h5py.File(self.question_cache_file, 'w')
                q_lm_embeddings_h5 = q_out.create_group('lm_embeddings')
                q_lengths_h5 = q_out.create_group('lengths')
                q_mask_h5 = q_out.create_group('mask')

            config = tf.ConfigProto(allow_soft_placement=True)
            with tf.Session(config=config) as sess:
                sess.run(tf.global_variables_initializer())
                for set_name in ['train', 'dev', 'test']:
                    for batch_data in tqdm(
                            dataset.gen_mini_batches(set_name,
                                                     20,
                                                     shuffle=False)):
                        samples = batch_data['raw_data']
                        # batch_data is filled with elmo feed_dict
                        self.run_pq_ops(sess, batch_data, max_p_len, max_q_len)
                        for i in range(len(samples)):
                            e_id = '{}'.format(samples[i]['example-id'])
                            try:
                                if self.need_p_cache:
                                    p_lm_embeddings_h5.create_dataset(
                                        e_id,
                                        p_ops['lm_embeddings'].shape[1:],
                                        dtype='float32',
                                        data=p_ops['lm_embeddings'][
                                            i, :, :, :],
                                        compression="gzip")
                                    p_lengths_h5.create_dataset(
                                        e_id, (1, ),
                                        dtype='int32',
                                        data=p_ops['lengths'][i])
                                    p_mask_h5.create_dataset(
                                        e_id,
                                        p_ops['mask'].shape[1:],
                                        dtype='int32',
                                        data=p_ops['mask'][i, :],
                                        compression="gzip")
                                if self.need_q_cache:
                                    q_lm_embeddings_h5.create_dataset(
                                        e_id,
                                        q_ops['lm_embeddings'].shape[1:],
                                        dtype='float32',
                                        data=q_ops['lm_embeddings'][
                                            i, :, :, :],
                                        compression="gzip")
                                    q_lengths_h5.create_dataset(
                                        e_id,
                                        (1, ),
                                        dtype='int32',
                                        data=q_ops['lengths'][i],
                                    )
                                    q_mask_h5.create_dataset(
                                        e_id,
                                        q_ops['mask'].shape[1:],
                                        dtype='int32',
                                        data=q_ops['mask'][i, :],
                                        compression="gzip")
                            except:
                                continue

        self.logger.info(
            'Finished ELMo embeddings for concated passage and question with ELMo_Utils'
        )

    def run_pq_ops(self, sess, batch_data, max_p_len, max_q_len):
        self._static_pq_padding(batch_data, max_p_len, max_q_len)

        if self.need_p_cache and self.need_q_cache:
            self.p_ops, self.q_ops = sess.run(
                [self.p_emb_elmo_op, self.q_emb_elmo_op],
                feed_dict={
                    self.elmo_p: batch_data['elmo_passage_char_ids'],
                    self.elmo_q: batch_data['elmo_question_char_ids']
                })
        elif self.need_p_cache:
            self.p_ops = sess.run(
                [self.p_emb_elmo_op],
                feed_dict={self.elmo_p: batch_data['elmo_passage_char_ids']})
        else:
            self.q_ops = sess.run([self.q_emb_elmo_op],
                                  feed_dict={
                                      self.elmo_q:
                                      batch_data['elmo_question_char_ids'],
                                  })

    def build_pq_elmo_graph(self):
        """
        Given the batch_data, this will seperately run tensorflow get the elmo embedding for each batch, which will be cached into file
        Especially , for sample level cache, please make sure that the first dimension for any tensor is batch_size
        """
        start_t = time.time()
        self.logger.info(
            "Start building elmo graph for concatenated p and q ...")
        self.add_elmo_placeholders()
        with tf.device('/device:GPU:0'):
            with tf.variable_scope("", reuse=tf.AUTO_REUSE):
                # get all elmo op with language mode
                # lm_embeddings : [batch_size, layers, max_length, hidden_dims * 2]
                # lengths : [batch_size]
                # mask : [batch_size, length]
                if self.need_p_cache:
                    self.p_emb_elmo_op = self.elmo_bilm(self.elmo_p)

                if self.need_q_cache:
                    # [batch_size, context_window, layers, max_u_length, hidden_dims * 2]
                    self.q_emb_elmo_op = self.elmo_bilm(self.elmo_q)

    def add_elmo_placeholders(self):
        """
        elmo for business, logic corresponding the specific application
        """
        # for ELMo with character embedding
        # elmo passage character ids for each token in each concatenated passage
        # [batch_size, passage_length, char_length]

        if self.need_p_cache:
            self.elmo_p = tf.placeholder(tf.int32,
                                         [None, None, self.elmo_max_num_char],
                                         'elmo_p')
        # elmo character ids for whole concatenated qustion
        # [batch_size, question_length, char_length]
        self.elmo_q = tf.placeholder(tf.int32,
                                     [None, None, self.elmo_max_num_char],
                                     'elmo_q')

    def _static_pq_padding(self, batch_data, max_p_len, max_q_len):
        """
        This is used for static padding, which is useful when the deep contextual embedding is saved with a mask of the whole static length.
        """
        # also padding elmo matrix
        # in elmo, the character ids after batch_sentences contains the start and end token, length for charids +2 while the final embedding not contains those special token.
        # For further compatibility, we still leave elmo length as different length.
        pad_q_len_elmo = 2 + max_q_len
        padding(batch_data, 'elmo_question_char_ids', pad_q_len_elmo,
                self.PAD_TOKEN_CHAR_IDS)

        if self.need_p_cache:
            pad_p_len_elmo = 2 + max_p_len
            padding(batch_data, 'elmo_passage_char_ids', pad_p_len_elmo,
                    self.PAD_TOKEN_CHAR_IDS)

    def _prepare_passage_elmo_feed_dict(self, sample, batch_data,
                                        context_window, token_key_to_use):
        """
        add elmo feed_dict for passage
        """
        e_id_str = '{}'.format(sample['example-id'])
        passage_utterance_tokens_elmo = []
        passage_utterance_length_elmo = []
        passage_tokens_elmo = [ELMo_Utils.START_TOKEN]
        passage_snt_ids = []
        pruned_context_utterances_elmo = sample['messages-so-far'][
            -context_window:]
        for i in range(context_window):
            if i >= len(pruned_context_utterances_elmo):
                current_utterance_tokens_elmo = [
                    ELMo_Utils.START_TOKEN, ELMo_Utils.END_TOKEN
                ]
                passage_snt_ids.append(ELMo_Utils.PAD_SNT_ID)
                passage_utterance_tokens_elmo.append(
                    current_utterance_tokens_elmo)
                passage_utterance_length_elmo.append(
                    len(current_utterance_tokens_elmo))
            else:
                utterance = pruned_context_utterances_elmo[i]
                if 'snt_id' in utterance:
                    passage_snt_ids.append(utterance['snt_id'])
                # split version of passages
                current_utterance_tokens_elmo = [ELMo_Utils.START_TOKEN]
                current_utterance_tokens_elmo.extend(
                    utterance[token_key_to_use])
                current_utterance_tokens_elmo.extend([ELMo_Utils.END_TOKEN])
                passage_utterance_tokens_elmo.append(
                    current_utterance_tokens_elmo)
                passage_utterance_length_elmo.append(
                    len(current_utterance_tokens_elmo))
                # concatenated version of passages
                # append passages utterance tokens
                passage_tokens_elmo.extend(utterance[token_key_to_use])

        passage_tokens_elmo.extend([ELMo_Utils.END_TOKEN])
        if self.need_build_passage_cache():
            # add into batch_data, no other batch data will data
            # [batch_size, passage_length, max_char_num]
            batch_data['elmo_passage_char_ids'].append(
                self.get_elmo_char_ids([passage_tokens_elmo])[0])
        else:
            #TODO add passage and question elmo retrieve here.
            if self.need_p_cache:
                self.assemble_elmo_batch_data('p', batch_data, e_id_str,
                                              self.passage_cache)
            for snt_id in passage_snt_ids:
                # self.assemble_elmo_with_snt_ids('pu', batch_data, snt_id)
                # self.assemble_elmo_batch_data_with_mem('pu', batch_data, snt_id, self.utterance_cache_in_mem)
                self.assemble_elmo_batch_data('pu', batch_data, snt_id,
                                              self.utterance_cache)

    def _prepare_question_elmo_feed_dict(self, sample, batch_data,
                                         question_window, token_key_to_use):
        """
        add question elmo feed_dict according the same style for adding regular question feed_dict
        """
        e_id_str = '{}'.format(sample['example-id'])
        # for each utterance in question
        question_utterance_tokens_elmo = []
        # for the concatenated question
        # for question utterance length
        question_utterance_length_elmo = []
        question_snt_ids = []
        # add start token, which is also in the vocabulary
        # in non-elmo, embedding, we wil add self.vocab.sos and self.vocab.eos in to the sentence,whic will be encoded by the downstream lstm. However, sos and eos are in capital case in the elmo. In fact, we must use Upper case here to get a emebdding from elmo abou it.
        question_tokens_elmo = [ELMo_Utils.START_TOKEN]
        pruned_question_utterance_elmo = sample['messages-so-far'][
            -question_window:]
        for i in range(question_window):
            if i >= len(pruned_question_utterance_elmo):
                current_utterance_tokens_elmo = [
                    ELMo_Utils.START_TOKEN, ELMo_Utils.END_TOKEN
                ]
                question_snt_ids.append(ELMo_Utils.PAD_SNT_ID)
                question_utterance_tokens_elmo.append(
                    current_utterance_tokens_elmo)
                question_utterance_length_elmo.append(
                    len(current_utterance_tokens_elmo))
            else:
                utterance = pruned_question_utterance_elmo[i]
                # split version of question
                if 'snt_id' in utterance:
                    question_snt_ids.append(utterance['snt_id'])
                current_utterance_tokens_elmo = [ELMo_Utils.START_TOKEN]
                current_utterance_tokens_elmo.extend(
                    utterance[token_key_to_use])
                current_utterance_tokens_elmo.extend([ELMo_Utils.END_TOKEN])
                # add each utterance token_ids into a parental list
                question_utterance_tokens_elmo.append(
                    current_utterance_tokens_elmo)
                question_utterance_length_elmo.append(
                    len(current_utterance_tokens_elmo))
                # concatenated version of question
                # append question utterance tokens
                question_tokens_elmo.extend(utterance[token_key_to_use])

        question_tokens_elmo.extend([ELMo_Utils.END_TOKEN])
        if question_window == 0:
            # if note use question, here it will make mistake,
            # bug here. make question at least = 1
            pass
        else:
            # add elmo question tokenids into batch_data
            if self.need_build_question_cache():
                # add into batch_data
                # [batch_size, question_length, max_char_num]
                batch_data['elmo_question_char_ids'].append(
                    self.get_elmo_char_ids([question_tokens_elmo])[0])
            else:
                # if question_window = 1, then juse use utterance cache
                if question_window == 1:
                    # self.assemble_elmo_with_snt_ids('q', batch_data, question_snt_ids[0])
                    # self.assemble_elmo_batch_data_with_mem('q', batch_data, question_snt_ids[0], self.utterance_cache_in_mem)
                    self.assemble_elmo_batch_data('q', batch_data,
                                                  question_snt_ids[0],
                                                  self.utterance_cache)
                else:
                    self.assemble_elmo_batch_data('q', batch_data, e_id_str,
                                                  self.question_cache)

    def _prepare_response_elmo_feed_dict(self, sample, batch_data,
                                         token_key_to_use):
        """
        add question elmo feed_dict according the same style for adding regular question feed_dict
        """
        if 'options-for-correct-answers':
            e_id_str = '{}'.format(sample['example-id'])
            utterance = sample['options-for-correct-answers'][0]
            # split version of question
            current_utterance_tokens_elmo = [ELMo_Utils.START_TOKEN]
            current_utterance_tokens_elmo.extend(utterance[token_key_to_use])
            current_utterance_tokens_elmo.extend([ELMo_Utils.END_TOKEN])
            if 'snt_id' in utterance:
                snt_id = utterance['snt_id']
                self.assemble_elmo_batch_data('r', batch_data, snt_id,
                                              self.utterance_cache)

    def init_elmo_batch_data_sntids(self, batch_data):
        if self.need_p_cache:
            # use elmo cache to retrieve batch_data
            batch_data['elmo_p_lm_embeddings'] = []
            batch_data['elmo_p_lengths'] = []
            batch_data['elmo_p_mask'] = []
        batch_data['elmo_pu_snt_ids'] = []
        batch_data['elmo_q_snt_ids'] = []
        batch_data['elmo_r_snt_ids'] = []

    def init_elmo_batch_data_emb(self, batch_data):
        if self.need_p_cache:
            # use elmo cache to retrieve batch_data
            batch_data['elmo_p_lm_embeddings'] = []
            batch_data['elmo_p_lengths'] = []
            batch_data['elmo_p_mask'] = []

        # for passage_utterance
        batch_data['elmo_pu_lm_embeddings'] = []
        batch_data['elmo_pu_lengths'] = []
        batch_data['elmo_pu_mask'] = []
        # for question
        batch_data['elmo_q_lm_embeddings'] = []
        batch_data['elmo_q_lengths'] = []
        batch_data['elmo_q_mask'] = []
        # for res
        batch_data['elmo_r_lm_embeddings'] = []
        batch_data['elmo_r_lengths'] = []
        batch_data['elmo_r_mask'] = []

    def add_elmo_placeholder_with_cache_sntids(self):
        """
        add placeholders for elmo ops, which will be used in the weight_layers
        """
        if self.need_p_cache:
            self.elmo_p_lm_embeddings = tf.placeholder(
                tf.float32, [None, self.output_layers, None, self.output_dim],
                name='elmp_p_lm_embeddings')
            self.elmo_p_lengths = tf.placeholder(tf.int32, [None],
                                                 name='elmo_p_lengths')
            self.elmo_p_mask = tf.placeholder(tf.int32, [None, None],
                                              name='elmo_p_mask')

        self.elmo_pu_snt_ids = tf.placeholder(tf.int32, [None],
                                              name='elmo_pu_snt_ids')
        self.elmo_q_snt_ids = tf.placeholder(tf.int32, [None],
                                             name='elmo_q_snt_ids')
        self.elmo_r_snt_ids = tf.placeholder(tf.int32, [None],
                                             name='elmo_r_snt_ids')

    def add_elmo_placeholder_with_cache_emb(self):
        """
        add placeholders for elmo ops, which will be used in the weight_layers
        """
        if self.need_p_cache:
            self.elmo_p_lm_embeddings = tf.placeholder(
                tf.float32, [None, self.output_layers, None, self.output_dim],
                name='elmp_p_lm_embeddings')
            self.elmo_p_lengths = tf.placeholder(tf.int32, [None],
                                                 name='elmo_p_lengths')
            self.elmo_p_mask = tf.placeholder(tf.int32, [None, None],
                                              name='elmo_p_mask')

        self.elmo_pu_lm_embeddings = tf.placeholder(
            tf.float32, [None, self.output_layers, None, self.output_dim],
            name='elmo_pu_lm_embeddings')
        self.elmo_pu_lengths = tf.placeholder(tf.int32, [None],
                                              name='elmo_pu_lengths')
        self.elmo_pu_mask = tf.placeholder(tf.int32, [None, None],
                                           name='elmo_pu_mask')
        self.elmo_q_lm_embeddings = tf.placeholder(
            tf.float32, [None, self.output_layers, None, self.output_dim],
            name='elmo_q_lm_embeddings')
        self.elmo_q_lengths = tf.placeholder(tf.int32, [None],
                                             name='elmo_q_lengths')
        self.elmo_q_mask = tf.placeholder(tf.int32, [None, None],
                                          name='elmo_q_mask')
        self.elmo_r_lm_embeddings = tf.placeholder(
            tf.float32, [None, self.output_layers, None, self.output_dim],
            name='elmo_r_lm_embeddings')
        self.elmo_r_lengths = tf.placeholder(tf.int32, [None],
                                             name='elmo_r_lengths')
        self.elmo_r_mask = tf.placeholder(tf.int32, [None, None],
                                          name='elmo_r_mask')

    def prepare_elmo_cache_feed_dict_sntids(self, feed_dict, batch):
        """
        consitently feed the batch_data, we prepared in the prepare_passage_elmo, question_elmo, answer_elmo
        """
        if self.need_p_cache:
            # for elmo_p
            feed_dict[
                self.elmo_p_lm_embeddings] = batch['elmo_p_lm_embeddings']
            feed_dict[self.elmo_p_lengths] = batch['elmo_p_lengths']
            feed_dict[self.elmo_p_mask] = batch['elmo_p_mask']

        # for elmo_q
        feed_dict[self.elmo_q_snt_ids] = batch['elmo_q_snt_ids']
        # for elmo_pu
        feed_dict[self.elmo_pu_snt_ids] = batch['elmo_pu_snt_ids']
        # for elmo_r
        feed_dict[self.elmo_r_snt_ids] = batch['elmo_r_snt_ids']

    def prepare_elmo_cache_feed_dict_emb(self, feed_dict, batch):
        """
        consitently feed the batch_data, we prepared in the prepare_passage_elmo, question_elmo, answer_elmo
        """
        if self.need_p_cache:
            # for elmo_p
            feed_dict[
                self.elmo_p_lm_embeddings] = batch['elmo_p_lm_embeddings']
            feed_dict[self.elmo_p_lengths] = batch['elmo_p_lengths']
            feed_dict[self.elmo_p_mask] = batch['elmo_p_mask']

        # for elmo_q
        feed_dict[self.elmo_q_lm_embeddings] = batch['elmo_q_lm_embeddings']
        feed_dict[self.elmo_q_lengths] = batch['elmo_q_lengths']
        feed_dict[self.elmo_q_mask] = batch['elmo_q_mask']

        # for elmo_pu
        feed_dict[self.elmo_pu_lm_embeddings] = batch['elmo_pu_lm_embeddings']
        feed_dict[self.elmo_pu_lengths] = batch['elmo_pu_lengths']
        feed_dict[self.elmo_pu_mask] = batch['elmo_pu_mask']

        # for elmo_r
        feed_dict[self.elmo_r_lm_embeddings] = batch['elmo_r_lm_embeddings']
        feed_dict[self.elmo_r_lengths] = batch['elmo_r_lengths']
        feed_dict[self.elmo_r_mask] = batch['elmo_r_mask']

    def elmo_embedding_layer_emb(self, elmo_emb_output):
        """
        elmo embedding layers, which will return embedding for p,q,a,pu,qu
        after projections, dim is elmo_emb_output
        if elmo_emb_output == self.output_dim, then no projection will be done
        """
        self.logger.info('build elmo embedding layer')
        if self.need_p_cache:
            p_emb_elmo_op = {
                'lm_embeddings': self.elmo_p_lm_embeddings,
                'lengths': self.elmo_p_lengths,
                'mask': self.elmo_p_mask
            }

        q_emb_elmo_op = {
            'lm_embeddings': self.elmo_q_lm_embeddings,
            'lengths': self.elmo_q_lengths,
            'mask': self.elmo_q_mask
        }

        pu_emb_elmo_op = {
            'lm_embeddings': self.elmo_pu_lm_embeddings,
            'lengths': self.elmo_pu_lengths,
            'mask': self.elmo_pu_mask
        }

        r_emb_elmo_op = {
            'lm_embeddings': self.elmo_r_lm_embeddings,
            'lengths': self.elmo_r_lengths,
            'mask': self.elmo_r_mask
        }

        with tf.device('/device:GPU:1'):
            with tf.variable_scope("", reuse=tf.AUTO_REUSE):
                if self.need_p_cache:
                    self.p_elmo_emb = self.weight_layers(
                        'input', p_emb_elmo_op, l2_coef=0.0)['weighted_op']
                self.q_elmo_emb = self.weight_layers(
                    'input', q_emb_elmo_op, l2_coef=0.0)['weighted_op']
                self.pu_elmo_emb = self.weight_layers(
                    'input', pu_emb_elmo_op, l2_coef=0.0)['weighted_op']
                self.r_elmo_emb = self.weight_layers(
                    'input', r_emb_elmo_op, l2_coef=0.0)['weighted_op']
                # do project from elmo embedding into 128 embedding to contact with word embedding.
                if elmo_emb_output == self.output_dim:
                    self.logger.info(
                        "Elmo_emb_output={} is just equal to the output_dim={}, no need to project with fully connected layers for passage and questions"
                        .format(elmo_emb_output, self.output_dim))
                else:
                    self.logger.info(
                        "Elmo_emb_output={}, output_dim={}, project with fully connected layers for question and passage"
                        .format(elmo_emb_output, self.output_dim))
                    if self.need_p_cache:
                        self.p_elmo_emb = tf.contrib.layers.fully_connected(
                            inputs=self.p_elmo_emb,
                            num_outputs=elmo_emb_output,
                            activation_fn=tf.nn.softmax)

                    self.q_elmo_emb = tf.contrib.layers.fully_connected(
                        inputs=self.q_elmo_emb,
                        num_outputs=elmo_emb_output,
                        activation_fn=tf.nn.softmax)
                    self.pu_elmo_emb = tf.contrib.layers.fully_connected(
                        inputs=self.pu_elmo_emb,
                        num_outputs=elmo_emb_output,
                        activation_fn=tf.nn.softmax)
                    self.r_elmo_emb = tf.contrib.layers.fully_connected(
                        inputs=self.r_elmo_emb,
                        num_outputs=elmo_emb_output,
                        activation_fn=tf.nn.softmax)

    def elmo_embedding_layer_sntids(self, elmo_emb_output):
        """
        elmo embedding layers, which will return embedding for p,q,a,pu,qu
        after projections, dim is elmo_emb_output
        if elmo_emb_output == self.output_dim, then no projection will be done
        """
        with tf.device('/cpu:0'), tf.variable_scope('elmo_embedding'):
            self.elmo_lm_embeddings_lookup = tf.get_variable(
                'lm_embeddings_lookup',
                shape=np.shape(self.utterance_cache_in_mem['lm_embeddings']),
                initializer=tf.constant_initializer(
                    self.utterance_cache_in_mem['lm_embeddings']),
                trainable=False)

            self.elmo_lengths_lookup = tf.get_variable(
                'lengths_lookup',
                shape=(np.shape(self.utterance_cache_in_mem['lengths'])),
                initializer=tf.constant_initializer(
                    self.utterance_cache_in_mem['lengths']),
                trainable=False)

            self.elmo_mask_lookup = tf.get_variable(
                'mask_lookup',
                shape=np.shape(self.utterance_cache_in_mem['mask']),
                initializer=tf.constant_initializer(
                    self.utterance_cache_in_mem['mask']),
                trainable=False)

        if self.need_p_cache:
            p_emb_elmo_op = {
                'lm_embeddings': self.elmo_p_embeddings,
                'lengths': self.elmo_p_lengths,
                'mask': self.elmo_p_mask
            }

        q_emb_elmo_op = {
            'lm_embeddings':
            tf.nn.embedding_lookup(self.elmo_lm_embeddings_lookup,
                                   self.elmo_q_snt_ids),
            'lengths':
            tf.nn.embedding_lookup(self.elmo_lengths_lookup,
                                   self.elmo_q_snt_ids),
            'mask':
            tf.nn.embedding_lookup(self.elmo_mask_lookup, self.elmo_q_snt_ids)
        }

        pu_emb_elmo_op = {
            'lm_embeddings':
            tf.nn.embedding_lookup(self.elmo_lm_embeddings_lookup,
                                   self.elmo_pu_snt_ids),
            'lengths':
            tf.nn.embedding_lookup(self.elmo_lengths_lookup,
                                   self.elmo_pu_snt_ids),
            'mask':
            tf.nn.embedding_lookup(self.elmo_mask_lookup, self.elmo_pu_snt_ids)
        }

        r_emb_elmo_op = {
            'lm_embeddings':
            tf.nn.embedding_lookup(self.elmo_lm_embeddings_lookup,
                                   self.elmo_r_snt_ids),
            'lengths':
            tf.nn.embedding_lookup(self.elmo_lengths_lookup,
                                   self.elmo_r_snt_ids),
            'mask':
            tf.nn.embedding_lookup(self.elmo_mask_lookup, self.elmo_r_snt_ids)
        }

        with tf.device('/device:GPU:1'):
            with tf.variable_scope("", reuse=tf.AUTO_REUSE):
                if self.need_p_cache:
                    self.p_elmo_emb = self.weight_layers(
                        'input', p_emb_elmo_op, l2_coef=0.0)['weighted_op']
                self.q_elmo_emb = self.weight_layers(
                    'input', q_emb_elmo_op, l2_coef=0.0)['weighted_op']
                self.pu_elmo_emb = self.weight_layers(
                    'input', pu_emb_elmo_op, l2_coef=0.0)['weighted_op']
                self.r_elmo_emb = self.weight_layers(
                    'input', r_emb_elmo_op, l2_coef=0.0)['weighted_op']
                # do project from elmo embedding into 128 embedding to contact with word embedding.
                if elmo_emb_output == self.output_dim:
                    self.logger.info(
                        "Elmo_emb_output={} is just equal to the output_dim={}, no need to project with fully connected layers for question and passage"
                        .format(elmo_emb_output, self.output_dim))
                else:
                    self.logger.info(
                        "Elmo_emb_output={}, output_dim={}, project with fully connected layers for question and passage"
                        .format(elmo_emb_output, self.output_dim))
                    if self.need_p_cache:
                        self.p_elmo_emb = tf.contrib.layers.fully_connected(
                            inputs=self.p_elmo_emb,
                            num_outputs=elmo_emb_output,
                            activation_fn=tf.nn.softmax)

                    self.q_elmo_emb = tf.contrib.layers.fully_connected(
                        inputs=self.q_elmo_emb,
                        num_outputs=elmo_emb_output,
                        activation_fn=tf.nn.softmax)
                    self.pu_elmo_emb = tf.contrib.layers.fully_connected(
                        inputs=self.pu_elmo_emb,
                        num_outputs=elmo_emb_output,
                        activation_fn=tf.nn.softmax)
                    self.r_elmo_emb = tf.contrib.layers.fully_connected(
                        inputs=self.r_elmo_emb,
                        num_outputs=elmo_emb_output,
                        activation_fn=tf.nn.softmax)

    def assemble_elmo_batch_data(self, name, batch_data, id_key, cache):
        lm_embeddings = cache['lm_embeddings']['{}'.format(id_key)][...]
        length = cache['lengths']['{}'.format(id_key)][0]
        mask = cache['mask']['{}'.format(id_key)][...]
        batch_data['elmo_{}_lm_embeddings'.format(name)].append(lm_embeddings)
        batch_data['elmo_{}_lengths'.format(name)].append(length)
        batch_data['elmo_{}_mask'.format(name)].append(mask)

    def assemble_elmo_batch_data_with_mem(self, name, batch_data, id_key,
                                          cache_in_mem):
        """
        id_key is int here, for the snt_id
        """
        lm_embeddings = cache_in_mem['lm_embeddings'][id_key]
        length = cache_in_mem['lengths'][id_key]
        mask = cache_in_mem['mask'][id_key]
        batch_data['elmo_{}_lm_embeddings'.format(name)].append(lm_embeddings)
        batch_data['elmo_{}_lengths'.format(name)].append(length)
        batch_data['elmo_{}_mask'.format(name)].append(mask)

    def assemble_elmo_with_snt_ids(self, name, batch_data, id_key):
        """
        id_key is int here, for the snt_id
        """
        batch_data['elmo_{}_snt_ids'.format(name)].append(id_key)
Beispiel #11
0
class NERModel(BaseModel):
    """Specialized class of Model for NER"""

    def __init__(self, config):
        super(NERModel, self).__init__(config)
        self.idx_to_tag = {idx: tag for tag, idx in
                           self.config.vocab_tags.items()}
        self.batcher = Batcher("model_emlo/vocab.txt", 50)


    def add_placeholders(self):
        """Define placeholders = entries to computational graph"""
        # shape = (batch size, max length of sentence in batch)
        self.word_ids = tf.placeholder(tf.int32, shape=[None, None],
                        name="word_ids")

        # shape = (batch size)
        self.sequence_lengths = tf.placeholder(tf.int32, shape=[None],
                        name="sequence_lengths")

        # shape = (batch size, max length of sentence, max length of word)
        self.char_ids = tf.placeholder(tf.int32, shape=[None, None, None],
                        name="char_ids")

        # shape = (batch size, max length of sentence, max length of word)
        self.char_ids_elmo = tf.placeholder(tf.int32, shape=[None, None, 50])

        # shape = (batch_size, max_length of sentence)
        self.word_lengths = tf.placeholder(tf.int32, shape=[None, None],
                        name="word_lengths")

        # shape = (batch size, max length of sentence in batch)
        self.labels = tf.placeholder(tf.int32, shape=[None, None],
                        name="labels")

        # hyper parameters
        self.dropout = tf.placeholder(dtype=tf.float32, shape=[],
                        name="dropout")
        self.lr = tf.placeholder(dtype=tf.float32, shape=[],
                        name="lr")


    def get_feed_dict(self, words, words_raw, labels=None, lr=None, dropout=None):
        """Given some data, pad it and build a feed dictionary

        Args:
            words: list of sentences. A sentence is a list of ids of a list of
                words. A word is a list of ids
            labels: list of ids
            lr: (float) learning rate
            dropout: (float) keep prob

        Returns:
            dict {placeholder: value}

        """
        # perform padding of the given data
        if self.config.use_chars:
            char_ids, word_ids = zip(*words)
            word_ids, sequence_lengths = pad_sequences(word_ids, 0)
            char_ids, word_lengths = pad_sequences(char_ids, pad_tok=0,
                nlevels=2)
        else:
            word_ids, sequence_lengths = pad_sequences(words, 0)

        #print(words_raw)
        if self.config.use_emlo:
            elmo_char_ids = self.batcher.batch_sentences(words_raw)
        # build feed dictionary
        feed = {
            self.word_ids: word_ids,
            self.sequence_lengths: sequence_lengths
        }

        if self.config.use_chars:
            feed[self.char_ids] = char_ids
            feed[self.word_lengths] = word_lengths

        if self.config.use_emlo:
            feed[self.char_ids_elmo] = elmo_char_ids

        if labels is not None:
            labels, _ = pad_sequences(labels, 0)
            feed[self.labels] = labels

        if lr is not None:
            feed[self.lr] = lr

        if dropout is not None:
            feed[self.dropout] = dropout

        return feed, sequence_lengths


    def add_word_embeddings_op(self):
        """Defines self.word_embeddings

        If self.config.embeddings is not None and is a np array initialized
        with pre-trained word vectors, the word embeddings is just a look-up
        and we don't train the vectors. Otherwise, a random matrix with
        the correct shape is initialized.
        """
        with tf.variable_scope("words"):
            if self.config.embeddings is None:
                self.logger.info("WARNING: randomly initializing word vectors")
                _word_embeddings = tf.get_variable(
                        name="_word_embeddings",
                        dtype=tf.float32,
                        shape=[self.config.nwords, self.config.dim_word])
            else:
                _word_embeddings = tf.Variable(
                        self.config.embeddings,
                        name="_word_embeddings",
                        dtype=tf.float32,
                        trainable=self.config.train_embeddings)

            word_embeddings = tf.nn.embedding_lookup(_word_embeddings,
                    self.word_ids, name="word_embeddings")

        with tf.variable_scope("chars"):
            if self.config.use_chars:
                # get char embeddings matrix
                _char_embeddings = tf.get_variable(
                        name="_char_embeddings",
                        dtype=tf.float32,
                        shape=[self.config.nchars, self.config.dim_char])
                char_embeddings = tf.nn.embedding_lookup(_char_embeddings,
                        self.char_ids, name="char_embeddings")
                print(char_embeddings.shape)
                # put the time dimension on axis=1
                s = tf.shape(char_embeddings)
                char_embeddings = tf.reshape(char_embeddings,
                        shape=[s[0]*s[1], s[-2], self.config.dim_char])
                word_lengths = tf.reshape(self.word_lengths, shape=[s[0]*s[1]])

                # bi lstm on chars
                cell_fw = tf.contrib.rnn.LSTMCell(self.config.hidden_size_char,
                        state_is_tuple=True)
                cell_bw = tf.contrib.rnn.LSTMCell(self.config.hidden_size_char,
                        state_is_tuple=True)
                _output = tf.nn.bidirectional_dynamic_rnn(
                        cell_fw, cell_bw, char_embeddings,
                        sequence_length=word_lengths, dtype=tf.float32)

                # read and concat output
                _, ((_, output_fw), (_, output_bw)) = _output
                output = tf.concat([output_fw, output_bw], axis=-1)

                # shape = (batch size, max sentence length, char hidden size)
                lm_char_embedding = tf.reshape(output,
                        shape=[s[0], s[1], 2*self.config.hidden_size_char])
                #word_embeddings = tf.concat([word_embeddings, output], axis=-1)

        #with tf.variable_scope("emlo"):
            #if self.config.use_emlo:
        print("==================")
	# get emlo embedding
        options_file = 'model_emlo/options.json'
        weight_file = 'model_emlo/elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5'
        bilm = BidirectionalLanguageModel(options_file, weight_file)

        print(bilm)
	# compute LM model embedding
        lm_embedding = bilm(self.char_ids_elmo)

	# get emlo model
        emlo_embedding = weight_layers('input', lm_embedding, l2_coef=0.0)["weighted_op"]
        print(emlo_embedding.shape)

        print("++++++")
        word_embeddings = tf.concat([word_embeddings, lm_char_embedding, emlo_embedding], axis=-1)
        self.word_embeddings = tf.nn.dropout(word_embeddings, self.dropout)


    def add_logits_op(self):
        """Defines self.logits

        For each word in each sentence of the batch, it corresponds to a vector
        of scores, of dimension equal to the number of tags.
        """
        with tf.variable_scope("bi-lstm"):
            cell_fw = tf.contrib.rnn.LSTMCell(self.config.hidden_size_lstm)
            cell_bw = tf.contrib.rnn.LSTMCell(self.config.hidden_size_lstm)
            (output_fw, output_bw), _ = tf.nn.bidirectional_dynamic_rnn(
                    cell_fw, cell_bw, self.word_embeddings,
                    sequence_length=self.sequence_lengths, dtype=tf.float32)
            output = tf.concat([output_fw, output_bw], axis=-1)
            output = tf.nn.dropout(output, self.dropout)

        with tf.variable_scope("proj"):
            W = tf.get_variable("W", dtype=tf.float32,
                    shape=[2*self.config.hidden_size_lstm, self.config.ntags])

            b = tf.get_variable("b", shape=[self.config.ntags],
                    dtype=tf.float32, initializer=tf.zeros_initializer())

            nsteps = tf.shape(output)[1]
            output = tf.reshape(output, [-1, 2*self.config.hidden_size_lstm])
            pred = tf.matmul(output, W) + b
            self.logits = tf.reshape(pred, [-1, nsteps, self.config.ntags])


    def add_pred_op(self):
        """Defines self.labels_pred

        This op is defined only in the case where we don't use a CRF since in
        that case we can make the prediction "in the graph" (thanks to tf
        functions in other words). With theCRF, as the inference is coded
        in python and not in pure tensroflow, we have to make the prediciton
        outside the graph.
        """
        if not self.config.use_crf:
            self.labels_pred = tf.cast(tf.argmax(self.logits, axis=-1),
                    tf.int32)


    def add_loss_op(self):
        """Defines the loss"""
        if self.config.use_crf:
            log_likelihood, trans_params = tf.contrib.crf.crf_log_likelihood(
                    self.logits, self.labels, self.sequence_lengths)
            self.trans_params = trans_params # need to evaluate it for decoding
            self.loss = tf.reduce_mean(-log_likelihood)
        else:
            losses = tf.nn.sparse_softmax_cross_entropy_with_logits(
                    logits=self.logits, labels=self.labels)
            mask = tf.sequence_mask(self.sequence_lengths)
            losses = tf.boolean_mask(losses, mask)
            self.loss = tf.reduce_mean(losses)

        # for tensorboard
        tf.summary.scalar("loss", self.loss)


    def build(self):
        # NER specific functions
        self.add_placeholders()
        self.add_word_embeddings_op()
        self.add_logits_op()
        self.add_pred_op()
        self.add_loss_op()

        # Generic functions that add training op and initialize session
        self.add_train_op(self.config.lr_method, self.lr, self.loss,
                self.config.clip)
        self.initialize_session() # now self.sess is defined and vars are init


    def predict_batch(self, words, words_raw):
        """
        Args:
            words: list of sentences

        Returns:
            labels_pred: list of labels for each sentence
            sequence_length

        """
        fd, sequence_lengths = self.get_feed_dict(words, words_raw, dropout=1.0)

        if self.config.use_crf:
            # get tag scores and transition params of CRF
            viterbi_sequences = []
            logits, trans_params = self.sess.run(
                    [self.logits, self.trans_params], feed_dict=fd)

            # iterate over the sentences because no batching in vitervi_decode
            for logit, sequence_length in zip(logits, sequence_lengths):
                logit = logit[:sequence_length] # keep only the valid steps
                viterbi_seq, viterbi_score = tf.contrib.crf.viterbi_decode(
                        logit, trans_params)
                viterbi_sequences += [viterbi_seq]

            return viterbi_sequences, sequence_lengths

        else:
            labels_pred = self.sess.run(self.labels_pred, feed_dict=fd)

            return labels_pred, sequence_lengths


    def run_epoch(self, train, dev, epoch):
        """Performs one complete pass over the train set and evaluate on dev

        Args:
            train: dataset that yields tuple of sentences, tags
            dev: dataset
            epoch: (int) index of the current epoch

        Returns:
            f1: (python float), score to select model on, higher is better

        """
        # progbar stuff for logging
        batch_size = self.config.batch_size
        nbatches = (len(train) + batch_size - 1) // batch_size
        prog = Progbar(target=nbatches)

        # iterate over dataset
        for i, (words, labels, words_raw) in enumerate(minibatches(train, batch_size)):
            fd, _ = self.get_feed_dict(words, words_raw, labels, self.config.lr,
                    self.config.dropout)

            _, train_loss, summary = self.sess.run(
                    [self.train_op, self.loss, self.merged], feed_dict=fd)

            prog.update(i + 1, [("train loss", train_loss)])

            # tensorboard
            if i % 10 == 0:
                self.file_writer.add_summary(summary, epoch*nbatches + i)

        metrics = self.run_evaluate(dev)
        msg = " - ".join(["{} {:04.2f}".format(k, v)
                for k, v in metrics.items()])
        self.logger.info(msg)

        return metrics["f1"]


    def run_evaluate(self, test):
        """Evaluates performance on test set

        Args:
            test: dataset that yields tuple of (sentences, tags)

        Returns:
            metrics: (dict) metrics["acc"] = 98.4, ...

        """
        accs = []
        correct_preds, total_correct, total_preds = 0., 0., 0.
        for words, labels, words_raw in minibatches(test, self.config.batch_size):
            labels_pred, sequence_lengths = self.predict_batch(words, words_raw)

            for lab, lab_pred, length in zip(labels, labels_pred,
                                             sequence_lengths):
                lab      = lab[:length]
                lab_pred = lab_pred[:length]
                accs    += [a==b for (a, b) in zip(lab, lab_pred)]

                lab_chunks      = set(get_chunks(lab, self.config.vocab_tags))
                lab_pred_chunks = set(get_chunks(lab_pred,
                                                 self.config.vocab_tags))

                correct_preds += len(lab_chunks & lab_pred_chunks)
                total_preds   += len(lab_pred_chunks)
                total_correct += len(lab_chunks)

        p   = correct_preds / total_preds if correct_preds > 0 else 0
        r   = correct_preds / total_correct if correct_preds > 0 else 0
        f1  = 2 * p * r / (p + r) if correct_preds > 0 else 0
        acc = np.mean(accs)

        return {"acc": 100*acc, "f1": 100*f1}


    def predict(self, words_raw):
        """Returns list of tags

        Args:
            words_raw: list of words (string), just one sentence (no batch)

        Returns:
            preds: list of tags (string), one for each word in the sentence

        """
        words = [self.config.processing_word(w) for w in words_raw]
        if type(words[0]) == tuple:
            words = zip(*words)
        pred_ids, _ = self.predict_batch([words], [words_raw])
        preds = [self.idx_to_tag[idx] for idx in list(pred_ids[0])]

        return preds
Beispiel #12
0
class NERModel(BaseModel):
    """Specialized class of Model for NER"""
    def __init__(self, config):
        super(NERModel, self).__init__(config)
        self.idx_to_tag = {
            idx: tag
            for tag, idx in list(self.config.vocab_tags.items())
        }
        if self.config.use_elmo:
            # self.elmo_inputs = []
            self.batcher = Batcher(self.config.filename_words, 50)
            self.bilm = BidirectionalLanguageModel(
                self.config.filename_elmo_options,
                self.config.filename_elmo_weights)
            self.elmo_token_ids = tf.placeholder('int32',
                                                 shape=(None, None, 50))
            self.elmo_embeddings_op = self.bilm(self.elmo_token_ids)
            self.elmo_embeddings_input = weight_layers('input',
                                                       self.elmo_embeddings_op,
                                                       l2_coef=0.0)

    def add_placeholders(self):
        """Define placeholders = entries to computational graph"""
        # shape = (batch size, max length of sentence in batch)
        self.word_ids = tf.placeholder(tf.int32, \
                    shape=[self.config.batch_size, self.config.max_length_words], name="word_ids")

        # shape = (batch size)
        self.sequence_lengths = tf.placeholder(tf.int32,
                                               shape=[self.config.batch_size],
                                               name="sequence_lengths")

        # shape = (batch size, max length of sentence, max length of word)
        self.char_ids = tf.placeholder(tf.int32,\
                    shape=[self.config.batch_size, self.config.max_length_words, self.config.max_length_chars],
                        name="char_ids")

        # shape = (batch size, max length sentences, 1024)
        self.elmo_embeddings = tf.placeholder(
            tf.float32,
            shape=(self.config.batch_size, self.config.max_length_words,
                   self.config.elmo_size))

        self.elmo_and_char_embeddings = tf.placeholder(
            tf.float32,
            shape=[
                self.config.batch_size, self.config.max_length_words,
                self.config.elmo_chars_size
            ])

        #shape = (batch_size, max_length sentences, Elmo emb size + Char emb size + word2vec emb size_
        self.word_elmo_char_embeddings = tf.placeholder(tf.float32, \
                                                        shape=[self.config.batch_size, self.config.max_length_words,
                                                               self.config.elmo_chars_size+self.config.dim_char])
        # shape = (batch_size, max_length of sentence)
        self.word_lengths = tf.placeholder(
            tf.int32,
            shape=[self.config.batch_size, self.config.max_length_words],
            name="word_lengths")

        # shape = (batch size, max length of sentence in batch)
        self.labels = tf.placeholder(
            tf.int32,
            shape=[self.config.batch_size, self.config.max_length_words],
            name="labels")

        #bool
        self.is_test = tf.placeholder(tf.bool)

        # hyper parameters
        self.dropout = tf.placeholder(dtype=tf.float32,
                                      shape=[],
                                      name="dropout")
        self.lr = tf.placeholder(dtype=tf.float32, shape=[], name="lr")

    def get_feed_dict(self, words, labels=None, lr=None, dropout=None):
        """Given some data, pad it and build a feed dictionary

        Args:
            words: list of sentences. A sentence is a list of ids of a list of
                words. A word is a list of ids
            labels: list of ids
            lr: (float) learning rate
            dropout: (float) keep prob

        Returns:
            dict {placeholder: value}

        """
        # perform padding of the given data
        # self.is_test = self.config.is_test

        if self.config.use_elmo:
            if self.config.use_chars and self.config.use_elmo_and_words:
                char_ids, words_embs, word_ids = list(zip(*words))
                char_ids, word_lengths = pad_sequences(char_ids,
                                                       pad_tok=0,
                                                       nlevels=2)
                word_ids, sequence_lengths = pad_sequences(word_ids, '_PAD_')

            elif self.config.use_chars:
                char_ids, word_ids = list(zip(*words))
                char_ids, word_lengths = pad_sequences(char_ids,
                                                       pad_tok=0,
                                                       nlevels=2)
                word_ids, sequence_lengths = pad_sequences(word_ids, '_PAD_')
            else:
                word_ids, sequence_lengths = pad_sequences(words, '_PAD_')
            elmo_ids = self.batcher.batch_sentences(word_ids)
            elmo_embeddings = self.sess.run(
                [self.elmo_embeddings_input['weighted_op']],
                feed_dict={self.elmo_token_ids: elmo_ids})

        elif self.config.use_chars:
            char_ids, word_ids = list(zip(*words))
            word_ids, sequence_lengths = pad_sequences(word_ids, 0)
            char_ids, word_lengths = pad_sequences(char_ids,
                                                   pad_tok=0,
                                                   nlevels=2)
        else:
            word_ids, sequence_lengths = pad_sequences(words, 0)

        # build feed dictionary
        if self.config.use_elmo:
            feed = {
                self.elmo_embeddings: elmo_embeddings[0],
                self.sequence_lengths: sequence_lengths
            }

        else:
            feed = {
                self.word_ids: word_ids,
                self.sequence_lengths: sequence_lengths
            }

        if self.config.use_chars:
            feed[self.char_ids] = char_ids
            feed[self.word_lengths] = word_lengths

        if self.config.use_elmo_and_words:
            feed[self.word_ids] = words_embs

        if labels is not None:
            labels, _ = pad_sequences(labels, 0)
            feed[self.labels] = labels

        if lr is not None:
            feed[self.lr] = lr

        if dropout is not None:
            feed[self.dropout] = dropout

        feed[self.is_test] = self.config.testing

        return feed, sequence_lengths

    def add_word_embeddings_op(self):
        """Defines self.word_embeddings

        If self.config.embeddings is not None and is a np array initialized
        with pre-trained word vectors, the word embeddings is just a look-up
        and we don't train the vectors. Otherwise, a random matrix with
        the correct shape is initialized.
        """
        with tf.variable_scope("words"):
            if self.config.embeddings is None:
                self.logger.info("WARNING: randomly initializing word vectors")
                _word_embeddings = tf.get_variable(
                    name="_word_embeddings",
                    dtype=tf.float32,
                    shape=[self.config.nwords, self.config.dim_word])
            else:
                _word_embeddings = tf.Variable(
                    self.config.embeddings,
                    name="_word_embeddings",
                    dtype=tf.float32,
                    trainable=self.config.train_embeddings)

            word_embeddings = tf.nn.embedding_lookup(_word_embeddings,
                                                     self.word_ids,
                                                     name="word_embeddings")

        with tf.variable_scope("chars"):
            if self.config.use_chars:
                # get char embeddings matrix
                _char_embeddings = tf.get_variable(
                    name="_char_embeddings",
                    dtype=tf.float32,
                    shape=[self.config.nchars, self.config.dim_char])
                char_embeddings = tf.nn.embedding_lookup(
                    _char_embeddings, self.char_ids, name="char_embeddings")

                # put the time dimension on axis=1
                s = tf.shape(char_embeddings)
                char_embeddings = tf.reshape(
                    char_embeddings,
                    shape=[s[0] * s[1], s[-2], self.config.dim_char])
                word_lengths = tf.reshape(self.word_lengths,
                                          shape=[s[0] * s[1]])

                # bi lstm on chars
                cell_fw = tf.contrib.rnn.LSTMCell(self.config.hidden_size_char,
                                                  state_is_tuple=True)
                cell_bw = tf.contrib.rnn.LSTMCell(self.config.hidden_size_char,
                                                  state_is_tuple=True)
                _output = tf.nn.bidirectional_dynamic_rnn(
                    cell_fw,
                    cell_bw,
                    char_embeddings,
                    sequence_length=word_lengths,
                    dtype=tf.float32)

                # read and concat output
                _, ((_, output_fw), (_, output_bw)) = _output
                output = tf.concat([output_fw, output_bw], axis=-1)

                # shape = (batch size, max sentence length, char hidden size)
                output = tf.reshape(
                    output,
                    shape=[s[0], s[1], 2 * self.config.hidden_size_char])
                word_embeddings = tf.concat([word_embeddings, output], axis=-1)

        self.word_embeddings = tf.nn.dropout(word_embeddings, self.dropout)

    def add_chars_elmo_highway_op(self):
        elmo_embeddings = self.elmo_embeddings
        with tf.variable_scope("chars"):
            if self.config.use_chars:
                # get char embeddings matrix
                _char_embeddings = tf.get_variable(
                    name="_char_embeddings",
                    dtype=tf.float32,
                    shape=[self.config.nchars, self.config.dim_char])
                char_embeddings = tf.nn.embedding_lookup(
                    _char_embeddings, self.char_ids, name="char_embeddings")

            cnns_list = []
            for filter_size, kernel_size in zip(self.config.filters,
                                                self.config.kernels):
                cnn_2d = tf.layers.conv2d(inputs=char_embeddings,
                                          filters=filter_size,
                                          kernel_size=kernel_size,
                                          strides=self.config.strides,
                                          padding='same',
                                          name="kernel_%d" % kernel_size)
                cnn_2d_bn = tf.layers.batch_normalization(inputs=cnn_2d,
                                                          name="batchnorm_%d" %
                                                          kernel_size)
                cnn_2d_act = tf.nn.tanh(cnn_2d_bn,
                                        name="cnn_tanh_%d" % kernel_size)
                cnn_2d_mp = tf.layers.max_pooling2d(inputs=cnn_2d_act,
                                                    pool_size=1,
                                                    strides=1,
                                                    name="cnn_mp_%d" %
                                                    kernel_size)
                cnn_2d_rd = tf.reduce_mean(cnn_2d_mp, axis=[2])
                cnns_list.append(cnn_2d_rd)

            cnns = tf.concat(cnns_list, axis=-1)

            cnns_h = highway_layer(cnns,
                                   bias=self.config.highway_bias,
                                   bias_start=self.config.highway_bias_start,
                                   scope='highway_layer')
        elmo_embeddings = tf.concat([elmo_embeddings, cnns_h], axis=-1)
        self.elmo_and_char_embeddings = tf.nn.dropout(elmo_embeddings,
                                                      self.dropout)

    def add_word_char_highway_embeddings_op(self):
        with tf.variable_scope("words"):
            if self.config.embeddings is None:
                self.logger.info("WARNING: randomly initializing word vectors")
                _word_embeddings = tf.get_variable(
                    name="_word_embeddings",
                    dtype=tf.float32,
                    shape=[self.config.nwords, self.config.dim_word])
            else:
                _word_embeddings = tf.Variable(
                    self.config.embeddings,
                    name="_word_embeddings",
                    dtype=tf.float32,
                    trainable=self.config.train_embeddings)

            word_embeddings = tf.nn.embedding_lookup(_word_embeddings,
                                                     self.word_ids,
                                                     name="word_embeddings")

        with tf.variable_scope("chars"):
            if self.config.use_chars:
                _char_embeddings = tf.get_variable(
                    name="_char_embeddings",
                    dtype=tf.float32,
                    shape=[self.config.nchars, self.config.dim_char])
                char_embeddings = tf.nn.embedding_lookup(
                    _char_embeddings, self.char_ids, name="char_embeddings")

            cnns_list = []
            for filter_size, kernel_size in zip(self.config.filters,
                                                self.config.kernels):
                cnn_2d = tf.layers.conv2d(inputs=char_embeddings,
                                          filters=filter_size,
                                          kernel_size=kernel_size,
                                          strides=1,
                                          padding='same',
                                          name="kernel_%d" % kernel_size)
                cnn_2d_bn = tf.layers.batch_normalization(inputs=cnn_2d,
                                                          name="batchnorm_%d" %
                                                          kernel_size)
                cnn_2d_act = tf.nn.tanh(cnn_2d_bn,
                                        name="cnn_tanh_%d" % kernel_size)
                cnn_2d_mp = tf.layers.max_pooling2d(inputs=cnn_2d_act,
                                                    pool_size=1,
                                                    strides=1,
                                                    name="cnn_mp_%d" %
                                                    kernel_size)
                cnn_2d_rd = tf.reduce_mean(cnn_2d_mp, axis=[2])
                cnns_list.append(cnn_2d_rd)

            cnns = tf.concat(cnns_list, axis=-1)

            cnns_h = highway_layer(cnns,
                                   bias=self.config.highway_bias,
                                   bias_start=self.config.highway_bias_start,
                                   scope='highway_layer')

        word_embeddings = tf.concat([word_embeddings, cnns_h], axis=-1)
        self.word_embeddings = tf.nn.dropout(word_embeddings, self.dropout)

    def add_word_char_1d_highway_embeddings_op(self):
        with tf.variable_scope("words"):
            if self.config.embeddings is None:
                self.logger.info("WARNING: randomly initializing word vectors")
                _word_embeddings = tf.get_variable(
                    name="_word_embeddings",
                    dtype=tf.float32,
                    shape=[self.config.nwords, self.config.dim_word])
            else:
                _word_embeddings = tf.Variable(
                    self.config.embeddings,
                    name="_word_embeddings",
                    dtype=tf.float32,
                    trainable=self.config.train_embeddings)

            word_embeddings = tf.nn.embedding_lookup(_word_embeddings,
                                                     self.word_ids,
                                                     name="word_embeddings")

        with tf.variable_scope("chars"):
            if self.config.use_chars:
                _char_embeddings = tf.get_variable(
                    name="_char_embeddings",
                    dtype=tf.float32,
                    shape=[self.config.nchars, self.config.dim_char])
                char_embeddings = tf.nn.embedding_lookup(
                    _char_embeddings, self.char_ids, name="char_embeddings")

                # put the time dimension on axis=1 for cnn1d
                s = tf.shape(char_embeddings)
                char_embeddings = tf.reshape(
                    char_embeddings,
                    shape=[s[0] * s[1], s[-2], self.config.dim_char])
                # word_lengths = tf.reshape(self.word_lengths, shape=[s[0]*s[1]])

                cnns_list = []
                for filter_size, kernel_size in zip(self.config.filters,
                                                    self.config.kernels):
                    cnn_1d = tf.layers.conv1d(inputs=char_embeddings,
                                              filters=filter_size,
                                              kernel_size=kernel_size,
                                              strides=1,
                                              padding='same',
                                              name="kernel_%d" % kernel_size)
                    cnn_1d_bn = tf.layers.batch_normalization(
                        inputs=cnn_1d, name="batchnorm_%d" % kernel_size)
                    cnn_1d_act = tf.nn.tanh(cnn_1d_bn,
                                            name="cnn_tanh_%d" % kernel_size)
                    cnn_1d_mp = tf.layers.max_pooling1d(inputs=cnn_1d_act,
                                                        pool_size=1,
                                                        strides=1,
                                                        name="cnn_mp_%d" %
                                                        kernel_size)
                    cnn_f = tf.reshape(cnn_1d_mp,
                                       shape=[s[0], s[1], s[2] * filter_size])
                    cnns_list.append(cnn_f)

                cnns = tf.concat(cnns_list, axis=-1)

        word_embeddings = tf.concat([word_embeddings, cnns], axis=-1)
        self.word_embeddings = tf.nn.dropout(word_embeddings, self.dropout)

    def add_logits_op_elmo_rdbilstm(self):
        if self.config.use_chars:
            input = self.elmo_and_char_embeddings
        else:
            input = self.elmo_embeddings

        # attention-global
        with tf.variable_scope('attention'):
            attn, alphas = attention(input)
            input = tf.expand_dims(attn, axis=1) * input

        with tf.variable_scope("bi-lstm"):
            cell_fw = tf.contrib.rnn.LSTMCell(self.config.hidden_size_lstm)
            cell_bw = tf.contrib.rnn.LSTMCell(self.config.hidden_size_lstm)
            (output_fw,
             output_bw), _ = tf.nn.bidirectional_dynamic_rnn(cell_fw,
                                                             cell_bw,
                                                             input,
                                                             dtype=tf.float32)
            output = tf.concat([output_fw, output_bw], axis=-1)
            output = tf.nn.dropout(output, self.dropout)

        input_h = highway_layer(input,
                                bias=self.config.highway_bias,
                                bias_start=self.config.highway_bias_start,
                                scope='highway_layer')

        output_h = tf.math.add(input_h, output)

        with tf.variable_scope("rdbilstm"):
            cell_fw_rd = tf.contrib.rnn.LSTMCell(self.config.rdsize)
            cell_bw_rd = tf.contrib.rnn.LSTMCell(self.config.rdsize)
            (output_fw, output_bw), _ = tf.nn.bidirectional_dynamic_rnn(
                cell_fw_rd,
                cell_bw_rd,
                output_h,
                sequence_length=self.sequence_lengths,
                dtype=tf.float32)
            output_rd = tf.concat([output_fw, output_bw], axis=-1)
            output_rd = tf.nn.dropout(output_rd, self.dropout)

        with tf.variable_scope("projrd"):
            W = tf.get_variable(
                "W",
                dtype=tf.float32,
                shape=[2 * self.config.rdsize, self.config.ntags])

            b = tf.get_variable("b",
                                shape=[self.config.ntags],
                                dtype=tf.float32,
                                initializer=tf.glorot_uniform_initializer())

            nsteps = tf.shape(output_rd)[1]
            output = tf.reshape(output_rd, [-1, 2 * self.config.rdsize])

            pred = tf.matmul(output, W) + b
            self.logits = tf.reshape(pred, [-1, nsteps, self.config.ntags])

    def add_logits_op(self):
        """Defines self.logits

        For each word in each sentence of the batch, it corresponds to a vector
        of scores, of dimension equal to the number of tags.
        """
        with tf.variable_scope("bi-lstm"):
            cell_fw = tf.contrib.rnn.LSTMCell(self.config.hidden_size_lstm)
            cell_bw = tf.contrib.rnn.LSTMCell(self.config.hidden_size_lstm)
            (output_fw, output_bw), _ = tf.nn.bidirectional_dynamic_rnn(
                cell_fw,
                cell_bw,
                self.word_embeddings,
                sequence_length=self.sequence_lengths,
                dtype=tf.float32)
            output = tf.concat([output_fw, output_bw], axis=-1)
            output = tf.nn.dropout(output, self.dropout)

        with tf.variable_scope("proj"):
            W = tf.get_variable(
                "W",
                dtype=tf.float32,
                shape=[2 * self.config.hidden_size_lstm, self.config.ntags])

            b = tf.get_variable("b",
                                shape=[self.config.ntags],
                                dtype=tf.float32,
                                initializer=tf.zeros_initializer())

            nsteps = tf.shape(output)[1]
            output = tf.reshape(output, [-1, 2 * self.config.hidden_size_lstm])

            pred = tf.matmul(output, W) + b
            self.logits = tf.reshape(pred, [-1, nsteps, self.config.ntags])

    def add_pred_op(self):
        """Defines self.labels_pred

        This op is defined only in the case where we don't use a CRF since in
        that case we can make the prediction "in the graph" (thanks to tf
        functions in other words). With theCRF, as the inference is coded
        in python and not in pure tensroflow, we have to make the prediciton
        outside the graph.
        """
        if not self.config.use_crf:
            self.labels_pred = tf.cast(tf.argmax(self.logits, axis=-1),
                                       tf.int32)

    def add_loss_op(self):
        """Defines the loss"""
        if self.config.use_crf:
            log_likelihood, trans_params = tf.contrib.crf.crf_log_likelihood(
                self.logits, self.labels, self.sequence_lengths)
            self.trans_params = trans_params  # need to evaluate it for decoding
            self.loss = tf.reduce_mean(-log_likelihood)
        else:
            losses = tf.nn.sparse_softmax_cross_entropy_with_logits(
                logits=self.logits, labels=self.labels)
            mask = tf.sequence_mask(self.sequence_lengths)
            losses = tf.boolean_mask(losses, mask)
            self.loss = tf.reduce_mean(losses)

        # for tensorboard
        tf.summary.scalar("loss", self.loss)

    def build(self):
        self.add_placeholders()
        if self.config.use_elmo:
            self.add_chars_elmo_highway_op()
            self.add_logits_op_elmo_rdbilstm()
        elif self.config.char_max:
            self.add_word_embeddings_op()
            self.add_word_char_cnn_bilstm_embeddings_op()
            self.add_logits_highway_op()
        else:
            self.add_word_char_highway_embeddings_op()  #original -lstm+cnn2d
            self.add_logits_op()
        self.add_pred_op()
        self.add_loss_op()

        # Generic functions that add training op and initialize session
        self.add_train_op(self.config.lr_method, self.lr, self.loss,
                          self.config.clip)
        self.initialize_session()  # now self.sess is defined and vars are init

    def predict_batch(self, words):
        """
        Args:
            words: list of sentences

        Returns:
            labels_pred: list of labels for each sentence
            sequence_length

        """
        fd, sequence_lengths = self.get_feed_dict(words, dropout=1.0)

        if self.config.use_crf:
            # get tag scores and transition params of CRF
            viterbi_sequences = []
            logits, trans_params = self.sess.run(
                [self.logits, self.trans_params], feed_dict=fd)

            # iterate over the sentences because no batching in vitervi_decode
            for logit, sequence_length in zip(logits, sequence_lengths):
                logit = logit[:sequence_length]  # keep only the valid steps
                viterbi_seq, viterbi_score = tf.contrib.crf.viterbi_decode(
                    logit, trans_params)
                viterbi_sequences += [viterbi_seq]

            return viterbi_sequences, sequence_lengths

        else:
            labels_pred = self.sess.run(self.labels_pred, feed_dict=fd)

            return labels_pred, sequence_lengths

    def batch_to_batchsize(self, batch_size, words, labels):
        import numpy as np
        # size = 5
        size = len(words)
        if batch_size > size:
            diff = batch_size - size
            if not self.config.use_elmo:
                pad_id = 0
                pad_char = [0] * self.config.max_length_chars
            else:
                pad_id = '_PAD_'
                pad_char = [self.config.vocab_chars[pad_id]
                            ] * self.config.max_length_chars
            pad_label = self.config.vocab_tags[NONE]
            pad_char_entries = ([pad_char] * self.config.max_length_words)
            pad_word_entries = [pad_id] * self.config.max_length_words
            if self.config.use_elmo_and_words:
                pad_word_id = [self.config.vocab_words[pad_id]
                               ] * self.config.max_length_words
                pad_entry_words = (pad_char_entries, pad_word_id,
                                   pad_word_entries)
            else:
                pad_entry_words = (pad_char_entries, pad_word_entries)
            pad_entry_labels = np.array([pad_label] *
                                        self.config.max_length_words,
                                        dtype=object)
            for _ in range(0, diff):
                words.append(pad_entry_words)
                labels.append(pad_entry_labels)
            return words, labels
        else:
            return words, labels

    def run_epoch(self, train, dev, epoch):
        """Performs one complete pass over the train set and evaluate on dev

        Args:
            train: dataset that yields tuple of sentences, tags
            dev: dataset
            epoch: (int) index of the current epoch

        Returns:
            f1: (python float), score to select model on, higher is better

        """
        # progbar stuff for logging
        batch_size = self.config.batch_size
        nbatches = (len(train) + batch_size - 1) // batch_size
        prog = Progbar(target=nbatches)
        # self.is_test = self.config.testing
        # iterate over dataset
        for i, (words, labels) in enumerate(minibatches(train, batch_size)):
            self.config.iter = i
            # self.is_test = self.config.testing
            words, labels = self.batch_to_batchsize(batch_size, words, labels)
            fd, _ = self.get_feed_dict(words, labels, self.config.lr,
                                       self.config.dropout)

            _, train_loss, summary = self.sess.run(
                [self.train_op, self.loss, self.merged], feed_dict=fd)

            prog.update(i + 1, [("train loss", train_loss)])

            # tensorboard
            if i % 10 == 0:
                self.file_writer.add_summary(summary, epoch * nbatches + i)

        # metrics = self.run_evaluate(dev)
        metrics = self.run_eval_pad(dev)
        msg = " - ".join(
            ["{} {:04.2f}".format(k, v) for k, v in list(metrics.items())])
        self.logger.info(msg)

        return metrics["f1"]

    def run_epoch_eval(self, train, dev, epoch):
        """Performs one complete pass over the train set and evaluate on dev

        Args:
            train: dataset that yields tuple of sentences, tags
            dev: dataset
            epoch: (int) index of the current epoch

        Returns:
            f1: (python float), score to select model on, higher is better

        """
        # progbar stuff for logging
        import itertools
        batch_size = self.config.batch_size
        nbatches = (len(train) + batch_size - 1) // batch_size
        prog = Progbar(target=nbatches)
        # self.is_test = self.config.testing
        # iterate over dataset
        for (i, (words, labels)), (j, (eval_words, eval_labels)) in zip(
                enumerate(minibatches(train, batch_size)),
                # itertools.cycle(enumerate(minibatches(dev, batch_size)))):
                enumerate(minibatches(itertools.cycle(dev), batch_size))):
            self.config.iter = i
            words, labels = self.batch_to_batchsize(batch_size, words, labels)
            fd, _ = self.get_feed_dict(words, labels, self.config.lr,
                                       self.config.dropout)

            _, train_loss, summary = self.sess.run(
                [self.train_op, self.loss, self.merged], feed_dict=fd)

            eval_words, eval_labels = self.batch_to_batchsize(
                batch_size, eval_words, eval_labels)
            eval_fd, _ = self.get_feed_dict(eval_words, eval_labels,
                                            self.config.lr,
                                            self.config.dropout)
            _, val_loss, _ = self.sess.run(
                [self.train_op, self.loss, self.merged], feed_dict=eval_fd)

            prog.update(i + 1, [("train loss", train_loss),
                                ("val loss", val_loss)])

            # tensorboard
            if i % 10 == 0:
                self.file_writer.add_summary(summary, epoch * nbatches + i)
                #loss tracking
                tf.summary.scalar("train loss", train_loss)
                tf.summary.scalar("eval loss", val_loss)

        # metrics = self.run_evaluate(dev)
        metrics = self.run_eval_pad(dev)
        msg = " - ".join(
            ["{} {:04.2f}".format(k, v) for k, v in list(metrics.items())])
        self.logger.info(msg)
        # eval loss

        return metrics["f1"]

    def run_evaluate(self, test):
        """Evaluates performance on test set

        Args:
            test: dataset that yields tuple of (sentences, tags)

        Returns:
            metrics: (dict) metrics["acc"] = 98.4, ...

        """
        def div_or_zero(num, den):
            return num / den if den else 0.0

        l_true = []
        l_pred = []

        accs = []
        correct_preds, total_correct, total_preds = 0., 0., 0.
        for words, labels in minibatches(test, self.config.batch_size):
            labels_pred, sequence_lengths = self.predict_batch(words)

            for lab, lab_pred, length in zip(labels, labels_pred,
                                             sequence_lengths):
                lab = lab[:length]
                lab_pred = lab_pred[:length]
                accs += [a == b for (a, b) in zip(lab, lab_pred)]

                l_true += lab
                l_pred += lab_pred

        # Token stats
        print('Passing LSTM-CRF tags to eval func:')
        print('\t', self.idx_to_tag.items())
        tags = [idx for idx, tag in self.idx_to_tag.items() if tag != NONE]
        return eval.token_f1(true=l_true, pred=l_pred, labels=tags)

    def run_eval_pad(self, test):
        def div_or_zero(num, den):
            return num / den if den else 0.0

        l_true = []
        l_pred = []

        accs = []
        correct_preds, total_correct, total_preds = 0., 0., 0.
        for words, labels in minibatches(test, self.config.batch_size):

            words, labels = self.batch_to_batchsize(self.config.batch_size,
                                                    words, labels)
            labels_pred, sequence_lengths = self.predict_batch(words)

            for lab, lab_pred, length in zip(labels, labels_pred,
                                             sequence_lengths):
                lab = list(lab[:length])
                lab_pred = lab_pred[:length]
                accs += [a == b for (a, b) in zip(lab, lab_pred)]

                l_true += lab
                l_pred += lab_pred
        # l_true = [list(lab) for lab in labels]
        # l_pred = labels_pred
        # Token stats
        print('Passing LSTM-CRF tags to eval func:')
        print('\t', self.idx_to_tag.items())
        tags = [idx for idx, tag in self.idx_to_tag.items() if tag != NONE]
        return eval.token_f1(true=l_true, pred=l_pred, labels=tags)

    def predict(self, words_raw):
        """Returns list of tags

        Args:
            words_raw: list of words (string), just one sentence (no batch)

        Returns:
            preds: list of tags (string), one for each word in the sentence

        """
        words = [self.config.processing_word(w) for w in words_raw]
        if type(words[0]) == tuple:
            words = list(zip(*words))
        pred_ids, _ = self.predict_batch([words])
        preds = [self.idx_to_tag[idx] for idx in list(pred_ids[0])]

        return preds

    def predict_elmo(self, sentence):
        words = [self.config.processing_word_elmo(w) for w in sentence]
        if type(words[0]) == tuple:
            words = list(zip(*words))
        pred_ids, _ = self.predict_batch([words])
        preds = [self.idx_to_tag[idx] for idx in list(pred_ids[0])]

        return preds

    def predict_abstract(self, pred_entry):
        all_pred_labels = []
        for words, labels in minibatches(pred_entry, self.config.batch_size):
            words, labels = self.batch_to_batchsize(self.config.batch_size,
                                                    words, labels)
            labels_pred, _ = self.predict_batch(words)
            all_pred_labels += labels_pred
        return all_pred_labels
Beispiel #13
0
elmo_sentence_input = weight_layers('input',
                                    sentence_embeddings_op,
                                    use_top_only=True)

# elmo_sentence_output = weight_layers('output', sentence_embeddings_op, l2_coef=0.0)

with tf.compat.v1.Session() as sess:
    # It is necessary to initialize variables once before running inference.
    sess.run(tf.compat.v1.global_variables_initializer())

    # model_vars = tf.global_variables()
    # a = slim.model_analyzer.analyze_vars(model_vars, print_info=True)
    # print(a)

    # Create batches of data.
    sentence_ids = batcher.batch_sentences(tokenized_sentences)

    # Compute ELMo representations (here for the input only, for simplicity).
    elmo_sentence_input_ = sess.run(
        elmo_sentence_input['weighted_op'],
        feed_dict={sentence_character_ids: sentence_ids})
    print(elmo_sentence_input_.shape)

    query_nr = 5

    query_word = tokenized_sentences[0][query_nr]
    print('Query:', query_word)
    query_vec = elmo_sentence_input_[0][query_nr, :]
    query_vec = preprocessing.normalize(query_vec, norm='l2')
    print(query_vec.shape)
Beispiel #14
0
    vocab_file = os.path.join(datadir, 'vocab_test.txt')
    options_file = os.path.join(datadir, 'options.json')
    weight_file = os.path.join(datadir, 'lm_weights.hdf5')

    from keras.layers import Input, GlobalMaxPooling1D, Dense
    from keras.models import Model

    inp = Input((None, 50), dtype=tf.int32)
    elmo = ELMoEmbedding(options_file, weight_file, 32)(inp)
    layer = GlobalMaxPooling1D()(elmo)
    pre = Dense(1)(layer)

    model = Model(inputs=inp, outputs=pre)
    model.compile(loss='binary_crossentropy', optimizer='adam')
    model.summary()

    raw_context = [
        'Pretrained biLMs compute representations useful for NLP tasks .',
        'They give state of the art performance for many tasks .'
    ]

    tokenized_context = [sentence.split() for sentence in raw_context]

    from bilm import Batcher

    batcher = Batcher(vocab_file, 50)

    context_ids = batcher.batch_sentences(tokenized_context)

    model.fit(x=context_ids, y=np.array([1, 0]), batch_size=1)
class ELMoEmbedderBilmTf(Component, metaclass=TfModelMeta):
   
    def __init__(self, spec: str, vocab_file='./datar/vocab/vocab.txt', max_word_length=50,
                 elmo_output_names: Optional[List] = None,
                 dim: Optional[int] = None, pad_zero: bool = False,
                 concat_last_axis: bool = True, max_token: Optional[int] = None,
                 mini_batch_size: int = 32, **kwargs) -> None:

        self.spec = spec if '://' in spec else str(expand_path(spec))
        self.max_word_length = max_word_length
        self.vocab_file = vocab_file 
        self.batcher = Batcher(self.vocab_file, self.max_word_length)
        self.pad_zero = pad_zero
        self.concat_last_axis = concat_last_axis
        self.max_token = max_token
        self.mini_batch_size = mini_batch_size
        self.elmo_outputs, self.sess, self.ids_placeholder = self._load()



    def _load(self):
 
        elmo_module = hub.Module(self.spec, trainable=False)

        sess_config = tf.ConfigProto()
        sess_config.gpu_options.allow_growth = True
        sess = tf.Session(config=sess_config)

        ids_placeholder = tf.placeholder('int32',shape=(None, None, self.max_word_length))
        
        elmo_outputs = elmo_module(inputs={'default': ids_placeholder}, as_dict=True)

        sess.run(tf.global_variables_initializer())

        return elmo_outputs, sess, ids_placeholder

    def _mini_batch_fit(self, batch: List[List[str]],
                        *args, **kwargs) -> Union[List[np.ndarray], np.ndarray]:
        """
        Embed sentences from a batch.

        Args:
            batch: A list of tokenized text samples.

        Returns:
            A batch of ELMo embeddings.
        """
        char_ids = self.batcher.batch_sentences(batch)

        elmo_outputs = self.sess.run(self.elmo_outputs,
                                     feed_dict={self.ids_placeholder: char_ids})['lm_embeddings']

        return elmo_outputs

    @overrides
    def __call__(self, batch: List[List[str]],
                 *args, **kwargs) -> Union[List[np.ndarray], np.ndarray]:
        """
        Embed sentences from a batch.

        Args:
            batch: A list of tokenized text samples.

        Returns:
            A batch of ELMo embeddings.
        """
        if len(batch) > self.mini_batch_size:
            batch_gen = chunk_generator(batch, self.mini_batch_size)
            elmo_output_values = []
            for mini_batch in batch_gen:
                mini_batch_out = self._mini_batch_fit(mini_batch, *args, **kwargs)
                elmo_output_values.extend(mini_batch_out)
        else:
            elmo_output_values = self._mini_batch_fit(batch, *args, **kwargs)

        return elmo_output_values

    def __iter__(self) -> Iterator:
        """
        Iterate over all words from a ELMo model vocabulary.
        The ELMo model vocabulary consists of ``['<S>', '</S>', '<UNK>']``.

        Returns:
            An iterator of three elements ``['<S>', '</S>', '<UNK>']``.
        """

        yield from ['<S>', '</S>', '<UNK>']

    def destroy(self):
        self.sess.close()
Beispiel #16
0
# Get ops to compute the LM embeddings.
question_embeddings_op = bilm(question_character_ids)
# Get an op to compute ELMo (weighted average of the internal biLM layers)
elmo_question_input = weight_layers('input',
                                    question_embeddings_op,
                                    l2_coef=0.0)
elmo_question_output = weight_layers('output',
                                     question_embeddings_op,
                                     l2_coef=0.0)
print(elmo_question_input['weighted_op'].get_shape())
"""
Prepare input
"""
tokenized_question = [['What', 'are', 'biLMs', 'useful', 'for', '?']]
# Create batches of data.
question_ids = batcher.batch_sentences(
    tokenized_question)  # (batch_size, sentence_length, word_length)

# padding
question_ids = question_ids.tolist()
print('length = ', len(question_ids[0]))
print(question_ids)
max_sentence_length = 10
for i in range(max_sentence_length - len(question_ids[0]) + 2):
    question_ids[0].append([0] * 50)
print('length = ', len(question_ids[0]))
print(question_ids)
"""
Compute ELMO embedding
"""
with tf.Session() as sess:
    # It is necessary to initialize variables once before running inference.
Beispiel #17
0
def build_features(config, examples, data_type, out_file, word2idx_dict, char2idx_dict, is_test=False):

    """
        Converting tokens to np.arrays of indices
    """
    def get_cand_position(candidates, context_tokens):
        cand_positions = np.zeros([cand_limit, para_limit], dtype=np.float32)

        context = ' '.join(context_tokens)
        for i, token in enumerate(candidates):
            token = token+' '
            char_start = context.find(token)
            if char_start > -1:
                l = len(token.split())
                pretext = context[:char_start].strip()
                token_start = len(pretext.split())
                for j in range(token_start, token_start+l):
                    if j < para_limit:
                        cand_positions[i][j] = 1.0
        # DEBUG
        # for i, (_, c) in enumerate(zip(cand_positions, candidates)):
        #     print c
        #     for j, t in enumerate(cand_positions[i]):
        #         if cand_positions[i][j] > 0:
        #             print context_tokens[j]

        return cand_positions

    para_limit = config.test_para_limit if is_test else config.para_limit
    ques_limit = config.test_ques_limit if is_test else config.ques_limit
    cand_limit = config.cand_limit

    #TODO: change hard-coded number to config parameter
    ans_limit = 100 if is_test else config.ans_limit
    char_limit = config.char_limit

    # def filter_func(example, is_test=False):
    #     return len(example["context_tokens"]) > para_limit or \
    #            len(example["ques_tokens"]) > ques_limit or \
    #            (example["y2s"][0] - example["y1s"][0]) > ans_limit

    print("Processing {} examples...".format(data_type))
    writer = tf.python_io.TFRecordWriter(out_file)
    total = 0
    total_ = 0
    meta = {}
    batcher = Batcher(vocab_file, 50)
    for example in tqdm(examples):
        '''
            example = {"context_tokens": context_tokens, "context_chars": context_chars,
                                   "ques_tokens": ques_tokens,"ques_chars": ques_chars,
                                   "y1s": y1s, "y2s": y2s, "candidates": candidates,
                                   "answer_position": answer_position, "id": total}
        '''
        total_ += 1

        # TODO: comment this out as we do not need filtering
        # if filter_func(example, is_test): continue

        total += 1

        line_zeros = np.zeros([50], dtype=np.int32)
        #construct context ids list about elmo       
        new_context_tokens = []
        new_list_list_context_tokens = []        
        new_context_elmo_ids = []
        if len(example["context_tokens"]) > para_limit:
            new_context_tokens = example["context_tokens"][:(para_limit)]
        #if len(example["context_tokens"]) > para_limit - 2:
        #    new_context_tokens = example["context_tokens"][:(para_limit - 2)]
            new_list_list_context_tokens.append(new_context_tokens)
            context_elmo_ids = batcher.batch_sentences(new_list_list_context_tokens)
            new_context_elmo_ids = context_elmo_ids[0]
        else:
            new_context_tokens = example["context_tokens"]
            new_list_list_context_tokens.append(new_context_tokens)
            context_elmo_ids = batcher.batch_sentences(new_list_list_context_tokens)
            new_context_elmo_ids = context_elmo_ids[0]
            remain_length = para_limit - len(example["context_tokens"])
            #remain_length = para_limit - len(example["context_tokens"])
            for i in range(remain_length):
                new_context_elmo_ids = np.row_stack((new_context_elmo_ids, line_zeros))
                
        #construct questions ids list about elmo
        new_ques_tokens = []
        new_list_list_ques_tokens = []        
        new_question_elmo_ids = []
        if len(example["ques_tokens"]) >= ques_limit:
            new_ques_tokens = example["ques_tokens"][:(ques_limit)]
        #if len(example["ques_tokens"]) >= ques_limit - 2:
        #    new_ques_tokens = example["ques_tokens"][:(ques_limit - 2)]
            new_list_list_ques_tokens.append(new_ques_tokens)
            question_elmo_ids = batcher.batch_sentences(new_list_list_ques_tokens)
            new_question_elmo_ids = question_elmo_ids[0]
        else:
            new_ques_tokens = example["ques_tokens"]
            new_list_list_ques_tokens.append(new_ques_tokens)
            question_elmo_ids = batcher.batch_sentences(new_list_list_ques_tokens)
            new_question_elmo_ids = question_elmo_ids[0]
            remain_length = ques_limit - len(example["ques_tokens"])
            #remain_length = ques_limit - len(example["ques_tokens"])
            for i in range(remain_length):
                new_question_elmo_ids = np.row_stack((new_question_elmo_ids, line_zeros))
        
        #construct candidates ids list about elmo     
        new_candidate_tokens = []
        new_list_list_candidate_tokens = []   
        new_candidate_elmo_ids = []
        if len(example["candidates"]) > cand_limit:
            new_candidate_tokens = example["candidates"][:(cand_limit)]
        #if len(example["candidates"]) > cand_limit - 2:
        #    new_candidate_tokens = example["candidates"][:(cand_limit - 2)]
            new_list_list_candidate_tokens.append(new_candidate_tokens)
            candidate_elmo_ids = batcher.batch_sentences(new_list_list_candidate_tokens)
            new_candidate_elmo_ids = candidate_elmo_ids[0]
        else:
            new_candidate_tokens = example["candidates"]
            new_list_list_candidate_tokens.append(new_candidate_tokens)
            candidate_elmo_ids = batcher.batch_sentences(new_list_list_candidate_tokens)
            new_candidate_elmo_ids = candidate_elmo_ids[0]
            remain_length = cand_limit - len(example["candidates"])
            #remain_length = cand_limit - len(example["candidates"])
            for i in range(remain_length):
                new_candidate_elmo_ids = np.row_stack((new_candidate_elmo_ids, line_zeros))
        

        context_idxs = np.zeros([para_limit], dtype=np.int32)
        context_char_idxs = np.zeros([para_limit, char_limit], dtype=np.int32)
        ques_idxs = np.zeros([ques_limit], dtype=np.int32)
        ques_char_idxs = np.zeros([ques_limit, char_limit], dtype=np.int32)
        cand_idxs = np.zeros([cand_limit], dtype=np.int32)
        cand_char_idxs = np.zeros([cand_limit, char_limit], dtype=np.int32)
        cand_label = np.zeros([cand_limit], dtype=np.float32)
        y1 = np.zeros([para_limit], dtype=np.float32)
        y2 = np.zeros([para_limit], dtype=np.float32)

        def _get_word_idx(word):
            for each in (word, word.lower(), word.capitalize(), word.upper()):
                if each in word2idx_dict:
                    return word2idx_dict[each]
            return 1 # OOV

        def _get_char_idx(char):
            if char in char2idx_dict:
                return char2idx_dict[char]
            return 1 # OOV

        for i, token in enumerate(example["context_tokens"]):
            context_idxs[i] = _get_word_idx(token)

        for i, token in enumerate(example["ques_tokens"]):
            ques_idxs[i] = _get_word_idx(token)

        for i, token in enumerate(example["candidates"]):
            cand_idxs[i] = _get_word_idx(token)

        for i, token in enumerate(example["context_chars"]):
            for j, char in enumerate(token):
                if j == char_limit:
                    break # discard chars beyond limit
                context_char_idxs[i, j] = _get_char_idx(char)

        for i, token in enumerate(example["ques_chars"]):
            for j, char in enumerate(token):
                if j == char_limit:
                    break
                ques_char_idxs[i, j] = _get_char_idx(char)

        for i, token in enumerate(example["cand_chars"]):
            for j, char in enumerate(token):
                if j == char_limit:
                    break
                cand_char_idxs[i, j] = _get_char_idx(char)

        if len(example["y1s"]) > 0:
            start = example["y1s"][-1]
            y1[start] = 1.0

        if len(example["y2s"]) > 0:
            end = example["y2s"][-1]
            y2[end] = 1.0

        cand_label[example["answer_position"]] = 1.0
        cand_positions = get_cand_position(example["candidates"], example["context_tokens"])

        '''
            tf.train.Example is not a Python class, but a protocol buffer for structuring a TFRecord. 
                An tf.train.Example stores features in a single attribute features of type tf.train.Features.

            tf.train.Features is a collection of named features.
            
            tf.train.Feature wraps a list of data of a specific type: tf.train.BytesList (attribute name bytes_list),
                tf.train.FloatList (attribute name float_list), or tf.train.Int64List (attribute name int64_list).
            
            tf.python_io.TFRecordWriter.write() accepts a string as parameter and writes it to disk, 
                meaning that structured data must be serialized first --> tf.train.Example.SerializeToString()
        '''

        record = tf.train.Example(features=tf.train.Features(feature={
            "context_elmo_idxs":      tf.train.Feature(bytes_list=tf.train.BytesList(value=[new_context_elmo_ids.tostring()])),
            "question_elmo_idxs":      tf.train.Feature(bytes_list=tf.train.BytesList(value=[new_question_elmo_ids.tostring()])),
            "candidate_elmo_idxs":      tf.train.Feature(bytes_list=tf.train.BytesList(value=[new_candidate_elmo_ids.tostring()])),
            "context_idxs":      tf.train.Feature(bytes_list=tf.train.BytesList(value=[context_idxs.tostring()])),
            "ques_idxs":         tf.train.Feature(bytes_list=tf.train.BytesList(value=[ques_idxs.tostring()])),
            "cand_idxs":         tf.train.Feature(bytes_list=tf.train.BytesList(value=[cand_idxs.tostring()])),
            "context_char_idxs": tf.train.Feature(bytes_list=tf.train.BytesList(value=[context_char_idxs.tostring()])),
            "ques_char_idxs":    tf.train.Feature(bytes_list=tf.train.BytesList(value=[ques_char_idxs.tostring()])),
            "cand_char_idxs":    tf.train.Feature(bytes_list=tf.train.BytesList(value=[cand_char_idxs.tostring()])),
            "cand_label":        tf.train.Feature(bytes_list=tf.train.BytesList(value=[cand_label.tostring()])),
            "cand_positions":    tf.train.Feature(bytes_list=tf.train.BytesList(value=[cand_positions.tostring()])),
            "y1":                tf.train.Feature(bytes_list=tf.train.BytesList(value=[y1.tostring()])),
            "y2":                tf.train.Feature(bytes_list=tf.train.BytesList(value=[y2.tostring()])),
            "id":                tf.train.Feature(int64_list=tf.train.Int64List(value=[example["id"]]))}))

        writer.write(record.SerializeToString())
    print("Built {} / {} instances of features in total".format(total, total_))
    meta["total"] = total
    writer.close()
    return meta
Beispiel #18
0
class Tokenizer(object):
    def __init__(self,
                 vocab_file,
                 max_seq_length,
                 max_token_length=None,
                 stroke_vocab_file=None,
                 tran2sim=False,
                 sim2tran=False):
        self.vocab_file = vocab_file
        self.max_seq_length = max_seq_length
        self.max_token_length = max_token_length

        max_seq_length = self.max_seq_length - 2  # 因會加 <bos> and <eos>,所以 -2
        self.token_batcher = TokenBatcher(self.vocab_file, max_seq_length)
        if max_token_length:
            self.batcher = Batcher(self.vocab_file, self.max_token_length,
                                   max_seq_length, stroke_vocab_file)

        self.convert_config = None
        if tran2sim and sim2tran:
            assert tran2sim != sim2tran
        elif tran2sim:
            self.convert_config = "t2s.json"
        elif sim2tran:
            self.convert_config = "s2t.json"

    def convert(self, text):
        """
    未轉簡繁、轉簡體、轉繁體
    很慢,不建議使用
    """
        if self.convert_config is None:
            return text
        return opencc.convert(text, config=self.convert_config)

    def tokenize(self, text):
        """
    text to token, for example:
    text=‘Pretrained biLMs compute representations useful for NLP tasks.’
    token=['Pretrained', 'biLMs', 'compute', 'representations', 'useful', 'for', 'NLP', 'tasks', '.']
    """
        text = self.convert(text)
        text = tokenize_chinese_chars(text)
        text = text.strip()
        tokens = []
        for word in text.split():
            tokens.extend(self._run_split_on_punc(word))
        return tokens

    def convert_tokens_to_ids(self, tokens):
        return self.token_batcher.batch_sentences([tokens])[0]

    def convert_tokens_to_char_ids(self, tokens):
        """
    tokens: tokenize(text)
    return: shape [max_seq_length * max_token_length]
    """
        # char_ids [max_seq_length, max_token_length]
        char_ids = self.batcher.batch_sentences([tokens])[0]
        # flat_char_ids [max_seq_length * max_token_length]
        flat_char_ids = [
            char_id for sublist in char_ids for char_id in sublist
        ]
        return flat_char_ids

    def _is_punctuation(self, char):
        """Checks whether `chars` is a punctuation character."""
        cp = ord(char)
        # We treat all non-letter/number ASCII as punctuation.
        # Characters such as "^", "$", and "`" are not in the Unicode
        # Punctuation class but we treat them as punctuation anyways, for
        # consistency.
        if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64)
                or (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):
            return True
        cat = unicodedata.category(char)
        if cat.startswith("P"):
            return True
        return False

    def _run_split_on_punc(self, text):
        """Splits punctuation on a piece of text."""
        chars = list(text)
        i = 0
        start_new_word = True
        output = []
        while i < len(chars):
            char = chars[i]
            if self._is_punctuation(char):
                output.append([char])
                start_new_word = True
            else:
                if start_new_word:
                    output.append([])
                start_new_word = False
                output[-1].append(char)
            i += 1
        return ["".join(x) for x in output]
import chainer
from bilm import Batcher
from bilm import Elmo
vocab_file = 'vocab-2016-09-10.txt'
options_file = 'elmo_2x4096_512_2048cnn_2xhighway_options.json'
weight_file = 'elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5'

batcher = Batcher(vocab_file, 50)
elmo = Elmo(options_file,
            weight_file,
            num_output_representations=1,
            requires_grad=False,
            do_layer_norm=False,
            dropout=0.)

raw_sents = []
raw_sents = [
    'Pretrained biLMs compute representations useful for NLP tasks .',
    'They give state of the art performance for many tasks .'
]
tokenized_sents = [sentence.split() for sentence in raw_sents]
batched_ids = batcher.batch_sentences(tokenized_sents, add_bos_eos=False)
embeddings = elmo.forward(batched_ids)

print(type(embeddings['elmo_representations'][0]))
# <class 'chainer.variable.Variable'>
print(embeddings['elmo_representations'][0].shape)
# (2, 11, 1024) = (batchsize, max_sentence_length, dim)
class QAModel(object):
    """
    add_placeholders():
    add_char_embedding_layer
    add_embedding_layer
    build_graph: the main part of the model
    add_loss

    """
    def __init__(self, FLAGS, id2word, word2id, emb_matrix, id2char, char2id):
        self.FLAGS = FLAGS
        self.id2word = id2word
        self.word2id = word2id
        self.emb = emb_matrix
        self.id2char = id2char
        self.char2id = char2id

        #This batcher is used for feed_dict in placeholder context_elmo & qn_elmo
        self.batcher = Batcher(
            os.path.join(self.FLAGS.elmo_dir, "elmo_vocab.txt"), 50)
        self.filters = [(1, 124)]

        with tf.variable_scope(
                "QAModel",
                initializer=tf.contrib.layers.variance_scaling_initializer(
                    factor=1.0, uniform=True)):
            self.add_placeholders()
            self.add_embedding_layer(emb_matrix.shape)
        self.add_elmo_embedding_layer(
            os.path.join(self.FLAGS.elmo_dir, "elmo.json"),
            os.path.join(self.FLAGS.elmo_dir, "lm_weight.hdf5"))
        with tf.variable_scope(
                "QAModel",
                initializer=tf.contrib.layers.variance_scaling_initializer(
                    factor=1.0, uniform=True)):
            self.build_graph()
            self.add_loss()

        # Define trainable parameters, gradient, gradient norm, and clip by gradient norm
        params = tf.trainable_variables(
        )  # also fine tune elmo (original since only one scope "QAModel")
        gradients = tf.gradients(
            self.loss,
            params)  # d(loss)/d(params) return list of (length len(params))
        self.gradient_norm = tf.global_norm(gradients)
        clipped_gradients, _ = tf.clip_by_global_norm(
            gradients, FLAGS.max_gradient_norm
        )  #return list_clipped, global_norm(here we don't need this)
        self.param_norm = tf.global_norm(params)

        # Define optimizer and updates
        # (updates is what you need to fetch in session.run to do a gradient update)
        self.global_step = tf.Variable(0, name="global_step", trainable=False)
        #This will increment the global step if global_step is not None
        opt = tf.train.AdamOptimizer(
            learning_rate=FLAGS.learning_rate)  # you can try other optimizers
        self.updates = opt.apply_gradients(zip(clipped_gradients, params),
                                           global_step=self.global_step)

        # Define savers (for checkpointing) and summaries (for tensorboard)
        self.saver = tf.train.Saver(tf.global_variables(),
                                    max_to_keep=FLAGS.keep)
        self.bestmodel_saver = tf.train.Saver(tf.global_variables(),
                                              max_to_keep=1)

        # NOTE: Added exponential moving average for trainable parameters
        #with tf.variable_scope("EMA"):
        #self.ema = tf.train.ExponentialMovingAverage(decay=0.999, zero_debias=True)
        #self.ema_ops = self.ema.apply(tf.trainable_variables())

        #self.ema_saver = tf.train.Saver(max_to_keep=FLAGS.keep)
        #self.ema_bestmodel_saver = tf.train.Saver(max_to_keep=1)

        self.summaries = tf.summary.merge_all(
        )  #collect all summaries defined in the graph above e.g. in add_loss() function
        #This function is like sess.run. The summary writer can then work

    def add_placeholders(self):
        #if shape is not specified, we can pass any shape
        self.context_ids = tf.placeholder(tf.int32)
        self.context_mask = tf.placeholder(tf.int32)
        self.qn_ids = tf.placeholder(tf.int32)
        self.qn_mask = tf.placeholder(tf.int32)
        self.ans_span = tf.placeholder(tf.int32, shape=[None, 2])

        #NOTE:Added char and elmo
        self.context_char = tf.placeholder(
            tf.int32, shape=[None, None, self.FLAGS.max_word_len])
        self.qn_char = tf.placeholder(
            tf.int32, shape=[None, None, self.FLAGS.max_word_len])
        self.context_elmo = tf.placeholder(tf.int32, shape=[None, None, 50])
        self.qn_elmo = tf.placeholder(tf.int32, shape=[None, None, 50])

        # Add a placeholder to feed in the keep probability (for dropout).
        # This is necessary so that we can instruct the model to use dropout when training, but not when testing
        self.keep_prob = tf.placeholder_with_default(1.0, shape=())

        self.emb_matrix = tf.placeholder(tf.float32, shape=self.emb.shape)

    def feed_embedding(self, session):
        set_emb = self.embedding_matrix.assign(self.emb_matrix)
        session.run(set_emb, feed_dict={self.emb_matrix: self.emb})

    def add_elmo_embedding_layer(self,
                                 options_file,
                                 weight_file,
                                 output_use=False):
        """
        Adds ELMo lstm embeddings to the graph.
        1. self.elmo_context_input (batch size, max_context_len among the batch, 1024)
        2. self.elmo_question_input (batch size, max_qn_len among the batch, 1024)
        If output_use is true:
            add the output to the graph either

        Inputs:
            options_file: json_file for the pretrained model
            weight_file: weights hdf5 file for the pretrained model
            output_use: determine if use elmo in output of biRNN (default False)

        """
        #Build biLM graph
        bilm = BidirectionalLanguageModel(options_file, weight_file)
        context_embeddings_op = bilm(self.context_elmo)
        question_embeddings_op = bilm(self.qn_elmo)

        # Get an op to compute ELMo (weighted average of the internal biLM layers)
        # Our SQuAD model includes ELMo at both the input and output layers
        # of the task GRU, so we need 4x ELMo representations for the question
        # and context at each of the input and output.
        # We use the same ELMo weights for both the question and context
        # at each of the input and output.
        #compute the final ELMo representations.
        self.elmo_context_input = weight_layers(
            'input', context_embeddings_op, l2_coef=0.001
        )['weighted_op']  #(batch size, max_context_len among the batch, 1024)
        with tf.variable_scope('', reuse=True):
            # the reuse=True scope reuses weights from the context for the question
            self.elmo_question_input = weight_layers(
                'input', question_embeddings_op, l2_coef=0.001)['weighted_op']

        if output_use:
            self.elmo_context_output = weight_layers(
                'output', context_embeddings_op, l2_coef=0.001)['weighted_op']
            with tf.variable_scope('', reuse=True):
                # the reuse=True scope reuses weights from the context for the question
                self.elmo_question_output = weight_layers(
                    'output', question_embeddings_op,
                    l2_coef=0.001)['weighted_op']

    def add_char_embedding_layer(self):
        #NOTE: ADD character embeddings
        with tf.variable_scope("embeddings_char"):
            char_embed_matrix = tf.get_variable(
                name='char_embed_matrix',
                shape=[self.FLAGS.num_of_char, self.FLAGS.char_size],
                initializer=tf.initializers.random_uniform(
                    minval=-0.5, maxval=0.5, dtype=tf.float32))  #(71, 20)
            #context_char is placeholder for context char ids
            context_char_emb = tf.nn.embedding_lookup(
                char_embed_matrix, self.context_char
            )  #shape(batch_size, context_len, max_word_len, char_size)
            #qn_char is placeholder for questionchar ids
            qn_char_emb = tf.nn.embedding_lookup(
                char_embed_matrix, self.qn_char
            )  #shape(batch_size, question_len, max_word_len, char_size)

            def make_conv(embedding, filters):
                pooled_cnn = []
                for i, (window_size, num_filter) in enumerate(filters):
                    filter_shape = [
                        1, window_size, self.FLAGS.char_size, num_filter
                    ]
                    w = tf.get_variable('W_f%s' % i, shape=filter_shape)
                    b = tf.get_variable('b_f%s' % i, shape=[num_filter])
                    conv = tf.nn.conv2d(
                        embedding,
                        filter=w,
                        strides=[1, 1, 1, 1],
                        padding="VALID"
                    ) + b  #shape(batch_size, context_len, max_word_len-window_size+1, num_filter)
                    conv = tf.nn.relu(conv)

                    h = tf.nn.max_pool(
                        conv,
                        ksize=[
                            1, 1, self.FLAGS.max_word_len - window_size + 1, 1
                        ],
                        strides=[1, 1, 1, 1],
                        padding="VALID"
                    )  #(batch_size, context_len, 1, num_filter)
                    h = tf.squeeze(
                        h,
                        axis=2)  # shape (batch_size, context_len, num_filter)
                    pooled_cnn.append(h)

                return tf.concat(
                    pooled_cnn, axis=2
                )  #shape (batch_size, context_len, sum all num_filter)

            self.context_char_embs = make_conv(
                context_char_emb, self.filters
            )  #shape (batch_size, context_len, sum all num_filter)
            #question and context char uses the same embeddings
            tf.get_variable_scope().reuse_variables()
            self.qn_char_embs = make_conv(
                qn_char_emb, self.filters
            )  #shape (batch_size, context_len, sum all num_filter)

    def add_embedding_layer(self, emb_matrix_shape):
        with tf.variable_scope("embeddings"):
            with tf.device('/cpu:0'):
                #set to constant so its untrainable
                #embedding_matrix = tf.constant(emb_matrix, dtype=tf.float32, name="emb_matrix") # shape (400002, embedding_size)
                self.embedding_matrix = tf.Variable(tf.zeros(emb_matrix_shape),
                                                    trainable=False,
                                                    name="embedding")
                # Get the word embeddings for the context and question,
                self.context_embs = tf.nn.embedding_lookup(
                    self.embedding_matrix,
                    self.context_ids)  #(batch_size, context_len, glove_dim)
                self.qn_embs = tf.nn.embedding_lookup(
                    self.embedding_matrix,
                    self.qn_ids)  #(batch_size, qn_len, glove_dim)

        #self.add_char_embedding_layer()

    def build_graph(self):
        """
        Builds the main part of the graph for the model
        
         Defines:
          self.logits_start, self.logits_end: Both tensors shape (batch_size, context_len).
            These are the logits (i.e. values that are fed into the softmax function) for the start and end distribution.
            Important: these are -large in the pad locations. Necessary for when we feed into the cross entropy function.
          self.probdist_start, self.probdist_end: Both shape (batch_size, context_len). Each row sums to 1.
            These are the result of taking (masked) softmax of logits_start and logits_end.
        """

        # NOTE CHANGE: concantanate glove and elmo embedding
        # How to handle elmo context_len and glove context_len mismatch?
        # Just make the context_ids no max context_len
        context_embs_concat = tf.concat(
            [self.elmo_context_input, self.context_embs],
            2)  #(batch_size, qn_len, 1024+self.FLAGS.embedding_size)
        qn_embs_concat = tf.concat(
            [self.elmo_question_input, self.qn_embs],
            2)  #(batch_size, qn_len, 1024+self.FLAGS.embedding_size)

        #set shape so that it can pass to dynamic lstm
        context_embs_concat.set_shape(
            (None, None, 1024 + self.FLAGS.embedding_size))
        qn_embs_concat.set_shape(
            (None, None, 1024 + self.FLAGS.embedding_size))
        self.qn_mask.set_shape((None, None))
        self.context_mask.set_shape((None, None))

        with tf.variable_scope("biLSTM"):
            Encoder = RNNEncoder(self.FLAGS.hidden_size,
                                 keep_prob=self.keep_prob,
                                 cell_type="lstm",
                                 input_size=1024 + self.FLAGS.embedding_size)
            #shared weights (same scope)
            context_hiddens = Encoder.build_graph(
                context_embs_concat,
                self.context_mask,
                scope="context_question_encoder"
            )  #(batch_size, context_len, hidden_size*2)
            question_hiddens = Encoder.build_graph(
                qn_embs_concat, self.qn_mask, scope="context_question_encoder"
            )  #(batch_size, question_len, hidden_size*2)

        with tf.variable_scope("bidaf"):
            bidaf_object = Bidaf(self.FLAGS.hidden_size * 2, self.keep_prob)
            b = bidaf_object.build_graph(
                context_hiddens, question_hiddens, self.context_mask,
                self.qn_mask)  #(batch_size, context_len, hidden_size*8)

        with tf.variable_scope("self_attn_layer"):
            SelfAttn_object = SelfAttn(self.FLAGS.hidden_size,
                                       self.FLAGS.hidden_size * 2,
                                       self.keep_prob,
                                       input_size=self.FLAGS.hidden_size * 2)
            M = SelfAttn_object.build_graph(
                b, self.context_mask,
                cell_type="lstm")  #(batch_size, context_len, hidden_size*2)

        #Make prediction
        with tf.variable_scope('prediction_layer'):
            #Encode the self-attended context first
            with tf.variable_scope("final_lstm_layer"):
                final_lstm_object = RNNEncoder(
                    self.FLAGS.hidden_size,
                    keep_prob=self.keep_prob,
                    cell_type="lstm",
                    input_size=self.FLAGS.hidden_size * 2)
                M_prime = final_lstm_object.build_graph(
                    M, self.context_mask,
                    scope="final_lstm")  #(batch_size, context_len, h*2)

            #Get start distribution
            with tf.variable_scope("StartDist"):
                softmax_layer_start = SimpleSoftmaxLayer()
                self.logits_start, self.probdist_start = softmax_layer_start.build_graph(
                    M_prime,
                    self.context_mask)  #both are (batch_size, context_len)

            with tf.variable_scope("EndDist"):
                logit_start_expand = tf.expand_dims(
                    self.logits_start, axis=2)  #(batch_size, context_len, 1)
                blended_end_rnn_input = tf.concat(
                    [logit_start_expand, M_prime],
                    axis=2)  #(batch_size, context_len, hidden_size*2)
                end_dist_rnn = RNNEncoder(self.FLAGS.hidden_size,
                                          keep_prob=self.keep_prob,
                                          direction="unidirectional")
                end_rnn_output = end_dist_rnn.build_graph(
                    blended_end_rnn_input,
                    self.context_mask,
                    scope="end_dist_rnn")

                # Get the end dist
                softmax_layer_end = SimpleSoftmaxLayer()
                self.logits_end, self.probdist_end = softmax_layer_end.build_graph(
                    end_rnn_output, self.context_mask)

    def add_loss(self):
        """
        Add loss computation to the graph.

        Uses:
          self.logits_start: shape (batch_size, context_len)
            IMPORTANT: Assumes that self.logits_start is masked (i.e. has -large in masked locations).
            That's because the tf.nn.sparse_softmax_cross_entropy_with_logits
            function applies softmax and then computes cross-entropy loss.
            So you need to apply masking to the logits (by subtracting large
            number in the padding location) BEFORE you pass to the
            sparse_softmax_cross_entropy_with_logits function.

          self.ans_span: shape (batch_size, 2)
            Contains the gold start and end locations

        Defines:
          self.loss_start, self.loss_end, self.loss: all scalar tensors
        """

        with tf.variable_scope("loss"):
            # Calculate loss for prediction of start position
            loss_start = tf.nn.sparse_softmax_cross_entropy_with_logits(
                logits=self.logits_start,
                labels=self.ans_span[:,
                                     0])  # loss_start has shape (batch_size)
            self.loss_start = tf.reduce_mean(loss_start)
            tf.summary.scalar('loss_start', self.loss_start)

            # Calculate loss for prediction of end position
            loss_end = tf.nn.sparse_softmax_cross_entropy_with_logits(
                logits=self.logits_end, labels=self.ans_span[:, 1])
            self.loss_end = tf.reduce_mean(loss_end)
            tf.summary.scalar('loss_end', self.loss_end)

            # Add two losses
            self.loss = self.loss_start + self.loss_end
            tf.summary.scalar('loss', self.loss)

    def run_train_iter(self, session, batch, summary_writer):
        """
        This performs a single training iteration (forward pass, loss computation, backprop, parameter update)

        Inputs:
          session: TensorFlow session
          batch: a Batch object
          summary_writer: for Tensorboard

        Returns:
          loss: The loss (averaged across the batch) for this batch.
          global_step: The current number of training iterations we've done
          param_norm: Global norm of the parameters
          gradient_norm: Global norm of the gradients
        """
        # Match up our input data with the placeholders
        input_feed = {}
        input_feed[self.context_ids] = batch.context_ids
        input_feed[self.context_mask] = batch.context_mask
        input_feed[self.context_elmo] = self.batcher.batch_sentences(
            batch.context_tokens)
        #NOTE: CHANGE added context_char
        #input_feed[self.context_char] = batch.context_char

        input_feed[self.qn_ids] = batch.qn_ids
        input_feed[self.qn_mask] = batch.qn_mask
        input_feed[self.qn_elmo] = self.batcher.batch_sentences(
            batch.qn_tokens)
        #NOTE: CHANGE added qn_char
        #input_feed[self.qn_char] = batch.qn_char

        input_feed[self.ans_span] = batch.ans_span
        input_feed[self.keep_prob] = 1.0 - self.FLAGS.dropout  # apply dropout

        # output_feed contains the things we want to fetch.
        output_feed = [
            self.updates, self.summaries, self.loss, self.global_step,
            self.param_norm, self.gradient_norm
        ]

        # Run the model
        [_, summaries, loss, global_step, param_norm,
         gradient_norm] = session.run(output_feed, input_feed)

        # All summaries in the graph are added to Tensorboard
        summary_writer.add_summary(summaries, global_step)

        return loss, global_step, param_norm, gradient_norm

    def get_loss(self, session, batch):
        """
        Run forward-pass only; get loss.

        Inputs:
          session: TensorFlow session
          batch: a Batch object

        Returns:
          loss: The loss (averaged across the batch) for this batch
        """

        input_feed = {}
        input_feed[self.context_ids] = batch.context_ids
        input_feed[self.context_mask] = batch.context_mask

        padded_batch_context_tokens = []
        for context in batch.context_tokens:
            if len(context) > self.FLAGS.context_len:
                padded_batch_context_tokens.append(
                    context[:self.FLAGS.context_len])
            else:
                padded_batch_context_tokens.append(context)
        input_feed[self.context_elmo] = self.batcher.batch_sentences(
            padded_batch_context_tokens)

        #NOTE: CHANGE added context_char
        #input_feed[self.context_char] = batch.context_char
        input_feed[self.qn_ids] = batch.qn_ids
        input_feed[self.qn_mask] = batch.qn_mask

        padded_batch_qn_tokens = []
        for qn in batch.qn_tokens:
            if len(qn) > self.FLAGS.question_len:
                padded_batch_qn_tokens.append(qn[:self.FLAGS.question_len])
            else:
                padded_batch_qn_tokens.append(qn)
        input_feed[self.qn_elmo] = self.batcher.batch_sentences(
            padded_batch_qn_tokens)

        #NOTE: CHANGE added qn_char
        #input_feed[self.qn_char] = batch.qn_char
        input_feed[self.ans_span] = batch.ans_span
        # note you don't supply keep_prob here, so it will default to 1 i.e. no dropout

        output_feed = [self.loss]

        [loss] = session.run(output_feed, input_feed)

        return loss

    def get_dev_loss(self, session, dev_context_path, dev_qn_path,
                     dev_ans_path):
        """
        Get loss for entire dev set.

        Inputs:
          session: TensorFlow session
          dev_qn_path, dev_context_path, dev_ans_path: paths to the dev.{context/question/answer} data files

        Outputs:
          dev_loss: float. Average loss across the dev set.
        """
        logging.info("Calculating dev loss...")
        tic = time.time()
        loss_per_batch, batch_lengths = [], []

        # Iterate over dev set batches
        # Note: here we set discard_long=True, meaning we discard any examples
        # which are longer than our context_len or question_len.
        # We need to do this because if, for example, the true answer is cut
        # off the context, then the loss function is undefined.

        for batch in get_batch_generator(self.word2id,
                                         self.char2id,
                                         dev_context_path,
                                         dev_qn_path,
                                         dev_ans_path,
                                         self.FLAGS.batch_size,
                                         self.FLAGS.context_len,
                                         self.FLAGS.question_len,
                                         self.FLAGS.max_word_len,
                                         discard_long=True):
            # Get loass for this batch
            loss = self.get_loss(session, batch)
            curr_batch_size = batch.batch_size
            loss_per_batch.append(
                loss * curr_batch_size
            )  #mutiply by curr_batch_size since the loss is divided by this already
            batch_lengths.append(curr_batch_size)

        # Calculate average loss
        total_num_examples = sum(batch_lengths)
        toc = time.time()
        print("Computed dev loss over %i examples in %.2f seconds" %
              (total_num_examples, toc - tic))

        # Overall loss is total loss divided by total number of examples
        dev_loss = sum(loss_per_batch) / float(total_num_examples)
        return dev_loss

    def get_prob_dists(self, session, batch):
        """
        Run forward-pass only; get probability distributions for start and end positions.

        Inputs:
          session: TensorFlow session
          batch: Batch object

        Returns:
          probdist_start and probdist_end: both shape (batch_size, context_len)
        """
        input_feed = {}
        input_feed[self.context_ids] = batch.context_ids
        input_feed[self.context_mask] = batch.context_mask
        padded_batch_context_tokens = []
        for context in batch.context_tokens:
            if len(context) > self.FLAGS.context_len:
                padded_batch_context_tokens.append(
                    context[:self.FLAGS.context_len])
            else:
                padded_batch_context_tokens.append(context)
        input_feed[self.context_elmo] = self.batcher.batch_sentences(
            padded_batch_context_tokens)
        #input_feed[self.context_elmo] = self.batcher.batch_sentences(batch.context_tokens)

        #NOTE: CHANGE added context_char
        #input_feed[self.context_char] = batch.context_char

        input_feed[self.qn_ids] = batch.qn_ids
        input_feed[self.qn_mask] = batch.qn_mask
        padded_batch_qn_tokens = []
        for qn in batch.qn_tokens:
            if len(qn) > self.FLAGS.question_len:
                padded_batch_qn_tokens.append(qn[:self.FLAGS.question_len])
            else:
                padded_batch_qn_tokens.append(qn)
        input_feed[self.qn_elmo] = self.batcher.batch_sentences(
            padded_batch_qn_tokens)
        #input_feed[self.qn_elmo] = self.batcher.batch_sentences(batch.qn_tokens)
        #NOTE: CHANGE added qn_char
        #input_feed[self.qn_char] = batch.qn_char

        # note you don't supply keep_prob here, so it will default to 1 i.e. no dropout

        output_feed = [self.probdist_start,
                       self.probdist_end]  #defined in the end of build_graph()
        [probdist_start, probdist_end] = session.run(output_feed, input_feed)
        return probdist_start, probdist_end

    def get_start_end_pos(self, session, batch, span="dp"):
        """
        Run forward-pass only; get the most likely answer span.

        Inputs:
          session: TensorFlow session
          batch: Batch object

        Returns:
          start_pos, end_pos: both numpy arrays shape (batch_size).
            The most likely start and end positions for each example in the batch.
        """
        start_dist, end_dist = self.get_prob_dists(session,
                                                   batch)  #numpy array

        if span == "original":
            start_pos = np.argmax(start_dist, axis=1)
            end_pos = np.argmax(end_dist, axis=1)
        elif span == "dp":
            """
            The thoery is add more weights to early start_pos context
            Then for the words equal or after that start_pos, find the word that has max dist
            The words in between (inclusive) are the answer
            """

            end_dp = np.zeros(end_dist.shape)
            end_dp[:, -1] = end_dist[:, -1]
            for i in range(len(end_dist[0]) - 2, -1, -1):
                end_dp[:, i] = np.amax([end_dist[:, i], end_dp[:, i + 1]],
                                       axis=0)
            start_pos = np.argmax(start_dist * end_dp, axis=1)
            end_pos = map(
                lambda i: start_pos[i] + np.argmax(end_dist[i, start_pos[i]:]),
                range(len(end_dist)))

        #print(start_dist)
        #print(start_dist.shape)
        #print(end_dist)
        #print(end_dist.shape)
        return start_pos, end_pos

    def check_f1_em(self,
                    session,
                    context_path,
                    qn_path,
                    ans_path,
                    dataset,
                    num_samples=100,
                    print_to_screen=False):
        """
        Sample from the provided (train/dev) set.
        For each sample, calculate F1 and EM score.
        Return average F1 and EM score for all samples.
        Optionally pretty-print examples.

        Note: This function is not quite the same as the F1/EM numbers you get from "official_eval" mode.
        This function uses the pre-processed version of the e.g. dev set for speed,
        whereas "official_eval" mode uses the original JSON. Therefore:
          1. official_eval takes your max F1/EM score w.r.t. the three reference answers,
            whereas this function compares to just the first answer (which is what's saved in the preprocessed data)
          2. Our preprocessed version of the dev set is missing some examples
            due to tokenization issues (see squad_preprocess.py).
            "official_eval" includes all examples.

        Inputs:
          session: TensorFlow session
          qn_path, context_path, ans_path: paths to {dev/train}.{question/context/answer} data files.
          dataset: string. Either "train" or "dev". Just for logging purposes.
          num_samples: int. How many samples to use. If num_samples=0 then do whole dataset.
          print_to_screen: if True, pretty-prints each example to screen

        Returns:
          F1 and EM: Scalars. The average across the sampled examples.
        """
        logging.info(
            "Calculating F1/EM for %s examples in %s set..." %
            (str(num_samples) if num_samples != 0 else "all", dataset))

        f1_total = 0.
        em_total = 0.
        example_num = 0

        tic = time.time()

        # Note here we select discard_long=False because we want to sample from the entire dataset
        # That means we're truncating, rather than discarding, examples with too-long context or questions
        for batch in get_batch_generator(self.word2id,
                                         self.char2id,
                                         context_path,
                                         qn_path,
                                         ans_path,
                                         self.FLAGS.batch_size,
                                         self.FLAGS.context_len,
                                         self.FLAGS.question_len,
                                         self.FLAGS.max_word_len,
                                         discard_long=False):
            pred_start_pos, pred_end_pos = self.get_start_end_pos(
                session, batch)  #numpy arrays shape (batch_size)

            # Convert the start and end positions to lists (length batch_size)
            try:
                pred_start_pos = pred_start_pos.tolist()
                pred_end_pos = pred_end_pos.tolist()
            except:
                pred_start_pos = [pos for pos in pred_start_pos]
                pred_end_pos = [pos for pos in pred_end_pos]

            for ex_idx, (pred_ans_start, pred_ans_end,
                         true_ans_tokens) in enumerate(
                             zip(pred_start_pos, pred_end_pos,
                                 batch.ans_tokens)):
                example_num += 1

                # Get the predicted answer
                # Important: batch.context_tokens contains the original words (no UNKs)
                # You need to use the original no-UNK version when measuring F1/EM
                pred_ans_tokens = batch.context_tokens[ex_idx][
                    pred_ans_start:pred_ans_end + 1]
                pred_answer = " ".join(pred_ans_tokens)

                #Get the true answer (No UNKs)
                true_answer = " ".join(true_ans_tokens)

                # Calculate F1, EM
                f1 = f1_score(pred_answer, true_answer)
                em = exact_match_score(pred_answer, true_answer)
                f1_total += f1
                em_total += em

                # Optionally pretty-print
                if print_to_screen:
                    print_example(self.word2id, batch.context_tokens[ex_idx],
                                  batch.qn_tokens[ex_idx],
                                  batch.ans_span[ex_idx,
                                                 0], batch.ans_span[ex_idx, 1],
                                  pred_ans_start, pred_ans_end, true_answer,
                                  pred_answer, f1, em)

                # Either all or only num_samplse for the calculation
                if num_samples != 0 and example_num >= num_samples:
                    break

            if num_samples != 0 and example_num >= num_samples:
                break

        f1_total /= example_num
        em_total /= example_num

        toc = time.time()
        logging.info(
            "Calculating F1/EM for %i examples in %s set took %.2f seconds" %
            (example_num, dataset, toc - tic))
        return f1_total, em_total

    def train(self, session, train_context_path, train_qn_path, train_ans_path,
              dev_qn_path, dev_context_path, dev_ans_path):
        """
        Main training loop.
        Note: all the inputs of this function are defined in main.py
        This function will be run in main.py

        Inputs:
          session: TensorFlow session
          {train/dev}_{qn/context/ans}_path: paths to {train/dev}.{context/question/answer} data files
        """

        tic = time.time()
        params = tf.trainable_variables()
        num_params = sum(
            map(lambda t: np.prod(tf.shape(t.value()).eval()), params))
        toc = time.time()
        logging.info("Number of params: %d (retrieval took %f secs)" %
                     (num_params, toc - tic))

        # We will keep track of exponentially-smoothed loss
        exp_loss = None

        # Checkpoint management.
        # We keep one latest checkpoint, and one best checkpoint (early stopping)
        checkpoint_path = os.path.join(self.FLAGS.train_dir, "qa.ckpt")
        bestmodel_dir = os.path.join(self.FLAGS.train_dir, "best_checkpoint")
        bestmodel_ckpt_path = os.path.join(bestmodel_dir, "qa_best.ckpt")

        #ema_checkpoint_path = os.path.join(self.FLAGS.train_dir, "ema_qa.ckpt")
        #ema_bestmodel_dir = os.path.join(self.FLAGS.train_dir, "ema_best_checkpoint")
        #ema_bestmodel_ckpt_path = os.path.join(ema_bestmodel_dir, "qa_best.ckpt")

        best_dev_f1 = None
        best_dev_em = None

        # for TensorBoard
        summary_writer = tf.summary.FileWriter(self.FLAGS.train_dir,
                                               session.graph)

        epoch = 0
        logging.info("Beginning training loop...")
        #Note if self.FLAGS.num_epochs == 0, then train infinitely
        while self.FLAGS.num_epochs == 0 or epoch < self.FLAGS.num_epochs:
            epoch += 1
            epoch_tic = time.time()

            # Loop over batches
            for batch in get_batch_generator(self.word2id,
                                             self.char2id,
                                             train_context_path,
                                             train_qn_path,
                                             train_ans_path,
                                             self.FLAGS.batch_size,
                                             self.FLAGS.context_len,
                                             self.FLAGS.question_len,
                                             self.FLAGS.max_word_len,
                                             discard_long=True):
                # Run training iteration
                iter_tic = time.time()
                loss, global_step, param_norm, grad_norm = self.run_train_iter(
                    session, batch, summary_writer)
                iter_toc = time.time()
                iter_time = iter_toc - iter_tic

                # Update exponentially-smoothed loss
                if not exp_loss:  # first iter
                    exp_loss = loss
                else:
                    exp_loss = 0.99 * exp_loss + 0.01 * loss

                # Sometimes print info to screen
                if global_step % self.FLAGS.print_every == 0:
                    logging.info(
                        'epoch %d, iter %d, loss %.5f, smoothed loss %.5f, grad norm %.5f, param norm %.5f, batch time %.3f'
                        % (epoch, global_step, loss, exp_loss, grad_norm,
                           param_norm, iter_time))

                # Sometimes save model
                if global_step % self.FLAGS.save_every == 0:
                    logging.info("Saving to %s..." % checkpoint_path)
                    self.saver.save(session,
                                    checkpoint_path,
                                    global_step=global_step)

                    # NOTE: CHANGE
                    #logging.info("Saving to %s..." % ema_checkpoint_path)
                    #self.ema_saver.save(session, ema_checkpoint_path, global_step=global_step)

                # Sometimes evaluate model on dev loss, train F1/EM and dev F1/EM
                if global_step % self.FLAGS.eval_every == 0:

                    # Get loss for entire dev set and log to tensorboard
                    dev_loss = self.get_dev_loss(session, dev_context_path,
                                                 dev_qn_path, dev_ans_path)
                    logging.info("Epoch %d, Iter %d, dev loss: %f" %
                                 (epoch, global_step, dev_loss))
                    write_summary(dev_loss, "dev/loss", summary_writer,
                                  global_step)

                    # Get F1/EM on train set and log to tensorboard
                    train_f1, train_em = self.check_f1_em(session,
                                                          train_context_path,
                                                          train_qn_path,
                                                          train_ans_path,
                                                          "train",
                                                          num_samples=1000)
                    logging.info(
                        "Epoch %d, Iter %d, Train F1 score: %f, Train EM score: %f"
                        % (epoch, global_step, train_f1, train_em))
                    write_summary(train_f1, "train/F1", summary_writer,
                                  global_step)
                    write_summary(train_em, "train/EM", summary_writer,
                                  global_step)

                    # Get F1/EM on dev set and log to tensorboard
                    dev_f1, dev_em = self.check_f1_em(session,
                                                      dev_context_path,
                                                      dev_qn_path,
                                                      dev_ans_path,
                                                      "dev",
                                                      num_samples=0)
                    logging.info(
                        "Epoch %d, Iter %d, Dev F1 score: %f, Dev EM score: %f"
                        % (epoch, global_step, dev_f1, dev_em))
                    write_summary(dev_f1, "dev/F1", summary_writer,
                                  global_step)
                    write_summary(dev_em, "dev/EM", summary_writer,
                                  global_step)

                    # Early stopping based on dev EM. You could switch this to use F1 instead.
                    if best_dev_em is None or dev_em > best_dev_em:
                        best_dev_em = dev_em
                        logging.info("Saving to %s..." % bestmodel_ckpt_path)
                        self.bestmodel_saver.save(session,
                                                  bestmodel_ckpt_path,
                                                  global_step=global_step)

                        # NOTE: CHANGE
                        #logging.info("Saving to %s..." % ema_bestmodel_ckpt_path)
                        #self.ema_bestmodel_saver.save(session, ema_bestmodel_ckpt_path, global_step=global_step)

            epoch_toc = time.time()
            logging.info("End of epoch %i. Time for epoch: %f" %
                         (epoch, epoch_toc - epoch_tic))

        sys.stdout.flush()
Beispiel #21
0
class ELMoTuner(Tuner):

    def __init__(self, train_corpus_fname, test_corpus_fname,
                 vocab_fname, options_fname, pretrain_model_fname,
                 model_save_path, max_characters_per_token=30,
                 batch_size=32, num_labels=2):
        # Load a corpus.
        super().__init__(train_corpus_fname=train_corpus_fname,
                         tokenized_train_corpus_fname=train_corpus_fname + ".elmo-tokenized",
                         test_corpus_fname=test_corpus_fname,
                         tokenized_test_corpus_fname=test_corpus_fname + ".elmo-tokenized",
                         model_name="elmo", vocab_fname=vocab_fname,
                         model_save_path=model_save_path, batch_size=batch_size)
        # configurations
        self.options_fname = options_fname
        self.pretrain_model_fname = pretrain_model_fname
        self.max_characters_per_token = max_characters_per_token
        self.num_labels = 2 # positive, negative
        self.num_train_steps = (int((len(self.train_data) - 1) / self.batch_size) + 1) * self.num_epochs
        self.eval_every = int(self.num_train_steps / self.num_epochs)  # epoch마다 평가
        # Create a Batcher to map text to character ids.
        # lm_vocab_file = ELMo는 token vocab이 없어도 on-the-fly로 입력 id들을 만들 수 있다
        # 하지만 자주 나오는 char sequence, 즉 vocab을 미리 id로 만들어 놓으면 좀 더 빠른 학습이 가능
        # max_token_length = the maximum number of characters in each token
        self.batcher = Batcher(lm_vocab_file=vocab_fname, max_token_length=self.max_characters_per_token)
        self.training = tf.placeholder(tf.bool)
        # build train graph
        self.ids_placeholder, self.labels_placeholder, self.dropout_keep_prob, self.logits, self.loss = make_elmo_graph(options_fname,
                                                                                                                        pretrain_model_fname,
                                                                                                                        max_characters_per_token,
                                                                                                                        num_labels, tune=True)

    def tune(self):
        global_step = tf.train.get_or_create_global_step()
        optimizer = tf.train.AdamOptimizer(learning_rate=0.0001)
        grads_and_vars = optimizer.compute_gradients(self.loss)
        train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step)
        output_feed = [train_op, global_step, self.logits, self.loss]
        saver = tf.train.Saver(max_to_keep=1)
        sess = tf.Session()
        sess.run(tf.global_variables_initializer())
        self.train(sess, saver, global_step, output_feed)

    def make_input(self, sentences, labels, is_training):
        current_input = self.batcher.batch_sentences(sentences)
        current_output = np.array(labels)
        if is_training:
            input_feed = {
                self.ids_placeholder: current_input,
                self.labels_placeholder: current_output,
                self.dropout_keep_prob: self.dropout_keep_prob_rate,
                self.training: True
            }
        else:
            input_feed_ = {
                self.ids_placeholder: current_input,
                self.labels_placeholder: current_output,
                self.dropout_keep_prob: 1.0,
                self.training: False
            }
            input_feed = [input_feed_, current_output]
        return input_feed
            allcnt = 1
            while exists(prefix + str(allcnt)):
                allcnt += 1

            print('num of batches', allcnt)

            for cnt in tqdm(range(1, allcnt)):

                dataset_file = prefix + str(
                    cnt)  #test.pep.token/batch1,2,3,...
                embedding_file = dataset_file + '.elmo_embeddingds_{}.hdf5'.format(
                    basename(dirname(model_dir)))

                with open(dataset_file) as f:
                    tokenized_context = [x.strip().split() for x in f]

                # Create batches of data.
                context_ids = batcher.batch_sentences(tokenized_context,
                                                      max_length=args.max_len)

                # Compute ELMo representations (here for the input only, for simplicity).

                elmo_context_input_ = sess.run(
                    elmo_context_input['weighted_op'],
                    feed_dict={context_character_ids: context_ids})

                print(elmo_context_input_[0].shape)  #40 x 64
                with h5py.File(embedding_file, 'w') as f:
                    f.create_dataset('embed', data=elmo_context_input_)
Beispiel #23
0
class ELMoEmbeddingEvaluator(SentenceEmbeddingEvaluator):
    def __init__(
            self,
            tune_model_fname="/notebooks/embedding/data/sentence-embeddings/elmo/tune-ckpt",
            pretrain_model_fname="/notebooks/embedding/data/sentence-embeddings/elmo/pretrain-ckpt/elmo.model",
            options_fname="/notebooks/embedding/data/sentence-embeddings/elmo/pretrain-ckpt/options.json",
            vocab_fname="/notebooks/embedding/data/sentence-embeddings/elmo/pretrain-ckpt/elmo-vocab.txt",
            max_characters_per_token=30,
            dimension=256,
            num_labels=2,
            use_notebook=False):

        # configurations
        super().__init__("elmo", dimension, use_notebook)
        self.tokenizer = get_tokenizer("mecab")
        self.batcher = Batcher(lm_vocab_file=vocab_fname,
                               max_token_length=max_characters_per_token)
        self.ids_placeholder, self.elmo_embeddings, self.probs = make_elmo_graph(
            options_fname,
            pretrain_model_fname,
            max_characters_per_token,
            num_labels,
            tune=False)
        # restore model
        saver = tf.train.Saver(tf.global_variables())
        self.sess = tf.Session()
        checkpoint_path = tf.train.latest_checkpoint(tune_model_fname)
        saver.restore(self.sess, checkpoint_path)

    def predict(self, sentence):
        tokens = self.tokenize(sentence)
        model_input = self.make_input(tokens)
        probs = self.sess.run(self.probs, model_input)
        return probs

    """
    sentence를 입력하면 토크나이즈 결과와 token 벡터 시퀀스를 반환한다
        - shape :[[# of tokens], [batch size, max seq length, dimension]]
    """

    def get_token_vector_sequence(self, sentence):
        tokens = self.tokenize(sentence)
        model_input = self.make_input(tokens)
        sentence_vector = self.sess.run(self.elmo_embeddings['weighted_op'],
                                        model_input)
        return [tokens, sentence_vector[0]]

    """
    sentence를 입력하면 토크나이즈 결과와 토큰 시퀀스의 마지막 벡터를 반환한다
    ELMo는 Language Model이기 때문에 토큰 시퀀스 마지막 벡터에 많은 정보가 녹아 있다
         - shape :[[# of tokens], [batch size, dimension]]
    """

    def get_sentence_vector(self, sentence):
        tokens, vecs = self.get_token_vector_sequence(sentence)
        return [tokens, vecs[-1]]

    def tokenize(self, sentence):
        tokens = self.tokenizer.morphs(sentence)
        return post_processing(tokens)

    def make_input(self, tokens):
        model_input = self.batcher.batch_sentences([tokens])
        input_feed = {self.ids_placeholder: model_input}
        return input_feed
Beispiel #24
0
    elmo_question_output = weight_layers(
        'output', question_embeddings_op, l2_coef=0.0
    )


# Now we can compute embeddings.
raw_context = [
    'Pretrained biLMs compute representations useful for NLP tasks .',
    'They give state of the art performance for many tasks .'
]
tokenized_context = [sentence.split() for sentence in raw_context]
tokenized_question = [
    ['What', 'are', 'biLMs', 'useful', 'for', '?'],
]

with tf.Session() as sess:
    # It is necessary to initialize variables once before running inference.
    sess.run(tf.global_variables_initializer())

    # Create batches of data.
    context_ids = batcher.batch_sentences(tokenized_context)
    question_ids = batcher.batch_sentences(tokenized_question)

    # Compute ELMo representations (here for the input only, for simplicity).
    elmo_context_input_, elmo_question_input_ = sess.run(
        [elmo_context_input['weighted_op'], elmo_question_input['weighted_op']],
        feed_dict={context_character_ids: context_ids,
                   question_character_ids: question_ids}
    )

Beispiel #25
0
with tf.Session() as sess:
    # It is necessary to initialize variables once before running inference.
    sess.run(tf.global_variables_initializer())

    # Create batches of data.
    best_val_acc = 0.0
    best_test_acc = 0.0
    for epoch in range(epoch_nums):
        inds = list(range(len(train_datas[0])))
        np.random.shuffle(inds)
        train_datas = [np.asarray(_)[inds] for _ in train_datas]
        for (i, (batch_context, batch_question, batch_options, batch_answer, \
            batch_context_l, batch_question_l, batch_options_l)) in enumerate(
            zip(*train_datas)):
            batch_options = batch_options.flatten()
            context_ids = batcher.batch_sentences(batch_context)
            question_ids = batcher2.batch_sentences(batch_question)
            options_ids = batcher2.batch_sentences(batch_options)
            max_context_length = context_ids.shape[1]
            batch_options_l = batch_options_l.flatten()

            # Compute ELMo representations (here for the input only, for simplicity).
            _, out_loss, accu = sess.run(
                [train_op, loss, accuracy],
                feed_dict={
                    context_character_ids: context_ids,
                    question_character_ids: question_ids,
                    options_character_ids: options_ids,
                    context_lengths: batch_context_l,
                    question_lengths: batch_question_l,
                    options_lengths: batch_options_l,
Beispiel #26
0
def elmo_eval(data_pd):
    from bilm import Batcher, BidirectionalLanguageModel, weight_layers
    # Location of pretrained LM.  Here we use the test fixtures.
    vocab_file = "vectors/elmo/hdf5/vocab.txt"
    options_file = "vectors/elmo/hdf5/options.json"
    weight_file = "vectors/elmo/hdf5/weights.hdf5"

    # Create a Batcher to map text to character ids.
    batcher = Batcher(vocab_file, 50)

    # Input placeholders to the biLM.
    character_ids = tf.placeholder('int32', shape=(None, None, 50))

    # Build the biLM graph.
    bilm = BidirectionalLanguageModel(options_file, weight_file)

    # Get ops to compute the LM embeddings.
    embeddings = bilm(character_ids)

    elmo_input = weight_layers('input', embeddings, l2_coef=0.0)

    tokenized_sentences = tokenize_sentences(data_pd["text"])

    with tf.Session(config=config) as sess:
        # It is necessary to initialize variables once before running inference.
        sess.run(tf.global_variables_initializer())

        # Create batches of data.
        ids = batcher.batch_sentences(tokenized_sentences)

        batch_size = 64
        beginning_index = 0
        x_data = []
        while beginning_index <= ids.shape[0]:
            # Compute ELMo representations (here for the input only, for simplicity).
            elmo_vectors = sess.run(elmo_input['weighted_op'],
                                    feed_dict={
                                        character_ids:
                                        ids[beginning_index:beginning_index +
                                            batch_size, :, :]
                                    })
            x_data.append(elmo_vectors)
            beginning_index += batch_size

        x_data = np.concatenate(x_data)

    y_data = data_pd["sentiment"].values

    x_train, x_val, y_train, y_val = train_test_split(x_data, y_data)

    with tf.Session(config=config) as sess:
        # create model
        a_model = model.prediction_model(max_seq_len=x_train.shape[1],
                                         embedding_size=x_train.shape[2],
                                         batch_size=BATCH_SIZE,
                                         sess=sess)
        a_model.train(x_train=x_train,
                      y_train=y_train,
                      x_val=x_val,
                      y_val=y_val)
        a_model.eval_model(x_val, y_val)
    def list_to_lazy_embeddings_with_dump(self,
                                          batch: List[List[str]],
                                          outfile_to_dump=None,
                                          partition=20):
        """
        Parameters
        ----------
        batch : ``List[List[str]]``, required
            A list of tokenized sentences.

        """
        nothing = []
        if batch == [[]]:
            raise ValueError('Batch should not be empty')
        else:

            if self.word_embedding_file is None:
                batcher = Batcher(self.voc_file_path, self.max_word_length)
            else:
                batcher = TokenBatcher(self.voc_file_path)
            config = tf.ConfigProto(allow_soft_placement=True)
            num_of_total_tokens = len(batch)
            each_partition_size = math.ceil(num_of_total_tokens / partition)
            print('Parition Size:{}'.format(partition))
            for _pi in range(0, partition):
                document_embeddings = []
                with tf.Session(config=config) as sess:
                    sess.run(tf.global_variables_initializer())
                    _begin_index = _pi * each_partition_size
                    _end_index = _begin_index + each_partition_size
                    print(15 * '-')
                    print('Itration: {}, Data Range {} - {}'.format(
                        _pi + 1, _begin_index, _end_index))
                    for i, token in enumerate(
                            tqdm(batch[_begin_index:_end_index],
                                 total=len(batch[_begin_index:_end_index]))):
                        char_ids = batcher.batch_sentences([[token]])
                        _ops = sess.run(
                            self.ops,
                            feed_dict={self.ids_placeholder: char_ids})
                        mask = _ops['mask']
                        lm_embeddings = _ops['lm_embeddings']
                        token_embeddings = _ops['token_embeddings']
                        lengths = _ops['lengths']
                        length = int(mask.sum())

                        #### shape of new embeddings [1,3,1,1024] so swap axes
                        new_embedding = np.swapaxes(lm_embeddings, 1, 2)
                        ## Another method for moving the axis (swapping) is transposing the matrix
                        #new_embedding_ = lm_embeddings.transpose(0,2,1,3)

                        new_embedding = new_embedding.reshape(
                            (new_embedding.shape[2], new_embedding.shape[3]))

                        # ds = fout.create_dataset(
                        #     '{}'.format(i),
                        #     new_embedding.shape, dtype='float32',
                        #     data=new_embedding
                        # )

                        document_embeddings.append(new_embedding)
                document_embeddings = np.asarray(document_embeddings)
                with h5py.File(outfile_to_dump.replace('@@', str(_pi + 1)),
                               'w') as fout:
                    ds = fout.create_dataset('embeddings',
                                             document_embeddings.shape,
                                             dtype='float32',
                                             data=document_embeddings)

        return nothing
    def list_to_embeddings_with_dump_(self,
                                      batch: List[List[str]],
                                      slice=None,
                                      outfile_to_dump=None,
                                      tfidfs=None):
        """
        Parameters
        ----------
        batch : ``List[List[str]]``, required
            A list of tokenized sentences.

        """
        elmo_embeddings = []

        if batch == [[]]:
            if slice is None:
                elmo_embeddings.append(empty_embedding(self.dims))
            else:
                if slice > 2:
                    raise ValueError('Slice can not be larger than 3')
                elmo_embeddings.append(empty_embedding(self.dims, True))
        else:
            batcher = Batcher(self.voc_file_path, self.max_word_length)
            config = tf.ConfigProto(allow_soft_placement=True)
            with tf.Session(config=config) as sess:
                sess.run(tf.global_variables_initializer())
                with h5py.File(outfile_to_dump, 'w') as fout:
                    for i, _contents in enumerate(tqdm(batch,
                                                       total=len(batch))):
                        # for _content in _contents:
                        # content_tokens_ = _contents.strip().split()
                        char_ids = batcher.batch_sentences([_contents])
                        _ops = sess.run(
                            self.ops,
                            feed_dict={self.ids_placeholder: char_ids})
                        mask = _ops['mask']
                        lm_embeddings = _ops['lm_embeddings']
                        token_embeddings = _ops['token_embeddings']
                        lengths = _ops['lengths']
                        length = int(mask.sum())
                        if slice is None:
                            lm_embeddings_mean = np.apply_over_axes(
                                np.mean, lm_embeddings[0], (0, 1))
                        else:
                            lm_embeddings_mean = np.apply_over_axes(
                                np.mean, lm_embeddings[0][slice], (0))
                        if tfidfs is not None:
                            lm_embeddings_mean = np.matmul(
                                lm_embeddings_mean,
                                np.reshape(tfidfs[i], (1, tfidfs[i].shape[0])))
                        # if lm_embeddings.shape != (1,3,1,1024):
                        #     print('Index of batch:', i)
                        #     print('Contents:', _contents)
                        #     print('Content Tokens:', content_tokens_)
                        #     print('Shape:', lm_embeddings.shape)
                        #     print(10*'-')
                        elmo_embeddings.append(lm_embeddings_mean)
                        ds = fout.create_dataset(
                            '{}'.format(i),
                            lm_embeddings_mean[0].shape[1:],
                            dtype='float32',
                            data=lm_embeddings_mean[0])

        return elmo_embeddings
class QAModel(object):
    def __init__(self, FLAGS, id2word, word2id, emb_matrix, id2char, char2id):
        self.FLAGS = FLAGS
        self.id2word = id2word
        self.word2id = word2id
        self.emb_matrix = emb_matrix
        self.id2char = id2char
        self.char2id = char2id

        self.batcher = Batcher(
            "/Users/lam/Desktop/Lam-cs224n/Projects/qa/squad/data/elmo/elmo_vocab.txt",
            50)
        self.filters = [(5, 10)]  #change back to 100 after

        self.options_file = "/Users/lam/Desktop/Lam-cs224n/Projects/qa/squad/data/elmo/elmo.json"
        self.weight_file = "/Users/lam/Desktop/Lam-cs224n/Projects/qa/squad/data/elmo/lm_weight.hdf5"

        with tf.variable_scope(
                "QAModel",
                initializer=tf.contrib.layers.variance_scaling_initializer(
                    factor=1.0, uniform=True)):
            self.add_placeholders()
            self.add_embedding_layer(emb_matrix)
        self.add_elmo_embedding_layer(self.options_file, self.weight_file)
        with tf.variable_scope(
                "QAModel",
                initializer=tf.contrib.layers.variance_scaling_initializer(
                    factor=1.0, uniform=True)):
            self.build_graph()
            self.add_loss()

        # Define trainable parameters, gradient, gradient norm, and clip by gradient norm
        params = tf.trainable_variables(
            "QAModel")  #since only one scope "QAModel"
        gradients = tf.gradients(
            self.loss,
            params)  # d(loss)/d(params) return list of (length len(params))
        self.gradient_norm = tf.global_norm(gradients)
        clipped_gradients, _ = tf.clip_by_global_norm(
            gradients,
            5.0)  #return list_clipped, global_norm(here we don't need this)
        self.param_norm = tf.global_norm(params)

        # Define optimizer and updates
        # (updates is what you need to fetch in session.run to do a gradient update)
        self.global_step = tf.Variable(0, name="global_step", trainable=False)
        #This will increment the global step if global_step is not None
        opt = tf.train.AdamOptimizer(
            learning_rate=0.001)  # you can try other optimizers
        self.updates = opt.apply_gradients(zip(clipped_gradients, params),
                                           global_step=self.global_step)

        self.saver = tf.train.Saver(tf.global_variables(), max_to_keep=1)
        self.bestmodel_saver = tf.train.Saver(tf.global_variables(),
                                              max_to_keep=1)

        self.summaries = tf.summary.merge_all()

    def add_placeholders(self):
        self.context_ids = tf.placeholder(tf.int32)
        self.context_mask = tf.placeholder(tf.int32)
        self.qn_ids = tf.placeholder(tf.int32)
        self.qn_mask = tf.placeholder(tf.int32)
        self.ans_span = tf.placeholder(tf.int32, shape=[None, 2])

        #NOTE:CHANGE
        #self.context_char = tf.placeholder(tf.int32, shape=[None, self.FLAGS.context_len, self.FLAGS.max_word_len])
        #self.qn_char = tf.placeholder(tf.int32, shape=[None, self.FLAGS.question_len, self.FLAGS.max_word_len])
        #The following two may not be necessary
        #self.context_char_mask = tf.placeholder(tf.int32, shape=[None, self.FLAGS.context_len, self.FLAGS.max_word_len])
        #self.qn_char_mask = tf.placeholder(tf.int32, shape=[None, self.FLAGS.question_len, self.FLAGS.max_word_len])
        self.context_elmo = tf.placeholder('int32', shape=[None, None, 50])
        self.qn_elmo = tf.placeholder('int32', shape=[None, None, 50])

        # Add a placeholder to feed in the keep probability (for dropout).
        # This is necessary so that we can instruct the model to use dropout when training, but not when testing
        self.keep_prob = tf.placeholder_with_default(1.0, shape=())

    def add_embedding_layer(self, emb_matrix):
        with tf.variable_scope("embeddings"):
            #set to constant so its untrainable
            embedding_matrix = tf.constant(
                emb_matrix, dtype=tf.float32,
                name="emb_matrix")  # shape (400002, embedding_size)

            # Get the word embeddings for the context and question,
            self.context_embs = tf.nn.embedding_lookup(embedding_matrix,
                                                       self.context_ids)
            self.qn_embs = tf.nn.embedding_lookup(embedding_matrix,
                                                  self.qn_ids)

        #self.add_char_embedding_layer()

    def add_elmo_embedding_layer(self,
                                 options_file,
                                 weight_file,
                                 output_use=False):
        """
        Adds ELMo lstm embeddings to the graph.

        Inputs:
            options_file: json_file for the pretrained model
            weight_file: weights hdf5 file for the pretrained model
            output_use: determine if use elmo in output of biRNN (default False)
        """
        #Build biLM graph
        bilm = BidirectionalLanguageModel(options_file, weight_file)
        context_embeddings_op = bilm(self.context_elmo)
        question_embeddings_op = bilm(self.qn_elmo)

        # Get an op to compute ELMo (weighted average of the internal biLM layers)
        # Our SQuAD model includes ELMo at both the input and output layers
        # of the task GRU, so we need 4x ELMo representations for the question
        # and context at each of the input and output.
        # We use the same ELMo weights for both the question and context
        # at each of the input and output.
        #compute the final ELMo representations.
        self.elmo_context_input = weight_layers(
            'input', context_embeddings_op,
            l2_coef=0.001)['weighted_op']  #(batch size, context size, ????)
        with tf.variable_scope('', reuse=True):
            # the reuse=True scope reuses weights from the context for the question
            self.elmo_question_input = weight_layers(
                'input', question_embeddings_op, l2_coef=0.001)['weighted_op']

        if output_use:
            self.elmo_context_output = weight_layers(
                'output', context_embeddings_op, l2_coef=0.001)['weighted_op']
            with tf.variable_scope('', reuse=True):
                # the reuse=True scope reuses weights from the context for the question
                self.elmo_question_output = weight_layers(
                    'output', question_embeddings_op,
                    l2_coef=0.001)['weighted_op']

    def build_graph(self):
        context_embs_concat = tf.concat(
            [self.elmo_context_input, self.context_embs],
            2)  #(batch_size, qn_len, 1024+self.FLAGS.embedding_size)

        context_embs_concat.set_shape(
            (None, None, 1024 + self.FLAGS.embedding_size))
        #qn_embs_concat.set_shape((None, None, 1024+self.FLAGS.embedding_size))
        self.qn_mask.set_shape((None, None))
        self.context_mask.set_shape((None, None))

        with tf.variable_scope("start"):
            softmax_start = SimpleSoftmaxLayer()
            self.logits_start, self.probdist_start = softmax_start.build_graph(
                context_embs_concat, self.context_mask)
        with tf.variable_scope("end"):
            softmax_end = SimpleSoftmaxLayer()
            self.logits_end, self.probdist_end = softmax_end.build_graph(
                context_embs_concat, self.context_mask)

    def add_loss(self):
        with tf.variable_scope("loss"):
            # Calculate loss for prediction of start position
            loss_start = tf.nn.sparse_softmax_cross_entropy_with_logits(
                logits=self.logits_start,
                labels=self.ans_span[:,
                                     0])  # loss_start has shape (batch_size)
            self.loss_start = tf.reduce_mean(loss_start)
            tf.summary.scalar('loss_start', self.loss_start)

            # Calculate loss for prediction of end position
            loss_end = tf.nn.sparse_softmax_cross_entropy_with_logits(
                logits=self.logits_end, labels=self.ans_span[:, 1])
            self.loss_end = tf.reduce_mean(loss_end)
            tf.summary.scalar('loss_end', self.loss_end)

            # Add two losses
            self.loss = self.loss_start + self.loss_end
            tf.summary.scalar('loss', self.loss)

    def run_train_iter(self, session, batch, summary_writer):
        input_feed = {}
        input_feed[self.context_ids] = batch.context_ids
        input_feed[self.context_mask] = batch.context_mask

        #NOTE: CHANGE added context_char
        #input_feed[self.context_char] = batch.context_char
        input_feed[self.context_elmo] = self.batcher.batch_sentences(
            batch.context_tokens)

        input_feed[self.qn_ids] = batch.qn_ids
        input_feed[self.qn_mask] = batch.qn_mask

        #NOTE: CHANGE added qn_char
        #input_feed[self.qn_char] = batch.qn_char
        input_feed[self.qn_elmo] = self.batcher.batch_sentences(
            batch.qn_tokens)

        input_feed[self.ans_span] = batch.ans_span
        input_feed[self.keep_prob] = 1.0 - self.FLAGS.dropout  # apply dropout

        output_feed = [
            self.updates, self.summaries, self.loss, self.global_step,
            self.param_norm, self.gradient_norm
        ]

        #output_feed = [self.elmo_context_input]
        [_, summaries, loss, global_step, param_norm,
         gradient_norm] = sess.run(output_feed, feed_dict=input_feed)

        print("FINISHED")

    def train(self, session, train_context_path, train_qn_path, train_ans_path,
              dev_qn_path, dev_context_path, dev_ans_path):
        summary_writer = tf.summary.FileWriter(
            "/Users/lam/Desktop/Lam-cs224n/Projects/qa/squad", session.graph)
        for batch in get_batch_generator(self.word2id,
                                         self.char2id,
                                         train_context_path,
                                         train_qn_path,
                                         train_ans_path,
                                         self.FLAGS.batch_size,
                                         self.FLAGS.context_len,
                                         self.FLAGS.question_len,
                                         self.FLAGS.max_word_len,
                                         discard_long=True):
            self.sample_batch = batch

            self.run_train_iter(session, batch, summary_writer)
            break
Beispiel #30
0
        sess.run(tf.global_variables_initializer())
        #sess = tf_debug.LocalCLIDebugWrapperSession(sess)
        for batch in get_batch_generator(word2id,
                                         char2id,
                                         train_context_path,
                                         train_qn_path,
                                         train_ans_path,
                                         batch_size,
                                         context_len,
                                         question_len,
                                         max_word_len,
                                         discard_long=True):

            # Create batches of data.
            input_feed = {}
            input_feed[context_elmo] = batcher.batch_sentences(
                batch.context_tokens)
            input_feed[question_elmo] = batcher.batch_sentences(
                batch.qn_tokens)
            input_feed[context_ids] = batch.context_ids
            input_feed[context_mask] = batch.context_mask
            input_feed[qn_ids] = batch.qn_ids
            input_feed[qn_mask] = batch.qn_mask
            input_feed[ans_span] = batch.ans_span
            input_feed[keep_prob] = dropout

            print("first context length: ",
                  len(input_feed[context_elmo][0]) - 2)
            print("second context length: ",
                  len(input_feed[context_elmo][1]) - 2)
            print("first question length: ",
                  len(input_feed[question_elmo][0]) - 2)