def __init__(self, args, next_element):
        super().__init__(args)

        self.chunk_id, self.words, self.words_len,\
        self.begin_span, self.end_span, self.spans_len,\
        self.cand_entities, self.cand_entities_ids, self.cand_entities_scores, self.cand_entities_labels,\
        self.cand_entities_len, self.ground_truth, self.ground_truth_len,\
        self.begin_gm, self.end_gm = next_element

        self.begin_span = tf.cast(self.begin_span, tf.int32)
        self.end_span = tf.cast(self.end_span, tf.int32)
        self.words_len = tf.cast(self.words_len, tf.int32)

        base = '/home/ubuntu/end2end_neural_el/'
        options_file = base + "data/basic_data/elmo/elmo_2x1024_128_2048cnn_1xhighway_options.json"
        weight_file = base + "data/basic_data/elmo/elmo_2x1024_128_2048cnn_1xhighway_weights.hdf5"
        token_embedding_file = base+"data/vocabulary/" + 'embeddings.hdf5'
        #wiki_embedding_file = base+"data/vocabulary/" + 'wiki_embeddings_light.hdf5'

        self.bilm = BidirectionalLanguageModel(
            options_file,
            weight_file,
            use_character_inputs=False,
            embedding_weight_file=token_embedding_file
        )

        self.entity_bilm = BidirectionalLanguageModel(
            options_file,
            weight_file,
            use_character_inputs=False,
            embedding_weight_file=token_embedding_file,
            max_batch_size=20000
        )
        """
        self.words:  tf.int64, shape=[None, None]   # shape = (batch size, max length of sentence in batch)
        self.words_len: tf.int32, shape=[None],     #   shape = (batch size)
        self.chars: tf.int64, shape=[None, None, None], # shape = (batch size, max length of sentence, max length of word)
        self.chars_len: tf.int64, shape=[None, None],   # shape = (batch_size, max_length of sentence)
        self.begin_span: tf.int32, shape=[None, None],  # shape = (batch_size, max number of candidate spans in one of the batch sentences)
        self.end_span: tf.int32, shape=[None, None],
        self.spans_len: tf.int64, shape=[None],     # shape = (batch size)
        self.cand_entities: tf.int64, shape=[None, None, None],  # shape = (batch size, max number of candidate spans, max number of cand entitites)
        self.cand_entities_scores: tf.float32, shape=[None, None, None],
        self.cand_entities_labels: tf.int64, shape=[None, None, None],
        # shape = (batch_size, max number of candidate spans)
        self.cand_entities_len: tf.int64, shape=[None, None],
        self.ground_truth: tf.int64, shape=[None, None],  # shape = (batch_size, max number of candidate spans)
        self.ground_truth_len: tf.int64, shape=[None],    # shape = (batch_size)
        self.begin_gm: tf.int64, shape=[None, None],  # shape = (batch_size, max number of gold mentions)
        self.end_gm = tf.placeholder(tf.int64, shape=[None, None],
        """

        with open(config.base_folder +"data/tfrecords/" + self.args.experiment_name +
                          "/word_char_maps.pickle", 'rb') as handle:
            _, id2word, _, id2char, _, _ = pickle.load(handle)
            self.nwords = len(id2word)
            self.nchars = len(id2char)

        self.loss_mask = self._sequence_mask_v13(self.cand_entities_len, tf.shape(self.cand_entities_scores)[2])
    def _embed_ids(self):
        print('[launch] embed_ids, use_ELMO')
        with tf.name_scope('text_embedding_layer'):

            # Build the biLM graph.
            if self.params.USE_CHAR_ELMO:
                bilm = BidirectionalLanguageModel(
                    options_file=self.data_path + self.params.ELMO_OPTIONS,
                    weight_file=self.data_path + self.params.ELMO_WEIGHTS,
                    max_batch_size=self.params.batch_size *
                    self.params.MAX_SENTENCES)
            else:
                bilm = BidirectionalLanguageModel(
                    options_file=self.data_path + self.params.ELMO_OPTIONS,
                    weight_file=self.data_path + self.params.ELMO_WEIGHTS,
                    use_character_inputs=False,
                    embedding_weight_file=self.data_path +
                    self.params.ELMO_TOKEN,
                    max_batch_size=self.params.batch_size *
                    self.params.MAX_SENTENCES)

            # question
            self.embed_q_op = bilm(self.batch_q)
            self.elmo_q_output = weight_layers('output',
                                               self.embed_q_op,
                                               l2_coef=0.0)
            self.embed_q_inter = self.elmo_q_output['weighted_op']
            '''
            self.q_len_to_pad = self.params.MAX_LENGTH_Q - tf.reduce_max( self.batch_len_q ) -1
            self.q_len_to_pad = tf.maximum(self.q_len_to_pad, 0)
            self.embed_q = tf.pad( self.embed_q_inter, [[0,0], [0, self.q_len_to_pad], [0,0]] )
            '''
            self.embed_q = self.embed_q_inter

            # sentence
            self.embed_s_op = bilm(self.batch_s)
            with tf.variable_scope('', reuse=tf.AUTO_REUSE):
                self.elmo_s_output = weight_layers('output',
                                                   self.embed_s_op,
                                                   l2_coef=0.0)
            self.embed_s_inter = self.elmo_s_output['weighted_op']

            self.s_len_to_pad = self.params.MAX_SENTENCES - tf.reduce_max(
                self.batch_len_s) - 1
            self.s_len_to_pad = tf.maximum(self.s_len_to_pad, 0)
            #self.embed_s = tf.pad( self.embed_s_inter, [[0,0], [0, self.s_len_to_pad], [0,0]] )

            # [batch_size, max_len (data dependent), elmo_embedding]
            self.embed_q = self.embed_q_inter

            # [batch_size, MAX_SENTENCES, max_len (data dependent), elmo_embedding]
            self.embed_s = tf.reshape(self.embed_s_inter, [
                self.params.batch_size, self.params.MAX_SENTENCES, -1,
                self.params.DIM_WORD_EMBEDDING
            ])
Exemple #3
0
    def bilm_build_graph(options_file, weight_file):
        # Build the biLM graph.
        bilm = BidirectionalLanguageModel(options_file, weight_file)

        # Get ops to compute the LM embeddings.
        context_embeddings_op = bilm(context_elmo)
        question_embeddings_op = bilm(question_elmo)

        # Get an op to compute ELMo (weighted average of the internal biLM layers)
        # Our SQuAD model includes ELMo at both the input and output layers
        # of the task GRU, so we need 4x ELMo representations for the question
        # and context at each of the input and output.
        # We use the same ELMo weights for both the question and context
        # at each of the input and output.
        elmo_context_input = weight_layers('input',
                                           context_embeddings_op,
                                           l2_coef=0.0)['weighted_op']
        with tf.variable_scope('', reuse=True):
            # the reuse=True scope reuses weights from the context for the question
            elmo_question_input = weight_layers('input',
                                                question_embeddings_op,
                                                l2_coef=0.0)['weighted_op']
        """
        elmo_context_output = weight_layers(
            'output', context_embeddings_op, l2_coef=0.0
        )['weighted_op']

        with tf.variable_scope('', reuse=True):
            # the reuse=True scope reuses weights from the context for the question
            elmo_question_output = weight_layers(
                'output', question_embeddings_op, l2_coef=0.0
            )

        """
        return elmo_context_input, elmo_question_input
    def __init__(self):
        self.vocab_file = 'vocab_small.txt'
        # Location of pretrained LM.  Here we use the test fixtures.
        datadir = os.path.join('pretrained')
        options_file = os.path.join(
            datadir, 'elmo_2x4096_512_2048cnn_2xhighway_5.5B_options.json')
        weight_file = os.path.join(
            datadir, 'elmo_2x4096_512_2048cnn_2xhighway_5.5B_weights.hdf5')

        # Dump the token embeddings to a file. Run this once for your dataset.
        token_embedding_file = 'elmo_token_embeddings.hdf5'
        dump_token_embeddings(self.vocab_file, options_file, weight_file,
                              token_embedding_file)

        self.batcher = TokenBatcher(self.vocab_file)
        # Input placeholders to the biLM.
        self.context_token_ids = tf.placeholder('int32', shape=(None, None))
        # Build the biLM graph.
        bilm = BidirectionalLanguageModel(
            options_file,
            weight_file,
            use_character_inputs=False,
            embedding_weight_file=token_embedding_file)
        # Get ops to compute the LM embeddings.
        context_embeddings_op = bilm(self.context_token_ids)
        self.elmo_context_input = weight_layers('input',
                                                context_embeddings_op,
                                                l2_coef=0.0)
        self.elmo_context_output = weight_layers('output',
                                                 context_embeddings_op,
                                                 l2_coef=0.0)
def load_elmo_embeddings(directory, top=False):
    """
    :param directory: directory with an ELMo model ('model.hdf5', 'options.json' and 'vocab.txt.gz')
    :param top: use ony top ELMo layer
    :return: ELMo batcher, character id placeholders, op object
    """
    vocab_file = os.path.join(directory, 'vocab.txt.gz')
    options_file = os.path.join(directory, 'options.json')
    weight_file = os.path.join(directory, 'model.hdf5')

    # Create a Batcher to map text to character ids.
    batcher = Batcher(vocab_file, 50)

    # Input placeholders to the biLM.
    sentence_character_ids = tf.placeholder('int32', shape=(None, None, 50))

    # Build the biLM graph.
    bilm = BidirectionalLanguageModel(options_file, weight_file, max_batch_size=300)

    # Get ops to compute the LM embeddings.
    sentence_embeddings_op = bilm(sentence_character_ids)

    # Get an op to compute ELMo (weighted average of the internal biLM layers)
    elmo_sentence_input = weight_layers('input', sentence_embeddings_op, use_top_only=top)
    return batcher, sentence_character_ids, elmo_sentence_input
    def __init__(self, session, bilm_params):
        self.params = bilm_params

        # Create a Batcher to map text to character ids.
        self.batcher = Batcher(self.params.vocab_file,
                               self.params.max_char_len)

        # Input placeholders to the biLM.
        self.sentence_character_ids = tf.placeholder(
            'int32', shape=(None, None, self.params.max_char_len))

        # Build the biLM graph.
        bilm = BidirectionalLanguageModel(
            self.params.options_file,
            self.params.weights_file,
        )

        # Get ops to compute the LM embeddings.
        sentence_embeddings_op = bilm(self.sentence_character_ids)

        self.elmo_sentence_input = weight_layers('input',
                                                 sentence_embeddings_op,
                                                 l2_coef=0.0,
                                                 use_top_only=True)

        self.sess = session
        self.sess.run(tf.global_variables_initializer())
Exemple #7
0
    def build(self,
              data,
              options_file,
              weight_file,
              token_embedding_file,
              m1,
              m2,
              a1,
              a2,
              a3,
              length=20,
              dim=128,
              batch_sizeK=1024,
              save_path='this-model.ckpt',
              data_save_path='this-data.bin',
              M1_path=None):
        self.data = data
        self.dim = dim
        self.length = self.data.length = length
        self.batch_sizeK = batch_sizeK
        self.data_save_path = data_save_path
        self.save_path = save_path
        self.M1_path = M1_path
        self.bilm = BidirectionalLanguageModel(
            options_file,
            weight_file,
            use_character_inputs=False,
            embedding_weight_file=token_embedding_file,
            max_batch_size=512)

        self.tf_parts = model.TFParts(m1, m2, a1, a2, a3, self.bilm, length,
                                      dim, token_embedding_file, batch_sizeK)
Exemple #8
0
def load_elmo_embeddings(directory, top=False):
    """
    :param directory: directory with an ELMo model ('model.hdf5', 'options.json' and 'vocab.txt.gz')
    :param top: use ony top ELMo layer
    :return: ELMo batcher, character id placeholders, op object
    """
    if os.path.isfile(os.path.join(directory, 'vocab.txt.gz')):
        vocab_file = os.path.join(directory, 'vocab.txt.gz')
    elif os.path.isfile(os.path.join(directory, 'vocab.txt')):
        vocab_file = os.path.join(directory, 'vocab.txt')
    else:
        raise SystemExit('Error: no vocabulary file found in the directory.')
    options_file = os.path.join(directory, 'options.json')
    weight_file = os.path.join(directory, 'model.hdf5')
    with open(options_file, 'r') as f:
        m_options = json.load(f)
    max_chars = m_options['char_cnn']['max_characters_per_token']

    # Create a Batcher to map text to character ids.
    batcher = Batcher(vocab_file, max_chars)

    # Input placeholders to the biLM.
    sentence_character_ids = tf.compat.v1.placeholder('int32', shape=(None, None, max_chars))

    # Build the biLM graph.
    bilm = BidirectionalLanguageModel(options_file, weight_file, max_batch_size=128)

    # Get ops to compute the LM embeddings.
    sentence_embeddings_op = bilm(sentence_character_ids)

    # Get an op to compute ELMo (weighted average of the internal biLM layers)
    elmo_sentence_input = weight_layers('input', sentence_embeddings_op, use_top_only=top)
    return batcher, sentence_character_ids, elmo_sentence_input
Exemple #9
0
def load_elmo_embeddings(directory, top=True):
    if os.path.isfile(os.path.join(directory, 'vocab.txt.gz')):
        vocab_file = os.path.join(directory, 'vocab.txt.gz')
    elif os.path.isfile(os.path.join(directory, 'vocab.txt')):
        vocab_file = os.path.join(directory, 'vocab.txt')
    else:
        raise SystemExit('Error: no vocabulary file found in the directory.')
    options_file = os.path.join(directory, 'options.json')
    weight_file = os.path.join(directory, 'model.hdf5')

    # Create a Batcher to map text to character ids.
    batcher = Batcher(vocab_file, 50)

    # Input placeholders to the biLM.
    sentence_character_ids = tf.placeholder('int32', shape=(None, None, 50))

    # Build the biLM graph.
    bilm = BidirectionalLanguageModel(options_file,
                                      weight_file,
                                      max_batch_size=300)

    # Get ops to compute the LM embeddings.
    sentence_embeddings_op = bilm(sentence_character_ids)

    # Get an op to compute ELMo (weighted average of the internal biLM layers)
    # Our model includes ELMo at both the input and output layers
    # of the task GRU, so we need 2x ELMo representations at each of the input and output.

    elmo_sentence_input = weight_layers('input',
                                        sentence_embeddings_op,
                                        use_top_only=top)
    return batcher, sentence_character_ids, elmo_sentence_input
Exemple #10
0
 def build(self, options_file, weight_file, vocab_file, token_embedding_file):
     self._bilm = BidirectionalLanguageModel(
         options_file,
         weight_file,
         use_character_inputs=False,
         embedding_weight_file=token_embedding_file,
         max_batch_size = self.max_batch)
     self._token_batcher = TokenBatcher(vocab_file)
Exemple #11
0
def _get_elmo_bilm():
    return BidirectionalLanguageModel(
        os.path.join(DIR_PATH,
                     'elmo_2x1024_128_2048cnn_1xhighway_options.json'),
        os.path.join(DIR_PATH,
                     'elmo_2x1024_128_2048cnn_1xhighway_weights.hdf5'),
        use_character_inputs=False,
        embedding_weight_file=os.path.join(DIR_PATH,
                                           'elmo_token_embeddings.hdf5'))
    def __init__(self,
                 options_file: str = DEFAULT_OPTIONS_FILE,
                 weight_file: str = DEFAULT_WEIGHT_FILE,
                 dims: int = 1024,
                 embedding_file=None) -> None:
        """
        Parameters
        ----------
        options_file : ``str``, optional
            A path or URL to an ELMo options file.
        weight_file : ``str``, optional
            A path or URL to an ELMo weights file.
        """
        if options_file is None:
            options_file = DEFAULT_OPTIONS_FILE
        if weight_file is None:
            weight_file = DEFAULT_WEIGHT_FILE

        self.options_file_path = cached_path(options_file)
        self.weight_file_path = cached_path(weight_file)
        with open(self.options_file_path, 'r') as fin:
            options = json.load(fin)
        self.max_word_length = options['char_cnn']['max_characters_per_token']
        self.dims = dims
        self.word_embedding_file = embedding_file
        # char file begin
        if self.word_embedding_file is None:
            self.ids_placeholder = tf.placeholder('int32',
                                                  shape=(None, None,
                                                         self.max_word_length))
            self.model = BidirectionalLanguageModel(self.options_file_path,
                                                    self.weight_file_path)
        # char file end
        else:
            self.ids_placeholder = tf.placeholder('int32', shape=(None, None))
            self.model = BidirectionalLanguageModel(self.options_file_path,
                                                    self.weight_file_path,
                                                    False,
                                                    self.word_embedding_file)

        self.ops = self.model(self.ids_placeholder)
Exemple #13
0
    def __init__(self, model_path):
        vocab_file = os.path.join(model_path, 'vocabs.txt')
        options_file = os.path.join(model_path, 'options.json')
        weight_file = os.path.join(model_path, 'weights.hdf5')
        with open(options_file, "r") as fj:
            options = json.load(fj)
        self.max_characters_per_token = options['char_cnn']['max_characters_per_token']        

        # Create a Batcher to map text to character ids.
        self.batcher = Batcher(vocab_file, self.max_characters_per_token)
        # Build the biLM graph.
        self.bilm = BidirectionalLanguageModel(options_file, weight_file)
Exemple #14
0
    def get_bilm(self):
        token_embedding_file = './ELMo/{}dim/DaGuanElmo_{}dim.hdf5'.format(
            self.elmo_dim, self.elmo_dim)
        options_file = './ELMo/{}dim/options.json'.format(self.elmo_dim)
        weight_file = './ELMo/{}dim/weights.hdf5'.format(self.elmo_dim)
        bilm = BidirectionalLanguageModel(
            options_file,
            weight_file,
            use_character_inputs=False,
            embedding_weight_file=token_embedding_file)

        return bilm
Exemple #15
0
 def __lambda_layer(x):
     import tensorflow as tf
     from utils.files import ProjectPath
     from bilm import BidirectionalLanguageModel, all_layers
     x_input = tf.cast(x, tf.int32)
     input_dir = ProjectPath.from_dict(path_dict)
     options_file: str = input_dir.join("options.json").get()
     weight_file: str = input_dir.join("weights.hdf5").get()
     with tf.variable_scope('', reuse=tf.AUTO_REUSE):
         bilm = BidirectionalLanguageModel(options_file, weight_file)
         embedding_op = bilm(x_input)
         return all_layers(embedding_op)
Exemple #16
0
    def __init__(self, config):
        self.lr = config["lr"]
        self.input_dropout = config["dropout"]
        self.lstm_dim = config["lstm_dim"]
        self.layer_type = config["layer_type"]
        self.use_attention = config["attention"]
        self.num_attention_heads = config['num_attention_heads']
        self.size_per_head = config['size_per_head']
        self.num_tags = 7
        self.char_dim = 300
        self.global_step = tf.Variable(0, trainable=False)
        self.best_dev_f1 = tf.Variable(0.0, trainable=False)
        self.initializer = initializers.xavier_initializer()

        # elmo
        self.batcher = TokenBatcher(config['vocab_file'])
        # Input placeholders to the biLM.
        self.context_token_ids = tf.placeholder('int32', shape=(None, None))
        # Build the biLM graph.
        self.bilm = BidirectionalLanguageModel(
            config['options_file'],
            config['weight_file'],
            use_character_inputs=False,
            embedding_weight_file=config['token_embedding_file'])
        self.context_embeddings_op = self.bilm(self.context_token_ids)
        self.elmo_context_input = weight_layers('input',
                                                self.context_embeddings_op,
                                                l2_coef=0.0)['weighted_op']

        # add placeholders for the model
        self.mask_inputs = tf.placeholder(dtype=tf.int32,
                                          shape=[None, None],
                                          name="ChatInputs")
        self.targets = tf.placeholder(dtype=tf.int32,
                                      shape=[None, None],
                                      name="Targets")

        # dropout keep prob
        self.dropout = tf.placeholder(dtype=tf.float32, name="Dropout")
        used = tf.sign(tf.abs(self.mask_inputs))
        length = tf.reduce_sum(used, reduction_indices=1)
        self.lengths = tf.cast(length, tf.int32)
        self.batch_size = tf.shape(self.mask_inputs)[0]
        self.num_steps = tf.shape(self.mask_inputs)[-1]

        self.logits = self.inference(self.elmo_context_input)
        # loss of the model
        self.loss = self.loss_layer(self.logits, self.lengths)
        self.train_op = self.train(self.loss)
        # saver of the model
        self.saver = tf.train.Saver(tf.global_variables(), max_to_keep=5)
Exemple #17
0
    def _load_embeddings(self,
                         vocab="vocab.txt",
                         options="elmo_options.json",
                         weights="elmo_weights.hdf5"):
        self.elmo_model = BidirectionalLanguageModel(options, weights)
        self.batcher = Batcher(vocab, 50)

        self.character_ids = tf.placeholder('int32', shape=(None, None, 50))
        context_embeddings_op = self.elmo_model(self.character_ids)
        self.elmo_context_output = weight_layers('output',
                                                 context_embeddings_op,
                                                 l2_coef=0.0)

        tf.global_variables_initializer().run()
Exemple #18
0
 def build(self, input_shape):
     self.elmo_model = BidirectionalLanguageModel(self.options_file,
                                                  self.weight_file,
                                                  max_batch_size=32)
     self.W = self.add_weight(name='W',
                              shape=(3, ),
                              initializer=keras.initializers.get('zeros'),
                              trainable=True)
     self.gamma = self.add_weight(
         name='gamma',
         shape=(1, ),
         initializer=keras.initializers.get('ones'),
         trainable=True)
     super(ELMoEmbedding, self).build(input_shape)
Exemple #19
0
    def __init__(self, config, trainable=True, dev=False, graph=None):
        self.config = config
        self.graph = graph if graph is not None else tf.Graph()
        with self.graph.as_default():
            self.N = config.batch_size if (trainable
                                           or dev) else config.test_batch_size
            self.QL = config.ques_limit if (trainable
                                            or dev) else config.test_ques_limit

            self.global_step = tf.get_variable(
                'global_step',
                shape=[],
                dtype=tf.int32,
                initializer=tf.constant_initializer(0),
                trainable=False)
            self.qa_id = tf.placeholder(tf.int32, [self.N], "qa_id")
            self.dropout = tf.placeholder_with_default(0.0, (), name="dropout")
            self.que1 = tf.placeholder(tf.int32, [self.N, self.QL + 2],
                                       "question1")
            self.que2 = tf.placeholder(tf.int32, [self.N, self.QL + 2],
                                       "question2")
            self.label = tf.placeholder(tf.int32, [self.N, 2], "label")

            # elmo
            self.bilm = BidirectionalLanguageModel(
                config.elmo_options_file,
                config.elmo_weight_file,
                use_character_inputs=False,
                embedding_weight_file=config.embedding_file)

        model = BiLSTMModel(self.que1, self.que2, self.label, self.bilm,
                            self.dropout, self.N, self.QL, config.qqp_hidden,
                            True)
        self.loss, self.pred_label = model.build_model()
        _, pos_prob = tf.split(self.pred_label, [1, 1], axis=1)
        self.pos_prob = tf.reshape(pos_prob, [-1])

        if trainable:
            self.lr = config.ml_learning_rate
            self.opt = tf.train.AdamOptimizer(learning_rate=self.lr,
                                              beta1=0.8,
                                              beta2=0.999,
                                              epsilon=1e-7)
            grads = self.opt.compute_gradients(self.loss)
            gradients, variables = zip(*grads)
            capped_grads, _ = tf.clip_by_global_norm(gradients,
                                                     config.grad_clip)
            self.train_op = self.opt.apply_gradients(
                zip(capped_grads, variables), global_step=self.global_step)
    def add_elmo_embedding_layer(self,
                                 options_file,
                                 weight_file,
                                 output_use=False):
        """
        Adds ELMo lstm embeddings to the graph.
        1. self.elmo_context_input (batch size, max_context_len among the batch, 1024)
        2. self.elmo_question_input (batch size, max_qn_len among the batch, 1024)
        If output_use is true:
            add the output to the graph either

        Inputs:
            options_file: json_file for the pretrained model
            weight_file: weights hdf5 file for the pretrained model
            output_use: determine if use elmo in output of biRNN (default False)

        """
        #Build biLM graph
        bilm = BidirectionalLanguageModel(options_file, weight_file)
        context_embeddings_op = bilm(self.context_elmo)
        question_embeddings_op = bilm(self.qn_elmo)

        # Get an op to compute ELMo (weighted average of the internal biLM layers)
        # Our SQuAD model includes ELMo at both the input and output layers
        # of the task GRU, so we need 4x ELMo representations for the question
        # and context at each of the input and output.
        # We use the same ELMo weights for both the question and context
        # at each of the input and output.
        #compute the final ELMo representations.
        self.elmo_context_input = weight_layers(
            'input', context_embeddings_op, l2_coef=0.001
        )['weighted_op']  #(batch size, max_context_len among the batch, 1024)
        with tf.variable_scope('', reuse=True):
            # the reuse=True scope reuses weights from the context for the question
            self.elmo_question_input = weight_layers(
                'input', question_embeddings_op, l2_coef=0.001)['weighted_op']

        if output_use:
            self.elmo_context_output = weight_layers(
                'output', context_embeddings_op, l2_coef=0.001)['weighted_op']
            with tf.variable_scope('', reuse=True):
                # the reuse=True scope reuses weights from the context for the question
                self.elmo_question_output = weight_layers(
                    'output', question_embeddings_op,
                    l2_coef=0.001)['weighted_op']
Exemple #21
0
 def __init__(self, config):
     super(NERModel, self).__init__(config)
     self.idx_to_tag = {
         idx: tag
         for tag, idx in list(self.config.vocab_tags.items())
     }
     if self.config.use_elmo:
         # self.elmo_inputs = []
         self.batcher = Batcher(self.config.filename_words, 50)
         self.bilm = BidirectionalLanguageModel(
             self.config.filename_elmo_options,
             self.config.filename_elmo_weights)
         self.elmo_token_ids = tf.placeholder('int32',
                                              shape=(None, None, 50))
         self.elmo_embeddings_op = self.bilm(self.elmo_token_ids)
         self.elmo_embeddings_input = weight_layers('input',
                                                    self.elmo_embeddings_op,
                                                    l2_coef=0.0)
Exemple #22
0
    def word_embedding(self):
        bilm = BidirectionalLanguageModel(
            self.options_file,
            self.weight_file,
            use_character_inputs=False,
            embedding_weight_file=self.token_embedding_file)
        context_embeddings_op = bilm(self.W_P)
        question_embeddings_op = bilm(self.W_Q)

        elmo_context_input = weight_layers('input',
                                           context_embeddings_op,
                                           l2_coef=0.0)
        with tf.variable_scope('', reuse=True):
            # the reuse=True scope reuses weights from the context for the question
            elmo_question_input = weight_layers('input',
                                                question_embeddings_op,
                                                l2_coef=0.0)
        self.p_embed, self.q_embed = elmo_context_input[
            'weighted_op'], elmo_question_input['weighted_op']
Exemple #23
0
 def __lambda_layer(x):
     import tensorflow as tf
     from utils.files import ProjectPath
     from bilm import BidirectionalLanguageModel, all_layers, weight_layers
     x_input = tf.cast(x, tf.int32)
     input_dir = ProjectPath.from_dict(path_dict)
     options_file: str = input_dir.join("options.json").get()
     weight_file: str = input_dir.join("weights.hdf5").get()
     with tf.variable_scope('', reuse=tf.AUTO_REUSE):
         bilm = BidirectionalLanguageModel(options_file, weight_file)
         embedding_op = bilm(x_input)
         if mode == "weighted":
             return all_layers(embedding_op)
         else:
             context_input = weight_layers('input',
                                           embedding_op,
                                           l2_coef=0.0,
                                           use_top_only=(mode == "top"))
             return context_input['weighted_op']
Exemple #24
0
def get_elmo_embeddings(config):

    batcher = Batcher(config.filename_words, 50)

    token_ids = tf.placeholder('int32', shape=(None, None, 50))
    bilm = BidirectionalLanguageModel(
        config.filename_elmo_options,
        config.filename_elmo_weights,
    )

    elmo_embeddings_op = bilm(token_ids)
    elmo_context_input = weight_layers('input',
                                       elmo_embeddings_op,
                                       l2_coef=0.0)

    with tf.Session() as sess:
        # It is necessary to initialize variables once before running inference.

        sess.run(tf.global_variables_initializer())

        # Create batches of data.
        train = CoNLLDataset(config.filename_train)
        sents_train = [entry[0] for entry in train]
        sent_ids_train = batcher.batch_sentences(sents_train)

        # Compute ELMo representations (here for the input only, for simplicity).

        elmo_input = sess.run([elmo_context_input['weighted_op']],
                              feed_dict={token_ids: sent_ids_train[0]})
        for batch in sent_ids_train[1:]:
            elmo_input_ = sess.run([elmo_context_input['weighted_op']],
                                   feed_dict={token_ids: batch})
            elmo_input = np.hstack(elmo_input, elmo_input_)

        test = CoNLLDataset(config.filename_test)
        sents_test = [entry[0] for entry in test]
        sent_ids_test = batcher.batch_sentences(sents_test)

        elmo_context_output_ = sess.run([elmo_context_input['weighted_op']],
                                        feed_dict={token_ids: sent_ids_test})

    return elmo_context_input_, elmo_context_output_
Exemple #25
0
def elmo_embedding(options_file, weight_file, token_a_character_ids,
                   token_b_character_ids):
    # Input placeholders to the biLM.
    # token_a_character_ids = tf.placeholder('int32', shape=(None, None, 50))
    # token_b_character_ids = tf.placeholder('int32', shape=(None, None, 50))

    # Build the biLM graph.
    bilm = BidirectionalLanguageModel(options_file, weight_file)

    # Get ops to compute the LM embeddings.
    token_a_embeddings_op = bilm(token_a_character_ids)
    token_b_embeddings_op = bilm(token_b_character_ids)

    elmo_token_a = weight_layers('input', token_a_embeddings_op, l2_coef=0.0)
    with tf.variable_scope('', reuse=True):
        # the reuse=True scope reuses weights from the context for the question
        elmo_token_b = weight_layers('input',
                                     token_b_embeddings_op,
                                     l2_coef=0.0)

    return elmo_token_a['weighted_op'], elmo_token_b['weighted_op']
Exemple #26
0
    def __init__(self, config, word_mat, char_mat, mix=False, dev=False, trainable=True):
        self.config = config
        self.trainable = trainable
        self.N = (config.batch_size * 2 if mix else config.batch_size) if (trainable or dev) else config.test_batch_size
        self.PL = config.para_limit if (trainable or dev) else config.test_para_limit
        self.QL = config.ques_limit if (trainable or dev) else config.test_ques_limit
        self.AL = config.ans_limit if (trainable or dev) else config.test_ans_limit
        self.CL = config.char_limit

        self.d = config.qa_hidden
        self.dc = config.char_dim

        self.global_step = tf.get_variable('global_step', shape=[], dtype=tf.int32,
                                           initializer=tf.constant_initializer(0), trainable=False)
        self.qa_id = tf.placeholder(tf.int32, [self.N], "qa_id")
        self.dropout = tf.placeholder_with_default(0.0, (), name="dropout")
        self.para = tf.placeholder(tf.int32, [self.N, self.PL + 2], "paragraph")
        self.para_char = tf.placeholder(tf.int32, [self.N, self.PL, self.CL], "paragraph_char")
        self.que = tf.placeholder(tf.int32, [self.N, self.QL + 2], "question")
        self.que_char = tf.placeholder(tf.int32, [self.N, self.QL, self.CL], "question_char")
        self.y1 = tf.placeholder(tf.int32, [self.N, self.PL], "answer_index1")
        self.y2 = tf.placeholder(tf.int32, [self.N, self.PL], "answer_index2")
        self.labels = tf.placeholder_with_default(tf.ones([self.N], dtype=tf.int32), (self.N), name="labels")

        _, self.para1, _ = tf.split(self.para, [1, self.PL, 1], axis=1)
        _, self.que1, _ = tf.split(self.que, [1, self.QL, 1], axis=1)
        self.para_mask = tf.cast(self.para1, tf.bool)
        self.que_mask = tf.cast(self.que1, tf.bool)
        self.para_len = tf.reduce_sum(tf.cast(self.para_mask, tf.int32), axis=-1)
        self.que_len = tf.reduce_sum(tf.cast(self.que_mask, tf.int32), axis=-1)

        with tf.device("/cpu:0"):
            self.word_mat = tf.get_variable("word_mat", initializer=tf.constant(word_mat, dtype=tf.float32),
                                            trainable=config.word_trainable)
            self.char_mat = tf.get_variable("char_mat", initializer=tf.constant(char_mat, dtype=tf.float32),
                                            trainable=True)
        # elmo
        self.elmo_bilm = BidirectionalLanguageModel(config.elmo_options_file, config.elmo_weight_file,
                                                    use_character_inputs=False,
                                                    embedding_weight_file=config.embedding_file)
def dump_token_bilm_embeddings(vocab_file, dataset_file, options_file,
                               weight_file, embedding_weight_file, outfile):

    batcher = TokenBatcher(vocab_file)

    ids_placeholder = tf.placeholder('int32', shape=(None, None))

    model = BidirectionalLanguageModel(
        options_file,
        weight_file,
        use_character_inputs=False,
        embedding_weight_file=embedding_weight_file)
    ops = model(ids_placeholder)

    config = tf.ConfigProto(allow_soft_placement=True)
    with tf.Session(config=config) as sess:
        sess.run(tf.global_variables_initializer())
        sentence_id = 0
        with open(dataset_file, 'r') as fin, \
                h5py.File(outfile, 'w') as fout:
            for line in fin:
                sentence = line.strip().split()
                token_ids = batcher.batch_sentences([sentence])
                embeddings = sess.run(ops['lm_embeddings'],
                                      feed_dict={ids_placeholder: token_ids})
                embedding = embeddings[0, :, :, :]
                ds = fout.create_dataset('{}'.format(sentence_id),
                                         embedding.shape,
                                         dtype='float32',
                                         data=embedding)
                # static_token_emb = embedding[0, :, :]
                # first_layer_emb = embedding[1, :, :]
                # final_layer_emb = embedding[2, :, :]
                # avg_emb = np.mean(embedding, axis=0)  # average embedding of the three layers
                sentence_id += 1
                if sentence_id % 500 == 0:
                    print('%.2f%% finished!' %
                          (sentence_id / float(EXAMPLE_COUNT) * 100))
Exemple #28
0
def make_elmo(chars_batched):
    bilm = BidirectionalLanguageModel(
                    options_file="data/elmo_2x4096_512_2048cnn_2xhighway_options.json",
                    weight_file="data/elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5",
                    max_batch_size=128)

    lm = bilm(chars_batched)
    word_representations_padded = weight_layers('scalar_mix', lm, l2_coef=0.0)['weighted_op']

    # Strip off multiplication by gamma. Our parser has gamma=1 because there is a
    # projection matrix right after
    word_representations_padded = word_representations_padded.op.inputs[0]

    with tf.variable_scope('', reuse=True):
        elmo_scalar_mix_matrix = tf.get_variable('scalar_mix_ELMo_W')

    tf.global_variables_initializer().run()
    tf.assign(elmo_scalar_mix_matrix, [
        float(sd['elmo.scalar_mix_0.scalar_parameters.0']),
        float(sd['elmo.scalar_mix_0.scalar_parameters.1']),
        float(sd['elmo.scalar_mix_0.scalar_parameters.2'])]).eval()

    # Switch from padded to packed representation
    valid_mask = lm['mask']
    dim_padded = tf.shape(lm['mask'])[:2]
    mask_flat = tf.reshape(lm['mask'], (-1,))
    dim_flat = tf.shape(mask_flat)[:1]
    nonpad_ids = tf.to_int32(tf.where(mask_flat)[:,0])
    word_reps_shape = tf.shape(word_representations_padded)
    word_representations_flat = tf.reshape(word_representations_padded, [-1, int(word_representations_padded.shape[-1])])
    word_representations = tf.gather(word_representations_flat, nonpad_ids)

    projected_annotations = tf.matmul(
        word_representations,
        tf.constant(sd['project_elmo.weight'].numpy().transpose()))

    return projected_annotations, nonpad_ids, dim_flat, dim_padded, valid_mask, lm['lengths']
Exemple #29
0
    def __init__(
        self,
        request_names=['train', 'valid', 'test'],
        new_names=['train', 'valid', 'test'],
        classes_name='classes',
        op_type='vectorizer',
        op_name='elmo',
        dimension=1024,
        file_type='bin',  #TODO: ?
        options_file='./embeddingsruwiki_pp_1.0_elmo/options.json',  #TODO: ?
        weights_file='./embeddingsruwiki_pp_1.0_elmo/weights.hdf5',  #TODO: ?
        vocab_file='./embeddingsruwiki_pp_1.0_elmo/vocab.txt'  #TODO: ?
    ):
        super().__init__(request_names, new_names, op_type, op_name)
        self.file_type = file_type
        self.classes_name = classes_name
        self.dimension = dimension
        # Location of pretrained LM.
        self.options_file = options_file
        self.weights_file = weights_file
        self.vocab_file = vocab_file
        # Create a Batcher to map text to character ids.
        char_per_token = 50
        self.batcher = Batcher(self.vocab_file, char_per_token)
        # Input placeholders to the biLM.
        self.character_ids = tf.placeholder('int32',
                                            shape=(None, None, char_per_token))
        # Build the biLM graph.
        bilm = BidirectionalLanguageModel(self.options_file, self.weights_file)

        # Get ops to compute the LM embeddings.
        embeddings_op = bilm(character_ids)

        # Get an op to compute ELMo (weighted average of the internal biLM layers)
        self.elmo_output = weight_layers('elmo_output',
                                         embeddings_op,
                                         l2_coef=0.0)
Exemple #30
0
    def __init__(self, path=embedding_path, embedding_dim=512,
                 sentence_len=max_sentence_len, pair_mode=False):
        embeddings = dict()

        self.embedding_path = path
        self.embedding_dim = embedding_dim
        self.sentence_len = sentence_len
        self.pair_mode = pair_mode
        self.embedding_dict = embeddings

        g_elmo = tf.Graph()
        vocab_file = './bilmelmo/data/vocab.txt'
        options_file = './bilmelmo/try/options.json'
        weight_file = './bilmelmo/try/weights.hdf5'
        token_embedding_file = './bilmelmo/data/vocab_embedding.hdf5'

        with tf.Graph().as_default() as g_elmo:
            self.batcher = TokenBatcher(vocab_file)
            self.context_token_ids = tf.placeholder('int32', shape=(None, None))
            self.bilm = BidirectionalLanguageModel(
                options_file,
                weight_file,
                use_character_inputs=False,
                embedding_weight_file=token_embedding_file
            )

            self.context_embeddings_op = self.bilm(self.context_token_ids)
            self.elmo_context_input = weight_layers('input', self.context_embeddings_op, l2_coef=0.0)

            self.elmo_context_output = weight_layers(
                'output', self.context_embeddings_op, l2_coef=0.0
            )
            init = tf.global_variables_initializer()
        sess_elmo = tf.Session(graph=g_elmo)
        sess_elmo.run(init)
        self.sess_elmo = sess_elmo