def __init__(self, args, next_element): super().__init__(args) self.chunk_id, self.words, self.words_len,\ self.begin_span, self.end_span, self.spans_len,\ self.cand_entities, self.cand_entities_ids, self.cand_entities_scores, self.cand_entities_labels,\ self.cand_entities_len, self.ground_truth, self.ground_truth_len,\ self.begin_gm, self.end_gm = next_element self.begin_span = tf.cast(self.begin_span, tf.int32) self.end_span = tf.cast(self.end_span, tf.int32) self.words_len = tf.cast(self.words_len, tf.int32) base = '/home/ubuntu/end2end_neural_el/' options_file = base + "data/basic_data/elmo/elmo_2x1024_128_2048cnn_1xhighway_options.json" weight_file = base + "data/basic_data/elmo/elmo_2x1024_128_2048cnn_1xhighway_weights.hdf5" token_embedding_file = base+"data/vocabulary/" + 'embeddings.hdf5' #wiki_embedding_file = base+"data/vocabulary/" + 'wiki_embeddings_light.hdf5' self.bilm = BidirectionalLanguageModel( options_file, weight_file, use_character_inputs=False, embedding_weight_file=token_embedding_file ) self.entity_bilm = BidirectionalLanguageModel( options_file, weight_file, use_character_inputs=False, embedding_weight_file=token_embedding_file, max_batch_size=20000 ) """ self.words: tf.int64, shape=[None, None] # shape = (batch size, max length of sentence in batch) self.words_len: tf.int32, shape=[None], # shape = (batch size) self.chars: tf.int64, shape=[None, None, None], # shape = (batch size, max length of sentence, max length of word) self.chars_len: tf.int64, shape=[None, None], # shape = (batch_size, max_length of sentence) self.begin_span: tf.int32, shape=[None, None], # shape = (batch_size, max number of candidate spans in one of the batch sentences) self.end_span: tf.int32, shape=[None, None], self.spans_len: tf.int64, shape=[None], # shape = (batch size) self.cand_entities: tf.int64, shape=[None, None, None], # shape = (batch size, max number of candidate spans, max number of cand entitites) self.cand_entities_scores: tf.float32, shape=[None, None, None], self.cand_entities_labels: tf.int64, shape=[None, None, None], # shape = (batch_size, max number of candidate spans) self.cand_entities_len: tf.int64, shape=[None, None], self.ground_truth: tf.int64, shape=[None, None], # shape = (batch_size, max number of candidate spans) self.ground_truth_len: tf.int64, shape=[None], # shape = (batch_size) self.begin_gm: tf.int64, shape=[None, None], # shape = (batch_size, max number of gold mentions) self.end_gm = tf.placeholder(tf.int64, shape=[None, None], """ with open(config.base_folder +"data/tfrecords/" + self.args.experiment_name + "/word_char_maps.pickle", 'rb') as handle: _, id2word, _, id2char, _, _ = pickle.load(handle) self.nwords = len(id2word) self.nchars = len(id2char) self.loss_mask = self._sequence_mask_v13(self.cand_entities_len, tf.shape(self.cand_entities_scores)[2])
def _embed_ids(self): print('[launch] embed_ids, use_ELMO') with tf.name_scope('text_embedding_layer'): # Build the biLM graph. if self.params.USE_CHAR_ELMO: bilm = BidirectionalLanguageModel( options_file=self.data_path + self.params.ELMO_OPTIONS, weight_file=self.data_path + self.params.ELMO_WEIGHTS, max_batch_size=self.params.batch_size * self.params.MAX_SENTENCES) else: bilm = BidirectionalLanguageModel( options_file=self.data_path + self.params.ELMO_OPTIONS, weight_file=self.data_path + self.params.ELMO_WEIGHTS, use_character_inputs=False, embedding_weight_file=self.data_path + self.params.ELMO_TOKEN, max_batch_size=self.params.batch_size * self.params.MAX_SENTENCES) # question self.embed_q_op = bilm(self.batch_q) self.elmo_q_output = weight_layers('output', self.embed_q_op, l2_coef=0.0) self.embed_q_inter = self.elmo_q_output['weighted_op'] ''' self.q_len_to_pad = self.params.MAX_LENGTH_Q - tf.reduce_max( self.batch_len_q ) -1 self.q_len_to_pad = tf.maximum(self.q_len_to_pad, 0) self.embed_q = tf.pad( self.embed_q_inter, [[0,0], [0, self.q_len_to_pad], [0,0]] ) ''' self.embed_q = self.embed_q_inter # sentence self.embed_s_op = bilm(self.batch_s) with tf.variable_scope('', reuse=tf.AUTO_REUSE): self.elmo_s_output = weight_layers('output', self.embed_s_op, l2_coef=0.0) self.embed_s_inter = self.elmo_s_output['weighted_op'] self.s_len_to_pad = self.params.MAX_SENTENCES - tf.reduce_max( self.batch_len_s) - 1 self.s_len_to_pad = tf.maximum(self.s_len_to_pad, 0) #self.embed_s = tf.pad( self.embed_s_inter, [[0,0], [0, self.s_len_to_pad], [0,0]] ) # [batch_size, max_len (data dependent), elmo_embedding] self.embed_q = self.embed_q_inter # [batch_size, MAX_SENTENCES, max_len (data dependent), elmo_embedding] self.embed_s = tf.reshape(self.embed_s_inter, [ self.params.batch_size, self.params.MAX_SENTENCES, -1, self.params.DIM_WORD_EMBEDDING ])
def bilm_build_graph(options_file, weight_file): # Build the biLM graph. bilm = BidirectionalLanguageModel(options_file, weight_file) # Get ops to compute the LM embeddings. context_embeddings_op = bilm(context_elmo) question_embeddings_op = bilm(question_elmo) # Get an op to compute ELMo (weighted average of the internal biLM layers) # Our SQuAD model includes ELMo at both the input and output layers # of the task GRU, so we need 4x ELMo representations for the question # and context at each of the input and output. # We use the same ELMo weights for both the question and context # at each of the input and output. elmo_context_input = weight_layers('input', context_embeddings_op, l2_coef=0.0)['weighted_op'] with tf.variable_scope('', reuse=True): # the reuse=True scope reuses weights from the context for the question elmo_question_input = weight_layers('input', question_embeddings_op, l2_coef=0.0)['weighted_op'] """ elmo_context_output = weight_layers( 'output', context_embeddings_op, l2_coef=0.0 )['weighted_op'] with tf.variable_scope('', reuse=True): # the reuse=True scope reuses weights from the context for the question elmo_question_output = weight_layers( 'output', question_embeddings_op, l2_coef=0.0 ) """ return elmo_context_input, elmo_question_input
def __init__(self): self.vocab_file = 'vocab_small.txt' # Location of pretrained LM. Here we use the test fixtures. datadir = os.path.join('pretrained') options_file = os.path.join( datadir, 'elmo_2x4096_512_2048cnn_2xhighway_5.5B_options.json') weight_file = os.path.join( datadir, 'elmo_2x4096_512_2048cnn_2xhighway_5.5B_weights.hdf5') # Dump the token embeddings to a file. Run this once for your dataset. token_embedding_file = 'elmo_token_embeddings.hdf5' dump_token_embeddings(self.vocab_file, options_file, weight_file, token_embedding_file) self.batcher = TokenBatcher(self.vocab_file) # Input placeholders to the biLM. self.context_token_ids = tf.placeholder('int32', shape=(None, None)) # Build the biLM graph. bilm = BidirectionalLanguageModel( options_file, weight_file, use_character_inputs=False, embedding_weight_file=token_embedding_file) # Get ops to compute the LM embeddings. context_embeddings_op = bilm(self.context_token_ids) self.elmo_context_input = weight_layers('input', context_embeddings_op, l2_coef=0.0) self.elmo_context_output = weight_layers('output', context_embeddings_op, l2_coef=0.0)
def load_elmo_embeddings(directory, top=False): """ :param directory: directory with an ELMo model ('model.hdf5', 'options.json' and 'vocab.txt.gz') :param top: use ony top ELMo layer :return: ELMo batcher, character id placeholders, op object """ vocab_file = os.path.join(directory, 'vocab.txt.gz') options_file = os.path.join(directory, 'options.json') weight_file = os.path.join(directory, 'model.hdf5') # Create a Batcher to map text to character ids. batcher = Batcher(vocab_file, 50) # Input placeholders to the biLM. sentence_character_ids = tf.placeholder('int32', shape=(None, None, 50)) # Build the biLM graph. bilm = BidirectionalLanguageModel(options_file, weight_file, max_batch_size=300) # Get ops to compute the LM embeddings. sentence_embeddings_op = bilm(sentence_character_ids) # Get an op to compute ELMo (weighted average of the internal biLM layers) elmo_sentence_input = weight_layers('input', sentence_embeddings_op, use_top_only=top) return batcher, sentence_character_ids, elmo_sentence_input
def __init__(self, session, bilm_params): self.params = bilm_params # Create a Batcher to map text to character ids. self.batcher = Batcher(self.params.vocab_file, self.params.max_char_len) # Input placeholders to the biLM. self.sentence_character_ids = tf.placeholder( 'int32', shape=(None, None, self.params.max_char_len)) # Build the biLM graph. bilm = BidirectionalLanguageModel( self.params.options_file, self.params.weights_file, ) # Get ops to compute the LM embeddings. sentence_embeddings_op = bilm(self.sentence_character_ids) self.elmo_sentence_input = weight_layers('input', sentence_embeddings_op, l2_coef=0.0, use_top_only=True) self.sess = session self.sess.run(tf.global_variables_initializer())
def build(self, data, options_file, weight_file, token_embedding_file, m1, m2, a1, a2, a3, length=20, dim=128, batch_sizeK=1024, save_path='this-model.ckpt', data_save_path='this-data.bin', M1_path=None): self.data = data self.dim = dim self.length = self.data.length = length self.batch_sizeK = batch_sizeK self.data_save_path = data_save_path self.save_path = save_path self.M1_path = M1_path self.bilm = BidirectionalLanguageModel( options_file, weight_file, use_character_inputs=False, embedding_weight_file=token_embedding_file, max_batch_size=512) self.tf_parts = model.TFParts(m1, m2, a1, a2, a3, self.bilm, length, dim, token_embedding_file, batch_sizeK)
def load_elmo_embeddings(directory, top=False): """ :param directory: directory with an ELMo model ('model.hdf5', 'options.json' and 'vocab.txt.gz') :param top: use ony top ELMo layer :return: ELMo batcher, character id placeholders, op object """ if os.path.isfile(os.path.join(directory, 'vocab.txt.gz')): vocab_file = os.path.join(directory, 'vocab.txt.gz') elif os.path.isfile(os.path.join(directory, 'vocab.txt')): vocab_file = os.path.join(directory, 'vocab.txt') else: raise SystemExit('Error: no vocabulary file found in the directory.') options_file = os.path.join(directory, 'options.json') weight_file = os.path.join(directory, 'model.hdf5') with open(options_file, 'r') as f: m_options = json.load(f) max_chars = m_options['char_cnn']['max_characters_per_token'] # Create a Batcher to map text to character ids. batcher = Batcher(vocab_file, max_chars) # Input placeholders to the biLM. sentence_character_ids = tf.compat.v1.placeholder('int32', shape=(None, None, max_chars)) # Build the biLM graph. bilm = BidirectionalLanguageModel(options_file, weight_file, max_batch_size=128) # Get ops to compute the LM embeddings. sentence_embeddings_op = bilm(sentence_character_ids) # Get an op to compute ELMo (weighted average of the internal biLM layers) elmo_sentence_input = weight_layers('input', sentence_embeddings_op, use_top_only=top) return batcher, sentence_character_ids, elmo_sentence_input
def load_elmo_embeddings(directory, top=True): if os.path.isfile(os.path.join(directory, 'vocab.txt.gz')): vocab_file = os.path.join(directory, 'vocab.txt.gz') elif os.path.isfile(os.path.join(directory, 'vocab.txt')): vocab_file = os.path.join(directory, 'vocab.txt') else: raise SystemExit('Error: no vocabulary file found in the directory.') options_file = os.path.join(directory, 'options.json') weight_file = os.path.join(directory, 'model.hdf5') # Create a Batcher to map text to character ids. batcher = Batcher(vocab_file, 50) # Input placeholders to the biLM. sentence_character_ids = tf.placeholder('int32', shape=(None, None, 50)) # Build the biLM graph. bilm = BidirectionalLanguageModel(options_file, weight_file, max_batch_size=300) # Get ops to compute the LM embeddings. sentence_embeddings_op = bilm(sentence_character_ids) # Get an op to compute ELMo (weighted average of the internal biLM layers) # Our model includes ELMo at both the input and output layers # of the task GRU, so we need 2x ELMo representations at each of the input and output. elmo_sentence_input = weight_layers('input', sentence_embeddings_op, use_top_only=top) return batcher, sentence_character_ids, elmo_sentence_input
def build(self, options_file, weight_file, vocab_file, token_embedding_file): self._bilm = BidirectionalLanguageModel( options_file, weight_file, use_character_inputs=False, embedding_weight_file=token_embedding_file, max_batch_size = self.max_batch) self._token_batcher = TokenBatcher(vocab_file)
def _get_elmo_bilm(): return BidirectionalLanguageModel( os.path.join(DIR_PATH, 'elmo_2x1024_128_2048cnn_1xhighway_options.json'), os.path.join(DIR_PATH, 'elmo_2x1024_128_2048cnn_1xhighway_weights.hdf5'), use_character_inputs=False, embedding_weight_file=os.path.join(DIR_PATH, 'elmo_token_embeddings.hdf5'))
def __init__(self, options_file: str = DEFAULT_OPTIONS_FILE, weight_file: str = DEFAULT_WEIGHT_FILE, dims: int = 1024, embedding_file=None) -> None: """ Parameters ---------- options_file : ``str``, optional A path or URL to an ELMo options file. weight_file : ``str``, optional A path or URL to an ELMo weights file. """ if options_file is None: options_file = DEFAULT_OPTIONS_FILE if weight_file is None: weight_file = DEFAULT_WEIGHT_FILE self.options_file_path = cached_path(options_file) self.weight_file_path = cached_path(weight_file) with open(self.options_file_path, 'r') as fin: options = json.load(fin) self.max_word_length = options['char_cnn']['max_characters_per_token'] self.dims = dims self.word_embedding_file = embedding_file # char file begin if self.word_embedding_file is None: self.ids_placeholder = tf.placeholder('int32', shape=(None, None, self.max_word_length)) self.model = BidirectionalLanguageModel(self.options_file_path, self.weight_file_path) # char file end else: self.ids_placeholder = tf.placeholder('int32', shape=(None, None)) self.model = BidirectionalLanguageModel(self.options_file_path, self.weight_file_path, False, self.word_embedding_file) self.ops = self.model(self.ids_placeholder)
def __init__(self, model_path): vocab_file = os.path.join(model_path, 'vocabs.txt') options_file = os.path.join(model_path, 'options.json') weight_file = os.path.join(model_path, 'weights.hdf5') with open(options_file, "r") as fj: options = json.load(fj) self.max_characters_per_token = options['char_cnn']['max_characters_per_token'] # Create a Batcher to map text to character ids. self.batcher = Batcher(vocab_file, self.max_characters_per_token) # Build the biLM graph. self.bilm = BidirectionalLanguageModel(options_file, weight_file)
def get_bilm(self): token_embedding_file = './ELMo/{}dim/DaGuanElmo_{}dim.hdf5'.format( self.elmo_dim, self.elmo_dim) options_file = './ELMo/{}dim/options.json'.format(self.elmo_dim) weight_file = './ELMo/{}dim/weights.hdf5'.format(self.elmo_dim) bilm = BidirectionalLanguageModel( options_file, weight_file, use_character_inputs=False, embedding_weight_file=token_embedding_file) return bilm
def __lambda_layer(x): import tensorflow as tf from utils.files import ProjectPath from bilm import BidirectionalLanguageModel, all_layers x_input = tf.cast(x, tf.int32) input_dir = ProjectPath.from_dict(path_dict) options_file: str = input_dir.join("options.json").get() weight_file: str = input_dir.join("weights.hdf5").get() with tf.variable_scope('', reuse=tf.AUTO_REUSE): bilm = BidirectionalLanguageModel(options_file, weight_file) embedding_op = bilm(x_input) return all_layers(embedding_op)
def __init__(self, config): self.lr = config["lr"] self.input_dropout = config["dropout"] self.lstm_dim = config["lstm_dim"] self.layer_type = config["layer_type"] self.use_attention = config["attention"] self.num_attention_heads = config['num_attention_heads'] self.size_per_head = config['size_per_head'] self.num_tags = 7 self.char_dim = 300 self.global_step = tf.Variable(0, trainable=False) self.best_dev_f1 = tf.Variable(0.0, trainable=False) self.initializer = initializers.xavier_initializer() # elmo self.batcher = TokenBatcher(config['vocab_file']) # Input placeholders to the biLM. self.context_token_ids = tf.placeholder('int32', shape=(None, None)) # Build the biLM graph. self.bilm = BidirectionalLanguageModel( config['options_file'], config['weight_file'], use_character_inputs=False, embedding_weight_file=config['token_embedding_file']) self.context_embeddings_op = self.bilm(self.context_token_ids) self.elmo_context_input = weight_layers('input', self.context_embeddings_op, l2_coef=0.0)['weighted_op'] # add placeholders for the model self.mask_inputs = tf.placeholder(dtype=tf.int32, shape=[None, None], name="ChatInputs") self.targets = tf.placeholder(dtype=tf.int32, shape=[None, None], name="Targets") # dropout keep prob self.dropout = tf.placeholder(dtype=tf.float32, name="Dropout") used = tf.sign(tf.abs(self.mask_inputs)) length = tf.reduce_sum(used, reduction_indices=1) self.lengths = tf.cast(length, tf.int32) self.batch_size = tf.shape(self.mask_inputs)[0] self.num_steps = tf.shape(self.mask_inputs)[-1] self.logits = self.inference(self.elmo_context_input) # loss of the model self.loss = self.loss_layer(self.logits, self.lengths) self.train_op = self.train(self.loss) # saver of the model self.saver = tf.train.Saver(tf.global_variables(), max_to_keep=5)
def _load_embeddings(self, vocab="vocab.txt", options="elmo_options.json", weights="elmo_weights.hdf5"): self.elmo_model = BidirectionalLanguageModel(options, weights) self.batcher = Batcher(vocab, 50) self.character_ids = tf.placeholder('int32', shape=(None, None, 50)) context_embeddings_op = self.elmo_model(self.character_ids) self.elmo_context_output = weight_layers('output', context_embeddings_op, l2_coef=0.0) tf.global_variables_initializer().run()
def build(self, input_shape): self.elmo_model = BidirectionalLanguageModel(self.options_file, self.weight_file, max_batch_size=32) self.W = self.add_weight(name='W', shape=(3, ), initializer=keras.initializers.get('zeros'), trainable=True) self.gamma = self.add_weight( name='gamma', shape=(1, ), initializer=keras.initializers.get('ones'), trainable=True) super(ELMoEmbedding, self).build(input_shape)
def __init__(self, config, trainable=True, dev=False, graph=None): self.config = config self.graph = graph if graph is not None else tf.Graph() with self.graph.as_default(): self.N = config.batch_size if (trainable or dev) else config.test_batch_size self.QL = config.ques_limit if (trainable or dev) else config.test_ques_limit self.global_step = tf.get_variable( 'global_step', shape=[], dtype=tf.int32, initializer=tf.constant_initializer(0), trainable=False) self.qa_id = tf.placeholder(tf.int32, [self.N], "qa_id") self.dropout = tf.placeholder_with_default(0.0, (), name="dropout") self.que1 = tf.placeholder(tf.int32, [self.N, self.QL + 2], "question1") self.que2 = tf.placeholder(tf.int32, [self.N, self.QL + 2], "question2") self.label = tf.placeholder(tf.int32, [self.N, 2], "label") # elmo self.bilm = BidirectionalLanguageModel( config.elmo_options_file, config.elmo_weight_file, use_character_inputs=False, embedding_weight_file=config.embedding_file) model = BiLSTMModel(self.que1, self.que2, self.label, self.bilm, self.dropout, self.N, self.QL, config.qqp_hidden, True) self.loss, self.pred_label = model.build_model() _, pos_prob = tf.split(self.pred_label, [1, 1], axis=1) self.pos_prob = tf.reshape(pos_prob, [-1]) if trainable: self.lr = config.ml_learning_rate self.opt = tf.train.AdamOptimizer(learning_rate=self.lr, beta1=0.8, beta2=0.999, epsilon=1e-7) grads = self.opt.compute_gradients(self.loss) gradients, variables = zip(*grads) capped_grads, _ = tf.clip_by_global_norm(gradients, config.grad_clip) self.train_op = self.opt.apply_gradients( zip(capped_grads, variables), global_step=self.global_step)
def add_elmo_embedding_layer(self, options_file, weight_file, output_use=False): """ Adds ELMo lstm embeddings to the graph. 1. self.elmo_context_input (batch size, max_context_len among the batch, 1024) 2. self.elmo_question_input (batch size, max_qn_len among the batch, 1024) If output_use is true: add the output to the graph either Inputs: options_file: json_file for the pretrained model weight_file: weights hdf5 file for the pretrained model output_use: determine if use elmo in output of biRNN (default False) """ #Build biLM graph bilm = BidirectionalLanguageModel(options_file, weight_file) context_embeddings_op = bilm(self.context_elmo) question_embeddings_op = bilm(self.qn_elmo) # Get an op to compute ELMo (weighted average of the internal biLM layers) # Our SQuAD model includes ELMo at both the input and output layers # of the task GRU, so we need 4x ELMo representations for the question # and context at each of the input and output. # We use the same ELMo weights for both the question and context # at each of the input and output. #compute the final ELMo representations. self.elmo_context_input = weight_layers( 'input', context_embeddings_op, l2_coef=0.001 )['weighted_op'] #(batch size, max_context_len among the batch, 1024) with tf.variable_scope('', reuse=True): # the reuse=True scope reuses weights from the context for the question self.elmo_question_input = weight_layers( 'input', question_embeddings_op, l2_coef=0.001)['weighted_op'] if output_use: self.elmo_context_output = weight_layers( 'output', context_embeddings_op, l2_coef=0.001)['weighted_op'] with tf.variable_scope('', reuse=True): # the reuse=True scope reuses weights from the context for the question self.elmo_question_output = weight_layers( 'output', question_embeddings_op, l2_coef=0.001)['weighted_op']
def __init__(self, config): super(NERModel, self).__init__(config) self.idx_to_tag = { idx: tag for tag, idx in list(self.config.vocab_tags.items()) } if self.config.use_elmo: # self.elmo_inputs = [] self.batcher = Batcher(self.config.filename_words, 50) self.bilm = BidirectionalLanguageModel( self.config.filename_elmo_options, self.config.filename_elmo_weights) self.elmo_token_ids = tf.placeholder('int32', shape=(None, None, 50)) self.elmo_embeddings_op = self.bilm(self.elmo_token_ids) self.elmo_embeddings_input = weight_layers('input', self.elmo_embeddings_op, l2_coef=0.0)
def word_embedding(self): bilm = BidirectionalLanguageModel( self.options_file, self.weight_file, use_character_inputs=False, embedding_weight_file=self.token_embedding_file) context_embeddings_op = bilm(self.W_P) question_embeddings_op = bilm(self.W_Q) elmo_context_input = weight_layers('input', context_embeddings_op, l2_coef=0.0) with tf.variable_scope('', reuse=True): # the reuse=True scope reuses weights from the context for the question elmo_question_input = weight_layers('input', question_embeddings_op, l2_coef=0.0) self.p_embed, self.q_embed = elmo_context_input[ 'weighted_op'], elmo_question_input['weighted_op']
def __lambda_layer(x): import tensorflow as tf from utils.files import ProjectPath from bilm import BidirectionalLanguageModel, all_layers, weight_layers x_input = tf.cast(x, tf.int32) input_dir = ProjectPath.from_dict(path_dict) options_file: str = input_dir.join("options.json").get() weight_file: str = input_dir.join("weights.hdf5").get() with tf.variable_scope('', reuse=tf.AUTO_REUSE): bilm = BidirectionalLanguageModel(options_file, weight_file) embedding_op = bilm(x_input) if mode == "weighted": return all_layers(embedding_op) else: context_input = weight_layers('input', embedding_op, l2_coef=0.0, use_top_only=(mode == "top")) return context_input['weighted_op']
def get_elmo_embeddings(config): batcher = Batcher(config.filename_words, 50) token_ids = tf.placeholder('int32', shape=(None, None, 50)) bilm = BidirectionalLanguageModel( config.filename_elmo_options, config.filename_elmo_weights, ) elmo_embeddings_op = bilm(token_ids) elmo_context_input = weight_layers('input', elmo_embeddings_op, l2_coef=0.0) with tf.Session() as sess: # It is necessary to initialize variables once before running inference. sess.run(tf.global_variables_initializer()) # Create batches of data. train = CoNLLDataset(config.filename_train) sents_train = [entry[0] for entry in train] sent_ids_train = batcher.batch_sentences(sents_train) # Compute ELMo representations (here for the input only, for simplicity). elmo_input = sess.run([elmo_context_input['weighted_op']], feed_dict={token_ids: sent_ids_train[0]}) for batch in sent_ids_train[1:]: elmo_input_ = sess.run([elmo_context_input['weighted_op']], feed_dict={token_ids: batch}) elmo_input = np.hstack(elmo_input, elmo_input_) test = CoNLLDataset(config.filename_test) sents_test = [entry[0] for entry in test] sent_ids_test = batcher.batch_sentences(sents_test) elmo_context_output_ = sess.run([elmo_context_input['weighted_op']], feed_dict={token_ids: sent_ids_test}) return elmo_context_input_, elmo_context_output_
def elmo_embedding(options_file, weight_file, token_a_character_ids, token_b_character_ids): # Input placeholders to the biLM. # token_a_character_ids = tf.placeholder('int32', shape=(None, None, 50)) # token_b_character_ids = tf.placeholder('int32', shape=(None, None, 50)) # Build the biLM graph. bilm = BidirectionalLanguageModel(options_file, weight_file) # Get ops to compute the LM embeddings. token_a_embeddings_op = bilm(token_a_character_ids) token_b_embeddings_op = bilm(token_b_character_ids) elmo_token_a = weight_layers('input', token_a_embeddings_op, l2_coef=0.0) with tf.variable_scope('', reuse=True): # the reuse=True scope reuses weights from the context for the question elmo_token_b = weight_layers('input', token_b_embeddings_op, l2_coef=0.0) return elmo_token_a['weighted_op'], elmo_token_b['weighted_op']
def __init__(self, config, word_mat, char_mat, mix=False, dev=False, trainable=True): self.config = config self.trainable = trainable self.N = (config.batch_size * 2 if mix else config.batch_size) if (trainable or dev) else config.test_batch_size self.PL = config.para_limit if (trainable or dev) else config.test_para_limit self.QL = config.ques_limit if (trainable or dev) else config.test_ques_limit self.AL = config.ans_limit if (trainable or dev) else config.test_ans_limit self.CL = config.char_limit self.d = config.qa_hidden self.dc = config.char_dim self.global_step = tf.get_variable('global_step', shape=[], dtype=tf.int32, initializer=tf.constant_initializer(0), trainable=False) self.qa_id = tf.placeholder(tf.int32, [self.N], "qa_id") self.dropout = tf.placeholder_with_default(0.0, (), name="dropout") self.para = tf.placeholder(tf.int32, [self.N, self.PL + 2], "paragraph") self.para_char = tf.placeholder(tf.int32, [self.N, self.PL, self.CL], "paragraph_char") self.que = tf.placeholder(tf.int32, [self.N, self.QL + 2], "question") self.que_char = tf.placeholder(tf.int32, [self.N, self.QL, self.CL], "question_char") self.y1 = tf.placeholder(tf.int32, [self.N, self.PL], "answer_index1") self.y2 = tf.placeholder(tf.int32, [self.N, self.PL], "answer_index2") self.labels = tf.placeholder_with_default(tf.ones([self.N], dtype=tf.int32), (self.N), name="labels") _, self.para1, _ = tf.split(self.para, [1, self.PL, 1], axis=1) _, self.que1, _ = tf.split(self.que, [1, self.QL, 1], axis=1) self.para_mask = tf.cast(self.para1, tf.bool) self.que_mask = tf.cast(self.que1, tf.bool) self.para_len = tf.reduce_sum(tf.cast(self.para_mask, tf.int32), axis=-1) self.que_len = tf.reduce_sum(tf.cast(self.que_mask, tf.int32), axis=-1) with tf.device("/cpu:0"): self.word_mat = tf.get_variable("word_mat", initializer=tf.constant(word_mat, dtype=tf.float32), trainable=config.word_trainable) self.char_mat = tf.get_variable("char_mat", initializer=tf.constant(char_mat, dtype=tf.float32), trainable=True) # elmo self.elmo_bilm = BidirectionalLanguageModel(config.elmo_options_file, config.elmo_weight_file, use_character_inputs=False, embedding_weight_file=config.embedding_file)
def dump_token_bilm_embeddings(vocab_file, dataset_file, options_file, weight_file, embedding_weight_file, outfile): batcher = TokenBatcher(vocab_file) ids_placeholder = tf.placeholder('int32', shape=(None, None)) model = BidirectionalLanguageModel( options_file, weight_file, use_character_inputs=False, embedding_weight_file=embedding_weight_file) ops = model(ids_placeholder) config = tf.ConfigProto(allow_soft_placement=True) with tf.Session(config=config) as sess: sess.run(tf.global_variables_initializer()) sentence_id = 0 with open(dataset_file, 'r') as fin, \ h5py.File(outfile, 'w') as fout: for line in fin: sentence = line.strip().split() token_ids = batcher.batch_sentences([sentence]) embeddings = sess.run(ops['lm_embeddings'], feed_dict={ids_placeholder: token_ids}) embedding = embeddings[0, :, :, :] ds = fout.create_dataset('{}'.format(sentence_id), embedding.shape, dtype='float32', data=embedding) # static_token_emb = embedding[0, :, :] # first_layer_emb = embedding[1, :, :] # final_layer_emb = embedding[2, :, :] # avg_emb = np.mean(embedding, axis=0) # average embedding of the three layers sentence_id += 1 if sentence_id % 500 == 0: print('%.2f%% finished!' % (sentence_id / float(EXAMPLE_COUNT) * 100))
def make_elmo(chars_batched): bilm = BidirectionalLanguageModel( options_file="data/elmo_2x4096_512_2048cnn_2xhighway_options.json", weight_file="data/elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5", max_batch_size=128) lm = bilm(chars_batched) word_representations_padded = weight_layers('scalar_mix', lm, l2_coef=0.0)['weighted_op'] # Strip off multiplication by gamma. Our parser has gamma=1 because there is a # projection matrix right after word_representations_padded = word_representations_padded.op.inputs[0] with tf.variable_scope('', reuse=True): elmo_scalar_mix_matrix = tf.get_variable('scalar_mix_ELMo_W') tf.global_variables_initializer().run() tf.assign(elmo_scalar_mix_matrix, [ float(sd['elmo.scalar_mix_0.scalar_parameters.0']), float(sd['elmo.scalar_mix_0.scalar_parameters.1']), float(sd['elmo.scalar_mix_0.scalar_parameters.2'])]).eval() # Switch from padded to packed representation valid_mask = lm['mask'] dim_padded = tf.shape(lm['mask'])[:2] mask_flat = tf.reshape(lm['mask'], (-1,)) dim_flat = tf.shape(mask_flat)[:1] nonpad_ids = tf.to_int32(tf.where(mask_flat)[:,0]) word_reps_shape = tf.shape(word_representations_padded) word_representations_flat = tf.reshape(word_representations_padded, [-1, int(word_representations_padded.shape[-1])]) word_representations = tf.gather(word_representations_flat, nonpad_ids) projected_annotations = tf.matmul( word_representations, tf.constant(sd['project_elmo.weight'].numpy().transpose())) return projected_annotations, nonpad_ids, dim_flat, dim_padded, valid_mask, lm['lengths']
def __init__( self, request_names=['train', 'valid', 'test'], new_names=['train', 'valid', 'test'], classes_name='classes', op_type='vectorizer', op_name='elmo', dimension=1024, file_type='bin', #TODO: ? options_file='./embeddingsruwiki_pp_1.0_elmo/options.json', #TODO: ? weights_file='./embeddingsruwiki_pp_1.0_elmo/weights.hdf5', #TODO: ? vocab_file='./embeddingsruwiki_pp_1.0_elmo/vocab.txt' #TODO: ? ): super().__init__(request_names, new_names, op_type, op_name) self.file_type = file_type self.classes_name = classes_name self.dimension = dimension # Location of pretrained LM. self.options_file = options_file self.weights_file = weights_file self.vocab_file = vocab_file # Create a Batcher to map text to character ids. char_per_token = 50 self.batcher = Batcher(self.vocab_file, char_per_token) # Input placeholders to the biLM. self.character_ids = tf.placeholder('int32', shape=(None, None, char_per_token)) # Build the biLM graph. bilm = BidirectionalLanguageModel(self.options_file, self.weights_file) # Get ops to compute the LM embeddings. embeddings_op = bilm(character_ids) # Get an op to compute ELMo (weighted average of the internal biLM layers) self.elmo_output = weight_layers('elmo_output', embeddings_op, l2_coef=0.0)
def __init__(self, path=embedding_path, embedding_dim=512, sentence_len=max_sentence_len, pair_mode=False): embeddings = dict() self.embedding_path = path self.embedding_dim = embedding_dim self.sentence_len = sentence_len self.pair_mode = pair_mode self.embedding_dict = embeddings g_elmo = tf.Graph() vocab_file = './bilmelmo/data/vocab.txt' options_file = './bilmelmo/try/options.json' weight_file = './bilmelmo/try/weights.hdf5' token_embedding_file = './bilmelmo/data/vocab_embedding.hdf5' with tf.Graph().as_default() as g_elmo: self.batcher = TokenBatcher(vocab_file) self.context_token_ids = tf.placeholder('int32', shape=(None, None)) self.bilm = BidirectionalLanguageModel( options_file, weight_file, use_character_inputs=False, embedding_weight_file=token_embedding_file ) self.context_embeddings_op = self.bilm(self.context_token_ids) self.elmo_context_input = weight_layers('input', self.context_embeddings_op, l2_coef=0.0) self.elmo_context_output = weight_layers( 'output', self.context_embeddings_op, l2_coef=0.0 ) init = tf.global_variables_initializer() sess_elmo = tf.Session(graph=g_elmo) sess_elmo.run(init) self.sess_elmo = sess_elmo