def load_ELMo_data(filename, seq_len, entity_len): vocab_file = "./ELMo_file/vocab.txt" batcher = TokenBatcher(vocab_file) entity_list, token_list, _ = read_data(filename) entity_id_list, token_id_list = [], [] real_chars_list, seq_lens_list = [], [] for index in range(len(token_list)): token_id_list.append(token_list[index][:seq_len]) entity_id_list.append(entity_list[index][:entity_len]) real_seq_len = min(len(token_list[index]), seq_len) tmp = [1] * real_seq_len [tmp.append(0) for _ in range(len(tmp), seq_len)] seq_lens_list.append(real_seq_len) real_chars_list.append(tmp) entity_pad = batcher.batch_sentences(entity_id_list) token_pad = batcher.batch_sentences(token_id_list) print("The shape of tokens after loading vocab:", token_pad.shape) # 按每条数据打包 features = [] for index in range(len(token_list)): curr_features = [ entity_pad[index], token_pad[index], real_chars_list[index], seq_lens_list[index], ] features.append(curr_features) return np.array(features)
class elmo(): def __init__(self): self.vocab_file = 'vocab_small.txt' # Location of pretrained LM. Here we use the test fixtures. datadir = os.path.join('pretrained') options_file = os.path.join( datadir, 'elmo_2x4096_512_2048cnn_2xhighway_5.5B_options.json') weight_file = os.path.join( datadir, 'elmo_2x4096_512_2048cnn_2xhighway_5.5B_weights.hdf5') # Dump the token embeddings to a file. Run this once for your dataset. token_embedding_file = 'elmo_token_embeddings.hdf5' dump_token_embeddings(self.vocab_file, options_file, weight_file, token_embedding_file) self.batcher = TokenBatcher(self.vocab_file) # Input placeholders to the biLM. self.context_token_ids = tf.placeholder('int32', shape=(None, None)) # Build the biLM graph. bilm = BidirectionalLanguageModel( options_file, weight_file, use_character_inputs=False, embedding_weight_file=token_embedding_file) # Get ops to compute the LM embeddings. context_embeddings_op = bilm(self.context_token_ids) self.elmo_context_input = weight_layers('input', context_embeddings_op, l2_coef=0.0) self.elmo_context_output = weight_layers('output', context_embeddings_op, l2_coef=0.0) def get_emb(self, tokenized_context): all_tokens = set(['<S>', '</S>']) for context_sentence in tokenized_context: for token in context_sentence: all_tokens.add(token) with open(self.vocab_file, 'w') as fout: fout.write('\n'.join(all_tokens)) tf.reset_default_graph() with tf.Session() as sess: sess.run(tf.global_variables_initializer()) # Create batches of data. context_ids = self.batcher.batch_sentences(tokenized_context) # Input elmo_context_input_ = sess.run( [self.elmo_context_input['weighted_op']], feed_dict={self.context_token_ids: context_ids}) # For output elmo_context_output_ = sess.run( [self.elmo_context_output['weighted_op']], feed_dict={self.context_token_ids: context_ids}) return elmo_context_input_, elmo_context_output_
def contextualize(sequences): batcher = TokenBatcher(vocab_file) with tf.Session() as sess: # It is necessary to initialize variables once before running inference. sess.run(tf.global_variables_initializer()) # Create batches of data. context_ids = batcher.batch_sentences(sequences) # Compute ELMo representations (here for the input only, for simplicity). elmo_context_output_ = sess.run( [elmo_context_output['weighted_op']], feed_dict={context_token_ids: context_ids}) # print(np.array(elmo_context_output_).shape) # print(elmo_context_output_) #contextualized embedding vector sequences return elmo_context_output_
class elmo_encoder(object): def __init__(self): self.max_batch = 120000 print ("WARNING: Currently max_batch_size of elmo encoder is set to", self.max_batch) pass def build(self, options_file, weight_file, vocab_file, token_embedding_file): self._bilm = BidirectionalLanguageModel( options_file, weight_file, use_character_inputs=False, embedding_weight_file=token_embedding_file, max_batch_size = self.max_batch) self._token_batcher = TokenBatcher(vocab_file) #self.length = length # sentences has to list of word lists. [['You', 'see', '?'], ['That', 'is', 'very', 'interesting', '.']] def embed_sent_batch(self, sentences, length): sentences_tokenid = self._token_batcher.batch_sentences(sentences) # s_tokenid = s_tokenid[1:][:-1] tf.reset_default_graph() processed_sentences_tokenid = [] length += 2 # Take into account <s> and </s> for s_tokenid in sentences_tokenid: if (len(s_tokenid) >= length): s_tokenid = s_tokenid[:length] else: s_tokenid = np.pad(s_tokenid, (0, length - len(s_tokenid)), 'constant', constant_values=(0)) #s_tokenid = np.expand_dims(s_tokenid, axis=0) processed_sentences_tokenid.append(s_tokenid) batch_size = len(processed_sentences_tokenid) processed_sentences_tokenid = np.array(processed_sentences_tokenid) # tf with tf.device("/cpu:0"): context_token_ids = tf.placeholder('int32', shape=(batch_size, length)) context_embeddings_op = self._bilm(context_token_ids) elmo_context_output = weight_layers('output', context_embeddings_op, l2_coef=0.0)['weighted_op'] config = tf.ConfigProto() config.gpu_options.allow_growth = True print ('++++++Check_point_1\n') with tf.Session(config=config) as sess: sess.run([tf.global_variables_initializer()]) elmo_context_output_ = sess.run([elmo_context_output],feed_dict={context_token_ids: processed_sentences_tokenid})[0] #print (elmo_context_output_.shape) return elmo_context_output_
def elmo(reviews, inputData): """ 对每个输入的batcher都动态的生成词向量表示 """ # TokenBatcher是生成词表示的batch类 batcher = TokenBatcher(config.vocabFile) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) # 生成batch数据 inputDataIndex = batcher.batch_sentences(reviews) #print("inputDataIndex:{}".format(inputDataIndex)) # 计算ELMo的向量表示 elmoInputVec = sess.run( [elmoInput["weighted_op"]], feed_dict={inputData: inputDataIndex}) return elmoInputVec
def elmo(reviews): """ 对每一个输入的batch都动态的生成词向量表示 """ # tf.reset_default_graph() # TokenBatcher是生成词表示的batch类 batcher = TokenBatcher(config.vocabFile) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) # 生成batch数据 inputDataIndex = batcher.batch_sentences(reviews) # 计算ELMo的向量表示 elmoInputVec = sess.run([elmoInput['weighted_op']], feed_dict={inputData: inputDataIndex}) return elmoInputVec
def dump_token_bilm_embeddings(vocab_file, dataset_file, options_file, weight_file, embedding_weight_file, outfile): batcher = TokenBatcher(vocab_file) ids_placeholder = tf.placeholder('int32', shape=(None, None)) model = BidirectionalLanguageModel( options_file, weight_file, use_character_inputs=False, embedding_weight_file=embedding_weight_file) ops = model(ids_placeholder) config = tf.ConfigProto(allow_soft_placement=True) with tf.Session(config=config) as sess: sess.run(tf.global_variables_initializer()) sentence_id = 0 with open(dataset_file, 'r') as fin, \ h5py.File(outfile, 'w') as fout: for line in fin: sentence = line.strip().split() token_ids = batcher.batch_sentences([sentence]) embeddings = sess.run(ops['lm_embeddings'], feed_dict={ids_placeholder: token_ids}) embedding = embeddings[0, :, :, :] ds = fout.create_dataset('{}'.format(sentence_id), embedding.shape, dtype='float32', data=embedding) # static_token_emb = embedding[0, :, :] # first_layer_emb = embedding[1, :, :] # final_layer_emb = embedding[2, :, :] # avg_emb = np.mean(embedding, axis=0) # average embedding of the three layers sentence_id += 1 if sentence_id % 500 == 0: print('%.2f%% finished!' % (sentence_id / float(EXAMPLE_COUNT) * 100))
class Tokenizer(object): def __init__(self, vocab_file, max_seq_length, max_token_length=None, stroke_vocab_file=None, tran2sim=False, sim2tran=False): self.vocab_file = vocab_file self.max_seq_length = max_seq_length self.max_token_length = max_token_length max_seq_length = self.max_seq_length - 2 # 因會加 <bos> and <eos>,所以 -2 self.token_batcher = TokenBatcher(self.vocab_file, max_seq_length) if max_token_length: self.batcher = Batcher(self.vocab_file, self.max_token_length, max_seq_length, stroke_vocab_file) self.convert_config = None if tran2sim and sim2tran: assert tran2sim != sim2tran elif tran2sim: self.convert_config = "t2s.json" elif sim2tran: self.convert_config = "s2t.json" def convert(self, text): """ 未轉簡繁、轉簡體、轉繁體 很慢,不建議使用 """ if self.convert_config is None: return text return opencc.convert(text, config=self.convert_config) def tokenize(self, text): """ text to token, for example: text=‘Pretrained biLMs compute representations useful for NLP tasks.’ token=['Pretrained', 'biLMs', 'compute', 'representations', 'useful', 'for', 'NLP', 'tasks', '.'] """ text = self.convert(text) text = tokenize_chinese_chars(text) text = text.strip() tokens = [] for word in text.split(): tokens.extend(self._run_split_on_punc(word)) return tokens def convert_tokens_to_ids(self, tokens): return self.token_batcher.batch_sentences([tokens])[0] def convert_tokens_to_char_ids(self, tokens): """ tokens: tokenize(text) return: shape [max_seq_length * max_token_length] """ # char_ids [max_seq_length, max_token_length] char_ids = self.batcher.batch_sentences([tokens])[0] # flat_char_ids [max_seq_length * max_token_length] flat_char_ids = [ char_id for sublist in char_ids for char_id in sublist ] return flat_char_ids def _is_punctuation(self, char): """Checks whether `chars` is a punctuation character.""" cp = ord(char) # We treat all non-letter/number ASCII as punctuation. # Characters such as "^", "$", and "`" are not in the Unicode # Punctuation class but we treat them as punctuation anyways, for # consistency. if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)): return True cat = unicodedata.category(char) if cat.startswith("P"): return True return False def _run_split_on_punc(self, text): """Splits punctuation on a piece of text.""" chars = list(text) i = 0 start_new_word = True output = [] while i < len(chars): char = chars[i] if self._is_punctuation(char): output.append([char]) start_new_word = True else: if start_new_word: output.append([]) start_new_word = False output[-1].append(char) i += 1 return ["".join(x) for x in output]
weight_file, use_character_inputs=False, embedding_weight_file=token_embedding_file) # Get ops to compute the LM embeddings. context_embeddings_op = bilm(context_token_ids) elmo_context_top = weight_layers('output_top_only', context_embeddings_op, l2_coef=0.0, use_top_only=True) elmo_context_output = weight_layers('output', context_embeddings_op, l2_coef=0.0) with tf.Session() as sess: # It is necessary to initialize variables once before running inference. sess.run(tf.global_variables_initializer()) # Create batches of data. context_ids = batcher.batch_sentences(sequences) # Input token representations. elmo_context_top_ = sess.run([elmo_context_top['weighted_op']], feed_dict={context_token_ids: context_ids}) # Output token representations. elmo_context_output_ = sess.run([elmo_context_output['weighted_op']], feed_dict={context_token_ids: context_ids}) print(elmo_context_output_ ) #contextualized embedding vector sequences (all layers)
# Get an op to compute ELMo (weighted average of the internal biLM layers) # The following calculates a weighted average of the two layers (equal weights). - These are trainable parameters (including weights and gamma), but we probably won't train them unless we're using this as part of inferSent (?) #TODO to use top layer only, add arg: use_top_only=True elmo_emb = weight_layers('input', input_embeddings_op, l2_coef=0.0, use_top_only=top_layer) batch_size = 32 elmo_size = 1024 with tf.Session() as sess: # It is necessary to initialize variables once before running inference. sess.run(tf.global_variables_initializer()) # Create batches of data. input_ids = batcher.batch_sentences(tokenized_txt) #context_ids, weights = batcher.batch_sentences(tokenized_context) final_res = np.zeros((len(tokenized_txt), elmo_size), dtype=np.float32) #run batches of size 128 for i in range(0, len(tokenized_txt), batch_size): j = i + batch_size if i + batch_size <= len(tokenized_txt) else len( tokenized_txt) elmo_emb_ = sess.run(elmo_emb['weighted_op'], feed_dict={input_token_ids: input_ids[i:j, :]}) #perform averaging here ... res = np.array(elmo_emb_) idx = i for x, sen in zip(res, tokenized_txt[i:j]): avg = np.sum(x, axis=0) / len(sen) final_res[idx] = avg
class EncoderGenerator(object): """receives samples Train or Test samples and encodes everything to numbers ready to be transformed to tfrecords. Also filters out candidate entities that are not in the entity universe.""" def __init__(self): self._generator = SamplesGenerator() self._batcher = TokenBatcher(config.base_folder+"data/vocabulary/"+"vocab_2.txt") self._wikiid2nnid = util.load_wikiid2nnid(args.entity_extension) self._wikii2summary = util.load_entity_summary_map() def set_gmonly_mode(self): self._generator.set_gmonly_mode() def set_allspans_mode(self): self._generator.set_allspans_mode() def is_gmonly_mode(self): return self._generator.is_gmonly_mode() def is_allspans_mode(self): return self._generator.is_allspans_mode() def process(self, filepath): ground_truth_errors_cnt = 0 cand_entities_not_in_universe_cnt = 0 samples_with_errors = 0 for sample in self._generator.process(filepath): words = self._batcher.batch_sentences([sample.chunk_words]).tolist()[0] ground_truth_enc = [self._wikiid2nnid[gt] if gt in self._wikiid2nnid else self._wikiid2nnid["<u>"] for gt in sample.ground_truth] ground_truth_errors_cnt += ground_truth_enc.count(self._wikiid2nnid["<u>"]) # it is always zero #print(colored("New sample", 'red')) #print(sample) if len(sample.begin_gm) != len(sample.end_gm) or \ len(sample.begin_gm) != len(ground_truth_enc): samples_with_errors += 1 continue if isinstance(sample, GmonlySample): cand_entities, cand_entities_ids, cand_entities_scores, cand_entities_labels, not_in_universe_cnt = \ self._encode_cand_entities_and_labels( sample.cand_entities, sample.cand_entities_scores, sample.ground_truth) yield SampleEncoded(chunk_id=sample.chunk_id, words=words, words_len=len(words) - 2, begin_spans=sample.begin_gm, end_spans=sample.end_gm, spans_len=len(sample.begin_gm), cand_entities=cand_entities, cand_entities_ids=cand_entities_ids, cand_entities_scores=cand_entities_scores, cand_entities_labels=cand_entities_labels, cand_entities_len=[len(t) // 22 for t in cand_entities], ground_truth=ground_truth_enc, ground_truth_len=len(sample.ground_truth), begin_gm=[], end_gm=[]) elif isinstance(sample, AllspansSample): if len(sample.begin_spans) != len(sample.end_spans): samples_with_errors += 1 continue # for each span i have the gt or the value -1 if this span is not a gm # and then i work in the same way as above span_ground_truth = [] gm_spans = list(zip(sample.begin_gm, sample.end_gm)) # [(3, 5), (10, 11), (15, 18)] for left, right in zip(sample.begin_spans, sample.end_spans): if (left, right) in gm_spans: span_ground_truth.append(sample.ground_truth[gm_spans.index((left, right))]) else: span_ground_truth.append(-1) # this span is not a gm cand_entities, cand_entities_ids, cand_entities_scores, cand_entities_labels, not_in_universe_cnt = \ self._encode_cand_entities_and_labels( sample.cand_entities, sample.cand_entities_scores, span_ground_truth) yield SampleEncoded(chunk_id=sample.chunk_id, words=words, words_len=len(words) - 2, begin_spans=sample.begin_spans, end_spans=sample.end_spans, spans_len=len(sample.begin_spans), cand_entities=cand_entities, cand_entities_ids=cand_entities_ids, cand_entities_scores=cand_entities_scores, cand_entities_labels=cand_entities_labels, cand_entities_len=[len(t) // 22 for t in cand_entities], ground_truth=ground_truth_enc, ground_truth_len=len(sample.ground_truth), begin_gm=sample.begin_gm, end_gm=sample.end_gm) cand_entities_not_in_universe_cnt += not_in_universe_cnt print("ground_truth_errors_cnt =", ground_truth_errors_cnt) print("cand_entities_not_in_universe_cnt =", cand_entities_not_in_universe_cnt) print("encoder samples_with_errors =", samples_with_errors) def _encode_cand_entities_and_labels(self, cand_entities_p, cand_entities_scores_p, ground_truth_p): """receives cand_entities (list of lists), and ground_truth (list) and does the following: 1) removes cand ent that are not in our universe 2) creates a label 0, 1 if this candidate is correct or not (i.e. if the span is indeed a gold mention (row of candidate entities array) and this specific candidate entity (column of candidate entities array) is correct. Returns the filtered cand_entities and the corresponding label (they have the same shape)""" cand_entities = [] cand_entities_ids = [] cand_entities_scores = [] cand_entities_labels = [] not_in_universe_cnt = 0 for cand_ent_l, cand_scores_l, gt in zip(cand_entities_p, cand_entities_scores_p, ground_truth_p): ent_l = [] ids_l = [] score_l = [] label_l = [] for cand_ent, score in zip(cand_ent_l, cand_scores_l): if cand_ent in self._wikiid2nnid: # else continue, this entity not in our universe summary = self._wikii2summary[cand_ent] tokens = list(filter(None, self._batcher.batch_sentences([summary]).tolist()[0])) while len(tokens) < 22: tokens.append(0) ent_l.append(self._wikiid2nnid[cand_ent]) ids_l.extend(tokens) score_l.append(score) label_l.append(1 if cand_ent == gt else 0) else: not_in_universe_cnt += 1 cand_entities.append(ent_l) cand_entities_ids.append(ids_l) cand_entities_scores.append(score_l) cand_entities_labels.append(label_l) return cand_entities, cand_entities_ids, cand_entities_scores, cand_entities_labels, not_in_universe_cnt
def train_svm(): print("Loading texts and labels...") # load training set train_labels, train_text_ids = \ load_labels(train_label_path, label_index, num_classes, one_hot=False) train_texts = load_texts(train_data_path, train_text_ids) train_size = len(train_texts) tokenized_train_texts = tokenize(train_texts) # Create a TokenBatcher to map text to token ids batcher = TokenBatcher(train_vocab_file) # restore the TextCNN model print("Restoring TextCNN model...") tf.reset_default_graph() cnn_path_dir = os.path.dirname(cnn_path) meta_file_name = os.listdir(cnn_path_dir)[-1] meta_path = os.path.join(cnn_path_dir, meta_file_name) sess = tf.InteractiveSession() saver = tf.train.import_meta_graph(meta_path) saver.restore(sess, tf.train.latest_checkpoint(os.path.dirname(cnn_path))) graph = tf.get_default_graph() input_tensors = { 't_real_text': graph.get_tensor_by_name('real_text_input:0'), 'input_y': graph.get_tensor_by_name('input_y:0') } feedback = { 'loss': graph.get_tensor_by_name('loss/add:0'), 'accuracy': graph.get_tensor_by_name('accuracy/accuracy:0') } features = graph.get_tensor_by_name('g_conv/Squeeze:0') # train svm # extract features of texts in training set first print( "Extracting features of texts in training set with trained text cnn model..." ) all_features = [] batch_num = train_size // batch_size for batch_no in range(batch_num): try: batch_texts = tokenized_train_texts[batch_no * batch_size:(batch_no + 1) * batch_size] except IndexError: batch_texts = tokenized_train_texts[batch_no * batch_size:] batch_texts = batcher.batch_sentences(batch_texts) batch_texts = pad_and_cut(batch_texts, MAX_LEN) batch_features = sess.run( features, feed_dict={input_tensors['t_real_text']: batch_texts}) all_features.append(batch_features) train_features = np.vstack(all_features) sess.close() # train svm with training set features and labels print("Training svm with training set features and labels...") start = datetime.now() clf = svm.SVC(kernel='linear', C=1, class_weight='balanced') clf.fit(train_features, train_labels) # save svm model if not os.path.exists(os.path.dirname(svm_path)): os.mkdir(os.path.dirname(svm_path)) joblib.dump(clf, svm_path) end = datetime.now() print("SVM training complete!") print("Train report: train time: %f" % (end - start).seconds)
class ELMoEmbeddings(object): def __init__(self, hparams): self.hparams = hparams self.vocab_path = self.hparams.word_vocab_path self.elmo_options_file = self.hparams.elmo_options_file self.elmo_weight_file = self.hparams.elmo_weight_file self.token_embedding_file = self.hparams.elmo_token_embedding_file self.batcher = TokenBatcher(self.vocab_path) if not os.path.exists(self.token_embedding_file): print("making dump token embeddings") self._make_dump_token_embeddings() print("finished making dump_token_embeddings") def build_embeddings_op(self, context_ids_ph, utterances_ids_ph, context_sentence_ids_ph): bilm = BidirectionalLanguageModel( self.elmo_options_file, self.elmo_weight_file, use_character_inputs=False, embedding_weight_file=self.token_embedding_file) context_emb_op = bilm(context_ids_ph) utterances_emb_op = bilm(utterances_ids_ph) context_sentence_emb_op = bilm(context_sentence_ids_ph) elmo_context_input = weight_layers('input', context_emb_op, l2_coef=0.0) with tf.variable_scope('', reuse=True): elmo_utterances_input = weight_layers('input', utterances_emb_op, l2_coef=0.0) elmo_context_sentence_input = weight_layers( 'input', context_sentence_emb_op, l2_coef=0.0) return (elmo_context_input, elmo_utterances_input, elmo_context_sentence_input) def get_toknized_data(self, context_batch, utterances_batch, context_sentence_batch): # get nltk tokenized data # context, utterances, context_sentence #context [None, None] -> okay #utterances [None, None, None] -> batch_size * num_candidates, max_utterances_len #context_sentence [None, None, None] -> batch_size * max_context_len, max_context_sentence_len # batch_size context_list = [] for context in context_batch: context_list.append(context[0]) # batch_size * num_candidates utterances_list = [] for utterances in utterances_batch: for response in utterances: utterances_list.append(response) context_sentence_list = [] for context_sentences in context_sentence_batch: for sentence in context_sentences: context_sentence_list.append(sentence) context_ids = self.batcher.batch_sentences(context_list) utterances_ids = self.batcher.batch_sentences(utterances_list) context_sentence_ids = self.batcher.batch_sentences( context_sentence_list) return np.array(context_ids), np.array(utterances_ids), np.array( context_sentence_ids) def context_sentence_padding(self, elmo_context_sentence_inputs, tot_context_len): #elmo_context_sentence_input_val : 39, max_sentence_len, 256 # [17, 5, 3, 11, 3] -> 17 max_sentence_len = np.shape(elmo_context_sentence_inputs)[1] max_context_len = max(tot_context_len) current_index = 0 length_index = 0 batch_context_sentence = [] each_context_sentence = [] for i in range(len(elmo_context_sentence_inputs)): each_context_sentence.append(elmo_context_sentence_inputs[i]) current_index += 1 if current_index == tot_context_len[length_index]: length_index += 1 current_index = 0 batch_context_sentence.append(each_context_sentence) each_context_sentence = [] continue pad_context_sentence = [] for context_sentences in batch_context_sentence: if len(context_sentences) < max_context_len: padding_value = np.zeros([ max_context_len - len(context_sentences), max_sentence_len, 256 ], np.float32) context_sentences = np.concatenate( (context_sentences, padding_value), axis=0) pad_context_sentence.append(context_sentences) return pad_context_sentence def _make_dump_token_embeddings(self): dump_token_embeddings(self.vocab_path, self.elmo_options_file, self.elmo_weight_file, self.token_embedding_file) def make_placeholders(self): context_ids_ph = tf.placeholder(tf.int32, shape=[None, None]) utterances_ids_ph = tf.placeholder(tf.int32, shape=[None, None]) context_sentence_ids_ph = tf.placeholder(tf.int32, shape=[None, None]) return context_ids_ph, utterances_ids_ph, context_sentence_ids_ph
# the reuse=True scope reuses weights from the context for the question elmo_question_input = weight_layers( 'input', question_embeddings_op, l2_coef=0.0 ) elmo_context_output = weight_layers( 'output', context_embeddings_op, l2_coef=0.0 ) with tf.variable_scope('', reuse=True): # the reuse=True scope reuses weights from the context for the question elmo_question_output = weight_layers( 'output', question_embeddings_op, l2_coef=0.0 ) with tf.Session() as sess: # It is necessary to initialize variables once before running inference. sess.run(tf.global_variables_initializer()) # Create batches of data. context_ids = batcher.batch_sentences(tokenized_context) question_ids = batcher.batch_sentences(tokenized_question) # Compute ELMo representations (here for the input only, for simplicity). elmo_context_input_, elmo_question_input_ = sess.run( [elmo_context_input['weighted_op'], elmo_question_input['weighted_op']], feed_dict={context_token_ids: context_ids, question_token_ids: question_ids} )
vocab_file = './data/vocab.txt' options_file = './try/options.json' weight_file = './try/weights.hdf5' token_embedding_file = './data/vocab_embedding.hdf5' batcher = TokenBatcher(vocab_file) context_token_ids = tf.placeholder('int32', shape=(None, None)) bilm = BidirectionalLanguageModel(options_file, weight_file, use_character_inputs=False, embedding_weight_file=token_embedding_file) context_embeddings_op = bilm(context_token_ids) elmo_context_input = weight_layers('input', context_embeddings_op, l2_coef=0.0) elmo_context_output = weight_layers('output', context_embeddings_op, l2_coef=0.0) with tf.Session() as sess: # It is necessary to initialize variables once before running inference. sess.run(tf.global_variables_initializer()) # Create batches of data. context_ids = batcher.batch_sentences(tokenized_context) # Compute ELMo representations (here for the input only, for simplicity). elmo_context_input_ = sess.run([elmo_context_input['weighted_op']], feed_dict={context_token_ids: context_ids})[0][0] print(elmo_context_input_.shape, elmo_context_input_)
question_embeddings_op, l2_coef=0.0) elmo_context_output = weight_layers('output', context_embeddings_op, l2_coef=0.0) with tf.variable_scope('', reuse=True): # the reuse=True scope reuses weights from the context for the question elmo_question_output = weight_layers('output', question_embeddings_op, l2_coef=0.0) with tf.Session() as sess: # It is necessary to initialize variables once before running inference. sess.run(tf.global_variables_initializer()) # Create batches of data. context_ids = batcher.batch_sentences(tokenized_context) question_ids = batcher.batch_sentences(tokenized_question) # Compute ELMo representations (here for the input only, for simplicity). elmo_context_input_, elmo_question_input_ = sess.run( [ elmo_context_input['weighted_op'], elmo_question_input['weighted_op'] ], feed_dict={ context_token_ids: context_ids, question_token_ids: question_ids })
def deep_data_prepare(config): print('深度学习模型数据准备') train_df = pd.read_csv(config.TRAIN_X) train_jp = pd.read_csv(config.TRAIN_JP) train_en = pd.read_csv(config.TRAIN_EN) test_df = pd.read_csv(config.TEST_X) char_sw_list = pickle.load(open('../data/char_stopword.pkl', 'rb')) word_sw_list = pickle.load(open('../data/word_stopword.pkl', 'rb')) # 用词向量 # 用字向量 train_x_char = train_df['char'] train_x_word = train_df['word'] train_x_sent_word = [w for w in open('../data/sentiment_word.txt')] train_x_sent_char = [w for w in open('../data/sentiment_word.txt')] train_jp_char = train_jp['char'] train_jp_word = train_jp['word'] train_en_char = train_en['char'] train_en_word = train_en['word'] train_char = pd.concat((train_x_char, train_jp_char, train_en_char)) train_word = pd.concat((train_x_word, train_jp_word, train_en_word)) test_char = test_df['char'] test_word = test_df['word'] if config.data_type == 0: train_y = train_df['sub_numerical'].values train_y = np_utils.to_categorical(train_y, num_classes=config.n_classes) elif config.data_type == 1: train_y = train_df['sentiment_value'].values train_y = np_utils.to_categorical(train_y, num_classes=config.n_classes) elif config.data_type == 2: train_y = np.array(train_df.iloc[:, 6:].values) elif config.data_type == 3: train_y = train_df.iloc[:, 6:].values targets = train_y.reshape(-1) one_hot_targets = np.eye(config.n_classes)[targets] train_y = one_hot_targets.reshape(-1, 10, config.n_classes) elif config.data_type == 4: train_y = (train_df['sentiment_value'] + 1).values train_y = np_utils.to_categorical(train_y, num_classes=config.n_classes) elif config.data_type == 5: train_y = train_df.iloc[:, 4:].values else: exit('错误数据类别') UNK_CHAR = len(char_stoi) PAD_CHAR = len(char_stoi) + 1 UNK_WORD = len(word_stoi) PAD_WORD = len(word_stoi) + 1 def generate_hann_data(df): import re hann_train_word = np.full(shape=(len(df['word']), config.HANN_SENT, config.HANN_WORD_LEN), fill_value=PAD_WORD) hann_train_char = np.full(shape=(len(df['char']), config.HANN_SENT, config.HANN_CHAR_LEN), fill_value=PAD_CHAR) for i, sentences in enumerate(df['word']): sentences = re.split(r" 。 | , ", sentences) for j, sent in enumerate(sentences): if j < config.HANN_SENT: k = 0 word_tokens = sent.split() for _, word in enumerate(word_tokens): if k < config.HANN_WORD_LEN and word not in word_sw_list and word in word_stoi: hann_train_word[i, j, k] = word_stoi[word] k += 1 for i, sentences in enumerate(df['char']): sentences = re.split(r" 。 | , ", sentences) for j, sent in enumerate(sentences): if j < config.HANN_SENT: k = 0 word_tokens = sent.split() for _, word in enumerate(word_tokens): if k < config.HANN_CHAR_LEN and word not in char_sw_list and word in char_stoi: hann_train_char[i, j, k] = char_stoi[word] k += 1 return hann_train_word, hann_train_char hann_train_word, hann_train_char = generate_hann_data(train_df) hann_test_word, hann_test_char = generate_hann_data(test_df) def word2id(train_dialogs, type='char'): if type == 'char': stoi = char_stoi max_len = config.CHAR_MAXLEN UNK = UNK_CHAR sw_list = set(char_sw_list) elif type == 'word': stoi = word_stoi max_len = config.WORD_MAXLEN UNK = UNK_WORD sw_list = set(word_sw_list) else: exit('类型错误') train_x = [] for d in tqdm(train_dialogs): d = str(d).split() line = [] for token in d: if token in sw_list\ or token == ''\ or token == ' ': continue if token in stoi: line.append(stoi[token]) else: line.append(UNK) train_x.append(line[:max_len]) return train_x # 普通模型数据 train_x_word = word2id(train_word, type='word') train_x_char = word2id(train_char, type='char') test_x_char = word2id(test_char, type='char') test_x_word = word2id(test_word, type='word') train_x_sent_word = word2id(train_x_sent_word, type='word') train_x_sent_char = word2id(train_x_sent_char, type='char') # rcnn模型数据准备 UNK_CHAR = PAD_CHAR UNK_WORD = PAD_WORD train_word_left = [[UNK_WORD] + w[:-1] for w in train_x_word] train_word_right = [w[1:] + [UNK_WORD] for w in train_x_word] train_char_left = [[UNK_CHAR] + w[:-1] for w in train_x_char] train_char_right = [w[1:] + [UNK_CHAR] for w in train_x_char] test_word_left = [[UNK_WORD] + w[:-1] for w in test_x_word] test_word_right = [w[1:] + [UNK_WORD] for w in test_x_word] test_char_left = [[UNK_CHAR] + w[:-1] for w in test_x_char] test_char_right = [w[1:] + [UNK_CHAR] for w in test_x_char] train_x_char = sequence.pad_sequences(train_x_char, maxlen=config.CHAR_MAXLEN, dtype='int32', padding='post', truncating='post', value=UNK_CHAR) train_x_word = sequence.pad_sequences(train_x_word, maxlen=config.WORD_MAXLEN, dtype='int32', padding='post', truncating='post', value=UNK_WORD) train_x_char_left = sequence.pad_sequences(train_char_left, maxlen=config.CHAR_MAXLEN, dtype='int32', padding='post', truncating='post', value=UNK_CHAR) train_x_word_left = sequence.pad_sequences(train_word_left, maxlen=config.WORD_MAXLEN, dtype='int32', padding='post', truncating='post', value=UNK_WORD) train_x_char_right = sequence.pad_sequences(train_char_right, maxlen=config.CHAR_MAXLEN, dtype='int32', padding='post', truncating='post', value=UNK_CHAR) train_x_word_right = sequence.pad_sequences(train_word_right, maxlen=config.WORD_MAXLEN, dtype='int32', padding='post', truncating='post', value=UNK_WORD) test_x_char = sequence.pad_sequences(test_x_char, maxlen=config.CHAR_MAXLEN, dtype='int32', padding='post', truncating='post', value=UNK_CHAR) test_x_word = sequence.pad_sequences(test_x_word, maxlen=config.WORD_MAXLEN, dtype='int32', padding='post', truncating='post', value=UNK_WORD) test_x_char_left = sequence.pad_sequences(test_char_left, maxlen=config.CHAR_MAXLEN, dtype='int32', padding='post', truncating='post', value=UNK_CHAR) test_x_word_left = sequence.pad_sequences(test_word_left, maxlen=config.WORD_MAXLEN, dtype='int32', padding='post', truncating='post', value=UNK_WORD) test_x_char_right = sequence.pad_sequences(test_char_right, maxlen=config.CHAR_MAXLEN, dtype='int32', padding='post', truncating='post', value=UNK_CHAR) test_x_word_right = sequence.pad_sequences(test_word_right, maxlen=config.WORD_MAXLEN, dtype='int32', padding='post', truncating='post', value=UNK_WORD) print('train_x char shape is: ', train_x_char.shape) print('train_x word shape is: ', train_x_word.shape) print('test_x char shape is: ', test_x_char.shape) print('test_x word shape is: ', test_x_word.shape) train = {} test = {} tokenizer = tokenization.FullTokenizer(vocab_file=config.BERT_VOCAB_FILES, do_lower_case=False) def get_bert_data(corpus): input_ids = [] input_mask = [] input_segment_ids = [] for sent in train_df['word'].values: sent = ''.join(sent.strip().split()) tmp_token_ids = tokenizer.convert_tokens_to_ids( ['[CLS]'] + tokenizer.tokenize(sent)[:188] + ['[SEP]']) tmp_mask = [1] * len(tmp_token_ids) tmp_segment_ids = [0] * len(tmp_token_ids) if len(tmp_token_ids) < 190: tmp_segment_ids.extend([0] * (190 - len(tmp_token_ids))) tmp_mask.extend([0] * (190 - len(tmp_token_ids))) tmp_token_ids.extend([0] * (190 - len(tmp_token_ids))) input_ids.append(tmp_token_ids) input_mask.append(tmp_mask) input_segment_ids.append(tmp_segment_ids) return np.array(input_ids, dtype='int32'), np.array( input_mask, dtype='int32'), np.array(input_segment_ids, dtype='int32') train['token_id'], train['mask_id'], train['type_id'] = get_bert_data( train_df['word'].values) test['token_id'], test['mask_id'], test['type_id'] = get_bert_data( test_df['word'].values) train['word'] = train_x_word train['char'] = train_x_char train['word_sent'] = train_x_sent_word train['char_sent'] = train_x_sent_char # rcnn train['word_left'] = train_x_word_left train['word_right'] = train_x_word_right train['char_left'] = train_x_char_left train['char_right'] = train_x_char_right # han train['hann_word'] = hann_train_word train['hann_char'] = hann_train_char test['word'] = test_x_word test['char'] = test_x_char test['word_left'] = test_x_word_left test['word_right'] = test_x_word_right test['char_left'] = test_x_char_left test['char_right'] = test_x_char_right test['hann_word'] = hann_test_word test['hann_char'] = hann_test_char assert train['word_left'].shape == train['word_right'].shape == train[ 'word'].shape assert train['char_left'].shape == train['char_right'].shape == train[ 'char'].shape assert test['word_left'].shape == test['word_right'].shape == test[ 'word'].shape assert test['char_left'].shape == test['char_right'].shape == test[ 'char'].shape batcher = TokenBatcher(config.elmo_word_vocab_file) train['elmo_word'] = batcher.batch_sentences( [str(w).split()[:config.WORD_MAXLEN] for w in train_df['word']]) test['elmo_word'] = batcher.batch_sentences( [str(w).split()[:config.WORD_MAXLEN] for w in test_df['word']]) batcher = TokenBatcher(config.elmo_char_vocab_file) train['elmo_char'] = batcher.batch_sentences( [str(w).split()[:config.CHAR_MAXLEN] for w in train_df['char']]) test['elmo_char'] = batcher.batch_sentences( [str(w).split()[:config.CHAR_MAXLEN] for w in test_df['char']]) batcher = TokenBatcher(config.elmo_qiuqiu_vocab_file) train['elmo_qiuqiu'] = batcher.batch_sentences( [str(w).split()[:config.WORD_MAXLEN] for w in train_df['word']]) test['elmo_qiuqiu'] = batcher.batch_sentences( [str(w).split()[:config.WORD_MAXLEN] for w in test_df['word']]) return train, train_y, test
def train_cnn(first_use=True): print("Loading texts and labels...") # load training set train_labels, train_text_ids = \ load_labels(train_label_path, label_index, num_classes, one_hot=True) train_texts = load_texts(train_data_path, train_text_ids) train_size = len(train_texts) tokenized_train_texts = tokenize(train_texts) # Create a TokenBatcher to map text to token ids batcher = TokenBatcher(train_vocab_file) if first_use: # build TextCNN model tf.reset_default_graph() model_options = { 'text_length': MAX_LEN, 'emb_dim': emb_dim, 'batch_size': batch_size, 'num_classes': num_classes, 'bilm_options_file': bilm_options_file, 'bilm_weight_file': bilm_weight_file, 'token_embedding_file': token_embedding_file, 'l2_bilm_lambda': l2_bilm_lambda, 'l2_cnn_lambda': l2_cnn_lambda } print("Building TextCNN model") cnn = tc.TextCNN_with_elmo(model_options) input_tensors, feedback, features = cnn.build_model() optim = tf.train.AdamOptimizer(learning_rate, beta1)\ .minimize(feedback['loss'], name='optim') saver = tf.train.Saver(max_to_keep=1) sess = tf.InteractiveSession() sess.run(tf.global_variables_initializer()) epoch_done = 0 else: tf.reset_default_graph() cnn_path_dir = os.path.dirname(cnn_path) meta_file_name = os.listdir(cnn_path_dir)[-1] epoch_done = int(meta_file_name[18:-5]) meta_path = os.path.join(cnn_path_dir, meta_file_name) sess = tf.InteractiveSession() saver = tf.train.import_meta_graph(meta_path) saver.restore(sess, tf.train.latest_checkpoint(os.path.dirname(cnn_path))) graph = tf.get_default_graph() input_tensors = {'t_real_text': graph.get_tensor_by_name('real_text_input:0'), 'input_y': graph.get_tensor_by_name('input_y:0')} feedback = {'loss': graph.get_tensor_by_name('loss/add:0'), 'accuracy': graph.get_tensor_by_name('accuracy/accuracy:0')} optim = graph.get_operation_by_name('optim') # Reset max_to_keep saver = tf.train.Saver(max_to_keep=1) # train for epoch in [(i + epoch_done + 1) for i in range(epoch_num)]: print("<Epoch no. %d>" % epoch) # train text cnn model print("Training text cnn model...") start = datetime.now() sum_accuracy = 0.0 total_loss = 0.0 batch_num = train_size // batch_size for batch_no in range(batch_num): if batch_no % (batch_num // 10) == 0: print("%d0%%" % (batch_no // (batch_num // 10))) batch_texts = tokenized_train_texts[batch_no * batch_size : (batch_no + 1) * batch_size] batch_texts = batcher.batch_sentences(batch_texts) batch_texts = pad_and_cut(batch_texts, MAX_LEN) batch_labels = train_labels[batch_no * batch_size : (batch_no + 1) * batch_size] _, loss, accuracy = \ sess.run([optim, feedback['loss'], feedback['accuracy']], feed_dict={input_tensors['t_real_text']: batch_texts, input_tensors['input_y']: batch_labels}) sum_accuracy += accuracy total_loss += loss avg_accuracy = sum_accuracy / batch_num # save the cnn model if not os.path.exists(os.path.dirname(cnn_path)): os.mkdir(os.path.dirname(cnn_path)) saver.save(sess, cnn_path, global_step=epoch) end = datetime.now() print("TextCNN training complete!") print("Train report: loss: %d, accuracy: %d, train time: %f" % (total_loss, avg_accuracy, (end - start).seconds)) sess.close()
# Create a TokenBatcher to map text to token ids. batcher = TokenBatcher(vocab_file) # REQUIRED # Build the Elmo with biLM and weight layers. elmo = Elmo( options_file, weight_file, token_embedding_file=token_embedding_file, # REQUIRED token_batcher=batcher, # REQUIRED num_output_representations=1, requires_grad=False, do_layer_norm=False, dropout=0.) # Create batches of data. context_token_ids = batcher.batch_sentences(tokenized_context, add_bos_eos=False) question_token_ids = batcher.batch_sentences(tokenized_question, add_bos_eos=False) # numpy.ndarray or cupy.ndarray # with shape (batchsize, max_length) if gpu >= 0: # transfer the model to the gpu chainer.cuda.get_device_from_id(gpu).use() elmo.to_gpu() # transfer input data to the gpu context_token_ids = elmo.xp.asarray(context_token_ids) question_token_ids = elmo.xp.asarray(question_token_ids) # Compute elmo outputs, # i.e. weighted sum of multi-layer biLM's outputs.
import bilm from bilm import TokenBatcher import model.config as config import preprocessing.util as util if __name__ == '__main__': entity_batcher = TokenBatcher(config.base_folder + "data/vocabulary/" + "wiki_vocab.txt") with open( '/Users/asntr/Projects/university/course_work/end2end_neural_el/data/entities/ent2toks.txt', 'w+' ) as dst, open( '/Users/asntr/Projects/university/course_work/end2end_neural_el/data/entities/summary.txt_prep', 'r') as src: entity2summary = util.load_entity_summary_map() for i, (k, v) in enumerate(entity2summary.items()): tokens = entity_batcher.batch_sentences([v]).tolist()[0] dst.write(k + '\t' + ' '.join([str(i) for i in tokens]) + '\n')
tfconfig.gpu_options.per_process_gpu_memory_fraction = 0.8 # maximun alloc gpu50% of MEM tfconfig.gpu_options.allow_growth = True #allocate dynamically os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2" #屏蔽warning信息 config = Config() if __name__ == '__main__': datas = json.load(open('../data/test2.json', encoding='utf-8')) ndatas = [line.split()[1:] for line in datas[:10]] batcher = TokenBatcher(config.vocab_file) #生成词表示的batch类 inputData = tf.placeholder('int32', shape=(None, None)) abilm = BidirectionalLanguageModel( config.option_file, config.weight_file, use_character_inputs=False, embedding_weight_file=config.tokenEmbeddingFile) inputEmbeddingsOp = abilm(inputData) elmoInput = weight_layers('input', inputEmbeddingsOp, l2_coef=0.0) sess = tf.Session() with sess.as_default(): sess.run(tf.global_variables_initializer()) inputids = batcher.batch_sentences(ndatas) #生成batch数据 inputvec = sess.run(elmoInput['weighted_op'], feed_dict={inputData: inputids}) print(inputvec) sess.close()
class ELMo_Utils(object): """ Impements Elmo functions used by downstream task Each tokenized sentence is a list of str, with a batch of sentences a list of tokenized sentences (List[List[str]]). The Batcher packs these into a shape (n_sentences, max_sentence_length + 2, 50) numpy array of character ids, padding on the right with 0 ids for sentences less then the maximum length. The first and last tokens for each sentence are special begin and end of sentence ids added by the Batcher. The input character id placeholder can be dimensioned (None, None, 50), with both the batch dimension (axis=0) and time dimension (axis=1) determined for each batch, up the the maximum batch size specified in the BidirectionalLanguageModel constructor. After running inference with the batch, the return biLM embeddings are a numpy array with shape (n_sentences, 3, max_sentence_length, 1024), after removing the special begin/end tokens. """ START_TOKEN = '<S>' END_TOKEN = '</S>' UNK_TOKEN = '<UNK>' PAD_SNT = '<S></S>' PAD_SNT_ID = 0 def __init__(self, elmo_vocab_file, elmo_weight_file, elmo_option_file, use_character_elmo, use_concat_p, question_window, utterance_cache_file='', passage_cache_file='', question_cache_file=''): self.logger = logging.getLogger("dial") self.utterance_cache = None self.passage_cache = None self.question_cache = None self.need_q_cache = (question_window > 1) self.need_p_cache = use_concat_p if os.path.exists(elmo_weight_file) and os.path.exists( elmo_option_file) and os.path.exists(elmo_vocab_file): # the vocab file exported from the corpus self.elmo_vocab_file = elmo_vocab_file # elmo weight file self.elmo_weight_file = elmo_weight_file # elmo option file self.elmo_option_file = elmo_option_file self.utterance_cache_file = utterance_cache_file self.passage_cache_file = passage_cache_file self.question_cache_file = question_cache_file self.use_character_elmo = use_character_elmo with open(self.elmo_option_file, 'r') as fin: options = json.load(fin) self.output_layers = options['lstm']['n_layers'] + 1 self.output_dim = 2 * options['lstm']['projection_dim'] self.logger.info("output_layers :{}, output_dim :{}".format( self.output_layers, self.output_dim)) # by default, the bilm use the character_elmo if self.use_character_elmo: # max_num_char for characters for a token. self.elmo_max_num_char = options['char_cnn'][ 'max_characters_per_token'] # line 207 https://github.com/allenai/bilm-tf/blob/ebf52c6ec1012a3672247c2d14ff7bcad7fb812b/bilm/data.py # the mask for char id is 0 self.PAD_TOKEN_CHAR_IDS = np.zeros((self.elmo_max_num_char), dtype=np.int32).tolist() # use subword character first, which shows extra improvements beside the contextual information. self.elmo_char_batcher = Batcher(self.elmo_vocab_file, self.elmo_max_num_char) # language mode with use_character_inputs = True self.elmo_bilm = BidirectionalLanguageModel( self.elmo_option_file, self.elmo_weight_file) else: # use token batcher self.elmo_token_batcher = TokenBatcher(self.elmo_vocab_file) # use elmo_bilm with use_character_inputs = False self.elmo_bilm = BidirectionalLanguageModel( self.elmo_option_file, self.elmo_weight_file) self.chk_load_utterance_cache() self.chk_load_passage_cache() self.chk_load_question_cache() else: self.logger.warn( "elmo_weight_file = {}, elmo_option_file={}, elmo_vocab_file={}" .format(elmo_weight_file, elmo_option_file, elmo_vocab_file)) def chk_load_utterance_cache(self): if self.utterance_cache_file and os.path.exists( self.utterance_cache_file): self.utterance_cache = h5py.File(self.utterance_cache_file, 'r') #self.utterance_cache_in_mem = {} #self.utterance_cache_in_mem['lm_embeddings'] = self.load_h5(self.utterance_cache['lm_embeddings']) #self.utterance_cache_in_mem['lengths'] = self.load_h5_lengths(self.utterance_cache['lengths']) #self.utterance_cache_in_mem['mask'] = self.load_h5(self.utterance_cache['mask']) self.logger.info( "Utterance cache loaded from {}, size = {}".format( self.utterance_cache_file, len(self.utterance_cache['lm_embeddings'].keys()))) else: self.utterance_cache = None def load_h5(self, h5group): x = [] for index in range(len(h5group.keys())): # https://stackoverflow.com/questions/10274476/how-to-export-hdf5-file-to-numpy-using-h5py x.append(h5group['{}'.format(index)][...].tolist()) return x def load_h5_lengths(self, h5group): x = [] for index in range(len(h5group.keys())): x.extend(h5group['{}'.format(index)][...].tolist()) return x def chk_load_passage_cache(self): if self.need_p_cache: if self.passage_cache_file and os.path.exists( self.passage_cache_file): self.passage_cache = h5py.File(self.passage_cache_file, 'r') self.logger.info("Passage cache loaded from {}".format( self.passage_cache_file)) else: self.passage_cache = None self.logger.info( "Passage cache needed from {}, it will build soon.".format( self.passage_cache_file)) else: self.passage_cache = None self.logger.info("Passage cache not needed") def chk_load_question_cache(self): if self.need_q_cache: if self.question_cache_file and os.path.exists( self.question_cache_file): self.question_cache = h5py.File(self.question_cache_file, 'r') self.logger.info("Question cache loaded from {}".format( self.question_cache_file)) else: self.question_cache = None self.logger.info( "Question cache needed from {}, it will build soon.". format(self.question_cache_file)) else: self.question_cache = None self.logger.info("Question cache not needed") def need_build_passage_cache(self): return self.need_p_cache and self.passage_cache_file != '' and self.passage_cache == None def need_build_question_cache(self): return self.need_q_cache and self.question_cache_file != '' and self.question_cache == None def cleanup(self): if self.utterance_cache: self.utterance_cache.close() if self.passage_cache: self.passage_cache.close() if self.question_cache: self.question_cache.close() self.logger.info("Clean up elmo cahce") def get_elmo_char_ids(self, sentences): ''' Given a nested list of tokens(with start and end token), return the character ids Arguments: sentences: List[List[str]] Return: [sentence_num, token_num, max_char_num] ''' return self.elmo_char_batcher.batch_sentences(sentences).tolist() def get_elmo_token_ids(self, sentences): ''' Given a nested list of tokens(without start and end token), return the token ids Arguments: sentemces : List[List[str]] Return : [sentence_num, token_num, max_char_num] ''' return self.elmo_token_batcher.batch_sentences(sentences).tolist() def get_elmo_emb_op(self, input_ids_place_holder): ''' Given the input ids place holder, reutrn a ops for computing the language model { 'lm_embeddings': embedding_op, (None, 3, None, 1024) 'lengths': sequence_lengths_op, (None, ) 'mask': op to compute mask (None, None) } ''' return self.elmo_bilm(input_ids_place_holder) def weight_layers(self, name, bilm_ops, l2_coef=None, use_top_only=False, do_layer_norm=False): ''' Weight the layers of a biLM with trainable scalar weights to compute ELMo representations. See more details on https://github.com/allenai/bilm-tf/blob/81a4b54937f4dfb93308f709c1cf34dbb37c553e/bilm/elmo.py { 'weighted_op': op to compute weighted average for output, 'regularization_op': op to compute regularization term } ''' return weight_layers(name, bilm_ops, l2_coef, use_top_only, do_layer_norm) @staticmethod def prepare_elmo_vocab_file(vocab, elmo_vocab_file): sorted_word = sorted(vocab.token_cnt, key=vocab.token_cnt.get, reverse=True) with open(elmo_vocab_file, 'w') as f: f.write('{}\n'.format(ELMo_Utils.START_TOKEN)) f.write('{}\n'.format(ELMo_Utils.END_TOKEN)) f.write('{}\n'.format(ELMo_Utils.UNK_TOKEN)) for item in sorted_word: f.write('%s\n' % item) def build_elmo_char_cache(self, snt_dict_file, max_snt_length, output_cache_file): """ Go through all the snts in the dataset, save into the cache """ self.logger.info( 'Prepare ELMo character embeddings for {} with ELMo_Utils ...'. format(snt_dict_file)) ids_placeholder = tf.placeholder('int32', shape=(None, max_snt_length, self.elmo_max_num_char)) ops = self.elmo_bilm(ids_placeholder) config = tf.ConfigProto(allow_soft_placement=True) with tf.Session(config=config) as sess: sess.run(tf.global_variables_initializer()) with open(snt_dict_file, 'r') as fin, h5py.File(output_cache_file, 'w') as fout: lm_embeddings_h5 = fout.create_group('lm_embeddings') lengths_h5 = fout.create_group('lengths') mask_h5 = fout.create_group('mask') batch_snts = [] start_snt_id_in_batch = 0 SNT_BATCH_SIZE = 10 for line in tqdm(fin, total=get_num_lines(snt_dict_file)): sentence = line.strip().split() batch_snts.append(sentence) length = len(batch_snts) if length >= SNT_BATCH_SIZE: start_snt_id_in_batch += self.consume_batch_snts( sess, ids_placeholder, ops, batch_snts, max_snt_length, start_snt_id_in_batch, lm_embeddings_h5, lengths_h5, mask_h5) batch_snts = [] if len(batch_snts) > 0: start_snt_id_in_batch += self.consume_batch_snts( sess, ids_placeholder, ops, batch_snts, max_snt_length, start_snt_id_in_batch, lm_embeddings_h5, lengths_h5, mask_h5) batch_snts = [] self.logger.info( "Finished ELMo embeddings for {} senencesm in {}".format( start_snt_id_in_batch, output_cache_file)) def consume_batch_snts(self, sess, ids_placeholder, ops, batch_snts, max_snt_length, start_snt_id_in_batch, lm_embeddings_h5, lengths_h5, mask_h5): char_ids = self.get_elmo_char_ids(batch_snts) char_ids = [(ids + [self.PAD_TOKEN_CHAR_IDS] * (max_snt_length - len(ids)))[:max_snt_length] for ids in char_ids] elmo_ops = sess.run(ops, feed_dict={ids_placeholder: char_ids}) batch_size = len(batch_snts) for i in range(batch_size): sentence_id = start_snt_id_in_batch + i # self.logger.info("create lm for snt {}".format(sentence_id)) lm_embeddings_h5.create_dataset( '{}'.format(sentence_id), elmo_ops['lm_embeddings'].shape[1:], dtype='float32', data=elmo_ops['lm_embeddings'][i, :, :, :], compression="gzip") lengths_h5.create_dataset('{}'.format(sentence_id), (1, ), dtype='int32', data=elmo_ops['lengths'][i]) mask_h5.create_dataset('{}'.format(sentence_id), elmo_ops['mask'].shape[1:], dtype='int32', data=elmo_ops['mask'][i], compression="gzip") return batch_size # TODO for token level embedding. def build_elmo_token_cache(self, snt_dict_file, max_snt_length, output_cache_file): pass def build_elmo_cache(self, snt_dict_file, max_snt_length, output_cache_file): if self.use_character_elmo: self.build_elmo_char_cache(snt_dict_file, max_snt_length, output_cache_file) else: self.build_elmo_token_cache(snt_dict_file, max_snt_length, output_cache_file) self.logger.info( 'Finished ELMo embeddings for utterance cache with ELMo_Utils') def build_elmo_cache_for_samples(self, dataset, max_p_len, max_q_len): if (not self.need_p_cache) and (not self.need_q_cache): self.logger.info( 'No need for ELMo embeddings for concated passage and question with ELMo_Utils' ) else: # build graph for getting forward elmo embedding. self.logger.info('Build ELMo embeddings for p = {}, q = {}'.format( self.need_p_cache, self.need_q_cache)) self.build_pq_elmo_graph() if self.need_p_cache: p_out = h5py.File(self.passage_cache_file, 'w') p_lm_embeddings_h5 = p_out.create_group('lm_embeddings') p_lengths_h5 = p_out.create_group('lengths') p_mask_h5 = p_out.create_group('mask') if self.need_q_cache: q_out = h5py.File(self.question_cache_file, 'w') q_lm_embeddings_h5 = q_out.create_group('lm_embeddings') q_lengths_h5 = q_out.create_group('lengths') q_mask_h5 = q_out.create_group('mask') config = tf.ConfigProto(allow_soft_placement=True) with tf.Session(config=config) as sess: sess.run(tf.global_variables_initializer()) for set_name in ['train', 'dev', 'test']: for batch_data in tqdm( dataset.gen_mini_batches(set_name, 20, shuffle=False)): samples = batch_data['raw_data'] # batch_data is filled with elmo feed_dict self.run_pq_ops(sess, batch_data, max_p_len, max_q_len) for i in range(len(samples)): e_id = '{}'.format(samples[i]['example-id']) try: if self.need_p_cache: p_lm_embeddings_h5.create_dataset( e_id, p_ops['lm_embeddings'].shape[1:], dtype='float32', data=p_ops['lm_embeddings'][ i, :, :, :], compression="gzip") p_lengths_h5.create_dataset( e_id, (1, ), dtype='int32', data=p_ops['lengths'][i]) p_mask_h5.create_dataset( e_id, p_ops['mask'].shape[1:], dtype='int32', data=p_ops['mask'][i, :], compression="gzip") if self.need_q_cache: q_lm_embeddings_h5.create_dataset( e_id, q_ops['lm_embeddings'].shape[1:], dtype='float32', data=q_ops['lm_embeddings'][ i, :, :, :], compression="gzip") q_lengths_h5.create_dataset( e_id, (1, ), dtype='int32', data=q_ops['lengths'][i], ) q_mask_h5.create_dataset( e_id, q_ops['mask'].shape[1:], dtype='int32', data=q_ops['mask'][i, :], compression="gzip") except: continue self.logger.info( 'Finished ELMo embeddings for concated passage and question with ELMo_Utils' ) def run_pq_ops(self, sess, batch_data, max_p_len, max_q_len): self._static_pq_padding(batch_data, max_p_len, max_q_len) if self.need_p_cache and self.need_q_cache: self.p_ops, self.q_ops = sess.run( [self.p_emb_elmo_op, self.q_emb_elmo_op], feed_dict={ self.elmo_p: batch_data['elmo_passage_char_ids'], self.elmo_q: batch_data['elmo_question_char_ids'] }) elif self.need_p_cache: self.p_ops = sess.run( [self.p_emb_elmo_op], feed_dict={self.elmo_p: batch_data['elmo_passage_char_ids']}) else: self.q_ops = sess.run([self.q_emb_elmo_op], feed_dict={ self.elmo_q: batch_data['elmo_question_char_ids'], }) def build_pq_elmo_graph(self): """ Given the batch_data, this will seperately run tensorflow get the elmo embedding for each batch, which will be cached into file Especially , for sample level cache, please make sure that the first dimension for any tensor is batch_size """ start_t = time.time() self.logger.info( "Start building elmo graph for concatenated p and q ...") self.add_elmo_placeholders() with tf.device('/device:GPU:0'): with tf.variable_scope("", reuse=tf.AUTO_REUSE): # get all elmo op with language mode # lm_embeddings : [batch_size, layers, max_length, hidden_dims * 2] # lengths : [batch_size] # mask : [batch_size, length] if self.need_p_cache: self.p_emb_elmo_op = self.elmo_bilm(self.elmo_p) if self.need_q_cache: # [batch_size, context_window, layers, max_u_length, hidden_dims * 2] self.q_emb_elmo_op = self.elmo_bilm(self.elmo_q) def add_elmo_placeholders(self): """ elmo for business, logic corresponding the specific application """ # for ELMo with character embedding # elmo passage character ids for each token in each concatenated passage # [batch_size, passage_length, char_length] if self.need_p_cache: self.elmo_p = tf.placeholder(tf.int32, [None, None, self.elmo_max_num_char], 'elmo_p') # elmo character ids for whole concatenated qustion # [batch_size, question_length, char_length] self.elmo_q = tf.placeholder(tf.int32, [None, None, self.elmo_max_num_char], 'elmo_q') def _static_pq_padding(self, batch_data, max_p_len, max_q_len): """ This is used for static padding, which is useful when the deep contextual embedding is saved with a mask of the whole static length. """ # also padding elmo matrix # in elmo, the character ids after batch_sentences contains the start and end token, length for charids +2 while the final embedding not contains those special token. # For further compatibility, we still leave elmo length as different length. pad_q_len_elmo = 2 + max_q_len padding(batch_data, 'elmo_question_char_ids', pad_q_len_elmo, self.PAD_TOKEN_CHAR_IDS) if self.need_p_cache: pad_p_len_elmo = 2 + max_p_len padding(batch_data, 'elmo_passage_char_ids', pad_p_len_elmo, self.PAD_TOKEN_CHAR_IDS) def _prepare_passage_elmo_feed_dict(self, sample, batch_data, context_window, token_key_to_use): """ add elmo feed_dict for passage """ e_id_str = '{}'.format(sample['example-id']) passage_utterance_tokens_elmo = [] passage_utterance_length_elmo = [] passage_tokens_elmo = [ELMo_Utils.START_TOKEN] passage_snt_ids = [] pruned_context_utterances_elmo = sample['messages-so-far'][ -context_window:] for i in range(context_window): if i >= len(pruned_context_utterances_elmo): current_utterance_tokens_elmo = [ ELMo_Utils.START_TOKEN, ELMo_Utils.END_TOKEN ] passage_snt_ids.append(ELMo_Utils.PAD_SNT_ID) passage_utterance_tokens_elmo.append( current_utterance_tokens_elmo) passage_utterance_length_elmo.append( len(current_utterance_tokens_elmo)) else: utterance = pruned_context_utterances_elmo[i] if 'snt_id' in utterance: passage_snt_ids.append(utterance['snt_id']) # split version of passages current_utterance_tokens_elmo = [ELMo_Utils.START_TOKEN] current_utterance_tokens_elmo.extend( utterance[token_key_to_use]) current_utterance_tokens_elmo.extend([ELMo_Utils.END_TOKEN]) passage_utterance_tokens_elmo.append( current_utterance_tokens_elmo) passage_utterance_length_elmo.append( len(current_utterance_tokens_elmo)) # concatenated version of passages # append passages utterance tokens passage_tokens_elmo.extend(utterance[token_key_to_use]) passage_tokens_elmo.extend([ELMo_Utils.END_TOKEN]) if self.need_build_passage_cache(): # add into batch_data, no other batch data will data # [batch_size, passage_length, max_char_num] batch_data['elmo_passage_char_ids'].append( self.get_elmo_char_ids([passage_tokens_elmo])[0]) else: #TODO add passage and question elmo retrieve here. if self.need_p_cache: self.assemble_elmo_batch_data('p', batch_data, e_id_str, self.passage_cache) for snt_id in passage_snt_ids: # self.assemble_elmo_with_snt_ids('pu', batch_data, snt_id) # self.assemble_elmo_batch_data_with_mem('pu', batch_data, snt_id, self.utterance_cache_in_mem) self.assemble_elmo_batch_data('pu', batch_data, snt_id, self.utterance_cache) def _prepare_question_elmo_feed_dict(self, sample, batch_data, question_window, token_key_to_use): """ add question elmo feed_dict according the same style for adding regular question feed_dict """ e_id_str = '{}'.format(sample['example-id']) # for each utterance in question question_utterance_tokens_elmo = [] # for the concatenated question # for question utterance length question_utterance_length_elmo = [] question_snt_ids = [] # add start token, which is also in the vocabulary # in non-elmo, embedding, we wil add self.vocab.sos and self.vocab.eos in to the sentence,whic will be encoded by the downstream lstm. However, sos and eos are in capital case in the elmo. In fact, we must use Upper case here to get a emebdding from elmo abou it. question_tokens_elmo = [ELMo_Utils.START_TOKEN] pruned_question_utterance_elmo = sample['messages-so-far'][ -question_window:] for i in range(question_window): if i >= len(pruned_question_utterance_elmo): current_utterance_tokens_elmo = [ ELMo_Utils.START_TOKEN, ELMo_Utils.END_TOKEN ] question_snt_ids.append(ELMo_Utils.PAD_SNT_ID) question_utterance_tokens_elmo.append( current_utterance_tokens_elmo) question_utterance_length_elmo.append( len(current_utterance_tokens_elmo)) else: utterance = pruned_question_utterance_elmo[i] # split version of question if 'snt_id' in utterance: question_snt_ids.append(utterance['snt_id']) current_utterance_tokens_elmo = [ELMo_Utils.START_TOKEN] current_utterance_tokens_elmo.extend( utterance[token_key_to_use]) current_utterance_tokens_elmo.extend([ELMo_Utils.END_TOKEN]) # add each utterance token_ids into a parental list question_utterance_tokens_elmo.append( current_utterance_tokens_elmo) question_utterance_length_elmo.append( len(current_utterance_tokens_elmo)) # concatenated version of question # append question utterance tokens question_tokens_elmo.extend(utterance[token_key_to_use]) question_tokens_elmo.extend([ELMo_Utils.END_TOKEN]) if question_window == 0: # if note use question, here it will make mistake, # bug here. make question at least = 1 pass else: # add elmo question tokenids into batch_data if self.need_build_question_cache(): # add into batch_data # [batch_size, question_length, max_char_num] batch_data['elmo_question_char_ids'].append( self.get_elmo_char_ids([question_tokens_elmo])[0]) else: # if question_window = 1, then juse use utterance cache if question_window == 1: # self.assemble_elmo_with_snt_ids('q', batch_data, question_snt_ids[0]) # self.assemble_elmo_batch_data_with_mem('q', batch_data, question_snt_ids[0], self.utterance_cache_in_mem) self.assemble_elmo_batch_data('q', batch_data, question_snt_ids[0], self.utterance_cache) else: self.assemble_elmo_batch_data('q', batch_data, e_id_str, self.question_cache) def _prepare_response_elmo_feed_dict(self, sample, batch_data, token_key_to_use): """ add question elmo feed_dict according the same style for adding regular question feed_dict """ if 'options-for-correct-answers': e_id_str = '{}'.format(sample['example-id']) utterance = sample['options-for-correct-answers'][0] # split version of question current_utterance_tokens_elmo = [ELMo_Utils.START_TOKEN] current_utterance_tokens_elmo.extend(utterance[token_key_to_use]) current_utterance_tokens_elmo.extend([ELMo_Utils.END_TOKEN]) if 'snt_id' in utterance: snt_id = utterance['snt_id'] self.assemble_elmo_batch_data('r', batch_data, snt_id, self.utterance_cache) def init_elmo_batch_data_sntids(self, batch_data): if self.need_p_cache: # use elmo cache to retrieve batch_data batch_data['elmo_p_lm_embeddings'] = [] batch_data['elmo_p_lengths'] = [] batch_data['elmo_p_mask'] = [] batch_data['elmo_pu_snt_ids'] = [] batch_data['elmo_q_snt_ids'] = [] batch_data['elmo_r_snt_ids'] = [] def init_elmo_batch_data_emb(self, batch_data): if self.need_p_cache: # use elmo cache to retrieve batch_data batch_data['elmo_p_lm_embeddings'] = [] batch_data['elmo_p_lengths'] = [] batch_data['elmo_p_mask'] = [] # for passage_utterance batch_data['elmo_pu_lm_embeddings'] = [] batch_data['elmo_pu_lengths'] = [] batch_data['elmo_pu_mask'] = [] # for question batch_data['elmo_q_lm_embeddings'] = [] batch_data['elmo_q_lengths'] = [] batch_data['elmo_q_mask'] = [] # for res batch_data['elmo_r_lm_embeddings'] = [] batch_data['elmo_r_lengths'] = [] batch_data['elmo_r_mask'] = [] def add_elmo_placeholder_with_cache_sntids(self): """ add placeholders for elmo ops, which will be used in the weight_layers """ if self.need_p_cache: self.elmo_p_lm_embeddings = tf.placeholder( tf.float32, [None, self.output_layers, None, self.output_dim], name='elmp_p_lm_embeddings') self.elmo_p_lengths = tf.placeholder(tf.int32, [None], name='elmo_p_lengths') self.elmo_p_mask = tf.placeholder(tf.int32, [None, None], name='elmo_p_mask') self.elmo_pu_snt_ids = tf.placeholder(tf.int32, [None], name='elmo_pu_snt_ids') self.elmo_q_snt_ids = tf.placeholder(tf.int32, [None], name='elmo_q_snt_ids') self.elmo_r_snt_ids = tf.placeholder(tf.int32, [None], name='elmo_r_snt_ids') def add_elmo_placeholder_with_cache_emb(self): """ add placeholders for elmo ops, which will be used in the weight_layers """ if self.need_p_cache: self.elmo_p_lm_embeddings = tf.placeholder( tf.float32, [None, self.output_layers, None, self.output_dim], name='elmp_p_lm_embeddings') self.elmo_p_lengths = tf.placeholder(tf.int32, [None], name='elmo_p_lengths') self.elmo_p_mask = tf.placeholder(tf.int32, [None, None], name='elmo_p_mask') self.elmo_pu_lm_embeddings = tf.placeholder( tf.float32, [None, self.output_layers, None, self.output_dim], name='elmo_pu_lm_embeddings') self.elmo_pu_lengths = tf.placeholder(tf.int32, [None], name='elmo_pu_lengths') self.elmo_pu_mask = tf.placeholder(tf.int32, [None, None], name='elmo_pu_mask') self.elmo_q_lm_embeddings = tf.placeholder( tf.float32, [None, self.output_layers, None, self.output_dim], name='elmo_q_lm_embeddings') self.elmo_q_lengths = tf.placeholder(tf.int32, [None], name='elmo_q_lengths') self.elmo_q_mask = tf.placeholder(tf.int32, [None, None], name='elmo_q_mask') self.elmo_r_lm_embeddings = tf.placeholder( tf.float32, [None, self.output_layers, None, self.output_dim], name='elmo_r_lm_embeddings') self.elmo_r_lengths = tf.placeholder(tf.int32, [None], name='elmo_r_lengths') self.elmo_r_mask = tf.placeholder(tf.int32, [None, None], name='elmo_r_mask') def prepare_elmo_cache_feed_dict_sntids(self, feed_dict, batch): """ consitently feed the batch_data, we prepared in the prepare_passage_elmo, question_elmo, answer_elmo """ if self.need_p_cache: # for elmo_p feed_dict[ self.elmo_p_lm_embeddings] = batch['elmo_p_lm_embeddings'] feed_dict[self.elmo_p_lengths] = batch['elmo_p_lengths'] feed_dict[self.elmo_p_mask] = batch['elmo_p_mask'] # for elmo_q feed_dict[self.elmo_q_snt_ids] = batch['elmo_q_snt_ids'] # for elmo_pu feed_dict[self.elmo_pu_snt_ids] = batch['elmo_pu_snt_ids'] # for elmo_r feed_dict[self.elmo_r_snt_ids] = batch['elmo_r_snt_ids'] def prepare_elmo_cache_feed_dict_emb(self, feed_dict, batch): """ consitently feed the batch_data, we prepared in the prepare_passage_elmo, question_elmo, answer_elmo """ if self.need_p_cache: # for elmo_p feed_dict[ self.elmo_p_lm_embeddings] = batch['elmo_p_lm_embeddings'] feed_dict[self.elmo_p_lengths] = batch['elmo_p_lengths'] feed_dict[self.elmo_p_mask] = batch['elmo_p_mask'] # for elmo_q feed_dict[self.elmo_q_lm_embeddings] = batch['elmo_q_lm_embeddings'] feed_dict[self.elmo_q_lengths] = batch['elmo_q_lengths'] feed_dict[self.elmo_q_mask] = batch['elmo_q_mask'] # for elmo_pu feed_dict[self.elmo_pu_lm_embeddings] = batch['elmo_pu_lm_embeddings'] feed_dict[self.elmo_pu_lengths] = batch['elmo_pu_lengths'] feed_dict[self.elmo_pu_mask] = batch['elmo_pu_mask'] # for elmo_r feed_dict[self.elmo_r_lm_embeddings] = batch['elmo_r_lm_embeddings'] feed_dict[self.elmo_r_lengths] = batch['elmo_r_lengths'] feed_dict[self.elmo_r_mask] = batch['elmo_r_mask'] def elmo_embedding_layer_emb(self, elmo_emb_output): """ elmo embedding layers, which will return embedding for p,q,a,pu,qu after projections, dim is elmo_emb_output if elmo_emb_output == self.output_dim, then no projection will be done """ self.logger.info('build elmo embedding layer') if self.need_p_cache: p_emb_elmo_op = { 'lm_embeddings': self.elmo_p_lm_embeddings, 'lengths': self.elmo_p_lengths, 'mask': self.elmo_p_mask } q_emb_elmo_op = { 'lm_embeddings': self.elmo_q_lm_embeddings, 'lengths': self.elmo_q_lengths, 'mask': self.elmo_q_mask } pu_emb_elmo_op = { 'lm_embeddings': self.elmo_pu_lm_embeddings, 'lengths': self.elmo_pu_lengths, 'mask': self.elmo_pu_mask } r_emb_elmo_op = { 'lm_embeddings': self.elmo_r_lm_embeddings, 'lengths': self.elmo_r_lengths, 'mask': self.elmo_r_mask } with tf.device('/device:GPU:1'): with tf.variable_scope("", reuse=tf.AUTO_REUSE): if self.need_p_cache: self.p_elmo_emb = self.weight_layers( 'input', p_emb_elmo_op, l2_coef=0.0)['weighted_op'] self.q_elmo_emb = self.weight_layers( 'input', q_emb_elmo_op, l2_coef=0.0)['weighted_op'] self.pu_elmo_emb = self.weight_layers( 'input', pu_emb_elmo_op, l2_coef=0.0)['weighted_op'] self.r_elmo_emb = self.weight_layers( 'input', r_emb_elmo_op, l2_coef=0.0)['weighted_op'] # do project from elmo embedding into 128 embedding to contact with word embedding. if elmo_emb_output == self.output_dim: self.logger.info( "Elmo_emb_output={} is just equal to the output_dim={}, no need to project with fully connected layers for passage and questions" .format(elmo_emb_output, self.output_dim)) else: self.logger.info( "Elmo_emb_output={}, output_dim={}, project with fully connected layers for question and passage" .format(elmo_emb_output, self.output_dim)) if self.need_p_cache: self.p_elmo_emb = tf.contrib.layers.fully_connected( inputs=self.p_elmo_emb, num_outputs=elmo_emb_output, activation_fn=tf.nn.softmax) self.q_elmo_emb = tf.contrib.layers.fully_connected( inputs=self.q_elmo_emb, num_outputs=elmo_emb_output, activation_fn=tf.nn.softmax) self.pu_elmo_emb = tf.contrib.layers.fully_connected( inputs=self.pu_elmo_emb, num_outputs=elmo_emb_output, activation_fn=tf.nn.softmax) self.r_elmo_emb = tf.contrib.layers.fully_connected( inputs=self.r_elmo_emb, num_outputs=elmo_emb_output, activation_fn=tf.nn.softmax) def elmo_embedding_layer_sntids(self, elmo_emb_output): """ elmo embedding layers, which will return embedding for p,q,a,pu,qu after projections, dim is elmo_emb_output if elmo_emb_output == self.output_dim, then no projection will be done """ with tf.device('/cpu:0'), tf.variable_scope('elmo_embedding'): self.elmo_lm_embeddings_lookup = tf.get_variable( 'lm_embeddings_lookup', shape=np.shape(self.utterance_cache_in_mem['lm_embeddings']), initializer=tf.constant_initializer( self.utterance_cache_in_mem['lm_embeddings']), trainable=False) self.elmo_lengths_lookup = tf.get_variable( 'lengths_lookup', shape=(np.shape(self.utterance_cache_in_mem['lengths'])), initializer=tf.constant_initializer( self.utterance_cache_in_mem['lengths']), trainable=False) self.elmo_mask_lookup = tf.get_variable( 'mask_lookup', shape=np.shape(self.utterance_cache_in_mem['mask']), initializer=tf.constant_initializer( self.utterance_cache_in_mem['mask']), trainable=False) if self.need_p_cache: p_emb_elmo_op = { 'lm_embeddings': self.elmo_p_embeddings, 'lengths': self.elmo_p_lengths, 'mask': self.elmo_p_mask } q_emb_elmo_op = { 'lm_embeddings': tf.nn.embedding_lookup(self.elmo_lm_embeddings_lookup, self.elmo_q_snt_ids), 'lengths': tf.nn.embedding_lookup(self.elmo_lengths_lookup, self.elmo_q_snt_ids), 'mask': tf.nn.embedding_lookup(self.elmo_mask_lookup, self.elmo_q_snt_ids) } pu_emb_elmo_op = { 'lm_embeddings': tf.nn.embedding_lookup(self.elmo_lm_embeddings_lookup, self.elmo_pu_snt_ids), 'lengths': tf.nn.embedding_lookup(self.elmo_lengths_lookup, self.elmo_pu_snt_ids), 'mask': tf.nn.embedding_lookup(self.elmo_mask_lookup, self.elmo_pu_snt_ids) } r_emb_elmo_op = { 'lm_embeddings': tf.nn.embedding_lookup(self.elmo_lm_embeddings_lookup, self.elmo_r_snt_ids), 'lengths': tf.nn.embedding_lookup(self.elmo_lengths_lookup, self.elmo_r_snt_ids), 'mask': tf.nn.embedding_lookup(self.elmo_mask_lookup, self.elmo_r_snt_ids) } with tf.device('/device:GPU:1'): with tf.variable_scope("", reuse=tf.AUTO_REUSE): if self.need_p_cache: self.p_elmo_emb = self.weight_layers( 'input', p_emb_elmo_op, l2_coef=0.0)['weighted_op'] self.q_elmo_emb = self.weight_layers( 'input', q_emb_elmo_op, l2_coef=0.0)['weighted_op'] self.pu_elmo_emb = self.weight_layers( 'input', pu_emb_elmo_op, l2_coef=0.0)['weighted_op'] self.r_elmo_emb = self.weight_layers( 'input', r_emb_elmo_op, l2_coef=0.0)['weighted_op'] # do project from elmo embedding into 128 embedding to contact with word embedding. if elmo_emb_output == self.output_dim: self.logger.info( "Elmo_emb_output={} is just equal to the output_dim={}, no need to project with fully connected layers for question and passage" .format(elmo_emb_output, self.output_dim)) else: self.logger.info( "Elmo_emb_output={}, output_dim={}, project with fully connected layers for question and passage" .format(elmo_emb_output, self.output_dim)) if self.need_p_cache: self.p_elmo_emb = tf.contrib.layers.fully_connected( inputs=self.p_elmo_emb, num_outputs=elmo_emb_output, activation_fn=tf.nn.softmax) self.q_elmo_emb = tf.contrib.layers.fully_connected( inputs=self.q_elmo_emb, num_outputs=elmo_emb_output, activation_fn=tf.nn.softmax) self.pu_elmo_emb = tf.contrib.layers.fully_connected( inputs=self.pu_elmo_emb, num_outputs=elmo_emb_output, activation_fn=tf.nn.softmax) self.r_elmo_emb = tf.contrib.layers.fully_connected( inputs=self.r_elmo_emb, num_outputs=elmo_emb_output, activation_fn=tf.nn.softmax) def assemble_elmo_batch_data(self, name, batch_data, id_key, cache): lm_embeddings = cache['lm_embeddings']['{}'.format(id_key)][...] length = cache['lengths']['{}'.format(id_key)][0] mask = cache['mask']['{}'.format(id_key)][...] batch_data['elmo_{}_lm_embeddings'.format(name)].append(lm_embeddings) batch_data['elmo_{}_lengths'.format(name)].append(length) batch_data['elmo_{}_mask'.format(name)].append(mask) def assemble_elmo_batch_data_with_mem(self, name, batch_data, id_key, cache_in_mem): """ id_key is int here, for the snt_id """ lm_embeddings = cache_in_mem['lm_embeddings'][id_key] length = cache_in_mem['lengths'][id_key] mask = cache_in_mem['mask'][id_key] batch_data['elmo_{}_lm_embeddings'.format(name)].append(lm_embeddings) batch_data['elmo_{}_lengths'.format(name)].append(length) batch_data['elmo_{}_mask'.format(name)].append(mask) def assemble_elmo_with_snt_ids(self, name, batch_data, id_key): """ id_key is int here, for the snt_id """ batch_data['elmo_{}_snt_ids'.format(name)].append(id_key)
def validate(): print("Validating with validation set...") # load validation set print("Loading validation set...") valid_labels, valid_text_ids = load_labels(valid_label_path, label_index, num_classes, one_hot=False) valid_texts = load_texts(valid_data_path, valid_text_ids) valid_size = len(valid_texts) tokenized_valid_texts = tokenize(valid_texts) # Create a TokenBatcher to map text to token ids batcher = TokenBatcher(train_vocab_file) # restore the TextCNN model print("Restoring TextCNN model...") tf.reset_default_graph() cnn_path = rootdir + r"models\TextCNN_with_elmo\4-classifier\\" \ + r"label" + str(label_index) \ + r"\TextCNN_with_elmo" cnn_path_dir = os.path.dirname(cnn_path) meta_file_name = os.listdir(cnn_path_dir)[-1] meta_path = os.path.join(cnn_path_dir, meta_file_name) sess = tf.InteractiveSession() saver = tf.train.import_meta_graph(meta_path) saver.restore(sess, tf.train.latest_checkpoint(os.path.dirname(cnn_path))) graph = tf.get_default_graph() input_tensors = { 't_real_text': graph.get_tensor_by_name('real_text_input:0'), 'input_y': graph.get_tensor_by_name('input_y:0') } feedback = { 'loss': graph.get_tensor_by_name('loss/add:0'), 'accuracy': graph.get_tensor_by_name('accuracy/accuracy:0') } features = graph.get_tensor_by_name('g_conv/Squeeze:0') # Extracting features of texts in validation set print("Extracting features of texts in validation set(4 classes)...") all_features = [] batch_num = valid_size // batch_size for batch_no in range(batch_num): try: batch_texts = tokenized_valid_texts[batch_no * batch_size:(batch_no + 1) * batch_size] except IndexError: batch_texts = tokenized_valid_texts[batch_no * batch_size:] batch_texts = batcher.batch_sentences(batch_texts) batch_texts = pad_and_cut(batch_texts, MAX_LEN) batch_features = sess.run( features, feed_dict={input_tensors['t_real_text']: batch_texts}) all_features.append(batch_features) valid_features = np.vstack(all_features) sess.close() # restore the 4-class-svm model print("Restoring 4-class svm and predicting labels...") svm_path = rootdir + r"models\SVM\4-classifier\\" \ + r"\svm_label" + str(label_index) + r".m" clf = joblib.load(svm_path) # predict labels of validation set with svm print("Predicting labels of validation set with svm...") predicted = clf.predict(valid_features) print(classification_report(valid_labels, predicted)) acc = accuracy_score(valid_labels, predicted) f11 = f1_score(valid_labels, predicted, average=None) f12 = f1_score(valid_labels, predicted, average='macro') print("f11=", f11) print("f12=", f12) print("acc=", acc)
# Create a TokenBatcher to map text to token ids. batcher = TokenBatcher(vocab_file) # REQUIRED # Build the Elmo with biLM and weight layers. elmo = Elmo( options_file, weight_file, token_embedding_file=token_embedding_file, # REQUIRED token_batcher=batcher, # REQUIRED num_output_representations=1, requires_grad=False, do_layer_norm=False, dropout=0.) # Create batches of data. context_token_ids = batcher.batch_sentences( tokenized_context, add_bos_eos=False) question_token_ids = batcher.batch_sentences( tokenized_question, add_bos_eos=False) # numpy.ndarray or cupy.ndarray # with shape (batchsize, max_length) if gpu >= 0: # transfer the model to the gpu chainer.cuda.get_device_from_id(gpu).use() elmo.to_gpu() # transfer input data to the gpu context_token_ids = elmo.xp.asarray(context_token_ids) question_token_ids = elmo.xp.asarray(question_token_ids) # Compute elmo outputs, # i.e. weighted sum of multi-layer biLM's outputs.
class ProcessData: def __init__(self, params): self.data_path = params.data_path self.params = params if params.IS_DEBUG: print('debug mode') # load data for debugging self.train = self.load_data(self.data_path + self.params.DATA_DEBUG) self.dev = self.load_data(self.data_path + self.params.DATA_DEBUG) self.test = self.load_data(self.data_path + self.params.DATA_DEBUG) else: # load data self.train = self.load_data(self.data_path + self.params.DATA_TRAIN) self.dev = self.load_data(self.data_path + self.params.DATA_DEV) self.test = self.load_data(self.data_path + self.params.DATA_TEST) # batcher for ELMo if self.params.USE_CHAR_ELMO: print('[INFO] character-level ELMo') self.batcher = Batcher(self.data_path + self.params.DIC, 50) else: print('[INFO] cached-token-level ELMo') self.batcher = TokenBatcher(self.data_path + self.params.DIC) self.dic_size = 0 with open(self.data_path + self.params.DIC, 'r') as f: self.dic = f.readlines() self.dic = [x.strip() for x in self.dic] self.dic_size = len(self.dic) print('[completed] load data, dic_size: ', self.dic_size) def load_data(self, file_path): with open(file_path, 'rb') as f: dataset = pickle.load(f) print('load data : ', file_path, len(dataset)) return dataset def get_glove(self): print('[load glove] ' + self.params.GLOVE) return np.load(self.data_path + self.params.GLOVE) """ inputs: data : data to be processed (train/dev/test) batch_size : mini-batch size is_test : True, inference stage (ordered input) (default : False) start_index : start index of mini-batch (will be used when is_test==True) return: list_q : [batch, time_step(==MAX_LENGTH_Q)], questions list_s : [batch, MAX_SENTENCES, time_step(==MAX_LENGTH_S)], sentences list_graph : [batch, MAX_SENTENCES+1, MAX_SENTENCES+1], adjacency matrix of graph [question ; sentecens] list_l : [batch], labels list_len_q : [batch]. vaild sequecne length list_len_s : [batch, MAX_SENTENCES]. vaild sequecne length list_num_s : [batch], valid number of sentences """ def get_batch(self, data, batch_size, is_test=False, start_index=0): list_q, list_s, list_graph, list_l = [], [], [], [] list_len_q, list_len_s, list_num_s = [], [], [] index = start_index # Get a random batch of encoder and encoderR inputs from data, # pad them if needed for _ in range(batch_size): tmp_list_s, tmp_list_len_s, tmp_list_l = [], [], [] tmp_list_graph = np.zeros( [self.params.MAX_SENTENCES + 1, self.params.MAX_SENTENCES + 1], dtype=np.int32) if is_test is False: # train case - random sampling q, s, i, l = random.choice(data) s = s[:self.params.MAX_SENTENCES] i = [x for x in i if x < self.params.MAX_SENTENCES] else: if index >= len(data): # dummy data ( use index 0 data ) q, s, i, l = data[ 0] # dummy for batch - will not be evaluated s = s[:self.params.MAX_SENTENCES] i = [x for x in i if x < self.params.MAX_SENTENCES] index += 1 else: # real data q, s, i, l = data[index] s = s[:self.params.MAX_SENTENCES] i = [x for x in i if x < self.params.MAX_SENTENCES] index += 1 tmp_q = q.copy() tmp_q = tmp_q[:( self.params.MAX_LENGTH_Q - 3 )] # [make room] elmo will add <S>, 0 (last padding), we added <\S> tmp_q.append('<\\S>') list_q.append(tmp_q) list_len_q.append( min(len(tmp_q) - 1, self.params.MAX_LENGTH_Q)) # ignore special token </S> # add data as many as MAX_ANSWERS for tmp_i in range(self.params.MAX_SENTENCES): # real data if tmp_i < len(s): # Add pad to data & Calculate seq_length (for later use) # negative case will not generate pad array tmp_s = s[tmp_i].copy() tmp_s = tmp_s[:( self.params.MAX_LENGTH_S - 3 )] # elmo will add <S>, 0 (last padding), we added <\S> tmp_s.append('<\\S>') tmp_list_s.append(tmp_s) tmp_list_len_s.append( min(len(tmp_s) - 1, self.params.MAX_LENGTH_S) ) # ignore special token </S> tmp_list_l.append(int(l[tmp_i])) else: # Add dummy data (data from index 0) tmp_s = s[0].copy() tmp_s = tmp_s[:( self.params.MAX_LENGTH_S - 3 )] # elmo will add <S>, 0 (last padding), we added <\S> tmp_s.append('<\\S>') tmp_list_s.append(tmp_s) #tmp_list_len_s.append( min(len(tmp_s)-1,self.params.MAX_LENGTH_S) ) # ignore special token </S> tmp_list_len_s.append(0) # ignore special token </S> tmp_list_l.append(int(l[0])) # build graph adj matrix [question;sentences] # edge btw question and each sentence ( +1 for question ) # [ max_sentence +1, max_sentence +1 ] tmp_list_graph[0][:len(s) + 1] = 1 q_offset = 1 i.append( len(s) ) # i = index of starting sentence in passage <- append total length of the sentence start_s, end_s = -1, -1 for sen_index in i: start_s = end_s end_s = sen_index # skipping initial condition if (start_s != -1): tmp_same_passage = [ ] # for checking the index of sentence in the same passage # edge btw sentences in the same passage for tmp_i in range(start_s, end_s): if self.params.EDGE_SENTENCE_QUESTION: tmp_list_graph[ tmp_i + q_offset][0] = 1 # edge with question if self.params.EDGE_SELF: tmp_list_graph[tmp_i + q_offset][tmp_i + q_offset] = 1 # self edge # edge with neighbor within passage if self.params.EDGE_WITHIN_PASSAGE == 0: if (tmp_i + 1 != end_s): tmp_list_graph[tmp_i + q_offset][ tmp_i + 1 + q_offset] = 1 # edge with neighbor tmp_list_graph[tmp_i + 1 + q_offset][ tmp_i + q_offset] = 1 # edge with neighbor tmp_same_passage.append(tmp_i + q_offset) # edge fully-connected within passage if self.params.EDGE_WITHIN_PASSAGE == 1: for sent_idx in tmp_same_passage: copy_tmp_same_passage = list(tmp_same_passage) copy_tmp_same_passage.remove( sent_idx ) # self-connection is defined from params.EDGE_SELF tmp_list_graph[sent_idx][ copy_tmp_same_passage] = 1 # q_offset is already applied # edge fully-connected among first sentence of the passage if self.params.EDGE_PASSAGE_PASSAGE: tmp_passage_index = list(i)[:-1] # remove last index tmp_passage_index = [ (x + q_offset) for x in tmp_passage_index ] # q offset for passage_idx in tmp_passage_index: copy_tmp_passage_index = list(tmp_passage_index) copy_tmp_passage_index.remove( passage_idx ) # self-connection is defined from params.EDGE_SELF tmp_list_graph[passage_idx][ copy_tmp_passage_index] = 1 # q_offset is already applied list_graph.append(tmp_list_graph) list_s.append(tmp_list_s) list_len_s.append(tmp_list_len_s) list_l.append(tmp_list_l) list_num_s.append(len(s)) list_s_reshape = np.reshape( list_s, (self.params.batch_size * self.params.MAX_SENTENCES)) elmo_list_q = self.batcher.batch_sentences(list_q) elmo_list_s = self.batcher.batch_sentences(list_s_reshape) return elmo_list_q, elmo_list_s, list_graph, list_l, list_len_q, list_len_s, list_num_s
class Data(object): # member variables like dictionaries and lists goes here def __init__(self, length=0, use_synonym=False): self.para_tuples = [ ] # [(sent_id, sent_id, index_of_an_overlapping/synonym_token, index_of_an_overlapping/synonym_token), ... ] self.neg_tuples = [ ] # [(sent_id, sent_id, index_of_an_overlapping/synonym_token, index_of_an_overlapping/synonym_token), ... ] self.token_pair2neg_tuples = { } # {(token_id\, token_id) : set([neg_tuple_id, ...])} self.id2sent = [ ] # a list of arrays, where each array is a list of token ids (which represent a sentence). # eventually, make this an numpy array self.sent2id = {} self.paraphrases = set( [] ) # a set of {(sent_id, sent_id), ...} to quickly check whether two sentences are paraphrases or not. self.token2sents = { } # reverse index of sentences given tokens. This is a map { token_id : set([(sent_id, index_of_the_token_in_the_sentence), ...]) }. self.synonyms = {} # {token_id : set([token_id, ... ])} self.use_synonym = use_synonym self.stop_word_ids = set([]) self.length = length # self.batch_sizeK = None # To be readed by tester # build token_batcher self.word2id = {} self.id2word = [] def build(self, vocab_file, stop_word_file, synonym_file=None): # 1. build TokenBatcher self.token_batcher = TokenBatcher(vocab_file) self.word2id = self.token_batcher._lm_vocab._word_to_id self.id2word = self.token_batcher._lm_vocab._id_to_word # 2. if synonym_file is not None, populate synonyms (two directions). with open(synonym_file, "r") as f: for line in f: line = line.strip().split("\t") if (line[0] in self.word2id and line[2] in self.word2id): id0 = self.word2id[line[0]] id1 = self.word2id[line[2]] if (id1 == id0): continue self.synonyms.setdefault(id0, set()).add(id1) self.synonyms.setdefault(id1, set()).add(id0) # 3. if stop_word_file is not None, populate stop_word_ids with open(stop_word_file, "r", encoding="utf-8") as f: for line in f: line = line.strip() if line in self.word2id: self.stop_word_ids.add(self.word2id[line]) # The dataset is formatted as sentence\tsentence\tlabel def load_sentece_pairs(self, data_file_list, bad_words, data_type_list): # 1. populate sentence_tuples, update sentences (check stop_word_ids), paraphrases, token2sents. # 2. populate negative cases of sentence pairs into neg_tuples, and correspondingly update token2neg_tuples, sentences (check stop_word_ids), token2sents. s_len = [] for data_file, data_type in zip(data_file_list, data_type_list): with open(data_file, "rt", encoding="utf-8") as f: count = 0 for line in f: count += 1 if (count >= 20000): break line = line.strip().split('\t') label = line[0] if (data_type == "mrpc"): s1 = line[3].split() s2 = line[4].split() else: s1 = line[1].split() s2 = line[2].split() exist_bad_word = False for i in bad_words: if (i in s1 or i in s2): exist_bad_word = True if (exist_bad_word == True): continue # s1_tokenid = self.token_batcher.batch_sentences([s1])[0][1:][:-1] # s2_tokenid = self.token_batcher.batch_sentences([s2])[0][1:][:-1] # 1 s1_tokenid = self.token_batcher.batch_sentences([s1])[0] s2_tokenid = self.token_batcher.batch_sentences([s2])[0] # zero-pad/ truncate sentences to self.length #check if len(s1) > self.len s_len.append(len(s1_tokenid)) s_len.append(len(s2_tokenid)) if (len(s1_tokenid) > self.length or len(s1_tokenid) < 3): print(s1_tokenid, s1) continue if (len(s2_tokenid) > self.length or len(s2_tokenid) < 3): print(s2_tokenid, s2) continue if len(s1_tokenid) > self.length: s1_tokenid = s1_tokenid[:self.length] else: s1_tokenid = np.pad(s1_tokenid, (0, self.length - len(s1_tokenid)), 'constant', constant_values=(0)) if len(s2_tokenid) > self.length: s2_tokenid = s2_tokenid[:self.length] else: s2_tokenid = np.pad(s2_tokenid, (0, self.length - len(s2_tokenid)), 'constant', constant_values=(0)) if not (tuple(s1_tokenid) in self.sent2id): self.id2sent.append(s1_tokenid) s1_id = len(self.id2sent) - 1 self.sent2id.update({tuple(s1_tokenid): s1_id}) else: s1_id = self.sent2id[tuple(s1_tokenid)] if not (tuple(s2_tokenid) in self.sent2id): self.id2sent.append(s2_tokenid) s2_id = len(self.id2sent) - 1 self.sent2id.update({tuple(s2_tokenid): s2_id}) else: s2_id = self.sent2id[tuple(s2_tokenid)] #update paraphrases, para_tuples, neg_tuples overlap_index_pairs, synonym_index_pairs = self.overlap( s1_tokenid, s2_tokenid) # print(s1_tokenid) # print(s2_tokenid) # print("overlap", overlap_index_pairs) # if synonym_index_pairs: # print("synonym_index_pairs", synonym_index_pairs) total_index_pairs = overlap_index_pairs + synonym_index_pairs if (label == "1"): self.paraphrases.add((s1_id, s2_id)) self.paraphrases.add((s2_id, s1_id)) for p in total_index_pairs: sent_tuple = (s1_id, s2_id, p[0], p[1]) self.para_tuples.append(sent_tuple) else: for p in total_index_pairs: sent_tuple = (s1_id, s2_id, p[0], p[1]) self.neg_tuples.append(sent_tuple) w1 = s1_tokenid[p[0]] w2 = s2_tokenid[p[1]] if w1 in self.stop_word_ids or w2 in self.stop_word_ids: continue self.token_pair2neg_tuples.setdefault( (w1, w2), set()).add(len(self.neg_tuples) - 1) # update token2sents for index, token_id in enumerate(s1_tokenid): if (token_id == 2 or token_id == 1): continue sid_index = (s1_id, index) self.token2sents.setdefault(token_id, set()).add(sid_index) for index, token_id in enumerate(s2_tokenid): if (token_id == 2 or token_id == 1): continue sid_index = (s2_id, index) self.token2sents.setdefault(token_id, set()).add(sid_index) self.neg_tuples, self.para_tuples, self.id2sent = np.array( self.neg_tuples), np.array(self.para_tuples), np.array( self.id2sent) s_len = np.array(s_len) print("s length", np.min(s_len), np.max(s_len), np.mean(s_len), np.median(s_len)) def overlap(self, s1, s2): # check intersection s1_dict = dict((k, i) for i, k in enumerate(s1)) s2_dict = dict((k, i) for i, k in enumerate(s2)) word_pairs = [] inter = set(s1_dict).intersection(set(s2_dict)) if (1 in inter): inter.remove(1) if (2 in inter): inter.remove(2) if (0 in inter): inter.remove(0) inter.difference_update(self.stop_word_ids) # check digit for i in inter.copy(): if (self.id2word[i].isdigit()): inter.remove(i) if (self.id2word[i].startswith('-')): inter.remove(i) for w in inter: w1_id = s1_dict[w] w2_id = s2_dict[w] word_pairs.append([w1_id, w2_id]) synonym_pairs = [] if self.use_synonym: for id in s1_dict.keys(): if id in self.synonyms: for s in self.synonyms[id]: if s in s2_dict.keys(): synonym_pairs.append((s1_dict[id], s2_dict[s])) # print(synonym_pairs) for id in s2_dict.keys(): if id in self.synonyms: for s in self.synonyms[id]: if s in s1_dict.keys(): synonym_pairs.append((s1_dict[s], s2_dict[id])) # print(synonym_pairs) # print("------") synonym_pairs = list(set(synonym_pairs)) return word_pairs, synonym_pairs def corrupt(self, para_tuple, tar=None): # corrupt para tuple into a negative sample. Return (sent_id, sent_id, index_of_an_overlapping/synonym_token, index_of_an_overlapping/synonym_token) for a negative sample. if tar == None: tar = random.randint(0, 1) s1 = para_tuple[0] s1_index = para_tuple[2] s2 = para_tuple[1] s2_index = para_tuple[3] if (tar == 0): token = self.id2sent[s1][s1_index] sents_list = self.token2sents[token] if ((s1, s1_index) in sents_list): sents_list.remove((s1, s1_index)) if ((s2, s2_index) in sents_list): sents_list.remove((s2, s2_index)) if (len(sents_list) == 0): return random.choice(self.neg_tuples) else: corrupt_s = random.choice(list(sents_list)) ind = 0 while self.is_paraphrase(corrupt_s[0], s1): corrupt_s = random.choice(list(sents_list)) ind += 1 if ind > 10: # print("ind", ind) random.choice(self.neg_tuples) break return (corrupt_s[0], s1, corrupt_s[1], s1_index) if (tar == 1): token = self.id2sent[s2][s2_index] sents_list = self.token2sents[token] if ((s1, s1_index) in sents_list): sents_list.remove((s1, s1_index)) if ((s2, s2_index) in sents_list): sents_list.remove((s2, s2_index)) if (len(sents_list) < 2): return random.choice(self.neg_tuples) else: corrupt_s = random.choice(list(sents_list)) ind = 0 while self.is_paraphrase(corrupt_s[0], s2): corrupt_s = random.choice(list(sents_list)) ind += 1 if ind > 10: # print("ind", ind) random.choice(self.neg_tuples) break c_tuple = (corrupt_s[0], s2, corrupt_s[1], s2_index) return c_tuple def neg(self, para_tuple): s1 = para_tuple[0] s1_index = para_tuple[2] s2 = para_tuple[1] s2_index = para_tuple[3] s1_token = self.id2sent[s1][s1_index] s2_token = self.id2sent[s2][s2_index] if ((s1_token, s2_token) in self.token_pair2neg_tuples): neg_tuple_id = random.choice( list(self.token_pair2neg_tuples[(s1_token, s2_token)])) neg_tuple = self.neg_tuples[neg_tuple_id] return neg_tuple else: return None def corrupt_n(self, para_tuple, n=2): # in case we use logistic loss, use the corrupt function n times to generate and return n negative samples. Before each corruption, the random seed needs to be reset. corrupt_tuples = [] for i in range(n): random.seed(datetime.now()) corrupt_tuple = self.corrupt(para_tuple) if not corrupt_tuple: return None else: corrupt_tuples.append(corrupt_tuple) return corrupt_tuples def is_synonym(self, token_id1, token_id2): if (token_id1 in self.synonyms(token_id2)): return True else: return False def is_paraphrase(self, sent_id1, sent_id2): if ((sent_id1, sent_id2) in self.paraphrases): return True else: return False def save(self, filename): f = open(filename, 'wb') #self.desc_embed = self.desc_embed_padded = None pickle.dump(self.__dict__, f, pickle.HIGHEST_PROTOCOL) f.close() print("Save data object as", filename) def load(self, filename): f = open(filename, 'rb') tmp_dict = pickle.load(f) self.__dict__.update(tmp_dict) print("Loaded data object from", filename) print( "===============\nCaution: need to reload desc embeddings.\n=====================" )
class PreTrainElmoProcess(object): def __init__(self, path=embedding_path, embedding_dim=512, sentence_len=max_sentence_len, pair_mode=False): embeddings = dict() self.embedding_path = path self.embedding_dim = embedding_dim self.sentence_len = sentence_len self.pair_mode = pair_mode self.embedding_dict = embeddings g_elmo = tf.Graph() vocab_file = './bilmelmo/data/vocab.txt' options_file = './bilmelmo/try/options.json' weight_file = './bilmelmo/try/weights.hdf5' token_embedding_file = './bilmelmo/data/vocab_embedding.hdf5' with tf.Graph().as_default() as g_elmo: self.batcher = TokenBatcher(vocab_file) self.context_token_ids = tf.placeholder('int32', shape=(None, None)) self.bilm = BidirectionalLanguageModel( options_file, weight_file, use_character_inputs=False, embedding_weight_file=token_embedding_file ) self.context_embeddings_op = self.bilm(self.context_token_ids) self.elmo_context_input = weight_layers('input', self.context_embeddings_op, l2_coef=0.0) self.elmo_context_output = weight_layers( 'output', self.context_embeddings_op, l2_coef=0.0 ) init = tf.global_variables_initializer() sess_elmo = tf.Session(graph=g_elmo) sess_elmo.run(init) self.sess_elmo = sess_elmo def encode(self, sentence, **kwargs): if 'pair_mode' in kwargs.keys(): if not isinstance(kwargs['pair_mode'], bool): raise TypeError("mode type must bool!") if 'pair_mode' in kwargs.keys() and kwargs['pair_mode']: try: assert isinstance(sentence, list) except AssertionError: print("sentence must be list!") else: try: assert isinstance(sentence, list) embedding_unk = [0.0 for _ in range(self.embedding_dim)] out_put = [] for sentence_idx, _sentence in enumerate(sentence): context_ids = self.batcher.batch_sentences(list(_sentence)) out_put_tmp = self.sess_elmo.run( [self.elmo_context_input['weighted_op']], feed_dict={self.context_token_ids: context_ids} )[0][0].tolist() for i in range(self.sentence_len - len(out_put_tmp)): out_put_tmp.append(embedding_unk) out_put_tmp = np.stack(out_put_tmp, axis=0) out_put.append(out_put_tmp) return np.stack(out_put, axis=0) except AssertionError: print("sentence must be list!")
apply_ops = optimizer.apply_gradients(gvs) tvars = tf.trainable_variables() acc = tf.metrics.accuracy(labels=tf.argmax(y_label, axis=2), predictions=idx_output) #output model data_path = './NER_models' model_save_name = 'NERModel' final_model = os.path.join(data_path, model_save_name) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) sess.run(tf.local_variables_initializer()) #for tf.metrics ids = batcher.batch_sentences(tokenized_sentences) ids_val = batcher.batch_sentences(tokenized_sentences_val) batch_size = ids.shape[0] // 128 #128 is the max batch_size_val = ids_val.shape[0] // 128 saver = tf.train.Saver() for step in range(1000): for i in range(batch_size + 1): if (i < batch_size): s_index = i * 128 e_index = (i + 1) * 128 ids_i = ids[s_index:e_index] y_i = y[s_index:e_index] elmo_input_ = sess.run(elmo_input['weighted_op'], feed_dict={token_ids: ids_i}) loss_, _ = sess.run([loss, apply_ops], feed_dict={
class Model(object): def __init__(self, config): self.lr = config["lr"] self.input_dropout = config["dropout"] self.lstm_dim = config["lstm_dim"] self.layer_type = config["layer_type"] self.use_attention = config["attention"] self.num_attention_heads = config['num_attention_heads'] self.size_per_head = config['size_per_head'] self.num_tags = 7 self.char_dim = 300 self.global_step = tf.Variable(0, trainable=False) self.best_dev_f1 = tf.Variable(0.0, trainable=False) self.initializer = initializers.xavier_initializer() # elmo self.batcher = TokenBatcher(config['vocab_file']) # Input placeholders to the biLM. self.context_token_ids = tf.placeholder('int32', shape=(None, None)) # Build the biLM graph. self.bilm = BidirectionalLanguageModel( config['options_file'], config['weight_file'], use_character_inputs=False, embedding_weight_file=config['token_embedding_file']) self.context_embeddings_op = self.bilm(self.context_token_ids) self.elmo_context_input = weight_layers('input', self.context_embeddings_op, l2_coef=0.0)['weighted_op'] # add placeholders for the model self.mask_inputs = tf.placeholder(dtype=tf.int32, shape=[None, None], name="ChatInputs") self.targets = tf.placeholder(dtype=tf.int32, shape=[None, None], name="Targets") # dropout keep prob self.dropout = tf.placeholder(dtype=tf.float32, name="Dropout") used = tf.sign(tf.abs(self.mask_inputs)) length = tf.reduce_sum(used, reduction_indices=1) self.lengths = tf.cast(length, tf.int32) self.batch_size = tf.shape(self.mask_inputs)[0] self.num_steps = tf.shape(self.mask_inputs)[-1] self.logits = self.inference(self.elmo_context_input) # loss of the model self.loss = self.loss_layer(self.logits, self.lengths) self.train_op = self.train(self.loss) # saver of the model self.saver = tf.train.Saver(tf.global_variables(), max_to_keep=5) def loss(self, embedding): logits = self.inference(embedding) loss = self.loss_layer(logits, self.logits) return loss def train(self, loss): with tf.variable_scope("optimizer"): opt = tf.train.AdamOptimizer(self.lr) # apply grad clip to avoid gradient explosion grads_vars = opt.compute_gradients(loss) capped_grads_vars = [[tf.clip_by_value(g, -5, 5), v] for g, v in grads_vars] train_op = opt.apply_gradients(capped_grads_vars, self.global_step) return train_op def single_biLSTM_layer(self, model_inputs, lstm_dim, lengths): """ :param lstm_inputs: [batch_size, num_steps, emb_size] :return: [batch_size, num_steps, 2*lstm_dim] """ with tf.variable_scope("first_layer"): first_fw_lstm_cell = tf.nn.rnn_cell.LSTMCell(lstm_dim, state_is_tuple=True) # 添加dropout.为了防止过拟合,在它的隐层添加了 dropout 正则 first_fw_lstm_cell = tf.nn.rnn_cell.DropoutWrapper( first_fw_lstm_cell, output_keep_prob=self.dropout) first_bw_lstm_cell = tf.nn.rnn_cell.LSTMCell(lstm_dim, state_is_tuple=True) first_bw_lstm_cell = tf.nn.rnn_cell.DropoutWrapper( first_bw_lstm_cell, output_keep_prob=self.dropout) first_outputs, _ = tf.nn.bidirectional_dynamic_rnn( first_fw_lstm_cell, first_bw_lstm_cell, model_inputs, sequence_length=lengths, dtype=tf.float32) output = tf.concat(first_outputs, -1) return output def concat_biLSTM_layer(self, model_inputs, lstm_dim, lengths): """ :param lstm_inputs: [batch_size, num_steps, emb_size] :return: [batch_size, num_steps, 2*lstm_dim] """ with tf.variable_scope("first_layer"): first_fw_lstm_cell = tf.nn.rnn_cell.LSTMCell(lstm_dim, state_is_tuple=True) # 添加dropout.为了防止过拟合,在它的隐层添加了 dropout 正则 first_fw_lstm_cell = tf.nn.rnn_cell.DropoutWrapper( first_fw_lstm_cell, output_keep_prob=self.dropout) first_bw_lstm_cell = tf.nn.rnn_cell.LSTMCell(lstm_dim, state_is_tuple=True) first_bw_lstm_cell = tf.nn.rnn_cell.DropoutWrapper( first_bw_lstm_cell, output_keep_prob=self.dropout) first_outputs, _ = tf.nn.bidirectional_dynamic_rnn( first_fw_lstm_cell, first_bw_lstm_cell, model_inputs, sequence_length=lengths, dtype=tf.float32) first_layer_output = tf.concat(first_outputs, -1) with tf.variable_scope("second_layer"): second_fw_lstm_cell = tf.nn.rnn_cell.LSTMCell(lstm_dim, state_is_tuple=True) # 添加dropout.为了防止过拟合,在它的隐层添加了 dropout 正则 second_fw_lstm_cell = tf.nn.rnn_cell.DropoutWrapper( second_fw_lstm_cell, output_keep_prob=self.dropout) second_bw_lstm_cell = tf.nn.rnn_cell.LSTMCell(lstm_dim, state_is_tuple=True) second_bw_lstm_cell = tf.nn.rnn_cell.DropoutWrapper( second_bw_lstm_cell, output_keep_prob=self.dropout) second_outputs, _ = tf.nn.bidirectional_dynamic_rnn( second_fw_lstm_cell, second_bw_lstm_cell, first_layer_output, sequence_length=lengths, dtype=tf.float32) second_layer_output = tf.concat(second_outputs, -1) return tf.concat([first_layer_output, second_layer_output], axis=-1) def stack_biLSTM_layer(self, model_inputs, lstm_dim, lengths): """ :param lstm_inputs: [batch_size, num_steps, emb_size] :return: [batch_size, num_steps, 2*lstm_dim] """ fw_lstms, bw_lstms = [], [] for _ in range(2): fw_lstm_cell = tf.nn.rnn_cell.LSTMCell(lstm_dim, state_is_tuple=True) # 添加dropout.为了防止过拟合,在它的隐层添加了 dropout 正则 fw_lstm_cell = tf.nn.rnn_cell.DropoutWrapper( fw_lstm_cell, output_keep_prob=self.dropout) fw_lstms.append(fw_lstm_cell) bw_lstm_cell = tf.nn.rnn_cell.LSTMCell(lstm_dim, state_is_tuple=True) bw_lstm_cell = tf.nn.rnn_cell.DropoutWrapper( bw_lstm_cell, output_keep_prob=self.dropout) bw_lstms.append(bw_lstm_cell) outputs, _, _ = tf.contrib.rnn.stack_bidirectional_dynamic_rnn( fw_lstms, bw_lstms, model_inputs, sequence_length=lengths, dtype=tf.float32) return outputs def project_layer_bilstm(self, lstm_outputs, num): """ hidden layer between lstm layer and logits :param lstm_outputs: [batch_size, num_steps, emb_size] :return: [batch_size, num_steps, num_tags] """ with tf.variable_scope("project"): with tf.variable_scope("attention"): if self.use_attention: attention_outputs = attention_layer( lstm_outputs, lstm_outputs, self.mask_inputs, self.num_attention_heads, self.size_per_head) else: attention_outputs = lstm_outputs with tf.variable_scope("hidden"): if self.use_attention: w_shape = [ self.num_attention_heads * self.size_per_head, self.lstm_dim ] output_shape = [ -1, self.num_attention_heads * self.size_per_head ] else: w_shape = [self.lstm_dim * num, self.lstm_dim] output_shape = [-1, self.lstm_dim * num] W = tf.get_variable("W", shape=w_shape, dtype=tf.float32, initializer=self.initializer) b = tf.get_variable("b", shape=[self.lstm_dim], dtype=tf.float32, initializer=tf.zeros_initializer()) output = tf.reshape(attention_outputs, shape=output_shape) hidden = tf.tanh(tf.nn.xw_plus_b(output, W, b)) # project to score of tags with tf.variable_scope("logits"): W = tf.get_variable("W", shape=[self.lstm_dim, self.num_tags], dtype=tf.float32, initializer=self.initializer) b = tf.get_variable("b", shape=[self.num_tags], dtype=tf.float32, initializer=tf.zeros_initializer()) pred = tf.nn.xw_plus_b(hidden, W, b) return tf.reshape(pred, [-1, self.num_steps, self.num_tags]) def inference(self, embedding): model_inputs = tf.nn.dropout(embedding, self.dropout) if self.layer_type == 'single': model_outputs = self.single_biLSTM_layer(model_inputs, self.lstm_dim, self.lengths) logits = self.project_layer_bilstm(model_outputs, 2) elif self.layer_type == 'stack': model_outputs = self.stack_biLSTM_layer(model_inputs, self.lstm_dim, self.lengths) logits = self.project_layer_bilstm(model_outputs, 2) else: model_outputs = self.concat_biLSTM_layer(model_inputs, self.lstm_dim, self.lengths) logits = self.project_layer_bilstm(model_outputs, 4) return logits def loss_layer(self, project_logits, lengths): """ calculate crf loss :param project_logits: [1, num_steps, num_tags] :return: scalar loss """ with tf.variable_scope("crf_loss"): small = -1000.0 # pad logits for crf loss start_logits = tf.concat([ small * tf.ones(shape=[self.batch_size, 1, self.num_tags]), tf.zeros(shape=[self.batch_size, 1, 1]) ], axis=-1) pad_logits = tf.cast( small * tf.ones([self.batch_size, self.num_steps, 1]), tf.float32) logits = tf.concat([project_logits, pad_logits], axis=-1) logits = tf.concat([start_logits, logits], axis=1) targets = tf.concat([ tf.cast(self.num_tags * tf.ones([self.batch_size, 1]), tf.int32), self.targets ], axis=-1) self.trans = tf.get_variable( "transitions", shape=[self.num_tags + 1, self.num_tags + 1], initializer=self.initializer) log_likelihood, self.trans = crf_log_likelihood( inputs=logits, tag_indices=targets, transition_params=self.trans, sequence_lengths=lengths + 1) loss = tf.reduce_mean(-log_likelihood) return loss def create_feed_dict(self, is_train, batch): str_input, masks, tags = batch token_ids = self.batcher.batch_sentences(str_input) feed_dict = { self.context_token_ids: np.asarray(token_ids), self.mask_inputs: np.asarray(masks), self.dropout: 1.0 } if is_train: feed_dict[self.targets] = np.asarray(tags) feed_dict[self.dropout] = self.input_dropout return feed_dict def run_step(self, sess, is_train, batch): """ :param sess: session to run the batch :param is_train: a flag indicate if it is a train batch :param batch: a dict containing batch data :return: batch result, loss of the batch or logits """ feed_dict = self.create_feed_dict(is_train, batch) if is_train: global_step, loss, _ = sess.run( [self.global_step, self.loss, self.train_op], feed_dict) return global_step, loss else: lengths, logits = sess.run([self.lengths, self.logits], feed_dict) return lengths, logits def decode(self, logits, lengths, matrix): """ :param logits: [batch_size, num_steps, num_tags]float32, logits :param lengths: [batch_size]int32, real length of each sequence :param matrix: transaction matrix for inference :return: """ # inference final labels usa viterbi Algorithm paths = [] small = -1000.0 start = np.asarray([[small] * self.num_tags + [0]]) for score, length in zip(logits, lengths): score = score[:length] pad = small * np.ones([length, 1]) logits = np.concatenate([score, pad], axis=1) logits = np.concatenate([start, logits], axis=0) path, _ = viterbi_decode(logits, matrix) paths.append(path[1:]) return paths def evaluate(self, sess, data_manager): results = [] trans = self.trans.eval() for batch in data_manager.iter_batch(): strings = batch[0] tags = batch[-1] lengths, scores = self.run_step(sess, False, batch) batch_paths = self.decode(scores, lengths, trans) for i in range(len(strings)): result = [] string = strings[i][:lengths[i]] gold = [[int(x)] for x in tags[i][:lengths[i]]] pred = [[int(x)] for x in batch_paths[i][:lengths[i]]] for char, gold, pred in zip(string, gold, pred): result.append([char, gold, pred]) results.append(result) return results