def main(): args = _parse_args() # vocab = set(['<S>', '</S>']) # braces = re.compile('\(.*\)') # preproc = [] # max_len = 0 # with open(args.input, 'r') as src: # for line in src: # try: # docid, text = line.split('\t', 1) # text = re.sub(braces, '', text) or text # sentence = sent_tokenize(text)[0] # tokens = word_tokenize(sentence) # max_len = max(len(tokens), max_len) # preproc.append(docid + '\t' + ' '.join(tokens[:20]) + '\n') # for token in tokens: # vocab.add(token) # except IndexError: # pass # print(max_len) # with open(args.input + '_prep', 'w+') as dst: # dst.writelines(preproc) # with open(args.output, 'w+') as bilm_handle: # bilm_handle.write('\n'.join(vocab)) options_file = "/Users/asntr/Projects/university/course_work/end2end_neural_el/data/basic_data/elmo/elmo_2x1024_128_2048cnn_1xhighway_options.json" weight_file = "/Users/asntr/Projects/university/course_work/end2end_neural_el/data/basic_data/elmo/elmo_2x1024_128_2048cnn_1xhighway_weights.hdf5" token_embedding_file = "/Users/asntr/Projects/university/course_work/end2end_neural_el/data/vocabulary/" + 'embeddings.hdf5' dump_token_embeddings(args.output, options_file, weight_file, token_embedding_file)
def _genElmoEmbedding(self): """ 调用ELMO源码中的dump_token_embeddings方法,基于字符的表示生成词的向量表示。并保存成hdf5文件,文件中的"embedding"键对应的value就是 词汇表文件中各词汇的向量表示,这些词汇的向量表示之后会作为BiLM的初始化输入。 """ dump_token_embeddings(self._vocabFile, self._optionFile, self._weightFile, self._tokenEmbeddingFile)
def __init__(self): self.vocab_file = 'vocab_small.txt' # Location of pretrained LM. Here we use the test fixtures. datadir = os.path.join('pretrained') options_file = os.path.join( datadir, 'elmo_2x4096_512_2048cnn_2xhighway_5.5B_options.json') weight_file = os.path.join( datadir, 'elmo_2x4096_512_2048cnn_2xhighway_5.5B_weights.hdf5') # Dump the token embeddings to a file. Run this once for your dataset. token_embedding_file = 'elmo_token_embeddings.hdf5' dump_token_embeddings(self.vocab_file, options_file, weight_file, token_embedding_file) self.batcher = TokenBatcher(self.vocab_file) # Input placeholders to the biLM. self.context_token_ids = tf.placeholder('int32', shape=(None, None)) # Build the biLM graph. bilm = BidirectionalLanguageModel( options_file, weight_file, use_character_inputs=False, embedding_weight_file=token_embedding_file) # Get ops to compute the LM embeddings. context_embeddings_op = bilm(self.context_token_ids) self.elmo_context_input = weight_layers('input', context_embeddings_op, l2_coef=0.0) self.elmo_context_output = weight_layers('output', context_embeddings_op, l2_coef=0.0)
def main(): parser = argparse.ArgumentParser() parser.add_argument('--vocab', help='', required=True) parser.add_argument('--weight', help='', required=True) args = parser.parse_args() # Dump the token embeddings to a file. Run this once for your dataset. options_file = filename_variation(args.weight, 'options').replace('.hdf5', '.json') token_embedding_file = filename_variation(args.weight, 'token_embedding') print(f'output file: {token_embedding_file}') dump_token_embeddings(args.vocab, options_file, args.weight, token_embedding_file)
import os import h5py from bilm import TokenBatcher, BidirectionalLanguageModel, weight_layers, dump_token_embeddings vocab_file = 'vocab.txt' # Location of pretrained LM. Here we use the test fixtures. datadir = 'kaggle_data' vocab_file = os.path.join(datadir, 'vocab.txt') options_file = os.path.join(datadir, 'options.json') weight_file = os.path.join(datadir, 'elmo_2x1024_128_2048cnn_1xhighway_weights.hdf5') # Dump the embeddings to a file. Run this once for your dataset. token_embedding_file = 'kaggle_elmo_token_SMALL.hdf5' dump_token_embeddings(vocab_file, options_file, weight_file, token_embedding_file) import tensorflow as tf tf.reset_default_graph() import data as dt #split trainset and validationset alen = len(dt.X) val_ratio = 0.1 val_len = int(alen * val_ratio) tokenized_sentences = dt.X[:-val_len] y = dt.y[:-val_len]
parser = argparse.ArgumentParser() parser.add_argument('-i', '--input-file') parser.add_argument('--weights-file') parser.add_argument('--options-file') parser.add_argument('-o', '--output-file', default='elmo_token_embeddings.hdf5') args = parser.parse_args() vocab_file = 'elmo_vocab.txt' with open(args.input_file, 'r') as fin: with open(vocab_file, 'w') as fout: for line in fin: token = line.strip() if token == '[UNK]': token = '<UNK>' elif token == '[START]': token = '<S>' elif token == '[STOP]': token = '</S>' elif token == '[PAD]': token = '<PAD>' fout.write(token) fout.write('\n') dump_token_embeddings(vocab_file, args.options_file, args.weights_file, args.output_file)
def main(): parser = ArgumentParser() parser.add_argument('--options-file', '-o', type=str, default="", help="elmo option file") parser.add_argument('--weight-file', '-w', type=str, default="", help="elmo weight file") parser.add_argument('--train-file', '-t', type=str, default="", help="training data") parser.add_argument('--dev-file', '-d', type=str, default="", help="dev data") parser.add_argument('--gpu', '-g', type=int, default="-1", help="gpu") parser.add_argument('--vocab-file', '-v', type=str, default="", help="vocab file") parser.add_argument('--token-embedding-file', '-e', type=str, default="", help="embedding file") args = parser.parse_args() # -o ../../../test_elmo/src/elmo-chainer/elmo_2x4096_512_2048cnn_2xhighway_options.json -w ../../../test_elmo/src/elmo-chainer/elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5 -t ../../data/datasets/SQuAD-train-v1.1-processed-spacy.txt -d ../../data/datasets/SQuAD-dev-v1.1-processed-spacy.txt -g=5 -v ../../data/embeddings/elmo/vocab_squad_1_1.txt -e ../../data/embeddings/elmo/token_embedding_squad_1_1.hdf5 all_tokens = ['<S>', '</S>'] with open(args.train_file) as f: json_list = [json.loads(line) for line in f] pbar = ProgressBar() for json_item in pbar(json_list): for token in json_item["document"]: if token not in all_tokens: all_tokens.append(token) for token in json_item["question"]: if token not in all_tokens: all_tokens.append(token) with open(args.dev_file) as f: json_list = [json.loads(line) for line in f] pbar = ProgressBar() for json_item in pbar(json_list): for token in json_item["document"]: if token not in all_tokens: all_tokens.append(token) for token in json_item["question"]: if token not in all_tokens: all_tokens.append(token) # vocab_file = 'vocab_squad1_1.txt' with open(args.vocab_file, 'w') as fout: fout.write('\n'.join(all_tokens)) # Location of pretrained LM. # options_file = 'elmo_2x4096_512_2048cnn_2xhighway_options.json' # weight_file = 'elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5' # Dump the token embeddings to a file. Run this once for your dataset. # token_embedding_file = 'elmo_token_embeddings_squad1_1.hdf5' # gpu id # if you want to use cpu, set gpu=-1 # gpu = -1 # batchsize # encoding each token is inefficient # encoding too many tokens is difficult due to memory batchsize = 64 dump_token_embeddings(args.vocab_file, args.options_file, args.weight_file, args.token_embedding_file, gpu=args.gpu, batchsize=batchsize)
def getElmoEmbedding(self): dump_token_embeddings(self.vocab_file,self.option_file,self.weight_file,self.tokenEmbeddingFile)
weight_file = 'elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5' # Dump the token embeddings to a file. Run this once for your dataset. token_embedding_file = 'elmo_token_embeddings.hdf5' # gpu id # if you want to use cpu, set gpu=-1 gpu = -1 # batchsize # encoding each token is inefficient # encoding too many tokens is difficult due to memory batchsize = 64 dump_token_embeddings(vocab_file, options_file, weight_file, token_embedding_file, gpu=gpu, batchsize=batchsize) ########################################### """ Differences from usage of character-elmo are only simple two points: 1. use TokenBatcher(vocab_file) instead of Batcher(vocab_file) 2. add token_embedding_file and token batcher for Elmo instantiation """ # Create a TokenBatcher to map text to token ids. batcher = TokenBatcher(vocab_file) # REQUIRED # Build the Elmo with biLM and weight layers. elmo = Elmo(
def get_vocab(config): print("Get the vocabulary...") word_counter, char_counter = Counter(), Counter() pos_counter, ner_counter, label_counter = Counter(), Counter(), Counter() files = [(config.train_para_file, config.train_question_file), (config.dev_para_file, config.dev_question_file)] for para_file, que_file in files: with open("{}.tok".format(para_file), 'r') as fp, open("{}.tok".format(que_file), 'r') as fq, \ open("{}.pos".format(para_file), 'r') as fpp, open("{}.pos".format(que_file), 'r') as fqp, \ open("{}.ner".format(para_file), 'r') as fpn, open("{}.ner".format(que_file), 'r') as fqn, \ open("{}.label".format(para_file), 'r') as fpl: while True: para, question = fp.readline(), fq.readline() pos, que_pos = fpp.readline(), fqp.readline() ner, que_ner = fpn.readline(), fqn.readline() label = fpl.readline() if not question or not para: break if config.lower_word: para = para.lower() question = question.lower() para_tokens = para.strip().split(' ') que_tokens = question.strip().split(' ') pos_tags = pos.strip().split(' ') ner_tags = ner.strip().split(' ') que_pos_tags = que_pos.strip().split(' ') que_ner_tags = que_ner.strip().split(' ') labels = label.strip().split(' ') for token in para_tokens + que_tokens: word_counter[token] += 1 for char in list(token): char_counter[char] += 1 for pos_tag in pos_tags + que_pos_tags: pos_counter[pos_tag] += 1 for ner_tag in ner_tags + que_ner_tags: ner_counter[ner_tag] += 1 for label in labels: label_counter[label] += 1 word_emb_mat, word2idx_dict, unk_num = get_word_embedding( word_counter, emb_file=config.glove_word_file, emb_size=config.glove_word_size, vocab_size=config.vocab_size_limit, vec_size=config.glove_dim, vocab_file=config.vocab_file) char_emb_mat, char2idx_dict = get_tag_embedding(char_counter, "char", vec_size=config.char_dim) pos_emb_mat, pos2idx_dict = get_tag_embedding(pos_counter, "pos", vec_size=config.pos_dim) ner_emb_mat, ner2idx_dict = get_tag_embedding(ner_counter, "ner", vec_size=config.ner_dim) label_emb_mat, label2idx_dict = get_tag_embedding( label_counter, "label", vec_size=config.label_dim) print("{} out of {} are not in glove".format(unk_num, len(word2idx_dict))) print("{} chars".format(char_emb_mat.shape[0])) print("{} pos tags, {} ner tags, {} answer labels, {} chars".format( pos_emb_mat.shape[0], ner_emb_mat.shape[0], label_emb_mat.shape[0], char_emb_mat.shape[0])) save(config.word_emb_file, word_emb_mat, message="word embedding") save(config.char_emb_file, char_emb_mat, message="char embedding") save(config.pos_emb_file, pos_emb_mat, message="pos embedding") save(config.ner_emb_file, ner_emb_mat, message="ner embedding") save(config.label_emb_file, label_emb_mat, message="label embedding") save(config.word_dictionary, word2idx_dict, message="word dictionary") save(config.char_dictionary, char2idx_dict, message="char dictionary") save(config.pos_dictionary, pos2idx_dict, message="pos dictionary") save(config.ner_dictionary, ner2idx_dict, message="ner dictionary") save(config.label_dictionary, label2idx_dict, message="label dictionary") print("Dump elmo word embedding...") token_embedding_file = config.embedding_file dump_token_embeddings(config.vocab_file, config.elmo_options_file, config.elmo_weight_file, token_embedding_file)
def _make_dump_token_embeddings(self): dump_token_embeddings(self.vocab_path, self.elmo_options_file, self.elmo_weight_file, self.token_embedding_file)
options_file = 'elmo_2x4096_512_2048cnn_2xhighway_options.json' weight_file = 'elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5' # Dump the token embeddings to a file. Run this once for your dataset. token_embedding_file = 'elmo_token_embeddings.hdf5' # gpu id # if you want to use cpu, set gpu=-1 gpu = -1 # batchsize # encoding each token is inefficient # encoding too many tokens is difficult due to memory batchsize = 64 dump_token_embeddings( vocab_file, options_file, weight_file, token_embedding_file, gpu=gpu, batchsize=batchsize ) ########################################### """ Differences from usage of character-elmo are only simple two points: 1. use TokenBatcher(vocab_file) instead of Batcher(vocab_file) 2. add token_embedding_file and token batcher for Elmo instantiation """ # Create a TokenBatcher to map text to token ids. batcher = TokenBatcher(vocab_file) # REQUIRED # Build the Elmo with biLM and weight layers. elmo = Elmo( options_file,
for context_sentence in tokenized_context: for token in context_sentence: all_tokens.add(token) vocab_file = 'vocab_small.txt' with open(vocab_file, 'w') as fout: fout.write('\n'.join(all_tokens)) # Location of pretrained LM. Here we use the test fixtures. datadir = os.path.join('tests', 'fixtures', 'model') options_file = os.path.join(datadir, 'options.json') weight_file = os.path.join(datadir, 'lm_weights.hdf5') # Dump the token embeddings to a file. Run this once for your dataset. token_embedding_file = 'elmo_token_embeddings.hdf5' dump_token_embeddings( vocab_file, options_file, weight_file, token_embedding_file ) tf.reset_default_graph() ## Now we can do inference. # Create a TokenBatcher to map text to token ids. batcher = TokenBatcher(vocab_file) # Input placeholders to the biLM. context_token_ids = tf.placeholder('int32', shape=(None, None)) question_token_ids = tf.placeholder('int32', shape=(None, None)) # Build the biLM graph. bilm = BidirectionalLanguageModel(