def eval(args): paddle.set_device(args.device) if not args.init_from_ckpt: raise ValueError('init_from_ckpt should be set when eval.') vocab = load_vocab(args.vocab_file, args.max_characters_per_token) elmo = ELMo(args.batch_size, args.char_embed_dim, args.projection_dim, vocab.size, dropout=args.dropout, num_layers=args.num_layers, num_highways=args.num_highways, char_vocab_size=vocab.char_size) elmo.eval() elmo_loss = ELMoLoss() # Loads pre-trained parameters. weight_state_dict = paddle.load(args.init_from_ckpt + '.pdparams') elmo.set_state_dict(weight_state_dict) print("Loaded checkpoint from %s" % args.init_from_ckpt) dev_dataset = OneBillionWordDataset(args.dev_data_path, vocab, args.batch_size, args.unroll_steps, mode='test', shuffle=False, seed=args.seed) dev_dataloader = DataLoader(dev_dataset, return_list=True, batch_size=None) total_step = total_loss = 0 total_time = 0.0 batch_start_time = time.time() for step, inputs in enumerate(dev_dataloader, start=1): ids, next_ids, ids_reverse, next_ids_reverse = inputs outputs = elmo([ids, ids_reverse]) loss = elmo_loss(outputs, [next_ids, next_ids_reverse]) ppl = paddle.exp(loss) total_loss += loss.numpy()[0] total_step += 1 total_time += (time.time() - batch_start_time) if step % args.log_freq == 0: print("Eval step %d - loss: %.4f - Perplexity: %.4f - %.3fs/step" % (step, loss.numpy()[0] * args.unroll_steps, ppl.numpy()[0], total_time / args.log_freq)) total_time = 0.0 batch_start_time = time.time() avg_loss = total_loss / total_step avg_ppl = math.exp(avg_loss) print("Eval - average loss: %.4f - average Perplexity: %.4f" % (avg_loss * args.unroll_steps, avg_ppl))
def __init__(self, model, options, vocab, nnvecs=1): self.word_counts, words, chars, pos, cpos, rels, treebanks, langs = vocab self.model = model self.nnvecs = nnvecs # Load ELMo if the option is set if options.elmo is not None: from elmo import ELMo self.elmo = ELMo(options.elmo, options.elmo_gamma, options.elmo_learn_gamma) self.elmo.init_weights(model) else: self.elmo = None extra_words = 2 # MLP padding vector and OOV vector self.words = {word: ind for ind, word in enumerate(words, extra_words)} self.word_lookup = self.model.add_lookup_parameters( (len(self.words) + extra_words, options.word_emb_size)) extra_pos = 2 # MLP padding vector and OOV vector self.pos = {pos: ind for ind, pos in enumerate(cpos, extra_pos)} self.pos_lookup = self.model.add_lookup_parameters( (len(cpos) + extra_pos, options.pos_emb_size)) self.irels = rels self.rels = {rel: ind for ind, rel in enumerate(rels)} extra_chars = 1 # OOV vector self.chars = {char: ind for ind, char in enumerate(chars, extra_chars)} self.char_lookup = self.model.add_lookup_parameters( (len(chars) + extra_chars, options.char_emb_size)) extra_treebanks = 1 # Padding vector self.treebanks = { treebank: ind for ind, treebank in enumerate(treebanks, extra_treebanks) } self.treebank_lookup = self.model.add_lookup_parameters( (len(treebanks) + extra_treebanks, options.tbank_emb_size)) # initialise word vectors with external embeddings where they exist # This part got ugly - TODO: refactor if not options.predict: self.external_embedding = defaultdict(lambda: {}) if options.ext_word_emb_file and options.word_emb_size > 0: # Load pre-trained word embeddings for lang in langs: embeddings = utils.get_external_embeddings( options, emb_file=options.ext_word_emb_file, lang=lang, words=self.words.viewkeys()) self.external_embedding["words"].update(embeddings) if options.ext_char_emb_file and options.char_emb_size > 0: # Load pre-trained character embeddings for lang in langs: embeddings = utils.get_external_embeddings( options, emb_file=options.ext_char_emb_file, lang=lang, words=self.chars, chars=True) self.external_embedding["chars"].update(embeddings) if options.ext_emb_dir: # For every language, load the data for the word and character # embeddings from a directory. for lang in langs: if options.word_emb_size > 0: embeddings = utils.get_external_embeddings( options, emb_dir=options.ext_emb_dir, lang=lang, words=self.words.viewkeys()) self.external_embedding["words"].update(embeddings) if options.char_emb_size > 0: embeddings = utils.get_external_embeddings( options, emb_dir=options.ext_emb_dir, lang=lang, words=self.chars, chars=True) self.external_embedding["chars"].update(embeddings) self.init_lookups(options) elmo_emb_size = self.elmo.emb_dim if self.elmo else 0 self.lstm_input_size = ( options.word_emb_size + elmo_emb_size + options.pos_emb_size + options.tbank_emb_size + 2 * (options.char_lstm_output_size if options.char_emb_size > 0 else 0) ) print "Word-level LSTM input size: " + str(self.lstm_input_size) self.bilstms = [] if options.no_bilstms > 0: self.bilstms.append( BiLSTM(self.lstm_input_size, options.lstm_output_size, self.model, dropout_rate=0.33)) for i in range(1, options.no_bilstms): self.bilstms.append( BiLSTM(2 * options.lstm_output_size, options.lstm_output_size, self.model, dropout_rate=0.33)) #used in the PaddingVec self.word2lstm = self.model.add_parameters( (options.lstm_output_size * 2, self.lstm_input_size)) self.word2lstmbias = self.model.add_parameters( (options.lstm_output_size * 2)) else: self.word2lstm = self.model.add_parameters( (self.lstm_input_size, self.lstm_input_size)) self.word2lstmbias = self.model.add_parameters( (self.lstm_input_size)) self.char_bilstm = BiLSTM(options.char_emb_size, options.char_lstm_output_size, self.model, dropout_rate=0.33) self.charPadding = self.model.add_parameters( (options.char_lstm_output_size * 2))
def eval(): paddle.disable_static() n_gpus = dist.get_world_size() rank = dist.get_rank() if n_gpus > 1: dist.init_parallel_env() args = parse_args() if not args.init_from_ckpt: raise ValueError('init_from_ckpt should be set when eval.') vocab = load_vocab(args.vocab_file, args.max_characters_per_token) elmo = ELMo(args.batch_size, args.char_embed_dim, args.projection_dim, vocab.size, dropout=args.dropout, num_layers=args.num_layers, num_highways=args.num_highways, char_vocab_size=vocab.char_size) if n_gpus > 1: elmo = paddle.DataParallel(elmo) elmo.eval() elmo_loss = ELMoLoss() # Loads pre-trained parameters. weight_state_dict = paddle.load(args.init_from_ckpt + '.pdparams') elmo.set_state_dict(weight_state_dict) print("Loaded checkpoint from %s" % args.init_from_ckpt) dev_dataset = OneBillionWordDataset(args.dev_data_path, vocab, args.batch_size, args.unroll_steps, n_gpus, rank, mode='test', shuffle=False, seed=args.random_seed) # FIXME(xiemoyuan): When DataLoader support setting batch_size to None, # setting batch_size to None. dev_dataloader = DataLoader(dev_dataset, return_list=True, batch_size=1) total_step = total_loss = 0 total_time = 0.0 batch_start_time = time.time() for step, inputs in enumerate(dev_dataloader, start=1): # FIXME(xiemoyuan): When DataLoader support setting batch_size to None, # deleting the operation of squeeze. for j in range(len(inputs)): inputs[j] = paddle.squeeze(inputs[j], axis=0) ids, next_ids, ids_reverse, next_ids_reverse = inputs outputs = elmo([ids, ids_reverse]) loss = elmo_loss(outputs, [next_ids, next_ids_reverse]) ppl = paddle.exp(loss) total_loss += loss.numpy()[0] total_step += 1 total_time += (time.time() - batch_start_time) if rank == 0: if step % args.log_freq == 0: print( "Eval step %d - loss: %.4f - Perplexity: %.4f - %.3fs/step" % (step, loss.numpy()[0] * args.unroll_steps, ppl.numpy()[0], total_time / args.log_freq)) total_time = 0.0 batch_start_time = time.time() avg_loss = total_loss / total_step avg_ppl = math.exp(avg_loss) if rank == 0: print("Eval - average loss: %.4f - average Perplexity: %.4f" % (avg_loss * args.unroll_steps, avg_ppl))
from tensorflow.contrib import seq2seq from elmo import ELMo from data import NERData import os total_epoch = 5000 hidden_size = 200 vocab_size = 5000 max_length = 128 entity_class = 8 lr = 1e-4 batch_size = 256 ner = NERData(batch_size, max_length) elmo = ELMo(batch_size, hidden_size, vocab_size) def network(X): w = tf.get_variable("fcn_w", [1, hidden_size, entity_class]) b = tf.get_variable("fcn_b", [entity_class]) w_tile = tf.tile(w, [batch_size, 1, 1]) logists = tf.nn.softmax(tf.nn.xw_plus_b(X, w_tile, b), name="logists") return logists def train(): X = tf.placeholder(shape=[batch_size, max_length], dtype=tf.int32, name="X")
for _ in range(len(SENT2VEC)) ] for i in range(len(SENT2VEC)): s2vsingle[i].load_state(SENT2VEC[i]) s2vsingle[i].set_w2v_path(PATH_TO_W2V) s2vsingle[i] = s2vsingle[i].cuda() sent2vec = Sent2Vec(s2vsingle, 'concat') params_model = {'bsize': 64, 'pool_type': 'mean', 'which_layer': 'all', 'optfile': ELMO_OPTIONS, 'wgtfile': ELMO_WEIGHT} elmo = ELMo(params_model) elmo = elmo.cuda() gensen_1 = GenSenSingle( model_folder=FOLDER_PATH, filename_prefix=PREFIX1, pretrained_emb=PRETRAIN_EMB, cuda=True ) gensen_2 = GenSenSingle( model_folder=FOLDER_PATH, filename_prefix=PREFIX2, pretrained_emb=PRETRAIN_EMB, cuda=True ) gensen = GenSen(gensen_1, gensen_2)
# The list contains train, valid and test dataset. Each tuple in the list is the dataset of word tokens and the dataset of character tokens datasets, field_word, field_char = gen_language_model_corpus(WikiText2) train_data, valid_data, test_data = datasets VOCAB_DIM = len(field_char.vocab) OUTPUT_DIM = len(field_word.vocab) # OTHER HYPER-PARAMETERS BATCH_SIZE = 32 N_EPOCHS = 100 CLIP = 1 best_valid_loss = float('inf') # PAD_IDX = field_word.vocab.stoi["<pad>"] # PAD token for word, NOT CHAR model = ELMo(VOCAB_DIM, OUTPUT_DIM, CHAR_EMB_DIM, HID_DIM, PRJ_DIM, FILTERS, CHAR_LEN, N_LAYERS).to(DEVICE) # Initialize model.init_weights() print(f'The model has {count_parameters(model):,} trainable parameters') import time # criterion = cal_loss criterion = nn.CrossEntropyLoss() optimizer = optim.Adam(model.parameters()) train_losses = [] test_losses = [] for epoch in range(1, N_EPOCHS + 1):
def train(args): paddle.set_device(args.device) n_procs = dist.get_world_size() rank = dist.get_rank() if n_procs > 1: dist.init_parallel_env() vocab = load_vocab(args.vocab_file, args.max_characters_per_token) elmo = ELMo(args.batch_size, args.char_embed_dim, args.projection_dim, vocab.size, dropout=args.dropout, num_layers=args.num_layers, num_highways=args.num_highways, char_vocab_size=vocab.char_size) if n_procs > 1: elmo = paddle.DataParallel(elmo) elmo.train() gloabl_norm_clip = nn.ClipGradByGlobalNorm(args.max_grad_norm) optimizer = paddle.optimizer.Adagrad(learning_rate=args.lr, parameters=elmo.parameters(), initial_accumulator_value=1.0, grad_clip=gloabl_norm_clip) elmo_loss = ELMoLoss() # Loads pre-trained parameters. if args.init_from_ckpt: weight_state_dict = paddle.load(args.init_from_ckpt + '.pdparams') opt_state_dict = paddle.load(args.init_from_ckpt + '.pdopt') elmo.set_state_dict(weight_state_dict) optimizer.set_state_dict(opt_state_dict) print("Loaded checkpoint from %s" % args.init_from_ckpt) train_dataset = OneBillionWordDataset(args.train_data_path, vocab, args.batch_size, args.unroll_steps, n_procs=n_procs, rank=rank, mode='train', shuffle=True, seed=args.seed) train_dataloader = DataLoader(train_dataset, return_list=True, batch_size=None) n_tokens_per_batch = args.batch_size * args.unroll_steps * n_procs n_steps_per_epoch = int(train_dataset.number_of_tokens / n_tokens_per_batch) n_steps_total = args.epochs * n_steps_per_epoch print("Training for %s epochs and %s steps" % (args.epochs, n_steps_total)) total_time = 0.0 batch_start_time = time.time() for step, inputs in enumerate(train_dataloader, start=1): ids, next_ids, ids_reverse, next_ids_reverse = inputs outputs = elmo([ids, ids_reverse]) loss = elmo_loss(outputs, [next_ids, next_ids_reverse]) ppl = paddle.exp(loss) loss *= args.unroll_steps loss.backward() optimizer.step() optimizer.clear_grad() total_time += (time.time() - batch_start_time) if step % args.log_freq == 0: print("step %d/%d - loss: %.4f - Perplexity: %.4f - %.3fs/step" % (step, n_steps_total, loss.numpy()[0], ppl.numpy()[0], total_time / args.log_freq)) total_time = 0.0 if rank == 0 and step % args.save_freq == 0: save_params(elmo, optimizer, args.save_dir, step) if step == n_steps_total: # training done if rank == 0: save_params(elmo, optimizer, args.save_dir, 'final') break batch_start_time = time.time()
'epoch_size': 4 } # Set up logger logging.basicConfig(format='%(asctime)s : %(message)s', level=logging.DEBUG) if __name__ == "__main__": # Load InferSent model params_model = { 'bsize': 64, 'pool_type': 'mean', 'which_layer': 'all', 'optfile': OPT_PATH, 'wgtfile': MODEL_PATH } model = ELMo(params_model) params_senteval['elmo'] = model.cuda() se = senteval.engine.SE(params_senteval, batcher, prepare) transfer_tasks = [ 'STS12', 'STS13', 'STS14', 'STS15', 'STS16', 'MR', 'CR', 'MPQA', 'SUBJ', 'SST2', 'SST5', 'TREC', 'MRPC', 'SICKEntailment', 'SICKRelatedness', 'STSBenchmark', 'Length', 'WordContent', 'Depth', 'TopConstituents', 'BigramShift', 'Tense', 'SubjNumber', 'ObjNumber', 'OddManOut', 'CoordinationInversion' ] results_transfer = se.eval(transfer_tasks) print('--------------------------------------------') print('MR [Dev:%.1f/Test:%.1f]' % (results_transfer['MR']['devacc'], results_transfer['MR']['acc']))