def __init__(self, embeddingSize, distinctTagNum, c2vPath, numHidden): self.embeddingSize = embeddingSize self.distinctTagNum = distinctTagNum self.numHidden = numHidden self.c2v = self.load_w2v(c2vPath, FLAGS.embedding_size) self.words = tf.Variable(self.c2v, name="words") layers = [ { 'dilation': 1 }, { 'dilation': 1 }, { 'dilation': 2 }, ] if FLAGS.use_idcnn: self.model = IdCNN(layers, 3, FLAGS.num_hidden, FLAGS.embedding_size, FLAGS.max_sentence_len, FLAGS.num_tags) else: self.model = BiLSTM(FLAGS.num_hidden, FLAGS.max_sentence_len, FLAGS.num_tags) self.trains_params = None self.inp = tf.placeholder(tf.int32, shape=[None, FLAGS.max_sentence_len], name="input_placeholder") pass
class BiLSTM_CRF(nn.Module): def __init__(self, data): super(BiLSTM_CRF, self).__init__() print "build batched lstmcrf..." self.gpu = data.HP_gpu ## add two more label for downlayer lstm, use original label size for CRF label_size = data.label_alphabet_size data.label_alphabet_size += 2 self.lstm = BiLSTM(data) self.crf = CRF(label_size, self.gpu) def neg_log_likelihood_loss(self, gaz_list, word_inputs, biword_inputs, word_seq_lengths, char_inputs, char_seq_lengths, char_seq_recover, batch_label, mask): outs = self.lstm.get_output_score(gaz_list, word_inputs, biword_inputs, word_seq_lengths, char_inputs, char_seq_lengths, char_seq_recover) batch_size = word_inputs.size(0) seq_len = word_inputs.size(1) total_loss = self.crf.neg_log_likelihood_loss(outs, mask, batch_label) scores, tag_seq = self.crf._viterbi_decode(outs, mask) return total_loss, tag_seq def forward(self, gaz_list, word_inputs, biword_inputs, word_seq_lengths, char_inputs, char_seq_lengths, char_seq_recover, mask): outs = self.lstm.get_output_score(gaz_list, word_inputs, biword_inputs, word_seq_lengths, char_inputs, char_seq_lengths, char_seq_recover) batch_size = word_inputs.size(0) seq_len = word_inputs.size(1) scores, tag_seq = self.crf._viterbi_decode(outs, mask) return tag_seq def get_lstm_features(self, gaz_list, word_inputs, biword_inputs, word_seq_lengths, char_inputs, char_seq_lengths, char_seq_recover): return self.lstm.get_lstm_features(gaz_list, word_inputs, biword_inputs, word_seq_lengths, char_inputs, char_seq_lengths, char_seq_recover)
def run_bilstm(embedding_info, train, test, val, dist_func='cosine', early_stopping=True, plot=True): num_units = 64 bilstm = BiLSTM(embedding_info, num_units, dist_func) adam = Adam(learning_rate=0.00001) bilstm.compile(loss='mse', optimizer=adam, metrics=['mse']) batch_size = 64 num_epochs = 100 trained_model = train_model(bilstm, train, val, early_stopping, batch_size, num_epochs) evaluate_model(bilstm, test) if plot: plot_training(trained_model, 'BiLSTM', 'loss')
def __init__(self, vocab_size, emb_size, weight, hidden_size, out_size): """初始化参数: vocab_size:字典的大小 emb_size:词向量的维数 hidden_size:隐向量的维数 out_size:标注的种类 """ super(BiLSTM_CRF, self).__init__() self.bilstm = BiLSTM(vocab_size, emb_size, weight, hidden_size, out_size) # CRF实际上就是多学习一个转移矩阵 [out_size, out_size] 初始化为均匀分布 self.transition = nn.Parameter( torch.ones(out_size, out_size) * 1 / out_size)
def __init__(self, embeddingSize, distinctTagNum, c2vPath, numHidden): self.embeddingSize = embeddingSize self.distinctTagNum = distinctTagNum self.numHidden = numHidden num_shards = FLAGS.num_shards self.c2v = self.load_w2v(num_shards, c2vPath, FLAGS.embedding_size) self.words = [] with tf.device("/cpu:0"): for i in range(0, num_shards): words_i = tf.get_variable(name="words-%02d" % i, initializer=tf.random_uniform( self.c2v[i].shape, minval=-0.1, maxval=0.1), trainable=False) self.words.append(words_i) layers = [ { 'dilation': 1 }, { 'dilation': 1 }, { 'dilation': 2 }, ] if FLAGS.use_idcnn: self.model = IdCNN(layers, 3, FLAGS.num_hidden, FLAGS.embedding_size, FLAGS.max_sentence_len, FLAGS.num_tags) else: self.model = BiLSTM(FLAGS.num_hidden, FLAGS.max_sentence_len, FLAGS.num_tags) self.trains_params = None self.inp = tf.placeholder(tf.int32, shape=[None, FLAGS.max_sentence_len], name="input_placeholder") pass
def run(dataDir, fold=5): f = open('./config.yml', encoding='utf-8', errors='ignore') config = yaml.safe_load(f) DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") config['DEVICE'] = DEVICE batchSize = config['model']['batchSize'] optParser = OptionParser() optParser.add_option('-m', '--model', action='store', type='string', dest='modelName') option, args = optParser.parse_args() modelName = config['modelName'] = option.modelName #保存最终结果 f = open(os.path.join(dataDir, modelName, 'result.txt'), 'w', encoding='utf-8', errors='ignore') #测试数据 testDataPath = config['data']['testDataPath'] testDataset = NERTestDataset(testDataPath, config) testIter = data.DataLoader(dataset=testDataset, batch_size=batchSize, shuffle=False, num_workers=4, collate_fn=testPad) for i in range(fold): print('--------------------第%d次验证-------------------\n' % (i + 1)) #验证数据 validDataset = NERDataset(os.path.join(dataDir, str(i) + '.txt'), config) validIter = data.DataLoader(dataset=validDataset, batch_size=batchSize, shuffle=False, num_workers=4, collate_fn=pad) #训练数据 trainPathArr = [ os.path.join(dataDir, str(j) + '.txt') for j in range(fold) if j != i ] assert len(trainPathArr) == fold - 1 trainDataset = NERDataset(trainPathArr, config) trainIter = data.DataLoader(dataset=trainDataset, batch_size=batchSize, shuffle=True, num_workers=4, collate_fn=pad) #加载网络 if modelName == 'bilstm': net = BiLSTM(config) if modelName == 'idcnn': net = IDCNN(config) if modelName == 'bilstm_attn': net = BiLSTM_ATTN(config) if torch.cuda.device_count() > 1: net = nn.DataParallel(net) net = net.to(DEVICE) config['submitPath'] = os.path.join(dataDir, modelName, str(i) + '.csv') config['modelSavePath'] = os.path.join(dataDir, modelName, str(i) + '.pkl') trainLoss, validLoss, f1Score, accurate, recall = train( net, trainIter, validIter, config) #验证集中实际效果 modelSavePath = config['modelSavePath'] if os.path.exists(modelSavePath): net.load_state_dict(torch.load(modelSavePath)) #未过滤训练集实体的缺失比、过滤完训练集实体的缺失比 disappear1, disappear2 = test(net, testIter, config) f.write('第%d次验证\n' % (i + 1)) f.write('trainLoss: %f\n' % trainLoss) f.write('validLoss: %f\n' % validLoss) f.write('f1Score %f, accurate %f, recall %f\n' % (f1Score, accurate, recall)) f.write('测试集中缺失比%f %f\n' % (disappear1, disappear2)) f.write('\n') f.close()
print("Embeddings written of the best model") result_list = list() result_list_acc = list() count = 0 try: for Ques_train, Ques_test in kf.split(Dataset): count += 1 print("Fold ", count) if count == 2: #writeContentEmbeddings(model) break # define model model = BiLSTM(embedding_dim=EMBEDDING_DIM, hidden_dim=HIDDEN_DIM, vocab_size=len(text_field.vocab), label_size=len(label_field.vocab)-1,\ use_gpu=USE_GPU, batch_size=BATCH_SIZE) if USE_GPU: model = model.to(DEVICE) model.embeddings.weight.data.copy_( torch.from_numpy(pretrained_embeddings)) optimizer = optim.Adam(model.parameters(), lr=1e-3) trainset = torch.utils.data.TensorDataset( torch.LongTensor(Dataset[Ques_train]), torch.torch.LongTensor(Dataset[Ques_train])) testset = torch.utils.data.TensorDataset( torch.LongTensor(Dataset[Ques_test]), torch.torch.LongTensor(Dataset[Ques_test])) train_loader = torch.utils.data.DataLoader(dataset=trainset, batch_size=64)
def __init__(self, model, options, vocab, nnvecs=1): self.word_counts, words, chars, pos, cpos, rels, treebanks, langs = vocab self.model = model self.nnvecs = nnvecs # Load ELMo if the option is set if options.elmo is not None: from elmo import ELMo self.elmo = ELMo(options.elmo, options.elmo_gamma, options.elmo_learn_gamma) self.elmo.init_weights(model) else: self.elmo = None extra_words = 2 # MLP padding vector and OOV vector self.words = {word: ind for ind, word in enumerate(words, extra_words)} self.word_lookup = self.model.add_lookup_parameters( (len(self.words) + extra_words, options.word_emb_size)) extra_pos = 2 # MLP padding vector and OOV vector self.pos = {pos: ind for ind, pos in enumerate(cpos, extra_pos)} self.pos_lookup = self.model.add_lookup_parameters( (len(cpos) + extra_pos, options.pos_emb_size)) self.irels = rels self.rels = {rel: ind for ind, rel in enumerate(rels)} extra_chars = 1 # OOV vector self.chars = {char: ind for ind, char in enumerate(chars, extra_chars)} self.char_lookup = self.model.add_lookup_parameters( (len(chars) + extra_chars, options.char_emb_size)) extra_treebanks = 1 # Padding vector self.treebanks = { treebank: ind for ind, treebank in enumerate(treebanks, extra_treebanks) } self.treebank_lookup = self.model.add_lookup_parameters( (len(treebanks) + extra_treebanks, options.tbank_emb_size)) # initialise word vectors with external embeddings where they exist # This part got ugly - TODO: refactor if not options.predict: self.external_embedding = defaultdict(lambda: {}) if options.ext_word_emb_file and options.word_emb_size > 0: # Load pre-trained word embeddings for lang in langs: embeddings = utils.get_external_embeddings( options, emb_file=options.ext_word_emb_file, lang=lang, words=self.words.viewkeys()) self.external_embedding["words"].update(embeddings) if options.ext_char_emb_file and options.char_emb_size > 0: # Load pre-trained character embeddings for lang in langs: embeddings = utils.get_external_embeddings( options, emb_file=options.ext_char_emb_file, lang=lang, words=self.chars, chars=True) self.external_embedding["chars"].update(embeddings) if options.ext_emb_dir: # For every language, load the data for the word and character # embeddings from a directory. for lang in langs: if options.word_emb_size > 0: embeddings = utils.get_external_embeddings( options, emb_dir=options.ext_emb_dir, lang=lang, words=self.words.viewkeys()) self.external_embedding["words"].update(embeddings) if options.char_emb_size > 0: embeddings = utils.get_external_embeddings( options, emb_dir=options.ext_emb_dir, lang=lang, words=self.chars, chars=True) self.external_embedding["chars"].update(embeddings) self.init_lookups(options) elmo_emb_size = self.elmo.emb_dim if self.elmo else 0 self.lstm_input_size = ( options.word_emb_size + elmo_emb_size + options.pos_emb_size + options.tbank_emb_size + 2 * (options.char_lstm_output_size if options.char_emb_size > 0 else 0) ) print "Word-level LSTM input size: " + str(self.lstm_input_size) self.bilstms = [] if options.no_bilstms > 0: self.bilstms.append( BiLSTM(self.lstm_input_size, options.lstm_output_size, self.model, dropout_rate=0.33)) for i in range(1, options.no_bilstms): self.bilstms.append( BiLSTM(2 * options.lstm_output_size, options.lstm_output_size, self.model, dropout_rate=0.33)) #used in the PaddingVec self.word2lstm = self.model.add_parameters( (options.lstm_output_size * 2, self.lstm_input_size)) self.word2lstmbias = self.model.add_parameters( (options.lstm_output_size * 2)) else: self.word2lstm = self.model.add_parameters( (self.lstm_input_size, self.lstm_input_size)) self.word2lstmbias = self.model.add_parameters( (self.lstm_input_size)) self.char_bilstm = BiLSTM(options.char_emb_size, options.char_lstm_output_size, self.model, dropout_rate=0.33) self.charPadding = self.model.add_parameters( (options.char_lstm_output_size * 2))
def __init__(self, model, options, vocab, nnvecs): self.word_counts, words, chars, pos, cpos, self.irels, treebanks, langs = vocab self.model = model self.nnvecs = nnvecs extra_words = 2 # MLP padding vector and OOV vector self.words = {word: ind for ind, word in enumerate(words, extra_words)} self.word_lookup = self.model.add_lookup_parameters( (len(self.words) + extra_words, options.word_emb_size)) extra_pos = 2 # MLP padding vector and OOV vector self.pos = {pos: ind for ind, pos in enumerate(cpos, extra_pos)} self.pos_lookup = self.model.add_lookup_parameters( (len(cpos) + extra_pos, options.pos_emb_size)) extra_chars = 1 # OOV vector self.chars = {char: ind for ind, char in enumerate(chars, extra_chars)} self.char_lookup = self.model.add_lookup_parameters( (len(chars) + extra_chars, options.char_emb_size)) extra_treebanks = 1 # Padding vector self.treebanks = { treebank: ind for ind, treebank in enumerate(treebanks, extra_treebanks) } self.treebank_lookup = self.model.add_lookup_parameters( (len(treebanks) + extra_treebanks, options.tbank_emb_size)) # initialise word vectors with external embeddings where they exist if (options.ext_emb_dir or options.ext_emb_file) and not options.predict: self.external_embedding = defaultdict(lambda: {}) for lang in langs: if options.word_emb_size > 0: self.external_embedding["words"].update( utils.get_external_embeddings(options, lang, self.words.viewkeys())) if options.char_emb_size > 0: self.external_embedding["chars"].update( utils.get_external_embeddings(options, lang, self.chars, chars=True)) self.init_lookups(options) self.lstm_input_size = options.word_emb_size + options.pos_emb_size + options.tbank_emb_size +\ 2* (options.char_lstm_output_size if options.char_emb_size > 0 else 0) print "Word-level LSTM input size: " + str(self.lstm_input_size) self.bilstms = [] if options.no_bilstms > 0: self.bilstms.append( BiLSTM(self.lstm_input_size, options.lstm_output_size, self.model, dropout_rate=0.33)) for i in range(1, options.no_bilstms): self.bilstms.append( BiLSTM(2 * options.lstm_output_size, options.lstm_output_size, self.model, dropout_rate=0.33)) #used in the PaddingVec self.word2lstm = self.model.add_parameters( (options.lstm_output_size * 2, self.lstm_input_size)) self.word2lstmbias = self.model.add_parameters( (options.lstm_output_size * 2)) else: self.word2lstm = self.model.add_parameters( (self.lstm_input_size, self.lstm_input_size)) self.word2lstmbias = self.model.add_parameters( (self.lstm_input_size)) self.char_bilstm = BiLSTM(options.char_emb_size, options.char_lstm_output_size, self.model, dropout_rate=0.33) self.charPadding = self.model.add_parameters( (options.char_lstm_output_size * 2))
if __name__ == "__main__": # Getting the test data from pytorch dataset = SNLI(batch_size, device) out_dim = dataset.out_dim() vocab_size = dataset.vocabulary_size() # from google.colab import drive # drive.mount('/content/gdrive') # Testing the LSTM model # Loading the model using the parameters needed filename = "Models/LSTM/" + '{0}_{1}_{2}_{3}_{4}_{5}_{6}_bidirect.pt'.format(batch_size, embedding_dim, dropout_ratio, hidden_dim, epochs, opt_name, lr) model = BiLSTM(vocab_size, embedding_dim, dropout_ratio, hidden_dim, out_dim, bidirect) model.to(device) model.load_state_dict(torch.load(filename, map_location=torch.device('cpu'))) test_loss, test_accuracy, gt, pred = test(model, dataset) # print("Test loss = {}, Test accuracy = {}".format(test_loss, test_accuracy)) # Writing the output from LSTM onto a text file labels = ['entailment', 'contradiction', 'neutral'] with open("LSTM.txt", 'w') as f: f.write("Loss on Test Data : {}\n".format(test_loss)) f.write("Accuracy on Test Data : {}\n".format(test_accuracy)) f.write("gt_label,pred_label \n") for idx in range(len(gt)): f.write("{},{}\n".format(labels[gt[idx]], labels[pred[idx]]))
optimizer.step() if ktr_in % 10 == 0: losses.append(loss) ktr_in += 10 xs = np.array(data[390:450], dtype=np.float) xs = torch.FloatTensor(xs) xs = Variable(xs).cuda() ys = rnn(xs) yc = torch.FloatTensor(np.array(labels[390:450], dtype=np.float)) yc = Variable(yc).cuda() f.write('{}\n'.format(criterion(ys, yc).item())) f.write("Type: BiLSTM\n") for ktr in range(3): f.write("\nhidden size: {}, num_layers: {}, error: ".format(n_shape[0], n_shape[1])) rnn = BiLSTM(input_size, n_shape[0], n_shape[1], 1) rnn.cuda() criterion = nn.MSELoss() optimizer = torch.optim.Adam(rnn.parameters(), lr=learning_rate) losses = [] ktr_in = 0 for epoch in range(num_epochs): for img, lb in train_set: img = np.array([img,], dtype=np.float) img = torch.FloatTensor(img) img = Variable(img).cuda() # Forward + Backward + Optimize optimizer.zero_grad() output = rnn(img) crt = torch.FloatTensor(np.array([lb,], dtype=np.float))
test_set = data_set[450:] for v in np.random.randint(0, 400, 25): valid_set.append(data_set[v]) for t in np.random.randint(0, 450, 30): test_set.append(data_set[t]) sequence_length = 401 input_size = 3 hidden_size = 16 num_layers = 1 batch_size = 1 num_epochs = 2 learning_rate = 0.0001 for ktr in range(1): rnn = BiLSTM(input_size, hidden_size, num_layers, 1) criterion = nn.MSELoss() optimizer = torch.optim.Adam(rnn.parameters(), lr=learning_rate) losses = [] ktr_in = 0 for epoch in range(num_epochs): for img, lb in train_set: img = np.array([ img, ], dtype=np.float) img = torch.FloatTensor(img) img = Variable(img) # .cuda() # Forward + Backward + Optimize optimizer.zero_grad() output = rnn(img)
def main(config): trainDataPath = config['data']['trainDataPath'] validDataPath = config['data']['validDataPath'] testDataPath = config['data']['testDataPath'] modelName = config['modelName'] batchSize = config['model']['batchSize'] epochNum = config['model']['epochNum'] earlyStop = config['model']['earlyStop'] learningRate = config['model']['learningRate'] modelSavePath = config['model']['modelSavePath'] #GPU/CPU DEVICE = config['DEVICE'] trianDataset = NERDataset(trainDataPath, config) validDataset = NERDataset(validDataPath, config) testDataset = NERDataset(testDataPath, config) trainIter = data.DataLoader(dataset=trianDataset, batch_size=batchSize, shuffle=True, num_workers=4, collate_fn=pad) validIter = data.DataLoader(dataset=validDataset, batch_size=batchSize, shuffle=False, num_workers=4, collate_fn=pad) testIter = data.DataLoader(dataset=testDataset, batch_size=batchSize, shuffle=False, num_workers=4, collate_fn=pad) if modelName == 'bilstm': net = BiLSTM(config) train = bilstmTrain eval = bilstmEval if torch.cuda.device_count() > 1: net = nn.DataParallel(net) if modelName == 'bilstm_crf': net = BiLSTM_CRF(config) train = bilstmCRFTrain eval = bilstmCRFEval if modelName == 'transformer_crf': net = Transformer_CRF(config) train = transformerCRFTrain eval = transformerCRFEval if modelName == 'cnn': net = CNN(config) train = cnnTrain eval = cnnEval net = net.to(DEVICE) lossFunction = nn.NLLLoss() optimizer = optim.Adam(net.parameters(), lr=learningRate, betas=(0.9, 0.999), eps=1e-08) earlyNumber, beforeLoss, maxScore = 0, sys.maxsize, -1 #开始训练 for epoch in range(epochNum): print('第%d次迭代: ' % epoch) totalLoss = train(net, trainIter, optimizer=optimizer, criterion=lossFunction, DEVICE=DEVICE) print('训练损失为: %f' % totalLoss) totalLoss, f1Score = eval(net, validIter, criterion=lossFunction, DEVICE=DEVICE) if f1Score > maxScore: maxScore = f1Score torch.save(net.state_dict(), modelSavePath) print('验证损失为:%f f1Score:%f / %f' % (totalLoss, f1Score, maxScore)) if f1Score < maxScore: earlyNumber += 1 print('earyStop: %d/%d' % (earlyNumber, earlyStop)) else: earlyNumber = 0 if earlyNumber >= earlyStop: break print('\n') #加载最优模型 net.load_state_dict(torch.load(modelSavePath)) totalLoss, f1Score = eval(net, testIter, criterion=lossFunction, DEVICE=DEVICE) print('测试损失为: %f, f1Score: %f' % (totalLoss, f1Score))
def main(config): trainDataPath = config['data']['trainDataPath'] validDataPath = config['data']['validDataPath'] testDataPath = config['data']['testDataPath'] batchSize = config['model']['batchSize'] #GPU/CPU DEVICE = config['DEVICE'] trianDataset = NERDataset(trainDataPath, config) validDataset = NERDataset(validDataPath, config) testDataset = NERTestDataset(testDataPath, config) trainIter = data.DataLoader(dataset=trianDataset, batch_size=batchSize, shuffle=True, num_workers=6, collate_fn=pad) validIter = data.DataLoader(dataset=validDataset, batch_size=batchSize, shuffle=False, num_workers=6, collate_fn=pad) testIter = data.DataLoader(dataset=testDataset, batch_size=batchSize, shuffle=False, num_workers=6, collate_fn=testPad) if config['modelName'] == 'bilstm': net = BiLSTM(config) config['modelSavePath'] = config['data']['BiLSTMSavePath'] modelSavePath = config['modelSavePath'] config['submitDataPath'] = config['data']['BiLSTMSubmitDataPath'] train = bilstm_train test = bilstm_test if config['modelName'] == 'bilstm_crf': net = BiLSTM_CRF(config) config['modelSavePath'] = config['data']['BiLSTMCRFSavePath'] modelSavePath = config['modelSavePath'] config['submitDataPath'] = config['data']['BiLSTMCRFSubmitDataPath'] train = bilstm_crf_train test = bilstm_crf_test if config['modelName'] == 'transformer_cnn': net = Transformer_CNN(config) config['modelSavePath'] = config['data']['TransformerCNNSavePath'] config['submitDataPath'] = config['data'][ 'TransformerCNNSubmitDataPath'] modelSavePath = config['modelSavePath'] train = transformer_cnn_train test = transformer_cnn_test if torch.cuda.device_count() > 1: net = nn.DataParallel(net) net = net.to(DEVICE) if os.path.exists(modelSavePath): net.load_state_dict(torch.load(modelSavePath)) #if config['train']: #train(net, trainIter, validIter, config) #if config['test']: test(net, testIter, config)
train_data = dsets.MNIST( root = '../LSTM/mnist', train = True, transform = torchvision.transforms.ToTensor(), download = DOWNLOAD_MNIST, ) test_data = dsets.MNIST(root='../LSTM/mnist', train=False) train_loader = Data.DataLoader(dataset=train_data, \ batch_size=BATCH_SIZE, shuffle=True) with torch.no_grad(): test_x = Variable(torch.unsqueeze(test_data.data, dim=1)).type(torch.FloatTensor)/255 test_y = test_data.targets lstm = BiLSTM(INPUT_SIZE, HIDDEN_SIZE, NUM_LAYERS, 10).to(device) optimizer = torch.optim.Adam(lstm.parameters(), lr=LR) loss_func = nn.CrossEntropyLoss() for epoch in range(EPOCH): for step, (x, y) in enumerate(train_loader): b_x = Variable(x.view(-1, TIME_STEP, INPUT_SIZE).to(device)) b_y = Variable(y.to(device)) output = lstm(b_x) loss = loss_func(output, b_y) optimizer.zero_grad() loss.backward() optimizer.step()
test_input = input_data[1000:] test_output = output_data[1000:] train_lengths = lengths[:1000] test_lengths = lengths[1000:] ############### # Build model # ############### num_classes = 5 data = tf.placeholder(tf.int32, [None, sequence_length]) target = tf.placeholder(tf.float32, [None, sequence_length, num_classes]) model = BiLSTM(data, target, len(vocab) + 1, embedding_size=10, lstm_size=10) ############### # Train model # ############### sess = tf.Session() sess.run(tf.initialize_all_variables()) batch_size = 20 no_of_batches = len(input_data) // batch_size epochs = 100 for epoch in range(epochs): if epoch % 10 == 0: print sess.run(model.cost, feed_dict={
def build_model(self): self.bilstm = BiLSTM(num_classes, word_embedding_size, elmo_embedding_size, batch_size, epochs, init_learning_rate, decay_rate, decay_steps) self.bilstm_parent = BiLSTM(num_classes, word_embedding_size, elmo_embedding_size, batch_size, epochs, init_learning_rate, decay_rate, decay_steps) with tf.variable_scope('softmax-1', reuse=tf.AUTO_REUSE): softmax_w = tf.get_variable( 'W', shape=[2 * feed_forward_op_size, intermediate_layer_size_1], initializer=tf.truncated_normal_initializer(), dtype=tf.float32) softmax_b = tf.get_variable( 'b', initializer=tf.constant_initializer(0.0), shape=[intermediate_layer_size_1], dtype=tf.float32) self.final_state = tf.concat( [self.bilstm.final_state, self.bilstm_parent.final_state], 1) self.logit = tf.matmul(self.final_state, softmax_w) + softmax_b self.logit = tf.nn.relu(self.logit) with tf.variable_scope('softmax-2', reuse=tf.AUTO_REUSE): softmax_w = tf.get_variable( 'W', shape=[intermediate_layer_size_1, intermediate_layer_size_2], initializer=tf.truncated_normal_initializer(), dtype=tf.float32) softmax_b = tf.get_variable( 'b', initializer=tf.constant_initializer(0.0), shape=[intermediate_layer_size_2], dtype=tf.float32) self.logit = tf.matmul(self.logit, softmax_w) + softmax_b self.logit = tf.nn.relu(self.logit) with tf.variable_scope('softmax-3', reuse=tf.AUTO_REUSE): softmax_w = tf.get_variable( 'W', shape=[intermediate_layer_size_2, intermediate_layer_size_3], initializer=tf.truncated_normal_initializer(), dtype=tf.float32) softmax_b = tf.get_variable( 'b', initializer=tf.constant_initializer(0.0), shape=[intermediate_layer_size_3], dtype=tf.float32) self.logit = tf.matmul(self.logit, softmax_w) + softmax_b self.logit = tf.nn.relu(self.logit) with tf.variable_scope('softmax-4', reuse=tf.AUTO_REUSE): softmax_w = tf.get_variable( 'W', shape=[intermediate_layer_size_3, num_classes], initializer=tf.truncated_normal_initializer(), dtype=tf.float32) softmax_b = tf.get_variable( 'b', initializer=tf.constant_initializer(0.0), shape=[num_classes], dtype=tf.float32) self.logit = tf.matmul(self.logit, softmax_w) + softmax_b self.norm_logit = tf.nn.softmax(self.logit) self.predictions = tf.cast(tf.math.argmax(self.norm_logit, axis=1), tf.int64) self.accuracy = tf.reduce_mean( tf.cast(tf.equal(self.predictions, self.bilstm.y), tf.float32))
def main(argv): print("CUDA_VISIBLE_DEVICES=", os.environ['CUDA_VISIBLE_DEVICES']) train_dir = FLAGS.train_dir dev_dir = FLAGS.dev_dir maps_dir = FLAGS.maps_dir if train_dir == '': print( 'Must supply input data directory generated from tsv_to_tfrecords.py' ) sys.exit(1) # Doesn't work in newer versions of tf. TODO: fix # print('\n'.join(sorted(["%s : %s" % (str(k), str(v)) for k, v in FLAGS.__dict__['__flags'].items()]))) with open(maps_dir + '/label.txt', 'r') as f: labels_str_id_map = { l.split('\t')[0]: int(l.split('\t')[1].strip()) for l in f.readlines() } labels_id_str_map = {i: s for s, i in labels_str_id_map.items()} labels_size = len(labels_id_str_map) with open(maps_dir + '/token.txt', 'r') as f: vocab_str_id_map = { l.split('\t')[0]: int(l.split('\t')[1].strip()) for l in f.readlines() } vocab_id_str_map = {i: s for s, i in vocab_str_id_map.items()} vocab_size = len(vocab_id_str_map) with open(maps_dir + '/shape.txt', 'r') as f: shape_str_id_map = { l.split('\t')[0]: int(l.split('\t')[1].strip()) for l in f.readlines() } shape_id_str_map = {i: s for s, i in shape_str_id_map.items()} shape_domain_size = len(shape_id_str_map) with open(maps_dir + '/char.txt', 'r') as f: char_str_id_map = { l.split('\t')[0]: int(l.split('\t')[1].strip()) for l in f.readlines() } char_id_str_map = {i: s for s, i in char_str_id_map.items()} char_domain_size = len(char_id_str_map) # with open(maps_dir + '/sizes.txt', 'r') as f: # num_train_examples = int(f.readline()[:-1]) print("num classes: %d" % labels_size) size_files = [ maps_dir + "/" + fname for fname in listdir(maps_dir) if fname.find("sizes") != -1 ] num_train_examples = 0 num_tokens = 0 for size_file in size_files: print(size_file) with open(size_file, 'r') as f: num_train_examples += int(f.readline()[:-1]) num_tokens += int(f.readline()[:-1]) print("num train examples: %d" % num_train_examples) print("num train tokens: %d" % num_tokens) dev_top_dir = '/'.join( dev_dir.split("/")[:-2]) if dev_dir.find("*") != -1 else dev_dir print(dev_top_dir) dev_size_files = [ dev_top_dir + "/" + fname for fname in listdir(dev_top_dir) if fname.find("sizes") != -1 ] num_dev_examples = 0 num_dev_tokens = 0 for size_file in dev_size_files: print(size_file) with open(size_file, 'r') as f: num_dev_examples += int(f.readline()[:-1]) num_dev_tokens += int(f.readline()[:-1]) print("num dev examples: %d" % num_dev_examples) print("num dev tokens: %d" % num_dev_tokens) # with open(dev_dir + '/sizes.txt', 'r') as f: # num_dev_examples = int(f.readline()[:-1]) type_set = {} type_int_int_map = {} outside_set = ["O", "<PAD>", "<S>", "</S>", "<ZERO>"] for label, id in labels_str_id_map.items(): label_type = label if label in outside_set else label[2:] if label_type not in type_set: type_set[label_type] = len(type_set) type_int_int_map[id] = type_set[label_type] print(type_set) # load embeddings, if given; initialize in range [-.01, .01] embeddings_shape = (vocab_size - 1, FLAGS.embed_dim) embeddings = tf_utils.embedding_values(embeddings_shape, old=False) embeddings_used = 0 if FLAGS.embeddings != '': with open(FLAGS.embeddings, 'r') as f: for line in f.readlines(): split_line = line.strip().split(" ") word = split_line[0] embedding = split_line[1:] if word in vocab_str_id_map: embeddings_used += 1 # shift by -1 because we are going to add a 0 constant vector for the padding later embeddings[vocab_str_id_map[word] - 1] = map( float, embedding) elif word.lower() in vocab_str_id_map: embeddings_used += 1 embeddings[vocab_str_id_map[word.lower()] - 1] = map( float, embedding) print("Loaded %d/%d embeddings (%2.2f%% coverage)" % (embeddings_used, vocab_size, embeddings_used / vocab_size * 100)) layers_map = sorted(json.loads(FLAGS.layers.replace( "'", '"')).items()) if FLAGS.model == 'cnn' else None pad_width = int(layers_map[0][1]['width'] / 2) if layers_map is not None else 1 with tf.Graph().as_default(): train_batcher = Batcher( train_dir, FLAGS.batch_size) if FLAGS.memmap_train else SeqBatcher( train_dir, FLAGS.batch_size) dev_batch_size = FLAGS.batch_size # num_dev_examples dev_batcher = SeqBatcher(dev_dir, dev_batch_size, num_buckets=0, num_epochs=1) if FLAGS.ontonotes: domain_dev_batchers = { domain: SeqBatcher(dev_dir.replace('*', domain), dev_batch_size, num_buckets=0, num_epochs=1) for domain in ['bc', 'nw', 'bn', 'wb', 'mz', 'tc'] } train_eval_batch_size = FLAGS.batch_size train_eval_batcher = SeqBatcher(train_dir, train_eval_batch_size, num_buckets=0, num_epochs=1) char_embedding_model = BiLSTMChar(char_domain_size, FLAGS.char_dim, int(FLAGS.char_tok_dim/2)) \ if FLAGS.char_dim > 0 and FLAGS.char_model == "lstm" else \ (CNNChar(char_domain_size, FLAGS.char_dim, FLAGS.char_tok_dim, layers_map[0][1]['width']) if FLAGS.char_dim > 0 and FLAGS.char_model == "cnn" else None) char_embeddings = char_embedding_model.outputs if char_embedding_model is not None else None if FLAGS.model == 'cnn': model = CNN(num_classes=labels_size, vocab_size=vocab_size, shape_domain_size=shape_domain_size, char_domain_size=char_domain_size, char_size=FLAGS.char_tok_dim, embedding_size=FLAGS.embed_dim, shape_size=FLAGS.shape_dim, nonlinearity=FLAGS.nonlinearity, layers_map=layers_map, viterbi=FLAGS.viterbi, projection=FLAGS.projection, loss=FLAGS.loss, margin=FLAGS.margin, repeats=FLAGS.block_repeats, share_repeats=FLAGS.share_repeats, char_embeddings=char_embeddings, embeddings=embeddings) elif FLAGS.model == "bilstm": model = BiLSTM(num_classes=labels_size, vocab_size=vocab_size, shape_domain_size=shape_domain_size, char_domain_size=char_domain_size, char_size=FLAGS.char_dim, embedding_size=FLAGS.embed_dim, shape_size=FLAGS.shape_dim, nonlinearity=FLAGS.nonlinearity, viterbi=FLAGS.viterbi, hidden_dim=FLAGS.lstm_dim, char_embeddings=char_embeddings, embeddings=embeddings) else: print(FLAGS.model + ' is not a valid model type') sys.exit(1) # Define Training procedure global_step = tf.Variable(0, name='global_step', trainable=False) optimizer = tf.train.AdamOptimizer(learning_rate=FLAGS.lr, beta1=FLAGS.beta1, beta2=FLAGS.beta2, epsilon=FLAGS.epsilon, name="optimizer") model_vars = tf.global_variables() print("model vars: %d" % len(model_vars)) print(map(lambda v: v.name, model_vars)) # todo put in func total_parameters = 0 for variable in tf.trainable_variables(): # shape is an array of tf.Dimension shape = variable.get_shape() variable_parametes = 1 for dim in shape: variable_parametes *= dim.value total_parameters += variable_parametes print("Total trainable parameters: %d" % (total_parameters)) if FLAGS.clip_norm > 0: grads, _ = tf.clip_by_global_norm( tf.gradients(model.loss, model_vars), FLAGS.clip_norm) train_op = optimizer.apply_gradients(zip(grads, model_vars), global_step=global_step) else: train_op = optimizer.minimize(model.loss, global_step=global_step, var_list=model_vars) tf.global_variables_initializer() opt_vars = [ optimizer.get_slot(s, n) for n in optimizer.get_slot_names() for s in model_vars if optimizer.get_slot(s, n) is not None ] model_vars += opt_vars if FLAGS.load_dir: reader = tf.train.NewCheckpointReader(FLAGS.load_dir + ".tf") saved_var_map = reader.get_variable_to_shape_map() intersect_vars = [ k for k in tf.global_variables() if k.name.split(':')[0] in saved_var_map and k.get_shape() == saved_var_map[k.name.split(':')[0]] ] leftovers = [ k for k in tf.global_variables() if k.name.split(':')[0] not in saved_var_map or k.get_shape() != saved_var_map[k.name.split(':')[0]] ] print("WARNING: Loading pretrained model, but not loading: ", map(lambda v: v.name, leftovers)) loader = tf.train.Saver(var_list=intersect_vars) else: loader = tf.train.Saver(var_list=model_vars) saver = tf.train.Saver(var_list=model_vars) sv = tf.train.Supervisor( logdir=FLAGS.model_dir if FLAGS.model_dir != '' else None, global_step=global_step, saver=None, save_model_secs=0, save_summaries_secs=0) training_start_time = time.time() with sv.managed_session( FLAGS.master, config=tf.ConfigProto(allow_soft_placement=True)) as sess: def run_evaluation(eval_batches, extra_text=""): predictions = [] for b, (eval_label_batch, eval_token_batch, eval_shape_batch, eval_char_batch, eval_seq_len_batch, eval_tok_len_batch, eval_mask_batch) in enumerate(eval_batches): batch_size, batch_seq_len = eval_token_batch.shape char_lens = np.sum(eval_tok_len_batch, axis=1) max_char_len = np.max(eval_tok_len_batch) eval_padded_char_batch = np.zeros( (batch_size, max_char_len * batch_seq_len)) for b in range(batch_size): char_indices = [ item for sublist in [ range(i * max_char_len, i * max_char_len + d) for i, d in enumerate(eval_tok_len_batch[b]) ] for item in sublist ] eval_padded_char_batch[ b, char_indices] = eval_char_batch[b][:char_lens[b]] char_embedding_feeds = {} if FLAGS.char_dim == 0 else { char_embedding_model.input_chars: eval_padded_char_batch, char_embedding_model.batch_size: batch_size, char_embedding_model.max_seq_len: batch_seq_len, char_embedding_model.token_lengths: eval_tok_len_batch, char_embedding_model.max_tok_len: max_char_len } basic_feeds = { model.input_x1: eval_token_batch, model.input_x2: eval_shape_batch, model.input_y: eval_label_batch, model.input_mask: eval_mask_batch, model.max_seq_len: batch_seq_len, model.batch_size: batch_size, model.sequence_lengths: eval_seq_len_batch } basic_feeds.update(char_embedding_feeds) total_feeds = basic_feeds.copy() if FLAGS.viterbi: preds, transition_params = sess.run( [model.predictions, model.transition_params], feed_dict=total_feeds) viterbi_repad = np.empty((batch_size, batch_seq_len)) for batch_idx, (unary_scores, sequence_lens) in enumerate( zip(preds, eval_seq_len_batch)): viterbi_sequence, _ = tf.contrib.crf.viterbi_decode( unary_scores, transition_params) viterbi_repad[batch_idx] = viterbi_sequence predictions.append(viterbi_repad) else: preds, scores = sess.run( [model.predictions, model.unflat_scores], feed_dict=total_feeds) predictions.append(preds) if FLAGS.print_preds != '': evaluation.print_conlleval_format( FLAGS.print_preds, eval_batches, predictions, labels_id_str_map, vocab_id_str_map, pad_width) # print evaluation f1_micro, precision = evaluation.segment_eval( eval_batches, predictions, type_set, type_int_int_map, labels_id_str_map, vocab_id_str_map, outside_idx=map( lambda t: type_set[t] if t in type_set else type_set["O"], outside_set), pad_width=pad_width, start_end=FLAGS.start_end, extra_text="Segment evaluation %s:" % extra_text) return f1_micro, precision threads = tf.train.start_queue_runners(sess=sess) log_every = int(max(100, num_train_examples / 5)) if FLAGS.load_dir != '': print("Deserializing model: " + FLAGS.load_dir + ".tf") loader.restore(sess, FLAGS.load_dir + ".tf") def get_dev_batches(seq_batcher): batches = [] # load all the dev batches into memory done = False while not done: try: dev_batch = sess.run(seq_batcher.next_batch_op) dev_label_batch, dev_token_batch, dev_shape_batch, dev_char_batch, dev_seq_len_batch, dev_tok_len_batch = dev_batch mask_batch = np.zeros(dev_token_batch.shape) actual_seq_lens = np.add( np.sum(dev_seq_len_batch, axis=1), (2 if FLAGS.start_end else 1) * pad_width * ((dev_seq_len_batch != 0).sum(axis=1) + (0 if FLAGS.start_end else 1))) for i, seq_len in enumerate(actual_seq_lens): mask_batch[i, :seq_len] = 1 batches.append( (dev_label_batch, dev_token_batch, dev_shape_batch, dev_char_batch, dev_seq_len_batch, dev_tok_len_batch, mask_batch)) except: done = True return batches dev_batches = get_dev_batches(dev_batcher) if FLAGS.ontonotes: domain_batches = { domain: get_dev_batches(domain_batcher) for domain, domain_batcher in domain_dev_batchers.iteritems() } train_batches = [] if FLAGS.train_eval: # load all the train batches into memory done = False while not done: try: train_batch = sess.run( train_eval_batcher.next_batch_op) train_label_batch, train_token_batch, train_shape_batch, train_char_batch, train_seq_len_batch, train_tok_len_batch = train_batch mask_batch = np.zeros(train_token_batch.shape) actual_seq_lens = np.add( np.sum(train_seq_len_batch, axis=1), (2 if FLAGS.start_end else 1) * pad_width * ((train_seq_len_batch != 0).sum(axis=1) + (0 if FLAGS.start_end else 1))) for i, seq_len in enumerate(actual_seq_lens): mask_batch[i, :seq_len] = 1 train_batches.append( (train_label_batch, train_token_batch, train_shape_batch, train_char_batch, train_seq_len_batch, train_tok_len_batch, mask_batch)) except Exception as e: done = True if FLAGS.memmap_train: train_batcher.load_and_bucket_data(sess) def train(max_epochs, best_score, model_hidden_drop, model_input_drop, until_convergence, max_lower=6, min_iters=20): print("Training on %d sentences (%d examples)" % (num_train_examples, num_train_examples)) start_time = time.time() train_batcher._step = 1.0 converged = False examples = 0 log_every_running = log_every epoch_loss = 0.0 num_lower = 0 training_iteration = 0 speed_num = 0.0 speed_denom = 0.0 while not sv.should_stop( ) and training_iteration < max_epochs and not ( until_convergence and converged): # evaluate if examples >= num_train_examples: training_iteration += 1 if FLAGS.train_eval: run_evaluation( train_batches, "TRAIN (iteration %d)" % training_iteration) print() f1_micro, precision = run_evaluation( dev_batches, "TEST (iteration %d)" % training_iteration) print("Avg training speed: %f examples/second" % (speed_num / speed_denom)) # keep track of running best / convergence heuristic if f1_micro > best_score: best_score = f1_micro num_lower = 0 if FLAGS.model_dir != '' and best_score > FLAGS.save_min: save_path = saver.save(sess, FLAGS.model_dir + ".tf") print("Serialized model: %s" % save_path) else: num_lower += 1 if num_lower > max_lower and training_iteration > min_iters: converged = True # update per-epoch variables log_every_running = log_every examples = 0 epoch_loss = 0.0 start_time = time.time() if examples > log_every_running: speed_denom += time.time() - start_time speed_num += examples evaluation.print_training_error( examples, start_time, [epoch_loss], train_batcher._step) log_every_running += log_every # Training iteration label_batch, token_batch, shape_batch, char_batch, seq_len_batch, tok_lengths_batch = \ train_batcher.next_batch() if FLAGS.memmap_train else sess.run(train_batcher.next_batch_op) # make mask out of seq lens batch_size, batch_seq_len = token_batch.shape char_lens = np.sum(tok_lengths_batch, axis=1) max_char_len = np.max(tok_lengths_batch) padded_char_batch = np.zeros( (batch_size, max_char_len * batch_seq_len)) for b in range(batch_size): char_indices = [ item for sublist in [ range(i * max_char_len, i * max_char_len + d) for i, d in enumerate(tok_lengths_batch[b]) ] for item in sublist ] padded_char_batch[ b, char_indices] = char_batch[b][:char_lens[b]] max_sentences = max(map(len, seq_len_batch)) new_seq_len_batch = np.zeros((batch_size, max_sentences)) for i, seq_len_list in enumerate(seq_len_batch): new_seq_len_batch[i, :len(seq_len_list)] = seq_len_list seq_len_batch = new_seq_len_batch num_sentences_batch = np.sum(seq_len_batch != 0, axis=1) mask_batch = np.zeros( (batch_size, batch_seq_len)).astype("int") actual_seq_lens = np.add( np.sum(seq_len_batch, axis=1), (2 if FLAGS.start_end else 1) * pad_width * (num_sentences_batch + (0 if FLAGS.start_end else 1))).astype('int') for i, seq_len in enumerate(actual_seq_lens): mask_batch[i, :seq_len] = 1 examples += batch_size # apply word dropout # create word dropout mask word_probs = np.random.random(token_batch.shape) drop_indices = np.where( (word_probs > FLAGS.word_dropout) & (token_batch != vocab_str_id_map["<PAD>"])) token_batch[drop_indices[0], drop_indices[1]] = vocab_str_id_map["<OOV>"] char_embedding_feeds = {} if FLAGS.char_dim == 0 else { char_embedding_model.input_chars: padded_char_batch, char_embedding_model.batch_size: batch_size, char_embedding_model.max_seq_len: batch_seq_len, char_embedding_model.token_lengths: tok_lengths_batch, char_embedding_model.max_tok_len: max_char_len, char_embedding_model.input_dropout_keep_prob: FLAGS.char_input_dropout } if FLAGS.model == "cnn": cnn_feeds = { model.input_x1: token_batch, model.input_x2: shape_batch, model.input_y: label_batch, model.input_mask: mask_batch, model.max_seq_len: batch_seq_len, model.sequence_lengths: seq_len_batch, model.batch_size: batch_size, model.hidden_dropout_keep_prob: model_hidden_drop, model.input_dropout_keep_prob: model_input_drop, model.middle_dropout_keep_prob: FLAGS.middle_dropout, model.l2_penalty: FLAGS.l2, model.drop_penalty: FLAGS.regularize_drop_penalty, } cnn_feeds.update(char_embedding_feeds) _, loss = sess.run([train_op, model.loss], feed_dict=cnn_feeds) elif FLAGS.model == "bilstm": lstm_feed = { model.input_x1: token_batch, model.input_x2: shape_batch, model.input_y: label_batch, model.input_mask: mask_batch, model.sequence_lengths: seq_len_batch, model.max_seq_len: batch_seq_len, model.batch_size: batch_size, model.hidden_dropout_keep_prob: FLAGS.hidden_dropout, model.middle_dropout_keep_prob: FLAGS.middle_dropout, model.input_dropout_keep_prob: FLAGS.input_dropout, model.l2_penalty: FLAGS.l2, model.drop_penalty: FLAGS.regularize_drop_penalty } lstm_feed.update(char_embedding_feeds) _, loss = sess.run([train_op, model.loss], feed_dict=lstm_feed) epoch_loss += loss train_batcher._step += 1 return best_score, training_iteration, speed_num / speed_denom if FLAGS.evaluate_only: if FLAGS.train_eval: run_evaluation(train_batches, "(train)") print() run_evaluation(dev_batches, "(test)") if FLAGS.ontonotes: for domain, domain_batches in domain_batches.iteritems(): print() run_evaluation(domain_batches, FLAGS.layers2 != '', "(test - domain: %s)" % domain) else: best_score, training_iteration, train_speed = train( FLAGS.max_epochs, 0.0, FLAGS.hidden_dropout, FLAGS.input_dropout, until_convergence=FLAGS.until_convergence) if FLAGS.model_dir: print("Deserializing model: " + FLAGS.model_dir + ".tf") saver.restore(sess, FLAGS.model_dir + ".tf") sv.coord.request_stop() sv.coord.join(threads) sess.close() total_time = time.time() - training_start_time if FLAGS.evaluate_only: print("Testing time: %d seconds" % (total_time)) else: print( "Training time: %d minutes, %d iterations (%3.2f minutes/iteration)" % (total_time / 60, training_iteration, total_time / (60 * training_iteration))) print("Avg training speed: %f examples/second" % (train_speed)) print("Best dev F1: %2.2f" % (best_score * 100))
acc_test, pred = sess.run([bi_lstm.accuracy,bi_lstm.label_out], feed_dict = feed_dict) return acc_test, pred , Y graph = tf.Graph() with graph.as_default(): # session_conf = tf.ConfigProto( # allow_soft_placement=allow_soft_placement, # log_device_placement=log_device_placement) sess = tf.Session() with sess.as_default(): bi_lstm = BiLSTM( num_hidden=num_hidden, num_classes=num_classes, voc_dim=vocsize, emb_dim=embedding_dim, sent_max_len = max_sent_length, tag_voc_dim = tag_voc_size, tags = True if POS_emb in [1,2] else False, external = pre_training, update = emb_update) saver = tf.train.Saver(tf.all_variables()) # saver.restore(sess, checkpoint_file) # print "Model restored!" # load model from last checkpoint checkpoint_file = tf.train.latest_checkpoint(FLAGS.checkpoint_dir) saver.restore(sess,checkpoint_file) print "Model restored!" # Collect the predictions here test_tot_acc = [] preds_test, gold_test = [],[]
def main(): d = Dataset() parser = argparse.ArgumentParser() parser.add_argument('-c', '--continue', dest='continue_path', required=False) args = parser.parse_args() ## build graph network = BiLSTM() placeholders, loss, viterbi_sequence, label = network.build() # loss_reg = tf.add_n(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)) ## train config global_steps = tf.Variable(0, trainable=False) boundaries = [ config.train_size // config.batch_size * 15, config.train_size // config.batch_size * 40 ] values = [0.01, 0.001, 0.0005] lr = tf.train.piecewise_constant(global_steps, boundaries, values) opt = tf.train.AdamOptimizer(lr) # in order to update BN in every iter update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): train = opt.minimize(loss) ## init tensorboard # tf.summary.scalar('loss_regularization', loss_reg) # tf.summary.scalar('loss_crossEntropy', loss - loss_reg) tf.summary.scalar('loss', loss) tf.summary.scalar('learning_rate', lr) merged = tf.summary.merge_all() train_writer = tf.summary.FileWriter( os.path.join(config.log_dir, 'tf_log', 'train'), tf.get_default_graph()) test_writer = tf.summary.FileWriter( os.path.join(config.log_dir, 'tf_log', 'validation'), tf.get_default_graph()) ## create a session tf.set_random_seed(12345) # ensure consistent results global_cnt = 0 epoch_start = 0 g_list = tf.global_variables() saver = tf.train.Saver(var_list=g_list) with tf.Session() as sess: sess.run(tf.local_variables_initializer()) sess.run(tf.global_variables_initializer()) # init all variables if args.continue_path: # load a model snapshot ckpt = tf.train.get_checkpoint_state(args.continue_path) saver.restore(sess, ckpt.model_checkpoint_path) epoch_start = int( ckpt.model_checkpoint_path.split('/')[-1].split('-')[1]) global_cnt = epoch_start * config.train_size // config.batch_size ## training for epoch in range(epoch_start + 1, config.nr_epoch + 1): for _ in range(config.train_size // config.batch_size): global_cnt += 1 images, labels, seq_len = d.one_batch_train().__next__() feed_dict = { placeholders['data']: images, placeholders['label']: labels, global_steps: global_cnt, placeholders['is_training']: True, placeholders['sequence_lengths']: seq_len } _, loss_v, lr_v, summary, v, y = sess.run( [train, loss, lr, merged, viterbi_sequence, label], feed_dict=feed_dict) if global_cnt % config.show_interval == 0: precision, recall = cal(v, y) train_writer.add_summary(summary, global_cnt) print( "e:{},{}/{}".format( epoch, (global_cnt % config.train_size) // config.batch_size, config.train_size // config.batch_size), 'loss: {:.3f}'.format(loss_v), 'precision: {:.3f}'.format(precision), 'recall: {:.3f}'.format(recall)) ## validation if epoch % config.test_interval == 0: loss_sum = 0 for i in range(config.val_size // config.batch_size): images, labels, seq_len = d.one_batch_val().__next__() feed_dict = { placeholders['data']: images, placeholders['label']: labels, global_steps: global_cnt, placeholders['is_training']: False, placeholders['sequence_lengths']: seq_len } loss_v, summary, v, y = sess.run( [loss, merged, viterbi_sequence, label], feed_dict=feed_dict) loss_sum += loss_v precision, recall = cal(v, y) test_writer.add_summary(summary, global_cnt) print("\n**************Validation results****************") print( 'loss_avg: {:.3f}'.format( loss_sum / (config.val_size // config.batch_size)), 'precision: {:.3f}'.format(precision), 'recall: {:.3f}'.format(recall), ) print("************************************************\n") ## save model if epoch % config.snapshot_interval == 0: saver.save(sess, os.path.join(config.log_model_dir, 'epoch-{}'.format(epoch)), global_step=global_cnt) print('Training is done, exit.')
embeddings_used += 1 print("Loaded %d/%d embeddings (%2.2f%% coverage)" % (embeddings_used, vocab_size, embeddings_used / vocab_size * 100) + '\n') if char_size > 0: char_embedding_model = BiLSTMChar(char_domain_size, char_size, int(char_tok_size / 2)) char_embeddings = char_embedding_model.outputs model = BiLSTM(num_classes_A=labels_cdr_size, num_classes_B=labels_bc_size, vocab_size=vocab_size, shape_domain_size=shape_domain_size, char_domain_size=char_domain_size, char_size=char_size, embedding_size=embedding_size, shape_size=shape_size, nonlinearity=nonlinearity, viterbi=viterbi, hidden_dim=hidden_size, char_embeddings=char_embeddings, embeddings=embeddings) type_set_A = {} type_int_int_map_A = {} type_set_B = {} type_int_int_map_B = {} outside_set = ["O", "<PAD>", "<S>", "</S>", "<ZERO>"]
class FeatureExtractor(object): def __init__(self, model, options, words, rels, langs, w2i, ch, nnvecs): self.model = model self.disableBilstm = options.disable_bilstm self.multiling = options.use_lembed and options.multiling self.lstm_output_size = options.lstm_output_size self.char_lstm_output_size = options.char_lstm_output_size self.word_emb_size = options.word_emb_size self.char_emb_size = options.char_emb_size self.lang_emb_size = options.lang_emb_size self.wordsCount = words self.vocab = {word: ind + 2 for word, ind in w2i.iteritems() } # +2 for MLP padding vector and OOV vector self.chars = {char: ind + 1 for ind, char in enumerate(ch)} # +1 for OOV vector self.rels = {word: ind for ind, word in enumerate(rels)} self.nnvecs = nnvecs if langs: self.langs = {lang: ind + 1 for ind, lang in enumerate(langs) } # +1 for padding vector else: self.langs = None self.irels = rels self.external_embedding = None if options.external_embedding is not None: self.get_external_embeddings(options.external_embedding) lstm_input_size = self.word_emb_size + (self.edim if self.external_embedding is\ not None else 0) + (self.lang_emb_size if self.multiling else 0) + 2 * self.char_lstm_output_size if not self.disableBilstm: self.bilstm1 = BiLSTM(lstm_input_size, self.lstm_output_size, self.model, dropout_rate=0.33) self.bilstm2 = BiLSTM(2 * self.lstm_output_size, self.lstm_output_size, self.model, dropout_rate=0.33) else: self.lstm_output_size = int(lstm_input_size * 0.5) self.char_bilstm = BiLSTM(self.char_emb_size, self.char_lstm_output_size, self.model, dropout_rate=0.33) self.clookup = self.model.add_lookup_parameters( (len(ch) + 1, self.char_emb_size)) self.wlookup = self.model.add_lookup_parameters( (len(words) + 2, self.word_emb_size)) if self.multiling and self.lang_emb_size > 0: self.langslookup = self.model.add_lookup_parameters( (len(langs) + 1, self.lang_emb_size)) #used in the PaddingVec self.word2lstm = self.model.add_parameters( (self.lstm_output_size * 2, lstm_input_size)) self.word2lstmbias = self.model.add_parameters( (self.lstm_output_size * 2)) self.chPadding = self.model.add_parameters( (self.char_lstm_output_size * 2)) def Init(self): evec = self.elookup[1] if self.external_embedding is not None else None paddingWordVec = self.wlookup[1] paddingLangVec = self.langslookup[ 0] if self.multiling and self.lang_emb_size > 0 else None self.paddingVec = dy.tanh(self.word2lstm.expr() * dy.concatenate( filter( None, [paddingWordVec, evec, self.chPadding.expr(), paddingLangVec])) + self.word2lstmbias.expr()) self.empty = self.paddingVec if self.nnvecs == 1 else dy.concatenate( [self.paddingVec for _ in xrange(self.nnvecs)]) def getWordEmbeddings(self, sentence, train): for root in sentence: wordcount = float(self.wordsCount.get(root.norm, 0)) noDropFlag = not train or (random.random() < (wordcount / (0.25 + wordcount))) root.wordvec = self.wlookup[int(self.vocab.get(root.norm, 0) ) if noDropFlag else 0] self.get_char_vector(root, train) if self.external_embedding is not None: if not noDropFlag and random.random() < 0.5: root.evec = self.elookup[0] elif root.form in self.external_embedding: root.evec = self.elookup[self.extrnd[root.form]] elif root.norm in self.external_embedding: root.evec = self.elookup[self.extrnd[root.norm]] else: root.evec = self.elookup[0] else: root.evec = None if self.multiling: root.langvec = self.langslookup[self.langs[ root.language_id]] if self.lang_emb_size > 0 else None else: root.langvec = None root.vec = dy.concatenate( filter(None, [root.wordvec, root.evec, root.chVec, root.langvec])) if not self.disableBilstm: self.bilstm1.set_token_vecs(sentence, train) self.bilstm2.set_token_vecs(sentence, train) def get_char_vector(self, root, train): if root.form == "*root*": # no point running a character analysis over this placeholder token root.chVec = self.chPadding.expr( ) # use the padding vector if it's the root token else: char_vecs = [] for char in root.form: char_vecs.append(self.clookup[self.chars.get(char, 0)]) root.chVec = self.char_bilstm.get_sequence_vector(char_vecs, train) def get_external_embeddings(self, external_embedding_file): external_embedding_fp = codecs.open(external_embedding_file, 'r', encoding='utf-8') external_embedding_fp.readline() self.external_embedding = {} for line in external_embedding_fp: line = line.strip().split() self.external_embedding[line[0]] = [float(f) for f in line[1:]] external_embedding_fp.close() self.edim = len(self.external_embedding.values()[0]) self.noextrn = [0.0 for _ in xrange(self.edim)] #??? self.extrnd = { word: i + 3 for i, word in enumerate(self.external_embedding) } self.elookup = self.model.add_lookup_parameters( (len(self.external_embedding) + 3, self.edim)) for word, i in self.extrnd.iteritems(): self.elookup.init_row(i, self.external_embedding[word]) self.extrnd['*PAD*'] = 1 self.extrnd['*INITIAL*'] = 2 print 'Load external embedding. Vector dimensions', self.edim
def __init__(self, model, options, vocab, nnvecs): self.word_counts, words, chars, pos, cpos, self.irels, treebanks, langs = vocab self.model = model self.nnvecs = nnvecs extra_words = 2 # MLP padding vector and OOV vector self.words = {word: ind for ind, word in enumerate(words, extra_words)} #why not just len(self.words)? self.word_lookup = self.model.add_lookup_parameters( (len(self.words) + extra_words, options.word_emb_size)) extra_pos = 2 # MLP padding vector and OOV vector self.pos = {pos: ind for ind, pos in enumerate(cpos, extra_pos)} self.pos_lookup = self.model.add_lookup_parameters( (len(cpos) + extra_pos, options.pos_emb_size)) extra_chars = 1 # OOV vector self.chars = {char: ind for ind, char in enumerate(chars, extra_chars)} self.char_lookup = self.model.add_lookup_parameters( (len(chars) + extra_chars, options.char_emb_size)) extra_treebanks = 1 # Padding vector self.treebanks = { treebank: ind for ind, treebank in enumerate(treebanks, extra_treebanks) } self.treebank_lookup = self.model.add_lookup_parameters( (len(treebanks) + extra_treebanks, options.tbank_emb_size)) # initialise word vectors with external embeddings where they exist if (options.ext_emb_dir or options.ext_emb_file) and not options.predict: self.external_embedding = defaultdict(lambda: {}) for lang in langs: if options.word_emb_size > 0: self.external_embedding["words"].update( utils.get_external_embeddings(options, lang, self.words.viewkeys())) if options.char_emb_size > 0: self.external_embedding["chars"].update( utils.get_external_embeddings(options, lang, self.chars, chars=True)) self.init_lookups(options) self.lstm_input_size = options.word_emb_size + options.pos_emb_size + options.tbank_emb_size +\ 2* (options.char_lstm_output_size if options.char_emb_size > 0 else 0) print "Word-level LSTM input size: " + str(self.lstm_input_size) self.bilstms = [] if options.no_bilstms > 0: if options.unidir_lstm is not None: #replace the BiLSTMs with unidirectional ones #it's ugly to still call it bilstm but easier if options.unidir_lstm: self.bilstms.append( LSTM(self.lstm_input_size, 2 * options.lstm_output_size, self.model, dropout_rate=0.33, direction=options.unidir_lstm, layers=options.no_bilstms)) else: self.bilstms.append( BiLSTM(self.lstm_input_size, options.lstm_output_size, self.model, dropout_rate=0.33)) for i in range(1, options.no_bilstms): self.bilstms.append( BiLSTM(2 * options.lstm_output_size, options.lstm_output_size, self.model, dropout_rate=0.33)) #used in the PaddingVec self.word2lstm = self.model.add_parameters( (options.lstm_output_size * 2, self.lstm_input_size)) self.word2lstmbias = self.model.add_parameters( (options.lstm_output_size * 2)) else: self.word2lstm = self.model.add_parameters( (self.lstm_input_size, self.lstm_input_size)) self.word2lstmbias = self.model.add_parameters( (self.lstm_input_size)) self.char_bilstm = BiLSTM(options.char_emb_size, options.char_lstm_output_size, self.model, dropout_rate=0.33) self.charPadding = self.model.add_parameters( (options.char_lstm_output_size * 2)) #recursive composition things if options.use_recursive_composition: deprel_dir = [(rel, direction) for rel in self.irels for direction in [0, 1]] extra_deprel = 1 # padding rel vec #this does not work self.ideprel_dir = { val: ind for ind, val in enumerate(deprel_dir, extra_deprel) } self.deprel_lookup = self.model.add_lookup_parameters( (len(self.ideprel_dir) + extra_deprel, options.deprel_size)) lstm_out_dim = options.lstm_output_size * 2 if options.use_recursive_composition == 'RecNN': self.hCompos = self.model.add_parameters( (lstm_out_dim, lstm_out_dim)) self.dCompos = self.model.add_parameters( (lstm_out_dim, lstm_out_dim)) self.rCompos = self.model.add_parameters( (lstm_out_dim, options.deprel_size)) self.biasCompos = self.model.add_parameters((lstm_out_dim)) else: compos_in_dim = lstm_out_dim * 2 + options.deprel_size self.composLSTM = dy.VanillaLSTMBuilder( 1, compos_in_dim, lstm_out_dim, self.model)
def __init__(self, model, options, words, rels, langs, w2i, ch, nnvecs): self.model = model self.disableBilstm = options.disable_bilstm self.multiling = options.use_lembed and options.multiling self.lstm_output_size = options.lstm_output_size self.char_lstm_output_size = options.char_lstm_output_size self.word_emb_size = options.word_emb_size self.char_emb_size = options.char_emb_size self.lang_emb_size = options.lang_emb_size self.wordsCount = words self.vocab = {word: ind + 2 for word, ind in w2i.iteritems() } # +2 for MLP padding vector and OOV vector self.chars = {char: ind + 1 for ind, char in enumerate(ch)} # +1 for OOV vector self.rels = {word: ind for ind, word in enumerate(rels)} self.nnvecs = nnvecs if langs: self.langs = {lang: ind + 1 for ind, lang in enumerate(langs) } # +1 for padding vector else: self.langs = None self.irels = rels self.external_embedding = None if options.external_embedding is not None: self.get_external_embeddings(options.external_embedding) lstm_input_size = self.word_emb_size + (self.edim if self.external_embedding is\ not None else 0) + (self.lang_emb_size if self.multiling else 0) + 2 * self.char_lstm_output_size if not self.disableBilstm: self.bilstm1 = BiLSTM(lstm_input_size, self.lstm_output_size, self.model, dropout_rate=0.33) self.bilstm2 = BiLSTM(2 * self.lstm_output_size, self.lstm_output_size, self.model, dropout_rate=0.33) else: self.lstm_output_size = int(lstm_input_size * 0.5) self.char_bilstm = BiLSTM(self.char_emb_size, self.char_lstm_output_size, self.model, dropout_rate=0.33) self.clookup = self.model.add_lookup_parameters( (len(ch) + 1, self.char_emb_size)) self.wlookup = self.model.add_lookup_parameters( (len(words) + 2, self.word_emb_size)) if self.multiling and self.lang_emb_size > 0: self.langslookup = self.model.add_lookup_parameters( (len(langs) + 1, self.lang_emb_size)) #used in the PaddingVec self.word2lstm = self.model.add_parameters( (self.lstm_output_size * 2, lstm_input_size)) self.word2lstmbias = self.model.add_parameters( (self.lstm_output_size * 2)) self.chPadding = self.model.add_parameters( (self.char_lstm_output_size * 2))
class FeatureExtractor(object): def __init__(self, model, options, vocab, nnvecs): self.word_counts, words, chars, pos, cpos, self.irels, treebanks, langs = vocab self.model = model self.nnvecs = nnvecs extra_words = 2 # MLP padding vector and OOV vector self.words = {word: ind for ind, word in enumerate(words, extra_words)} self.word_lookup = self.model.add_lookup_parameters( (len(self.words) + extra_words, options.word_emb_size)) extra_pos = 2 # MLP padding vector and OOV vector self.pos = {pos: ind for ind, pos in enumerate(cpos, extra_pos)} self.pos_lookup = self.model.add_lookup_parameters( (len(cpos) + extra_pos, options.pos_emb_size)) extra_chars = 1 # OOV vector self.chars = {char: ind for ind, char in enumerate(chars, extra_chars)} self.char_lookup = self.model.add_lookup_parameters( (len(chars) + extra_chars, options.char_emb_size)) extra_treebanks = 1 # Padding vector self.treebanks = { treebank: ind for ind, treebank in enumerate(treebanks, extra_treebanks) } self.treebank_lookup = self.model.add_lookup_parameters( (len(treebanks) + extra_treebanks, options.tbank_emb_size)) # initialise word vectors with external embeddings where they exist if (options.ext_emb_dir or options.ext_emb_file) and not options.predict: self.external_embedding = defaultdict(lambda: {}) for lang in langs: if options.word_emb_size > 0: self.external_embedding["words"].update( utils.get_external_embeddings(options, lang, self.words.viewkeys())) if options.char_emb_size > 0: self.external_embedding["chars"].update( utils.get_external_embeddings(options, lang, self.chars, chars=True)) self.init_lookups(options) self.lstm_input_size = options.word_emb_size + options.pos_emb_size + options.tbank_emb_size +\ 2* (options.char_lstm_output_size if options.char_emb_size > 0 else 0) print "Word-level LSTM input size: " + str(self.lstm_input_size) self.bilstms = [] if options.no_bilstms > 0: self.bilstms.append( BiLSTM(self.lstm_input_size, options.lstm_output_size, self.model, dropout_rate=0.33)) for i in range(1, options.no_bilstms): self.bilstms.append( BiLSTM(2 * options.lstm_output_size, options.lstm_output_size, self.model, dropout_rate=0.33)) #used in the PaddingVec self.word2lstm = self.model.add_parameters( (options.lstm_output_size * 2, self.lstm_input_size)) self.word2lstmbias = self.model.add_parameters( (options.lstm_output_size * 2)) else: self.word2lstm = self.model.add_parameters( (self.lstm_input_size, self.lstm_input_size)) self.word2lstmbias = self.model.add_parameters( (self.lstm_input_size)) self.char_bilstm = BiLSTM(options.char_emb_size, options.char_lstm_output_size, self.model, dropout_rate=0.33) self.charPadding = self.model.add_parameters( (options.char_lstm_output_size * 2)) def Init(self, options): paddingWordVec = self.word_lookup[ 1] if options.word_emb_size > 0 else None paddingPosVec = self.pos_lookup[1] if options.pos_emb_size > 0 else None paddingCharVec = self.charPadding.expr( ) if options.char_emb_size > 0 else None paddingTbankVec = self.treebank_lookup[ 0] if options.tbank_emb_size > 0 else None self.paddingVec = dy.tanh(self.word2lstm.expr() *\ dy.concatenate(filter(None,[paddingWordVec, paddingPosVec, paddingCharVec, paddingTbankVec])) + self.word2lstmbias.expr()) self.empty = self.paddingVec if self.nnvecs == 1 else\ dy.concatenate([self.paddingVec for _ in xrange(self.nnvecs)]) def getWordEmbeddings(self, sentence, train, options, test_embeddings=defaultdict(lambda: {})): for root in sentence: root.vecs = defaultdict( lambda: None ) # all vecs are None by default (possibly a little risky?) if options.word_emb_size > 0: if train: word_count = float(self.word_counts.get(root.norm, 0)) dropFlag = random.random() > word_count / (0.25 + word_count) root.vecs["word"] = self.word_lookup[ self.words.get(root.norm, 0) if not dropFlag else 0] else: # need to check in test_embeddings at prediction time if root.norm in self.words: root.vecs["word"] = self.word_lookup[self.words[ root.norm]] elif root.norm in test_embeddings["words"]: root.vecs["word"] = dy.inputVector( test_embeddings["words"][root.norm]) else: root.vecs["word"] = self.word_lookup[0] if options.pos_emb_size > 0: root.vecs["pos"] = self.pos_lookup[self.pos.get(root.cpos, 0)] if options.char_emb_size > 0: root.vecs["char"] = self.get_char_vector( root, train, test_embeddings["chars"]) if options.tbank_emb_size > 0: if options.forced_tbank_emb: treebank_id = options.forced_tbank_emb elif root.proxy_tbank: treebank_id = root.proxy_tbank else: treebank_id = root.treebank_id # this is a bit of a hack for models trained on an old version of the code # that used treebank name rather than id as the lookup if not treebank_id in self.treebanks and treebank_id in utils.reverse_iso_dict and \ utils.reverse_iso_dict[treebank_id] in self.treebanks: treebank_id = utils.reverse_iso_dict[treebank_id] root.vecs["treebank"] = self.treebank_lookup[ self.treebanks[treebank_id]] root.vec = dy.concatenate( filter(None, [ root.vecs["word"], root.vecs["pos"], root.vecs["char"], root.vecs["treebank"] ])) for bilstm in self.bilstms: bilstm.set_token_vecs(sentence, train) def get_char_vector(self, root, train, test_embeddings_chars={}): if root.char_rep == "*root*": # no point running a character analysis over this placeholder token return self.charPadding.expr( ) # use the padding vector if it's the root token else: char_vecs = [] for char in root.char_rep: if char in self.chars: char_vecs.append(self.char_lookup[self.chars[char]]) elif char in test_embeddings_chars: char_vecs.append( dy.inputVector(test_embeddings_chars[char])) else: char_vecs.append(self.char_lookup[0]) return self.char_bilstm.get_sequence_vector(char_vecs, train) def init_lookups(self, options): if self.external_embedding["words"]: print 'Initialising %i word vectors with external embeddings' % len( self.external_embedding["words"]) for word in self.external_embedding["words"]: if len(self.external_embedding["words"] [word]) != options.word_emb_size: raise Exception( "Size of external embedding does not match specified word embedding size of %s" % (options.word_emb_size)) self.word_lookup.init_row( self.words[word], self.external_embedding["words"][word]) elif options.word_emb_size > 0: print 'No word external embeddings found: all vectors initialised randomly' if self.external_embedding["chars"]: print 'Initialising %i char vectors with external embeddings' % len( self.external_embedding["chars"]) for char in self.external_embedding["chars"]: if len(self.external_embedding["chars"] [char]) != options.char_emb_size: raise Exception( "Size of external embedding does not match specified char embedding size of %s" % (options.char_emb_size)) self.char_lookup.init_row( self.chars[char], self.external_embedding["chars"][char]) elif options.char_emb_size > 0: print 'No character external embeddings found: all vectors initialised randomly'
class FeatureExtractor(object): def __init__(self, model, wordsCount, rels, langs, words, ch, nnvecs, options): """ Options handling """ self.model = model if langs: self.langs = {lang: ind+1 for ind, lang in enumerate(langs)} # +1 for padding vector else: self.langs = None self.nnvecs = nnvecs self.multiling = options.multiling #and options.use_lembed self.external_embedding = None if options.external_embedding is not None: self.get_external_embeddings(options.external_embedding, model, wordsCount) self.disable_bilstm = options.disable_bilstm self.disable_second_bilstm = options.disable_second_bilstm """sharing""" self.shareBiLSTM = options.shareBiLSTM self.shareWordLookup = options.shareWordLookup self.shareCharLookup = options.shareCharLookup self.shareCharBiLSTM = options.shareCharBiLSTM self.word_lembed = options.lembed_word self.char_lembed = options.lembed_char """dims""" self.word_emb_size = options.word_emb_size self.char_emb_size = options.char_emb_size self.lstm_output_size = options.lstm_output_size self.char_lstm_output_size = options.char_lstm_output_size self.lang_emb_size = options.lang_emb_size lstm_input_size = self.word_emb_size + (self.edim if self.external_embedding is\ not None else 0) + (self.lang_emb_size if self.word_lembed else 0)\ + 2 * self.char_lstm_output_size """UTILS""" self.wordsCount = wordsCount self.irels = rels if self.multiling and not self.shareWordLookup: w2i = {} for lang in self.langs: w2i[lang] = {w: i for i, w in enumerate(words[lang])} self.vocab = {} for lang in self.langs: self.vocab[lang] = {word: ind+2 for word, ind in w2i[lang].iteritems()} else: w2i = {w: i for i, w in enumerate(words)} self.vocab = {word: ind+2 for word, ind in w2i.iteritems()} # +2 for MLP padding vector and OOV vector if not self.multiling or self.shareCharLookup: self.chars = {char: ind+1 for ind, char in enumerate(ch)} # +1 for OOV vector else: self.chars = {} for lang in self.langs: self.chars[lang] = {char: ind+1 for ind, char in enumerate(ch[lang])} self.rels = {word: ind for ind, word in enumerate(rels)} """BILSTMS""" #word if not self.multiling or self.shareBiLSTM: if not self.disable_bilstm: self.bilstm1 = BiLSTM(lstm_input_size, self.lstm_output_size, model, dropout_rate=0.33) if not self.disable_second_bilstm: self.bilstm2 = BiLSTM(2* self.lstm_output_size, self.lstm_output_size, model, dropout_rate=0.33) else: self.lstm_output_size = int(lstm_input_size * 0.5) else: self.bilstm1= {} self.bilstm2= {} for lang in self.langs: self.bilstm1[lang] = BiLSTM(lstm_input_size, self.lstm_output_size, model, dropout_rate=0.33) self.bilstm2[lang] = BiLSTM(2* self.lstm_output_size, self.lstm_output_size, model, dropout_rate=0.33) #char if self.char_lembed: char_in_dims = self.char_emb_size + self.lang_emb_size else: char_in_dims = self.char_emb_size if not self.multiling or self.shareCharBiLSTM: self.char_bilstm = BiLSTM(char_in_dims,self.char_lstm_output_size,self.model,dropout_rate=0.33) else: self.char_bilstms = {} for lang in self.langs: self.char_bilstms[lang] = BiLSTM(char_in_dims,self.char_lstm_output_size,self.model,dropout_rate=0.33) """LOOKUPS""" if not self.multiling or self.shareCharLookup: self.clookup = self.model.add_lookup_parameters((len(ch) + 1, self.char_emb_size)) else: self.clookups = {} for lang in self.langs: self.clookups[lang] = self.model.add_lookup_parameters((len(ch[lang]) + 1, self.char_emb_size)) if not self.multiling or self.shareWordLookup: self.wlookup = self.model.add_lookup_parameters((len(words) + 2, self.word_emb_size)) else: self.wlookups = {} for lang in self.langs: self.wlookups[lang] = self.model.add_lookup_parameters((len(words[lang]) + 2, self.word_emb_size)) if self.multiling and self.lang_emb_size > 0: self.langslookup = model.add_lookup_parameters((len(langs) + 1, self.lang_emb_size)) """Padding""" self.word2lstm = model.add_parameters((self.lstm_output_size * 2, lstm_input_size)) self.word2lstmbias = model.add_parameters((self.lstm_output_size *2)) self.chPadding = model.add_parameters((self.char_lstm_output_size *2)) def get_char_vec(self,word,dropout,lang=None,langvec=None): if word.form == "*root*": word.chVec = self.chPadding.expr() # use the padding vector if it's the word token else: char_vecs = [] for char in word.form: if lang: cvec = self.clookups[lang][self.chars[lang].get(char,0)] else: cvec = self.clookup[self.chars.get(char,0)] if langvec: char_vecs.append(dy.concatenate([langvec,cvec])) else: char_vecs.append(cvec) if lang: word.chVec = self.char_bilstms[lang].get_sequence_vector(char_vecs,dropout) else: word.chVec = self.char_bilstm.get_sequence_vector(char_vecs,dropout) def Init(self): #TODO: This function makes me cry #I'm not sure how necessary it is to get different padding vecs evec = self.elookup[1] if self.external_embedding is not None else None paddingLangVec = self.langslookup[0] if self.multiling and self.lang_emb_size > 0 else None if not self.multiling or self.shareWordLookup: paddingWordVec = self.wlookup[1] #import ipdb;ipdb.set_trace() self.paddingVec = dy.tanh(self.word2lstm.expr() * dy.concatenate(filter(None, [paddingWordVec, evec, self.chPadding.expr(), paddingLangVec if self.word_lembed else None])) + self.word2lstmbias.expr() ) self.empty = self.paddingVec if self.nnvecs == 1 else dy.concatenate([self.paddingVec for _ in xrange(self.nnvecs)]) else: paddingWordVecs = {} self.paddingVecs = {} self.emptyVecs = {} for lang in self.langs: paddingWordVecs[lang] = self.wlookups[lang][1] self.paddingVecs[lang] = dy.tanh(self.word2lstm.expr() * dy.concatenate(filter(None, [paddingWordVecs[lang], evec, self.chPadding.expr(), paddingLangVec if self.word_lembed else None])) + self.word2lstmbias.expr() ) self.emptyVecs[lang] = self.paddingVecs[lang] if self.nnvecs == 1 else dy.concatenate([self.paddingVecs[lang] for _ in xrange(self.nnvecs)]) def getWordEmbeddings(self, sentence, train, get_vectors=False): lang = sentence[0].language_id for root in sentence: #word if not self.multiling or self.shareWordLookup: wordcount = float(self.wordsCount.get(root.norm, 0)) else: wordcount = float(self.wordsCount[lang].get(root.norm, 0)) noDropFlag = not train or (random.random() < (wordcount/(0.25+wordcount))) if not self.multiling or self.shareWordLookup: root.wordvec = self.wlookup[int(self.vocab.get(root.norm, 0)) if noDropFlag else 0] else: root.wordvec = self.wlookups[lang][int(self.vocab[lang].get(root.norm, 0)) if noDropFlag else 0] if self.multiling and self.word_lembed: root.langvec = self.langslookup[self.langs[root.language_id]] if self.lang_emb_size > 0 else None else: root.langvec = None #char if not self.multiling or self.shareCharBiLSTM: if self.char_lembed: langVec = self.langslookup[self.langs[lang]] self.get_char_vec(root,train, langvec=langvec) else: self.get_char_vec(root,train) else: self.get_char_vec(root,train, lang=lang) if self.external_embedding is not None: if not noDropFlag and random.random() < 0.5: root.evec = self.elookup[0] elif root.form in self.external_embedding: root.evec = self.elookup[self.extrnd[root.form]] elif root.norm in self.external_embedding: root.evec = self.elookup[self.extrnd[root.norm]] else: root.evec = self.elookup[0] else: root.evec = None root.vec = dy.concatenate(filter(None, [root.wordvec, root.evec, root.chVec, root.langvec])) if not self.multiling or self.shareBiLSTM: self.bilstm1.set_token_vecs(sentence,train) self.bilstm2.set_token_vecs(sentence,train) else: self.bilstm1[lang].set_token_vecs(sentence,train) self.bilstm2[lang].set_token_vecs(sentence,train) if get_vectors: data_vec = list() for i, token in enumerate(sentence): if token.form != '*root*': import pdb wordvec = token.wordvec.value() if self.external_embedding is not None: wordvec += token.evec.value() data_tuple = (i+1, token.form, token.cpos, token.feats, token.chVec.value(), wordvec, token.vec.value()) data_vec.append(data_tuple) return data_vec def get_external_embeddings(self, external_embedding_file, model, wordsCount): # NOTE: this is modified to load fastText embeddings! self.external_embedding = {} external_embedding_fp = codecs.open(external_embedding_file, 'r', encoding='utf-8') # read first line --- number of tokens and embedding dimension self.edim = int(external_embedding_fp.readline().split()[1]) num_tokens = 0 for line in external_embedding_fp: line = line.strip().split() if len(line) != self.edim + 1: continue else: if line[0] in wordsCount: self.external_embedding[line[0]] = [float(f) for f in line[1:]] num_tokens += 1 external_embedding_fp.close() # self.edim = len(self.external_embedding.values()[0]) self.noextrn = [0.0 for _ in xrange(self.edim)] #??? self.extrnd = {word: i + 3 for i, word in enumerate(self.external_embedding)} self.elookup = model.add_lookup_parameters((len(self.external_embedding) + 3, self.edim)) for word, i in self.extrnd.iteritems(): self.elookup.init_row(i, self.external_embedding[word]) self.extrnd['*PAD*'] = 1 self.extrnd['*INITIAL*'] = 2 print '-' * 100 print 'Load external embedding. Vector dimensions:', self.edim, ', number of tokens:', num_tokens print '-' * 100
def main(_): tf.logging.set_verbosity(tf.logging.INFO) data_loader = TextLoader(True, FLAGS.train_path, FLAGS.batch_size, FLAGS.seq_length, None, None, 'utf8', False) test_data_loader = TextLoader(False, FLAGS.test_path, FLAGS.batch_size, FLAGS.seq_length, data_loader.vocab, data_loader.labels, 'utf8', False) tf.logging.info("vocab_size: " + str(data_loader.vocab_size)) FLAGS.vocab_size = data_loader.vocab_size tf.logging.info("label_size: " + str(data_loader.label_size)) FLAGS.label_size = data_loader.label_size bilstm = BiLSTM(FLAGS) init = tf.global_variables_initializer() config = tf.ConfigProto(allow_soft_placement=True) config.gpu_options.allow_growth = True with tf.Session(config=config) as sess: sess.run(init) saver = tf.train.Saver(tf.global_variables(), max_to_keep=2) idx = 0 test_best_acc = 0 for epcho in range(FLAGS.num_epcho): #for each epoch data_loader.reset_batch_pointer() for train_batch_num in range(data_loader.num_batches): #for each batch input_x, input_y, x_len = data_loader.next_batch() feed = {bilstm.input_x:input_x, bilstm.input_y:input_y, bilstm.x_len:x_len, bilstm.dropout_keep_prob:FLAGS.dropout_keep_prob} _, global_step_op, train_loss, train_acc = sess.run( [bilstm.train_step, bilstm.global_step, bilstm.loss, bilstm.acc], feed_dict=feed) tf.logging.info("training...........global_step = {}, epoch = {}, current_batch = {}, " "train_loss = {:.4f}, accuracy = {:.4f}".format(global_step_op, epcho, train_batch_num, train_loss, train_acc)) idx += 1 if idx % FLAGS.check_every == 0: test_acc = 0 all_num = 0 acc_num = 0 test_data_loader.reset_batch_pointer() write_result = [] for _ in range(test_data_loader.num_batches): input_x_test, input_y_test, x_len_test = test_data_loader.next_batch() feed = {bilstm.input_x: input_x_test, bilstm.input_y: input_y_test, bilstm.x_len: x_len_test, bilstm.dropout_keep_prob: 1.0} prediction, arg_index = sess.run([bilstm.prediction, bilstm.arg_index], feed_dict=feed) all_num = all_num + len(input_y_test) write_str = "" for i, indexs in enumerate(arg_index): pre_label_id = indexs[0] real_label_id = input_y_test[i] if pre_label_id == real_label_id: acc_num = acc_num + 1 if real_label_id in test_data_loader.id_2_label: write_str = test_data_loader.id_2_label.get(real_label_id) else: write_str = "__label__unknown" for index in indexs: cur_label = test_data_loader.id_2_label.get(index) cur_score = prediction[i][index] write_str = write_str + " " + cur_label + ":" + str(cur_score) write_str = write_str + "\n" write_result.append(write_str) test_acc = acc_num * 1.0 / all_num tf.logging.info("testing...........global_step = {}, epoch = {}, accuracy = {:.4f}, cur_best_acc = {}".format(global_step_op, epcho, test_acc, test_best_acc)) if test_best_acc < test_acc: test_best_acc = test_acc # save_model checkpoint_path = os.path.join(FLAGS.model_path, 'lstm.ckpt') saver.save(sess, checkpoint_path, global_step=global_step_op) resultfile = open(FLAGS.result_file, 'w', encoding='utf-8') for pre_sen in write_result: resultfile.write(pre_sen) tf.logging.info("has saved model and write.result...................................................................") resultfile.close()
def __init__(self, model, wordsCount, rels, langs, words, ch, nnvecs, options): """ Options handling """ self.model = model if langs: self.langs = {lang: ind+1 for ind, lang in enumerate(langs)} # +1 for padding vector else: self.langs = None self.nnvecs = nnvecs self.multiling = options.multiling #and options.use_lembed self.external_embedding = None if options.external_embedding is not None: self.get_external_embeddings(options.external_embedding, model, wordsCount) self.disable_bilstm = options.disable_bilstm self.disable_second_bilstm = options.disable_second_bilstm """sharing""" self.shareBiLSTM = options.shareBiLSTM self.shareWordLookup = options.shareWordLookup self.shareCharLookup = options.shareCharLookup self.shareCharBiLSTM = options.shareCharBiLSTM self.word_lembed = options.lembed_word self.char_lembed = options.lembed_char """dims""" self.word_emb_size = options.word_emb_size self.char_emb_size = options.char_emb_size self.lstm_output_size = options.lstm_output_size self.char_lstm_output_size = options.char_lstm_output_size self.lang_emb_size = options.lang_emb_size lstm_input_size = self.word_emb_size + (self.edim if self.external_embedding is\ not None else 0) + (self.lang_emb_size if self.word_lembed else 0)\ + 2 * self.char_lstm_output_size """UTILS""" self.wordsCount = wordsCount self.irels = rels if self.multiling and not self.shareWordLookup: w2i = {} for lang in self.langs: w2i[lang] = {w: i for i, w in enumerate(words[lang])} self.vocab = {} for lang in self.langs: self.vocab[lang] = {word: ind+2 for word, ind in w2i[lang].iteritems()} else: w2i = {w: i for i, w in enumerate(words)} self.vocab = {word: ind+2 for word, ind in w2i.iteritems()} # +2 for MLP padding vector and OOV vector if not self.multiling or self.shareCharLookup: self.chars = {char: ind+1 for ind, char in enumerate(ch)} # +1 for OOV vector else: self.chars = {} for lang in self.langs: self.chars[lang] = {char: ind+1 for ind, char in enumerate(ch[lang])} self.rels = {word: ind for ind, word in enumerate(rels)} """BILSTMS""" #word if not self.multiling or self.shareBiLSTM: if not self.disable_bilstm: self.bilstm1 = BiLSTM(lstm_input_size, self.lstm_output_size, model, dropout_rate=0.33) if not self.disable_second_bilstm: self.bilstm2 = BiLSTM(2* self.lstm_output_size, self.lstm_output_size, model, dropout_rate=0.33) else: self.lstm_output_size = int(lstm_input_size * 0.5) else: self.bilstm1= {} self.bilstm2= {} for lang in self.langs: self.bilstm1[lang] = BiLSTM(lstm_input_size, self.lstm_output_size, model, dropout_rate=0.33) self.bilstm2[lang] = BiLSTM(2* self.lstm_output_size, self.lstm_output_size, model, dropout_rate=0.33) #char if self.char_lembed: char_in_dims = self.char_emb_size + self.lang_emb_size else: char_in_dims = self.char_emb_size if not self.multiling or self.shareCharBiLSTM: self.char_bilstm = BiLSTM(char_in_dims,self.char_lstm_output_size,self.model,dropout_rate=0.33) else: self.char_bilstms = {} for lang in self.langs: self.char_bilstms[lang] = BiLSTM(char_in_dims,self.char_lstm_output_size,self.model,dropout_rate=0.33) """LOOKUPS""" if not self.multiling or self.shareCharLookup: self.clookup = self.model.add_lookup_parameters((len(ch) + 1, self.char_emb_size)) else: self.clookups = {} for lang in self.langs: self.clookups[lang] = self.model.add_lookup_parameters((len(ch[lang]) + 1, self.char_emb_size)) if not self.multiling or self.shareWordLookup: self.wlookup = self.model.add_lookup_parameters((len(words) + 2, self.word_emb_size)) else: self.wlookups = {} for lang in self.langs: self.wlookups[lang] = self.model.add_lookup_parameters((len(words[lang]) + 2, self.word_emb_size)) if self.multiling and self.lang_emb_size > 0: self.langslookup = model.add_lookup_parameters((len(langs) + 1, self.lang_emb_size)) """Padding""" self.word2lstm = model.add_parameters((self.lstm_output_size * 2, lstm_input_size)) self.word2lstmbias = model.add_parameters((self.lstm_output_size *2)) self.chPadding = model.add_parameters((self.char_lstm_output_size *2))
class FeatureExtractor(object): def __init__(self, model, options, vocab, nnvecs=1): self.word_counts, words, chars, pos, cpos, rels, treebanks, langs = vocab self.model = model self.nnvecs = nnvecs # Load ELMo if the option is set if options.elmo is not None: from elmo import ELMo self.elmo = ELMo(options.elmo, options.elmo_gamma, options.elmo_learn_gamma) self.elmo.init_weights(model) else: self.elmo = None extra_words = 2 # MLP padding vector and OOV vector self.words = {word: ind for ind, word in enumerate(words, extra_words)} self.word_lookup = self.model.add_lookup_parameters( (len(self.words) + extra_words, options.word_emb_size)) extra_pos = 2 # MLP padding vector and OOV vector self.pos = {pos: ind for ind, pos in enumerate(cpos, extra_pos)} self.pos_lookup = self.model.add_lookup_parameters( (len(cpos) + extra_pos, options.pos_emb_size)) self.irels = rels self.rels = {rel: ind for ind, rel in enumerate(rels)} extra_chars = 1 # OOV vector self.chars = {char: ind for ind, char in enumerate(chars, extra_chars)} self.char_lookup = self.model.add_lookup_parameters( (len(chars) + extra_chars, options.char_emb_size)) extra_treebanks = 1 # Padding vector self.treebanks = { treebank: ind for ind, treebank in enumerate(treebanks, extra_treebanks) } self.treebank_lookup = self.model.add_lookup_parameters( (len(treebanks) + extra_treebanks, options.tbank_emb_size)) # initialise word vectors with external embeddings where they exist # This part got ugly - TODO: refactor if not options.predict: self.external_embedding = defaultdict(lambda: {}) if options.ext_word_emb_file and options.word_emb_size > 0: # Load pre-trained word embeddings for lang in langs: embeddings = utils.get_external_embeddings( options, emb_file=options.ext_word_emb_file, lang=lang, words=self.words.viewkeys()) self.external_embedding["words"].update(embeddings) if options.ext_char_emb_file and options.char_emb_size > 0: # Load pre-trained character embeddings for lang in langs: embeddings = utils.get_external_embeddings( options, emb_file=options.ext_char_emb_file, lang=lang, words=self.chars, chars=True) self.external_embedding["chars"].update(embeddings) if options.ext_emb_dir: # For every language, load the data for the word and character # embeddings from a directory. for lang in langs: if options.word_emb_size > 0: embeddings = utils.get_external_embeddings( options, emb_dir=options.ext_emb_dir, lang=lang, words=self.words.viewkeys()) self.external_embedding["words"].update(embeddings) if options.char_emb_size > 0: embeddings = utils.get_external_embeddings( options, emb_dir=options.ext_emb_dir, lang=lang, words=self.chars, chars=True) self.external_embedding["chars"].update(embeddings) self.init_lookups(options) elmo_emb_size = self.elmo.emb_dim if self.elmo else 0 self.lstm_input_size = ( options.word_emb_size + elmo_emb_size + options.pos_emb_size + options.tbank_emb_size + 2 * (options.char_lstm_output_size if options.char_emb_size > 0 else 0) ) print "Word-level LSTM input size: " + str(self.lstm_input_size) self.bilstms = [] if options.no_bilstms > 0: self.bilstms.append( BiLSTM(self.lstm_input_size, options.lstm_output_size, self.model, dropout_rate=0.33)) for i in range(1, options.no_bilstms): self.bilstms.append( BiLSTM(2 * options.lstm_output_size, options.lstm_output_size, self.model, dropout_rate=0.33)) #used in the PaddingVec self.word2lstm = self.model.add_parameters( (options.lstm_output_size * 2, self.lstm_input_size)) self.word2lstmbias = self.model.add_parameters( (options.lstm_output_size * 2)) else: self.word2lstm = self.model.add_parameters( (self.lstm_input_size, self.lstm_input_size)) self.word2lstmbias = self.model.add_parameters( (self.lstm_input_size)) self.char_bilstm = BiLSTM(options.char_emb_size, options.char_lstm_output_size, self.model, dropout_rate=0.33) self.charPadding = self.model.add_parameters( (options.char_lstm_output_size * 2)) def Init(self, options): paddingWordVec = self.word_lookup[ 1] if options.word_emb_size > 0 else None paddingElmoVec = dy.zeros(self.elmo.emb_dim) if self.elmo else None paddingPosVec = self.pos_lookup[1] if options.pos_emb_size > 0 else None paddingCharVec = self.charPadding.expr( ) if options.char_emb_size > 0 else None paddingTbankVec = self.treebank_lookup[ 0] if options.tbank_emb_size > 0 else None self.paddingVec = dy.tanh(self.word2lstm.expr() *\ dy.concatenate(filter(None,[paddingWordVec, paddingElmoVec, paddingPosVec, paddingCharVec, paddingTbankVec])) + self.word2lstmbias.expr()) self.empty = self.paddingVec if self.nnvecs == 1 else\ dy.concatenate([self.paddingVec for _ in xrange(self.nnvecs)]) def getWordEmbeddings(self, sentence, train, options, test_embeddings=defaultdict(lambda: {})): if self.elmo: # Get full text of sentence - excluding root, which is loaded differently # for transition and graph-based parsers. if options.graph_based: sentence_text = " ".join( [entry.form for entry in sentence[1:]]) else: sentence_text = " ".join( [entry.form for entry in sentence[:-1]]) elmo_sentence_representation = \ self.elmo.get_sentence_representation(sentence_text) for i, root in enumerate(sentence): root.vecs = defaultdict( lambda: None ) # all vecs are None by default (possibly a little risky?) if options.word_emb_size > 0: if train: word_count = float(self.word_counts.get(root.norm, 0)) dropFlag = random.random() > word_count / (0.25 + word_count) root.vecs["word"] = self.word_lookup[ self.words.get(root.norm, 0) if not dropFlag else 0] else: # need to check in test_embeddings at prediction time if root.norm in self.words: root.vecs["word"] = self.word_lookup[self.words[ root.norm]] elif root.norm in test_embeddings["words"]: root.vecs["word"] = dy.inputVector( test_embeddings["words"][root.norm]) else: root.vecs["word"] = self.word_lookup[0] if options.pos_emb_size > 0: root.vecs["pos"] = self.pos_lookup[self.pos.get(root.cpos, 0)] if options.char_emb_size > 0: root.vecs["char"] = self.get_char_vector( root, train, test_embeddings["chars"]) if options.tbank_emb_size > 0: if options.forced_tbank_emb: treebank_id = options.forced_tbank_emb elif root.proxy_tbank: treebank_id = root.proxy_tbank else: treebank_id = root.treebank_id # this is a bit of a hack for models trained on an old version of the code # that used treebank name rather than id as the lookup if not treebank_id in self.treebanks and treebank_id in utils.reverse_iso_dict and \ utils.reverse_iso_dict[treebank_id] in self.treebanks: treebank_id = utils.reverse_iso_dict[treebank_id] root.vecs["treebank"] = self.treebank_lookup[ self.treebanks[treebank_id]] if self.elmo: if i < len(sentence) - 1: # Don't look up the 'root' word root.vecs["elmo"] = elmo_sentence_representation[i] else: # TODO root.vecs["elmo"] = dy.zeros(self.elmo.emb_dim) root.vec = dy.concatenate( filter(None, [ root.vecs["word"], root.vecs["elmo"], root.vecs["pos"], root.vecs["char"], root.vecs["treebank"] ])) for bilstm in self.bilstms: bilstm.set_token_vecs(sentence, train) def get_char_vector(self, root, train, test_embeddings_chars={}): if root.char_rep == "*root*": # no point running a character analysis over this placeholder token return self.charPadding.expr( ) # use the padding vector if it's the root token else: char_vecs = [] for char in root.char_rep: if char in self.chars: char_vecs.append(self.char_lookup[self.chars[char]]) elif char in test_embeddings_chars: char_vecs.append( dy.inputVector(test_embeddings_chars[char])) else: char_vecs.append(self.char_lookup[0]) return self.char_bilstm.get_sequence_vector(char_vecs, train) def init_lookups(self, options): if self.external_embedding["words"]: print 'Initialising %i word vectors with external embeddings' % len( self.external_embedding["words"]) for word in self.external_embedding["words"]: if len(self.external_embedding["words"] [word]) != options.word_emb_size: raise Exception( "Size of external embedding does not match specified word embedding size of %s" % (options.word_emb_size)) self.word_lookup.init_row( self.words[word], self.external_embedding["words"][word]) elif options.word_emb_size > 0: print 'No word external embeddings found: all vectors initialised randomly' if self.external_embedding["chars"]: print 'Initialising %i char vectors with external embeddings' % len( self.external_embedding["chars"]) for char in self.external_embedding["chars"]: if len(self.external_embedding["chars"] [char]) != options.char_emb_size: raise Exception( "Size of external embedding does not match specified char embedding size of %s" % (options.char_emb_size)) self.char_lookup.init_row( self.chars[char], self.external_embedding["chars"][char]) elif options.char_emb_size > 0: print 'No character external embeddings found: all vectors initialised randomly'
def __init__(self, vocab_size, charEmbedding, out_size, crf=True): """功能:对LSTM的模型进行训练与测试 参数: vocab_size:词典大小 out_size:标注种类 crf选择是否添加CRF层""" self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") # 加载模型参数 self.emb_size = LSTMConfig.emb_size self.hidden_size = LSTMConfig.hidden_size self.charEmbedding = charEmbedding self.crf = crf # 根据是否添加crf初始化不同的模型 选择不一样的损失计算函数 if not crf: self.model = BiLSTM(vocab_size, self.emb_size, self.hidden_size, out_size).to(self.device) self.cal_loss_func = cal_loss else: print('bilstmcrf') self.model = BiLSTM_CRF(vocab_size, self.emb_size, self.charEmbedding, self.hidden_size, out_size).to(self.device) self.cal_loss_func = cal_lstm_crf_loss print(self.model) # # preTrainModel=torch.load('/home/huang/Desktop/named_entity_recognition/ckpts/language_model.ckpt') # model_dict = self.model.state_dict() # pretrained_dict = {'bilstm.'+k: v for k, v in preTrainModel.items() if 'bilstm.'+k in model_dict} # filter out unnecessary keys # model_dict.update(pretrained_dict) # self.model.load_state_dict(model_dict) # 加载训练参数: self.epoches = TrainingConfig.epoches self.print_step = TrainingConfig.print_step self.lr = TrainingConfig.lr self.batch_size = TrainingConfig.batch_size weight_p, bias_p = [], [] for name, p in self.model.named_parameters(): if 'bias' in name: bias_p += [p] else: weight_p += [p] parameter_list = [ # {"params": self.model.bilstm.embed.parameters(), "lr": 0.001}, # {"params": self.model.bilstm.preTrainembedding.parameters(), "lr": 0.00001}, \ # {"params": self.model.bilstm.lstm.parameters(), "lr": 0.001}, # {"params": self.model.bilstm.conv1.parameters(), "lr": 0.001}, \ # {"params": self.model.bilstm.conv2.parameters(), "lr": 0.001}, # {"params": self.model.bilstm.conv3.parameters(), "lr": 0.001}, # {"params": self.model.bilstm.dense1.parameters(), "lr": 0.001}, # {"params": self.model.bilstm.dense2.parameters(), "lr": 0.001}, # {"params": self.model.bilstm.dense3.parameters(), "lr": 0.001}, # {"params": self.model.bilstm.lin.parameters(), "lr": 0.001}, # {"params": self.model.transition, "lr": 0.001}, { "params": weight_p, "weight_decay": 0.00005 }, { 'params': bias_p, 'weight_decay': 0 } ] # 初始化优化器 self.optimizer = optim.Adam(parameter_list, lr=0.001) # 初始化其他指标 self.step = 0 self.best_val_loss = 1e18 self.best_model = None