Ejemplo n.º 1
0
 def __init__(self, embeddingSize, distinctTagNum, c2vPath, numHidden):
     self.embeddingSize = embeddingSize
     self.distinctTagNum = distinctTagNum
     self.numHidden = numHidden
     self.c2v = self.load_w2v(c2vPath, FLAGS.embedding_size)
     self.words = tf.Variable(self.c2v, name="words")
     layers = [
         {
             'dilation': 1
         },
         {
             'dilation': 1
         },
         {
             'dilation': 2
         },
     ]
     if FLAGS.use_idcnn:
         self.model = IdCNN(layers, 3, FLAGS.num_hidden,
                            FLAGS.embedding_size, FLAGS.max_sentence_len,
                            FLAGS.num_tags)
     else:
         self.model = BiLSTM(FLAGS.num_hidden, FLAGS.max_sentence_len,
                             FLAGS.num_tags)
     self.trains_params = None
     self.inp = tf.placeholder(tf.int32,
                               shape=[None, FLAGS.max_sentence_len],
                               name="input_placeholder")
     pass
Ejemplo n.º 2
0
class BiLSTM_CRF(nn.Module):
    def __init__(self, data):
        super(BiLSTM_CRF, self).__init__()
        print "build batched lstmcrf..."
        self.gpu = data.HP_gpu
        ## add two more label for downlayer lstm, use original label size for CRF
        label_size = data.label_alphabet_size
        data.label_alphabet_size += 2
        self.lstm = BiLSTM(data)
        self.crf = CRF(label_size, self.gpu)

    def neg_log_likelihood_loss(self, gaz_list, word_inputs, biword_inputs,
                                word_seq_lengths, char_inputs,
                                char_seq_lengths, char_seq_recover,
                                batch_label, mask):
        outs = self.lstm.get_output_score(gaz_list, word_inputs, biword_inputs,
                                          word_seq_lengths, char_inputs,
                                          char_seq_lengths, char_seq_recover)
        batch_size = word_inputs.size(0)
        seq_len = word_inputs.size(1)
        total_loss = self.crf.neg_log_likelihood_loss(outs, mask, batch_label)
        scores, tag_seq = self.crf._viterbi_decode(outs, mask)
        return total_loss, tag_seq

    def forward(self, gaz_list, word_inputs, biword_inputs, word_seq_lengths,
                char_inputs, char_seq_lengths, char_seq_recover, mask):
        outs = self.lstm.get_output_score(gaz_list, word_inputs, biword_inputs,
                                          word_seq_lengths, char_inputs,
                                          char_seq_lengths, char_seq_recover)
        batch_size = word_inputs.size(0)
        seq_len = word_inputs.size(1)
        scores, tag_seq = self.crf._viterbi_decode(outs, mask)
        return tag_seq

    def get_lstm_features(self, gaz_list, word_inputs, biword_inputs,
                          word_seq_lengths, char_inputs, char_seq_lengths,
                          char_seq_recover):
        return self.lstm.get_lstm_features(gaz_list, word_inputs,
                                           biword_inputs, word_seq_lengths,
                                           char_inputs, char_seq_lengths,
                                           char_seq_recover)
Ejemplo n.º 3
0
def run_bilstm(embedding_info,
               train,
               test,
               val,
               dist_func='cosine',
               early_stopping=True,
               plot=True):
    num_units = 64
    bilstm = BiLSTM(embedding_info, num_units, dist_func)
    adam = Adam(learning_rate=0.00001)

    bilstm.compile(loss='mse', optimizer=adam, metrics=['mse'])

    batch_size = 64
    num_epochs = 100

    trained_model = train_model(bilstm, train, val, early_stopping, batch_size,
                                num_epochs)
    evaluate_model(bilstm, test)

    if plot:
        plot_training(trained_model, 'BiLSTM', 'loss')
    def __init__(self, vocab_size, emb_size, weight, hidden_size, out_size):
        """初始化参数:
            vocab_size:字典的大小
            emb_size:词向量的维数
            hidden_size:隐向量的维数
            out_size:标注的种类
        """
        super(BiLSTM_CRF, self).__init__()
        self.bilstm = BiLSTM(vocab_size, emb_size, weight, hidden_size,
                             out_size)

        # CRF实际上就是多学习一个转移矩阵 [out_size, out_size] 初始化为均匀分布
        self.transition = nn.Parameter(
            torch.ones(out_size, out_size) * 1 / out_size)
Ejemplo n.º 5
0
 def __init__(self, embeddingSize, distinctTagNum, c2vPath, numHidden):
     self.embeddingSize = embeddingSize
     self.distinctTagNum = distinctTagNum
     self.numHidden = numHidden
     num_shards = FLAGS.num_shards
     self.c2v = self.load_w2v(num_shards, c2vPath, FLAGS.embedding_size)
     self.words = []
     with tf.device("/cpu:0"):
         for i in range(0, num_shards):
             words_i = tf.get_variable(name="words-%02d" % i,
                                       initializer=tf.random_uniform(
                                           self.c2v[i].shape,
                                           minval=-0.1,
                                           maxval=0.1),
                                       trainable=False)
             self.words.append(words_i)
     layers = [
         {
             'dilation': 1
         },
         {
             'dilation': 1
         },
         {
             'dilation': 2
         },
     ]
     if FLAGS.use_idcnn:
         self.model = IdCNN(layers, 3, FLAGS.num_hidden,
                            FLAGS.embedding_size, FLAGS.max_sentence_len,
                            FLAGS.num_tags)
     else:
         self.model = BiLSTM(FLAGS.num_hidden, FLAGS.max_sentence_len,
                             FLAGS.num_tags)
     self.trains_params = None
     self.inp = tf.placeholder(tf.int32,
                               shape=[None, FLAGS.max_sentence_len],
                               name="input_placeholder")
     pass
Ejemplo n.º 6
0
def run(dataDir, fold=5):
    f = open('./config.yml', encoding='utf-8', errors='ignore')
    config = yaml.safe_load(f)
    DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    config['DEVICE'] = DEVICE
    batchSize = config['model']['batchSize']

    optParser = OptionParser()
    optParser.add_option('-m',
                         '--model',
                         action='store',
                         type='string',
                         dest='modelName')
    option, args = optParser.parse_args()
    modelName = config['modelName'] = option.modelName

    #保存最终结果
    f = open(os.path.join(dataDir, modelName, 'result.txt'),
             'w',
             encoding='utf-8',
             errors='ignore')

    #测试数据
    testDataPath = config['data']['testDataPath']
    testDataset = NERTestDataset(testDataPath, config)
    testIter = data.DataLoader(dataset=testDataset,
                               batch_size=batchSize,
                               shuffle=False,
                               num_workers=4,
                               collate_fn=testPad)

    for i in range(fold):
        print('--------------------第%d次验证-------------------\n' % (i + 1))
        #验证数据
        validDataset = NERDataset(os.path.join(dataDir,
                                               str(i) + '.txt'), config)
        validIter = data.DataLoader(dataset=validDataset,
                                    batch_size=batchSize,
                                    shuffle=False,
                                    num_workers=4,
                                    collate_fn=pad)
        #训练数据
        trainPathArr = [
            os.path.join(dataDir,
                         str(j) + '.txt') for j in range(fold) if j != i
        ]
        assert len(trainPathArr) == fold - 1

        trainDataset = NERDataset(trainPathArr, config)
        trainIter = data.DataLoader(dataset=trainDataset,
                                    batch_size=batchSize,
                                    shuffle=True,
                                    num_workers=4,
                                    collate_fn=pad)

        #加载网络
        if modelName == 'bilstm':
            net = BiLSTM(config)

        if modelName == 'idcnn':
            net = IDCNN(config)

        if modelName == 'bilstm_attn':
            net = BiLSTM_ATTN(config)

        if torch.cuda.device_count() > 1:
            net = nn.DataParallel(net)

        net = net.to(DEVICE)

        config['submitPath'] = os.path.join(dataDir, modelName,
                                            str(i) + '.csv')
        config['modelSavePath'] = os.path.join(dataDir, modelName,
                                               str(i) + '.pkl')

        trainLoss, validLoss, f1Score, accurate, recall = train(
            net, trainIter, validIter, config)

        #验证集中实际效果
        modelSavePath = config['modelSavePath']
        if os.path.exists(modelSavePath):
            net.load_state_dict(torch.load(modelSavePath))

        #未过滤训练集实体的缺失比、过滤完训练集实体的缺失比
        disappear1, disappear2 = test(net, testIter, config)

        f.write('第%d次验证\n' % (i + 1))
        f.write('trainLoss: %f\n' % trainLoss)
        f.write('validLoss: %f\n' % validLoss)
        f.write('f1Score %f, accurate %f, recall %f\n' %
                (f1Score, accurate, recall))
        f.write('测试集中缺失比%f %f\n' % (disappear1, disappear2))
        f.write('\n')

    f.close()
Ejemplo n.º 7
0
    print("Embeddings written of the best model")


result_list = list()
result_list_acc = list()
count = 0
try:
    for Ques_train, Ques_test in kf.split(Dataset):

        count += 1
        print("Fold ", count)
        if count == 2:
            #writeContentEmbeddings(model)
            break
        # define model
        model = BiLSTM(embedding_dim=EMBEDDING_DIM, hidden_dim=HIDDEN_DIM, vocab_size=len(text_field.vocab), label_size=len(label_field.vocab)-1,\
                              use_gpu=USE_GPU, batch_size=BATCH_SIZE)
        if USE_GPU:
            model = model.to(DEVICE)
        model.embeddings.weight.data.copy_(
            torch.from_numpy(pretrained_embeddings))
        optimizer = optim.Adam(model.parameters(), lr=1e-3)

        trainset = torch.utils.data.TensorDataset(
            torch.LongTensor(Dataset[Ques_train]),
            torch.torch.LongTensor(Dataset[Ques_train]))
        testset = torch.utils.data.TensorDataset(
            torch.LongTensor(Dataset[Ques_test]),
            torch.torch.LongTensor(Dataset[Ques_test]))

        train_loader = torch.utils.data.DataLoader(dataset=trainset,
                                                   batch_size=64)
Ejemplo n.º 8
0
    def __init__(self, model, options, vocab, nnvecs=1):

        self.word_counts, words, chars, pos, cpos, rels, treebanks, langs = vocab

        self.model = model
        self.nnvecs = nnvecs

        # Load ELMo if the option is set
        if options.elmo is not None:
            from elmo import ELMo
            self.elmo = ELMo(options.elmo, options.elmo_gamma,
                             options.elmo_learn_gamma)
            self.elmo.init_weights(model)
        else:
            self.elmo = None

        extra_words = 2  # MLP padding vector and OOV vector
        self.words = {word: ind for ind, word in enumerate(words, extra_words)}
        self.word_lookup = self.model.add_lookup_parameters(
            (len(self.words) + extra_words, options.word_emb_size))

        extra_pos = 2  # MLP padding vector and OOV vector
        self.pos = {pos: ind for ind, pos in enumerate(cpos, extra_pos)}
        self.pos_lookup = self.model.add_lookup_parameters(
            (len(cpos) + extra_pos, options.pos_emb_size))

        self.irels = rels
        self.rels = {rel: ind for ind, rel in enumerate(rels)}

        extra_chars = 1  # OOV vector
        self.chars = {char: ind for ind, char in enumerate(chars, extra_chars)}
        self.char_lookup = self.model.add_lookup_parameters(
            (len(chars) + extra_chars, options.char_emb_size))

        extra_treebanks = 1  # Padding vector
        self.treebanks = {
            treebank: ind
            for ind, treebank in enumerate(treebanks, extra_treebanks)
        }
        self.treebank_lookup = self.model.add_lookup_parameters(
            (len(treebanks) + extra_treebanks, options.tbank_emb_size))

        # initialise word vectors with external embeddings where they exist
        # This part got ugly - TODO: refactor
        if not options.predict:
            self.external_embedding = defaultdict(lambda: {})

            if options.ext_word_emb_file and options.word_emb_size > 0:
                # Load pre-trained word embeddings
                for lang in langs:
                    embeddings = utils.get_external_embeddings(
                        options,
                        emb_file=options.ext_word_emb_file,
                        lang=lang,
                        words=self.words.viewkeys())
                    self.external_embedding["words"].update(embeddings)

            if options.ext_char_emb_file and options.char_emb_size > 0:
                # Load pre-trained character embeddings
                for lang in langs:
                    embeddings = utils.get_external_embeddings(
                        options,
                        emb_file=options.ext_char_emb_file,
                        lang=lang,
                        words=self.chars,
                        chars=True)
                    self.external_embedding["chars"].update(embeddings)

            if options.ext_emb_dir:
                # For every language, load the data for the word and character
                # embeddings from a directory.
                for lang in langs:
                    if options.word_emb_size > 0:
                        embeddings = utils.get_external_embeddings(
                            options,
                            emb_dir=options.ext_emb_dir,
                            lang=lang,
                            words=self.words.viewkeys())
                        self.external_embedding["words"].update(embeddings)

                    if options.char_emb_size > 0:
                        embeddings = utils.get_external_embeddings(
                            options,
                            emb_dir=options.ext_emb_dir,
                            lang=lang,
                            words=self.chars,
                            chars=True)
                        self.external_embedding["chars"].update(embeddings)

            self.init_lookups(options)

        elmo_emb_size = self.elmo.emb_dim if self.elmo else 0
        self.lstm_input_size = (
            options.word_emb_size + elmo_emb_size + options.pos_emb_size +
            options.tbank_emb_size + 2 *
            (options.char_lstm_output_size if options.char_emb_size > 0 else 0)
        )
        print "Word-level LSTM input size: " + str(self.lstm_input_size)

        self.bilstms = []
        if options.no_bilstms > 0:
            self.bilstms.append(
                BiLSTM(self.lstm_input_size,
                       options.lstm_output_size,
                       self.model,
                       dropout_rate=0.33))
            for i in range(1, options.no_bilstms):
                self.bilstms.append(
                    BiLSTM(2 * options.lstm_output_size,
                           options.lstm_output_size,
                           self.model,
                           dropout_rate=0.33))
            #used in the PaddingVec
            self.word2lstm = self.model.add_parameters(
                (options.lstm_output_size * 2, self.lstm_input_size))
            self.word2lstmbias = self.model.add_parameters(
                (options.lstm_output_size * 2))
        else:
            self.word2lstm = self.model.add_parameters(
                (self.lstm_input_size, self.lstm_input_size))
            self.word2lstmbias = self.model.add_parameters(
                (self.lstm_input_size))

        self.char_bilstm = BiLSTM(options.char_emb_size,
                                  options.char_lstm_output_size,
                                  self.model,
                                  dropout_rate=0.33)

        self.charPadding = self.model.add_parameters(
            (options.char_lstm_output_size * 2))
Ejemplo n.º 9
0
    def __init__(self, model, options, vocab, nnvecs):

        self.word_counts, words, chars, pos, cpos, self.irels, treebanks, langs = vocab

        self.model = model
        self.nnvecs = nnvecs

        extra_words = 2  # MLP padding vector and OOV vector
        self.words = {word: ind for ind, word in enumerate(words, extra_words)}
        self.word_lookup = self.model.add_lookup_parameters(
            (len(self.words) + extra_words, options.word_emb_size))

        extra_pos = 2  # MLP padding vector and OOV vector
        self.pos = {pos: ind for ind, pos in enumerate(cpos, extra_pos)}
        self.pos_lookup = self.model.add_lookup_parameters(
            (len(cpos) + extra_pos, options.pos_emb_size))

        extra_chars = 1  # OOV vector
        self.chars = {char: ind for ind, char in enumerate(chars, extra_chars)}
        self.char_lookup = self.model.add_lookup_parameters(
            (len(chars) + extra_chars, options.char_emb_size))

        extra_treebanks = 1  # Padding vector
        self.treebanks = {
            treebank: ind
            for ind, treebank in enumerate(treebanks, extra_treebanks)
        }
        self.treebank_lookup = self.model.add_lookup_parameters(
            (len(treebanks) + extra_treebanks, options.tbank_emb_size))

        # initialise word vectors with external embeddings where they exist
        if (options.ext_emb_dir
                or options.ext_emb_file) and not options.predict:
            self.external_embedding = defaultdict(lambda: {})
            for lang in langs:
                if options.word_emb_size > 0:
                    self.external_embedding["words"].update(
                        utils.get_external_embeddings(options, lang,
                                                      self.words.viewkeys()))
                if options.char_emb_size > 0:
                    self.external_embedding["chars"].update(
                        utils.get_external_embeddings(options,
                                                      lang,
                                                      self.chars,
                                                      chars=True))
            self.init_lookups(options)

        self.lstm_input_size = options.word_emb_size + options.pos_emb_size + options.tbank_emb_size +\
            2* (options.char_lstm_output_size if options.char_emb_size > 0 else 0)

        print "Word-level LSTM input size: " + str(self.lstm_input_size)

        self.bilstms = []
        if options.no_bilstms > 0:
            self.bilstms.append(
                BiLSTM(self.lstm_input_size,
                       options.lstm_output_size,
                       self.model,
                       dropout_rate=0.33))
            for i in range(1, options.no_bilstms):
                self.bilstms.append(
                    BiLSTM(2 * options.lstm_output_size,
                           options.lstm_output_size,
                           self.model,
                           dropout_rate=0.33))
            #used in the PaddingVec
            self.word2lstm = self.model.add_parameters(
                (options.lstm_output_size * 2, self.lstm_input_size))
            self.word2lstmbias = self.model.add_parameters(
                (options.lstm_output_size * 2))
        else:
            self.word2lstm = self.model.add_parameters(
                (self.lstm_input_size, self.lstm_input_size))
            self.word2lstmbias = self.model.add_parameters(
                (self.lstm_input_size))

        self.char_bilstm = BiLSTM(options.char_emb_size,
                                  options.char_lstm_output_size,
                                  self.model,
                                  dropout_rate=0.33)

        self.charPadding = self.model.add_parameters(
            (options.char_lstm_output_size * 2))
Ejemplo n.º 10
0
if __name__ == "__main__":

    # Getting the test data from pytorch

    dataset = SNLI(batch_size, device)
    out_dim = dataset.out_dim()
    vocab_size = dataset.vocabulary_size()

    # from google.colab import drive
    # drive.mount('/content/gdrive')
    
    # Testing the LSTM model

    # Loading the model using the parameters needed
    filename = "Models/LSTM/" + '{0}_{1}_{2}_{3}_{4}_{5}_{6}_bidirect.pt'.format(batch_size, embedding_dim, dropout_ratio, hidden_dim, epochs, opt_name, lr)
    model = BiLSTM(vocab_size, embedding_dim, dropout_ratio, hidden_dim, out_dim, bidirect)
    model.to(device)
    model.load_state_dict(torch.load(filename, map_location=torch.device('cpu')))

    test_loss, test_accuracy, gt, pred = test(model, dataset)
    # print("Test loss = {}, Test accuracy = {}".format(test_loss, test_accuracy))

    # Writing the output from LSTM onto a text file

    labels = ['entailment', 'contradiction', 'neutral']
    with open("LSTM.txt", 'w') as f:
        f.write("Loss on Test Data : {}\n".format(test_loss))
        f.write("Accuracy on Test Data : {}\n".format(test_accuracy))
        f.write("gt_label,pred_label \n")
        for idx in range(len(gt)):
            f.write("{},{}\n".format(labels[gt[idx]], labels[pred[idx]]))
Ejemplo n.º 11
0
            optimizer.step()
            if ktr_in % 10 == 0:
                losses.append(loss)
            ktr_in += 10
    xs = np.array(data[390:450], dtype=np.float)
    xs = torch.FloatTensor(xs)
    xs = Variable(xs).cuda()
    ys = rnn(xs)
    yc = torch.FloatTensor(np.array(labels[390:450], dtype=np.float))
    yc = Variable(yc).cuda()
    f.write('{}\n'.format(criterion(ys, yc).item()))

f.write("Type: BiLSTM\n")
for ktr in range(3):
    f.write("\nhidden size: {}, num_layers: {}, error: ".format(n_shape[0], n_shape[1]))
    rnn = BiLSTM(input_size, n_shape[0], n_shape[1], 1)
    rnn.cuda()
    criterion = nn.MSELoss()
    optimizer = torch.optim.Adam(rnn.parameters(), lr=learning_rate)
    losses = []
    ktr_in = 0
    for epoch in range(num_epochs):
        for img, lb in train_set:
            img = np.array([img,], dtype=np.float)
            img = torch.FloatTensor(img)
            img = Variable(img).cuda()

            # Forward + Backward + Optimize
            optimizer.zero_grad()
            output = rnn(img)
            crt = torch.FloatTensor(np.array([lb,], dtype=np.float))
Ejemplo n.º 12
0
test_set = data_set[450:]
for v in np.random.randint(0, 400, 25):
    valid_set.append(data_set[v])
for t in np.random.randint(0, 450, 30):
    test_set.append(data_set[t])

sequence_length = 401
input_size = 3
hidden_size = 16
num_layers = 1
batch_size = 1
num_epochs = 2
learning_rate = 0.0001

for ktr in range(1):
    rnn = BiLSTM(input_size, hidden_size, num_layers, 1)
    criterion = nn.MSELoss()
    optimizer = torch.optim.Adam(rnn.parameters(), lr=learning_rate)
    losses = []
    ktr_in = 0
    for epoch in range(num_epochs):
        for img, lb in train_set:
            img = np.array([
                img,
            ], dtype=np.float)
            img = torch.FloatTensor(img)
            img = Variable(img)  # .cuda()

            # Forward + Backward + Optimize
            optimizer.zero_grad()
            output = rnn(img)
Ejemplo n.º 13
0
def main(config):
    trainDataPath = config['data']['trainDataPath']
    validDataPath = config['data']['validDataPath']
    testDataPath = config['data']['testDataPath']

    modelName = config['modelName']

    batchSize = config['model']['batchSize']
    epochNum = config['model']['epochNum']
    earlyStop = config['model']['earlyStop']
    learningRate = config['model']['learningRate']
    modelSavePath = config['model']['modelSavePath']

    #GPU/CPU
    DEVICE = config['DEVICE']

    trianDataset = NERDataset(trainDataPath, config)
    validDataset = NERDataset(validDataPath, config)
    testDataset = NERDataset(testDataPath, config)

    trainIter = data.DataLoader(dataset=trianDataset,
                                batch_size=batchSize,
                                shuffle=True,
                                num_workers=4,
                                collate_fn=pad)

    validIter = data.DataLoader(dataset=validDataset,
                                batch_size=batchSize,
                                shuffle=False,
                                num_workers=4,
                                collate_fn=pad)

    testIter = data.DataLoader(dataset=testDataset,
                               batch_size=batchSize,
                               shuffle=False,
                               num_workers=4,
                               collate_fn=pad)

    if modelName == 'bilstm':
        net = BiLSTM(config)
        train = bilstmTrain
        eval = bilstmEval
        if torch.cuda.device_count() > 1:
            net = nn.DataParallel(net)

    if modelName == 'bilstm_crf':
        net = BiLSTM_CRF(config)
        train = bilstmCRFTrain
        eval = bilstmCRFEval

    if modelName == 'transformer_crf':
        net = Transformer_CRF(config)
        train = transformerCRFTrain
        eval = transformerCRFEval

    if modelName == 'cnn':
        net = CNN(config)
        train = cnnTrain
        eval = cnnEval

    net = net.to(DEVICE)

    lossFunction = nn.NLLLoss()
    optimizer = optim.Adam(net.parameters(),
                           lr=learningRate,
                           betas=(0.9, 0.999),
                           eps=1e-08)

    earlyNumber, beforeLoss, maxScore = 0, sys.maxsize, -1

    #开始训练
    for epoch in range(epochNum):

        print('第%d次迭代: ' % epoch)

        totalLoss = train(net,
                          trainIter,
                          optimizer=optimizer,
                          criterion=lossFunction,
                          DEVICE=DEVICE)
        print('训练损失为: %f' % totalLoss)

        totalLoss, f1Score = eval(net,
                                  validIter,
                                  criterion=lossFunction,
                                  DEVICE=DEVICE)

        if f1Score > maxScore:
            maxScore = f1Score
            torch.save(net.state_dict(), modelSavePath)

        print('验证损失为:%f   f1Score:%f / %f' % (totalLoss, f1Score, maxScore))

        if f1Score < maxScore:
            earlyNumber += 1
            print('earyStop: %d/%d' % (earlyNumber, earlyStop))
        else:
            earlyNumber = 0
        if earlyNumber >= earlyStop: break
        print('\n')

    #加载最优模型
    net.load_state_dict(torch.load(modelSavePath))
    totalLoss, f1Score = eval(net,
                              testIter,
                              criterion=lossFunction,
                              DEVICE=DEVICE)
    print('测试损失为: %f, f1Score: %f' % (totalLoss, f1Score))
Ejemplo n.º 14
0
def main(config):
    trainDataPath = config['data']['trainDataPath']
    validDataPath = config['data']['validDataPath']
    testDataPath = config['data']['testDataPath']
    batchSize = config['model']['batchSize']

    #GPU/CPU
    DEVICE = config['DEVICE']

    trianDataset = NERDataset(trainDataPath, config)
    validDataset = NERDataset(validDataPath, config)
    testDataset = NERTestDataset(testDataPath, config)

    trainIter = data.DataLoader(dataset=trianDataset,
                                batch_size=batchSize,
                                shuffle=True,
                                num_workers=6,
                                collate_fn=pad)

    validIter = data.DataLoader(dataset=validDataset,
                                batch_size=batchSize,
                                shuffle=False,
                                num_workers=6,
                                collate_fn=pad)

    testIter = data.DataLoader(dataset=testDataset,
                               batch_size=batchSize,
                               shuffle=False,
                               num_workers=6,
                               collate_fn=testPad)

    if config['modelName'] == 'bilstm':
        net = BiLSTM(config)
        config['modelSavePath'] = config['data']['BiLSTMSavePath']
        modelSavePath = config['modelSavePath']
        config['submitDataPath'] = config['data']['BiLSTMSubmitDataPath']
        train = bilstm_train
        test = bilstm_test

    if config['modelName'] == 'bilstm_crf':
        net = BiLSTM_CRF(config)
        config['modelSavePath'] = config['data']['BiLSTMCRFSavePath']
        modelSavePath = config['modelSavePath']
        config['submitDataPath'] = config['data']['BiLSTMCRFSubmitDataPath']
        train = bilstm_crf_train
        test = bilstm_crf_test

    if config['modelName'] == 'transformer_cnn':
        net = Transformer_CNN(config)
        config['modelSavePath'] = config['data']['TransformerCNNSavePath']
        config['submitDataPath'] = config['data'][
            'TransformerCNNSubmitDataPath']
        modelSavePath = config['modelSavePath']
        train = transformer_cnn_train
        test = transformer_cnn_test

    if torch.cuda.device_count() > 1:
        net = nn.DataParallel(net)

    net = net.to(DEVICE)

    if os.path.exists(modelSavePath):
        net.load_state_dict(torch.load(modelSavePath))

    #if config['train']:
    #train(net, trainIter, validIter, config)

    #if config['test']:
    test(net, testIter, config)
Ejemplo n.º 15
0
Archivo: train.py Proyecto: hlr7999/ML
train_data = dsets.MNIST(
    root = '../LSTM/mnist',
    train = True,
    transform = torchvision.transforms.ToTensor(),
    download = DOWNLOAD_MNIST,
)
test_data = dsets.MNIST(root='../LSTM/mnist', train=False)

train_loader = Data.DataLoader(dataset=train_data, \
    batch_size=BATCH_SIZE, shuffle=True)

with torch.no_grad():
    test_x = Variable(torch.unsqueeze(test_data.data, dim=1)).type(torch.FloatTensor)/255
test_y = test_data.targets

lstm = BiLSTM(INPUT_SIZE, HIDDEN_SIZE, NUM_LAYERS, 10).to(device)

optimizer = torch.optim.Adam(lstm.parameters(), lr=LR)
loss_func = nn.CrossEntropyLoss()

for epoch in range(EPOCH):
    for step, (x, y) in enumerate(train_loader):
        b_x = Variable(x.view(-1, TIME_STEP, INPUT_SIZE).to(device))
        b_y = Variable(y.to(device))
 
        output = lstm(b_x)
        loss = loss_func(output, b_y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
 
Ejemplo n.º 16
0
test_input = input_data[1000:]
test_output = output_data[1000:]

train_lengths = lengths[:1000]
test_lengths = lengths[1000:]

###############
# Build model #
###############

num_classes = 5
data = tf.placeholder(tf.int32, [None, sequence_length])
target = tf.placeholder(tf.float32, [None, sequence_length, num_classes])

model = BiLSTM(data, target, len(vocab) + 1, embedding_size=10, lstm_size=10)

###############
# Train model #
###############

sess = tf.Session()
sess.run(tf.initialize_all_variables())

batch_size = 20
no_of_batches = len(input_data) // batch_size
epochs = 100
for epoch in range(epochs):
    if epoch % 10 == 0:
        print sess.run(model.cost,
                       feed_dict={
Ejemplo n.º 17
0
 def build_model(self):
     self.bilstm = BiLSTM(num_classes, word_embedding_size,
                          elmo_embedding_size, batch_size, epochs,
                          init_learning_rate, decay_rate, decay_steps)
     self.bilstm_parent = BiLSTM(num_classes, word_embedding_size,
                                 elmo_embedding_size, batch_size, epochs,
                                 init_learning_rate, decay_rate,
                                 decay_steps)
     with tf.variable_scope('softmax-1', reuse=tf.AUTO_REUSE):
         softmax_w = tf.get_variable(
             'W',
             shape=[2 * feed_forward_op_size, intermediate_layer_size_1],
             initializer=tf.truncated_normal_initializer(),
             dtype=tf.float32)
         softmax_b = tf.get_variable(
             'b',
             initializer=tf.constant_initializer(0.0),
             shape=[intermediate_layer_size_1],
             dtype=tf.float32)
     self.final_state = tf.concat(
         [self.bilstm.final_state, self.bilstm_parent.final_state], 1)
     self.logit = tf.matmul(self.final_state, softmax_w) + softmax_b
     self.logit = tf.nn.relu(self.logit)
     with tf.variable_scope('softmax-2', reuse=tf.AUTO_REUSE):
         softmax_w = tf.get_variable(
             'W',
             shape=[intermediate_layer_size_1, intermediate_layer_size_2],
             initializer=tf.truncated_normal_initializer(),
             dtype=tf.float32)
         softmax_b = tf.get_variable(
             'b',
             initializer=tf.constant_initializer(0.0),
             shape=[intermediate_layer_size_2],
             dtype=tf.float32)
     self.logit = tf.matmul(self.logit, softmax_w) + softmax_b
     self.logit = tf.nn.relu(self.logit)
     with tf.variable_scope('softmax-3', reuse=tf.AUTO_REUSE):
         softmax_w = tf.get_variable(
             'W',
             shape=[intermediate_layer_size_2, intermediate_layer_size_3],
             initializer=tf.truncated_normal_initializer(),
             dtype=tf.float32)
         softmax_b = tf.get_variable(
             'b',
             initializer=tf.constant_initializer(0.0),
             shape=[intermediate_layer_size_3],
             dtype=tf.float32)
     self.logit = tf.matmul(self.logit, softmax_w) + softmax_b
     self.logit = tf.nn.relu(self.logit)
     with tf.variable_scope('softmax-4', reuse=tf.AUTO_REUSE):
         softmax_w = tf.get_variable(
             'W',
             shape=[intermediate_layer_size_3, num_classes],
             initializer=tf.truncated_normal_initializer(),
             dtype=tf.float32)
         softmax_b = tf.get_variable(
             'b',
             initializer=tf.constant_initializer(0.0),
             shape=[num_classes],
             dtype=tf.float32)
     self.logit = tf.matmul(self.logit, softmax_w) + softmax_b
     self.norm_logit = tf.nn.softmax(self.logit)
     self.predictions = tf.cast(tf.math.argmax(self.norm_logit, axis=1),
                                tf.int64)
     self.accuracy = tf.reduce_mean(
         tf.cast(tf.equal(self.predictions, self.bilstm.y), tf.float32))
Ejemplo n.º 18
0
def main(argv):
    print("CUDA_VISIBLE_DEVICES=", os.environ['CUDA_VISIBLE_DEVICES'])

    train_dir = FLAGS.train_dir
    dev_dir = FLAGS.dev_dir
    maps_dir = FLAGS.maps_dir

    if train_dir == '':
        print(
            'Must supply input data directory generated from tsv_to_tfrecords.py'
        )
        sys.exit(1)

    # Doesn't work in newer versions of tf. TODO: fix

    # print('\n'.join(sorted(["%s : %s" % (str(k), str(v)) for k, v in FLAGS.__dict__['__flags'].items()])))

    with open(maps_dir + '/label.txt', 'r') as f:
        labels_str_id_map = {
            l.split('\t')[0]: int(l.split('\t')[1].strip())
            for l in f.readlines()
        }
        labels_id_str_map = {i: s for s, i in labels_str_id_map.items()}
        labels_size = len(labels_id_str_map)
    with open(maps_dir + '/token.txt', 'r') as f:
        vocab_str_id_map = {
            l.split('\t')[0]: int(l.split('\t')[1].strip())
            for l in f.readlines()
        }
        vocab_id_str_map = {i: s for s, i in vocab_str_id_map.items()}
        vocab_size = len(vocab_id_str_map)
    with open(maps_dir + '/shape.txt', 'r') as f:
        shape_str_id_map = {
            l.split('\t')[0]: int(l.split('\t')[1].strip())
            for l in f.readlines()
        }
        shape_id_str_map = {i: s for s, i in shape_str_id_map.items()}
        shape_domain_size = len(shape_id_str_map)
    with open(maps_dir + '/char.txt', 'r') as f:
        char_str_id_map = {
            l.split('\t')[0]: int(l.split('\t')[1].strip())
            for l in f.readlines()
        }
        char_id_str_map = {i: s for s, i in char_str_id_map.items()}
        char_domain_size = len(char_id_str_map)

    # with open(maps_dir + '/sizes.txt', 'r') as f:
    #     num_train_examples = int(f.readline()[:-1])

    print("num classes: %d" % labels_size)

    size_files = [
        maps_dir + "/" + fname for fname in listdir(maps_dir)
        if fname.find("sizes") != -1
    ]
    num_train_examples = 0
    num_tokens = 0
    for size_file in size_files:
        print(size_file)
        with open(size_file, 'r') as f:
            num_train_examples += int(f.readline()[:-1])
            num_tokens += int(f.readline()[:-1])

    print("num train examples: %d" % num_train_examples)
    print("num train tokens: %d" % num_tokens)

    dev_top_dir = '/'.join(
        dev_dir.split("/")[:-2]) if dev_dir.find("*") != -1 else dev_dir
    print(dev_top_dir)
    dev_size_files = [
        dev_top_dir + "/" + fname for fname in listdir(dev_top_dir)
        if fname.find("sizes") != -1
    ]
    num_dev_examples = 0
    num_dev_tokens = 0
    for size_file in dev_size_files:
        print(size_file)
        with open(size_file, 'r') as f:
            num_dev_examples += int(f.readline()[:-1])
            num_dev_tokens += int(f.readline()[:-1])

    print("num dev examples: %d" % num_dev_examples)
    print("num dev tokens: %d" % num_dev_tokens)

    # with open(dev_dir + '/sizes.txt', 'r') as f:
    #     num_dev_examples = int(f.readline()[:-1])

    type_set = {}
    type_int_int_map = {}
    outside_set = ["O", "<PAD>", "<S>", "</S>", "<ZERO>"]
    for label, id in labels_str_id_map.items():
        label_type = label if label in outside_set else label[2:]
        if label_type not in type_set:
            type_set[label_type] = len(type_set)
        type_int_int_map[id] = type_set[label_type]
    print(type_set)

    # load embeddings, if given; initialize in range [-.01, .01]
    embeddings_shape = (vocab_size - 1, FLAGS.embed_dim)
    embeddings = tf_utils.embedding_values(embeddings_shape, old=False)
    embeddings_used = 0
    if FLAGS.embeddings != '':
        with open(FLAGS.embeddings, 'r') as f:
            for line in f.readlines():
                split_line = line.strip().split(" ")
                word = split_line[0]
                embedding = split_line[1:]
                if word in vocab_str_id_map:
                    embeddings_used += 1
                    # shift by -1 because we are going to add a 0 constant vector for the padding later
                    embeddings[vocab_str_id_map[word] - 1] = map(
                        float, embedding)
                elif word.lower() in vocab_str_id_map:
                    embeddings_used += 1
                    embeddings[vocab_str_id_map[word.lower()] - 1] = map(
                        float, embedding)
    print("Loaded %d/%d embeddings (%2.2f%% coverage)" %
          (embeddings_used, vocab_size, embeddings_used / vocab_size * 100))

    layers_map = sorted(json.loads(FLAGS.layers.replace(
        "'", '"')).items()) if FLAGS.model == 'cnn' else None

    pad_width = int(layers_map[0][1]['width'] /
                    2) if layers_map is not None else 1

    with tf.Graph().as_default():
        train_batcher = Batcher(
            train_dir, FLAGS.batch_size) if FLAGS.memmap_train else SeqBatcher(
                train_dir, FLAGS.batch_size)

        dev_batch_size = FLAGS.batch_size  # num_dev_examples
        dev_batcher = SeqBatcher(dev_dir,
                                 dev_batch_size,
                                 num_buckets=0,
                                 num_epochs=1)
        if FLAGS.ontonotes:
            domain_dev_batchers = {
                domain: SeqBatcher(dev_dir.replace('*', domain),
                                   dev_batch_size,
                                   num_buckets=0,
                                   num_epochs=1)
                for domain in ['bc', 'nw', 'bn', 'wb', 'mz', 'tc']
            }

        train_eval_batch_size = FLAGS.batch_size
        train_eval_batcher = SeqBatcher(train_dir,
                                        train_eval_batch_size,
                                        num_buckets=0,
                                        num_epochs=1)

        char_embedding_model = BiLSTMChar(char_domain_size, FLAGS.char_dim, int(FLAGS.char_tok_dim/2)) \
            if FLAGS.char_dim > 0 and FLAGS.char_model == "lstm" else \
            (CNNChar(char_domain_size, FLAGS.char_dim, FLAGS.char_tok_dim, layers_map[0][1]['width'])
                if FLAGS.char_dim > 0 and FLAGS.char_model == "cnn" else None)
        char_embeddings = char_embedding_model.outputs if char_embedding_model is not None else None

        if FLAGS.model == 'cnn':
            model = CNN(num_classes=labels_size,
                        vocab_size=vocab_size,
                        shape_domain_size=shape_domain_size,
                        char_domain_size=char_domain_size,
                        char_size=FLAGS.char_tok_dim,
                        embedding_size=FLAGS.embed_dim,
                        shape_size=FLAGS.shape_dim,
                        nonlinearity=FLAGS.nonlinearity,
                        layers_map=layers_map,
                        viterbi=FLAGS.viterbi,
                        projection=FLAGS.projection,
                        loss=FLAGS.loss,
                        margin=FLAGS.margin,
                        repeats=FLAGS.block_repeats,
                        share_repeats=FLAGS.share_repeats,
                        char_embeddings=char_embeddings,
                        embeddings=embeddings)
        elif FLAGS.model == "bilstm":
            model = BiLSTM(num_classes=labels_size,
                           vocab_size=vocab_size,
                           shape_domain_size=shape_domain_size,
                           char_domain_size=char_domain_size,
                           char_size=FLAGS.char_dim,
                           embedding_size=FLAGS.embed_dim,
                           shape_size=FLAGS.shape_dim,
                           nonlinearity=FLAGS.nonlinearity,
                           viterbi=FLAGS.viterbi,
                           hidden_dim=FLAGS.lstm_dim,
                           char_embeddings=char_embeddings,
                           embeddings=embeddings)
        else:
            print(FLAGS.model + ' is not a valid model type')
            sys.exit(1)

        # Define Training procedure
        global_step = tf.Variable(0, name='global_step', trainable=False)

        optimizer = tf.train.AdamOptimizer(learning_rate=FLAGS.lr,
                                           beta1=FLAGS.beta1,
                                           beta2=FLAGS.beta2,
                                           epsilon=FLAGS.epsilon,
                                           name="optimizer")

        model_vars = tf.global_variables()

        print("model vars: %d" % len(model_vars))
        print(map(lambda v: v.name, model_vars))

        # todo put in func
        total_parameters = 0
        for variable in tf.trainable_variables():
            # shape is an array of tf.Dimension
            shape = variable.get_shape()
            variable_parametes = 1
            for dim in shape:
                variable_parametes *= dim.value
            total_parameters += variable_parametes
        print("Total trainable parameters: %d" % (total_parameters))

        if FLAGS.clip_norm > 0:
            grads, _ = tf.clip_by_global_norm(
                tf.gradients(model.loss, model_vars), FLAGS.clip_norm)
            train_op = optimizer.apply_gradients(zip(grads, model_vars),
                                                 global_step=global_step)
        else:
            train_op = optimizer.minimize(model.loss,
                                          global_step=global_step,
                                          var_list=model_vars)

        tf.global_variables_initializer()

        opt_vars = [
            optimizer.get_slot(s, n) for n in optimizer.get_slot_names()
            for s in model_vars if optimizer.get_slot(s, n) is not None
        ]
        model_vars += opt_vars

        if FLAGS.load_dir:
            reader = tf.train.NewCheckpointReader(FLAGS.load_dir + ".tf")
            saved_var_map = reader.get_variable_to_shape_map()
            intersect_vars = [
                k for k in tf.global_variables()
                if k.name.split(':')[0] in saved_var_map
                and k.get_shape() == saved_var_map[k.name.split(':')[0]]
            ]
            leftovers = [
                k for k in tf.global_variables()
                if k.name.split(':')[0] not in saved_var_map
                or k.get_shape() != saved_var_map[k.name.split(':')[0]]
            ]
            print("WARNING: Loading pretrained model, but not loading: ",
                  map(lambda v: v.name, leftovers))
            loader = tf.train.Saver(var_list=intersect_vars)

        else:
            loader = tf.train.Saver(var_list=model_vars)

        saver = tf.train.Saver(var_list=model_vars)

        sv = tf.train.Supervisor(
            logdir=FLAGS.model_dir if FLAGS.model_dir != '' else None,
            global_step=global_step,
            saver=None,
            save_model_secs=0,
            save_summaries_secs=0)

        training_start_time = time.time()
        with sv.managed_session(
                FLAGS.master,
                config=tf.ConfigProto(allow_soft_placement=True)) as sess:

            def run_evaluation(eval_batches, extra_text=""):
                predictions = []
                for b, (eval_label_batch, eval_token_batch, eval_shape_batch,
                        eval_char_batch, eval_seq_len_batch,
                        eval_tok_len_batch,
                        eval_mask_batch) in enumerate(eval_batches):
                    batch_size, batch_seq_len = eval_token_batch.shape

                    char_lens = np.sum(eval_tok_len_batch, axis=1)
                    max_char_len = np.max(eval_tok_len_batch)
                    eval_padded_char_batch = np.zeros(
                        (batch_size, max_char_len * batch_seq_len))
                    for b in range(batch_size):
                        char_indices = [
                            item for sublist in [
                                range(i * max_char_len, i * max_char_len + d)
                                for i, d in enumerate(eval_tok_len_batch[b])
                            ] for item in sublist
                        ]
                        eval_padded_char_batch[
                            b,
                            char_indices] = eval_char_batch[b][:char_lens[b]]

                    char_embedding_feeds = {} if FLAGS.char_dim == 0 else {
                        char_embedding_model.input_chars:
                        eval_padded_char_batch,
                        char_embedding_model.batch_size: batch_size,
                        char_embedding_model.max_seq_len: batch_seq_len,
                        char_embedding_model.token_lengths: eval_tok_len_batch,
                        char_embedding_model.max_tok_len: max_char_len
                    }

                    basic_feeds = {
                        model.input_x1: eval_token_batch,
                        model.input_x2: eval_shape_batch,
                        model.input_y: eval_label_batch,
                        model.input_mask: eval_mask_batch,
                        model.max_seq_len: batch_seq_len,
                        model.batch_size: batch_size,
                        model.sequence_lengths: eval_seq_len_batch
                    }

                    basic_feeds.update(char_embedding_feeds)
                    total_feeds = basic_feeds.copy()

                    if FLAGS.viterbi:
                        preds, transition_params = sess.run(
                            [model.predictions, model.transition_params],
                            feed_dict=total_feeds)

                        viterbi_repad = np.empty((batch_size, batch_seq_len))
                        for batch_idx, (unary_scores,
                                        sequence_lens) in enumerate(
                                            zip(preds, eval_seq_len_batch)):
                            viterbi_sequence, _ = tf.contrib.crf.viterbi_decode(
                                unary_scores, transition_params)
                            viterbi_repad[batch_idx] = viterbi_sequence
                        predictions.append(viterbi_repad)
                    else:
                        preds, scores = sess.run(
                            [model.predictions, model.unflat_scores],
                            feed_dict=total_feeds)
                        predictions.append(preds)

                if FLAGS.print_preds != '':
                    evaluation.print_conlleval_format(
                        FLAGS.print_preds, eval_batches, predictions,
                        labels_id_str_map, vocab_id_str_map, pad_width)

                # print evaluation
                f1_micro, precision = evaluation.segment_eval(
                    eval_batches,
                    predictions,
                    type_set,
                    type_int_int_map,
                    labels_id_str_map,
                    vocab_id_str_map,
                    outside_idx=map(
                        lambda t: type_set[t]
                        if t in type_set else type_set["O"], outside_set),
                    pad_width=pad_width,
                    start_end=FLAGS.start_end,
                    extra_text="Segment evaluation %s:" % extra_text)

                return f1_micro, precision

            threads = tf.train.start_queue_runners(sess=sess)
            log_every = int(max(100, num_train_examples / 5))

            if FLAGS.load_dir != '':
                print("Deserializing model: " + FLAGS.load_dir + ".tf")
                loader.restore(sess, FLAGS.load_dir + ".tf")

            def get_dev_batches(seq_batcher):
                batches = []
                # load all the dev batches into memory
                done = False
                while not done:
                    try:
                        dev_batch = sess.run(seq_batcher.next_batch_op)
                        dev_label_batch, dev_token_batch, dev_shape_batch, dev_char_batch, dev_seq_len_batch, dev_tok_len_batch = dev_batch
                        mask_batch = np.zeros(dev_token_batch.shape)
                        actual_seq_lens = np.add(
                            np.sum(dev_seq_len_batch,
                                   axis=1), (2 if FLAGS.start_end else 1) *
                            pad_width * ((dev_seq_len_batch != 0).sum(axis=1) +
                                         (0 if FLAGS.start_end else 1)))
                        for i, seq_len in enumerate(actual_seq_lens):
                            mask_batch[i, :seq_len] = 1
                        batches.append(
                            (dev_label_batch, dev_token_batch, dev_shape_batch,
                             dev_char_batch, dev_seq_len_batch,
                             dev_tok_len_batch, mask_batch))
                    except:
                        done = True
                return batches

            dev_batches = get_dev_batches(dev_batcher)
            if FLAGS.ontonotes:
                domain_batches = {
                    domain: get_dev_batches(domain_batcher)
                    for domain, domain_batcher in
                    domain_dev_batchers.iteritems()
                }

            train_batches = []
            if FLAGS.train_eval:
                # load all the train batches into memory
                done = False
                while not done:
                    try:
                        train_batch = sess.run(
                            train_eval_batcher.next_batch_op)
                        train_label_batch, train_token_batch, train_shape_batch, train_char_batch, train_seq_len_batch, train_tok_len_batch = train_batch
                        mask_batch = np.zeros(train_token_batch.shape)
                        actual_seq_lens = np.add(
                            np.sum(train_seq_len_batch, axis=1),
                            (2 if FLAGS.start_end else 1) * pad_width *
                            ((train_seq_len_batch != 0).sum(axis=1) +
                             (0 if FLAGS.start_end else 1)))
                        for i, seq_len in enumerate(actual_seq_lens):
                            mask_batch[i, :seq_len] = 1
                        train_batches.append(
                            (train_label_batch, train_token_batch,
                             train_shape_batch, train_char_batch,
                             train_seq_len_batch, train_tok_len_batch,
                             mask_batch))
                    except Exception as e:
                        done = True
            if FLAGS.memmap_train:
                train_batcher.load_and_bucket_data(sess)

            def train(max_epochs,
                      best_score,
                      model_hidden_drop,
                      model_input_drop,
                      until_convergence,
                      max_lower=6,
                      min_iters=20):
                print("Training on %d sentences (%d examples)" %
                      (num_train_examples, num_train_examples))
                start_time = time.time()
                train_batcher._step = 1.0
                converged = False
                examples = 0
                log_every_running = log_every
                epoch_loss = 0.0
                num_lower = 0
                training_iteration = 0
                speed_num = 0.0
                speed_denom = 0.0
                while not sv.should_stop(
                ) and training_iteration < max_epochs and not (
                        until_convergence and converged):
                    # evaluate
                    if examples >= num_train_examples:
                        training_iteration += 1

                        if FLAGS.train_eval:
                            run_evaluation(
                                train_batches,
                                "TRAIN (iteration %d)" % training_iteration)
                        print()
                        f1_micro, precision = run_evaluation(
                            dev_batches,
                            "TEST (iteration %d)" % training_iteration)
                        print("Avg training speed: %f examples/second" %
                              (speed_num / speed_denom))

                        # keep track of running best / convergence heuristic
                        if f1_micro > best_score:
                            best_score = f1_micro
                            num_lower = 0
                            if FLAGS.model_dir != '' and best_score > FLAGS.save_min:
                                save_path = saver.save(sess,
                                                       FLAGS.model_dir + ".tf")
                                print("Serialized model: %s" % save_path)
                        else:
                            num_lower += 1
                        if num_lower > max_lower and training_iteration > min_iters:
                            converged = True

                        # update per-epoch variables
                        log_every_running = log_every
                        examples = 0
                        epoch_loss = 0.0
                        start_time = time.time()

                    if examples > log_every_running:
                        speed_denom += time.time() - start_time
                        speed_num += examples
                        evaluation.print_training_error(
                            examples, start_time, [epoch_loss],
                            train_batcher._step)
                        log_every_running += log_every

                    # Training iteration

                    label_batch, token_batch, shape_batch, char_batch, seq_len_batch, tok_lengths_batch = \
                        train_batcher.next_batch() if FLAGS.memmap_train else sess.run(train_batcher.next_batch_op)

                    # make mask out of seq lens
                    batch_size, batch_seq_len = token_batch.shape

                    char_lens = np.sum(tok_lengths_batch, axis=1)
                    max_char_len = np.max(tok_lengths_batch)
                    padded_char_batch = np.zeros(
                        (batch_size, max_char_len * batch_seq_len))
                    for b in range(batch_size):
                        char_indices = [
                            item for sublist in [
                                range(i * max_char_len, i * max_char_len + d)
                                for i, d in enumerate(tok_lengths_batch[b])
                            ] for item in sublist
                        ]
                        padded_char_batch[
                            b, char_indices] = char_batch[b][:char_lens[b]]

                    max_sentences = max(map(len, seq_len_batch))
                    new_seq_len_batch = np.zeros((batch_size, max_sentences))
                    for i, seq_len_list in enumerate(seq_len_batch):
                        new_seq_len_batch[i, :len(seq_len_list)] = seq_len_list
                    seq_len_batch = new_seq_len_batch
                    num_sentences_batch = np.sum(seq_len_batch != 0, axis=1)

                    mask_batch = np.zeros(
                        (batch_size, batch_seq_len)).astype("int")
                    actual_seq_lens = np.add(
                        np.sum(seq_len_batch, axis=1),
                        (2 if FLAGS.start_end else 1) * pad_width *
                        (num_sentences_batch +
                         (0 if FLAGS.start_end else 1))).astype('int')
                    for i, seq_len in enumerate(actual_seq_lens):
                        mask_batch[i, :seq_len] = 1
                    examples += batch_size

                    # apply word dropout
                    # create word dropout mask
                    word_probs = np.random.random(token_batch.shape)
                    drop_indices = np.where(
                        (word_probs > FLAGS.word_dropout)
                        & (token_batch != vocab_str_id_map["<PAD>"]))
                    token_batch[drop_indices[0],
                                drop_indices[1]] = vocab_str_id_map["<OOV>"]

                    char_embedding_feeds = {} if FLAGS.char_dim == 0 else {
                        char_embedding_model.input_chars:
                        padded_char_batch,
                        char_embedding_model.batch_size:
                        batch_size,
                        char_embedding_model.max_seq_len:
                        batch_seq_len,
                        char_embedding_model.token_lengths:
                        tok_lengths_batch,
                        char_embedding_model.max_tok_len:
                        max_char_len,
                        char_embedding_model.input_dropout_keep_prob:
                        FLAGS.char_input_dropout
                    }

                    if FLAGS.model == "cnn":
                        cnn_feeds = {
                            model.input_x1: token_batch,
                            model.input_x2: shape_batch,
                            model.input_y: label_batch,
                            model.input_mask: mask_batch,
                            model.max_seq_len: batch_seq_len,
                            model.sequence_lengths: seq_len_batch,
                            model.batch_size: batch_size,
                            model.hidden_dropout_keep_prob: model_hidden_drop,
                            model.input_dropout_keep_prob: model_input_drop,
                            model.middle_dropout_keep_prob:
                            FLAGS.middle_dropout,
                            model.l2_penalty: FLAGS.l2,
                            model.drop_penalty: FLAGS.regularize_drop_penalty,
                        }
                        cnn_feeds.update(char_embedding_feeds)
                        _, loss = sess.run([train_op, model.loss],
                                           feed_dict=cnn_feeds)
                    elif FLAGS.model == "bilstm":
                        lstm_feed = {
                            model.input_x1: token_batch,
                            model.input_x2: shape_batch,
                            model.input_y: label_batch,
                            model.input_mask: mask_batch,
                            model.sequence_lengths: seq_len_batch,
                            model.max_seq_len: batch_seq_len,
                            model.batch_size: batch_size,
                            model.hidden_dropout_keep_prob:
                            FLAGS.hidden_dropout,
                            model.middle_dropout_keep_prob:
                            FLAGS.middle_dropout,
                            model.input_dropout_keep_prob: FLAGS.input_dropout,
                            model.l2_penalty: FLAGS.l2,
                            model.drop_penalty: FLAGS.regularize_drop_penalty
                        }
                        lstm_feed.update(char_embedding_feeds)
                        _, loss = sess.run([train_op, model.loss],
                                           feed_dict=lstm_feed)
                    epoch_loss += loss
                    train_batcher._step += 1
                return best_score, training_iteration, speed_num / speed_denom

            if FLAGS.evaluate_only:
                if FLAGS.train_eval:
                    run_evaluation(train_batches, "(train)")
                print()
                run_evaluation(dev_batches, "(test)")
                if FLAGS.ontonotes:
                    for domain, domain_batches in domain_batches.iteritems():
                        print()
                        run_evaluation(domain_batches, FLAGS.layers2 != '',
                                       "(test - domain: %s)" % domain)

            else:
                best_score, training_iteration, train_speed = train(
                    FLAGS.max_epochs,
                    0.0,
                    FLAGS.hidden_dropout,
                    FLAGS.input_dropout,
                    until_convergence=FLAGS.until_convergence)
                if FLAGS.model_dir:
                    print("Deserializing model: " + FLAGS.model_dir + ".tf")
                    saver.restore(sess, FLAGS.model_dir + ".tf")

            sv.coord.request_stop()
            sv.coord.join(threads)
            sess.close()

            total_time = time.time() - training_start_time
            if FLAGS.evaluate_only:
                print("Testing time: %d seconds" % (total_time))
            else:
                print(
                    "Training time: %d minutes, %d iterations (%3.2f minutes/iteration)"
                    % (total_time / 60, training_iteration, total_time /
                       (60 * training_iteration)))
                print("Avg training speed: %f examples/second" % (train_speed))
                print("Best dev F1: %2.2f" % (best_score * 100))
Ejemplo n.º 19
0
        acc_test, pred = sess.run([bi_lstm.accuracy,bi_lstm.label_out], feed_dict = feed_dict)
        return acc_test, pred , Y


graph = tf.Graph()
with graph.as_default():
    # session_conf = tf.ConfigProto(
    #   allow_soft_placement=allow_soft_placement,
    #   log_device_placement=log_device_placement)
    sess = tf.Session()
    with sess.as_default():
        bi_lstm = BiLSTM(
                num_hidden=num_hidden,
                num_classes=num_classes,
                voc_dim=vocsize,
                emb_dim=embedding_dim,
                sent_max_len = max_sent_length,
                tag_voc_dim = tag_voc_size,
                tags = True if POS_emb in [1,2] else False,
                external = pre_training,
                update = emb_update)
        saver = tf.train.Saver(tf.all_variables())
        # saver.restore(sess, checkpoint_file)
        # print "Model restored!"

        # load model from last checkpoint
        checkpoint_file = tf.train.latest_checkpoint(FLAGS.checkpoint_dir)
        saver.restore(sess,checkpoint_file)
        print "Model restored!"
        # Collect the predictions here
        test_tot_acc = []
        preds_test, gold_test = [],[]
Ejemplo n.º 20
0
def main():
    d = Dataset()
    parser = argparse.ArgumentParser()
    parser.add_argument('-c',
                        '--continue',
                        dest='continue_path',
                        required=False)
    args = parser.parse_args()

    ## build graph
    network = BiLSTM()
    placeholders, loss, viterbi_sequence, label = network.build()

    # loss_reg = tf.add_n(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES))

    ## train config
    global_steps = tf.Variable(0, trainable=False)
    boundaries = [
        config.train_size // config.batch_size * 15,
        config.train_size // config.batch_size * 40
    ]
    values = [0.01, 0.001, 0.0005]
    lr = tf.train.piecewise_constant(global_steps, boundaries, values)
    opt = tf.train.AdamOptimizer(lr)
    # in order to update BN in every iter
    update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
    with tf.control_dependencies(update_ops):
        train = opt.minimize(loss)

    ## init tensorboard
    # tf.summary.scalar('loss_regularization', loss_reg)
    # tf.summary.scalar('loss_crossEntropy', loss - loss_reg)
    tf.summary.scalar('loss', loss)
    tf.summary.scalar('learning_rate', lr)
    merged = tf.summary.merge_all()
    train_writer = tf.summary.FileWriter(
        os.path.join(config.log_dir, 'tf_log', 'train'),
        tf.get_default_graph())
    test_writer = tf.summary.FileWriter(
        os.path.join(config.log_dir, 'tf_log', 'validation'),
        tf.get_default_graph())

    ## create a session
    tf.set_random_seed(12345)  # ensure consistent results
    global_cnt = 0
    epoch_start = 0
    g_list = tf.global_variables()
    saver = tf.train.Saver(var_list=g_list)
    with tf.Session() as sess:
        sess.run(tf.local_variables_initializer())
        sess.run(tf.global_variables_initializer())  # init all variables
        if args.continue_path:  # load a model snapshot
            ckpt = tf.train.get_checkpoint_state(args.continue_path)
            saver.restore(sess, ckpt.model_checkpoint_path)
            epoch_start = int(
                ckpt.model_checkpoint_path.split('/')[-1].split('-')[1])
            global_cnt = epoch_start * config.train_size // config.batch_size

        ## training
        for epoch in range(epoch_start + 1, config.nr_epoch + 1):
            for _ in range(config.train_size // config.batch_size):
                global_cnt += 1
                images, labels, seq_len = d.one_batch_train().__next__()
                feed_dict = {
                    placeholders['data']: images,
                    placeholders['label']: labels,
                    global_steps: global_cnt,
                    placeholders['is_training']: True,
                    placeholders['sequence_lengths']: seq_len
                }
                _, loss_v, lr_v, summary, v, y = sess.run(
                    [train, loss, lr, merged, viterbi_sequence, label],
                    feed_dict=feed_dict)

                if global_cnt % config.show_interval == 0:
                    precision, recall = cal(v, y)
                    train_writer.add_summary(summary, global_cnt)
                    print(
                        "e:{},{}/{}".format(
                            epoch, (global_cnt % config.train_size) //
                            config.batch_size,
                            config.train_size // config.batch_size),
                        'loss: {:.3f}'.format(loss_v),
                        'precision: {:.3f}'.format(precision),
                        'recall: {:.3f}'.format(recall))

            ## validation
            if epoch % config.test_interval == 0:
                loss_sum = 0
                for i in range(config.val_size // config.batch_size):
                    images, labels, seq_len = d.one_batch_val().__next__()
                    feed_dict = {
                        placeholders['data']: images,
                        placeholders['label']: labels,
                        global_steps: global_cnt,
                        placeholders['is_training']: False,
                        placeholders['sequence_lengths']: seq_len
                    }
                    loss_v, summary, v, y = sess.run(
                        [loss, merged, viterbi_sequence, label],
                        feed_dict=feed_dict)
                    loss_sum += loss_v

                    precision, recall = cal(v, y)
                test_writer.add_summary(summary, global_cnt)
                print("\n**************Validation results****************")
                print(
                    'loss_avg: {:.3f}'.format(
                        loss_sum / (config.val_size // config.batch_size)),
                    'precision: {:.3f}'.format(precision),
                    'recall: {:.3f}'.format(recall),
                )
                print("************************************************\n")

            ## save model
            if epoch % config.snapshot_interval == 0:
                saver.save(sess,
                           os.path.join(config.log_model_dir,
                                        'epoch-{}'.format(epoch)),
                           global_step=global_cnt)

        print('Training is done, exit.')
Ejemplo n.º 21
0
        embeddings_used += 1
print("Loaded %d/%d embeddings (%2.2f%% coverage)" %
      (embeddings_used, vocab_size, embeddings_used / vocab_size * 100) + '\n')

if char_size > 0:
    char_embedding_model = BiLSTMChar(char_domain_size, char_size,
                                      int(char_tok_size / 2))
    char_embeddings = char_embedding_model.outputs

model = BiLSTM(num_classes_A=labels_cdr_size,
               num_classes_B=labels_bc_size,
               vocab_size=vocab_size,
               shape_domain_size=shape_domain_size,
               char_domain_size=char_domain_size,
               char_size=char_size,
               embedding_size=embedding_size,
               shape_size=shape_size,
               nonlinearity=nonlinearity,
               viterbi=viterbi,
               hidden_dim=hidden_size,
               char_embeddings=char_embeddings,
               embeddings=embeddings)

type_set_A = {}
type_int_int_map_A = {}

type_set_B = {}
type_int_int_map_B = {}

outside_set = ["O", "<PAD>", "<S>", "</S>", "<ZERO>"]
Ejemplo n.º 22
0
class FeatureExtractor(object):
    def __init__(self, model, options, words, rels, langs, w2i, ch, nnvecs):
        self.model = model
        self.disableBilstm = options.disable_bilstm
        self.multiling = options.use_lembed and options.multiling
        self.lstm_output_size = options.lstm_output_size
        self.char_lstm_output_size = options.char_lstm_output_size
        self.word_emb_size = options.word_emb_size
        self.char_emb_size = options.char_emb_size
        self.lang_emb_size = options.lang_emb_size
        self.wordsCount = words
        self.vocab = {word: ind + 2
                      for word, ind in w2i.iteritems()
                      }  # +2 for MLP padding vector and OOV vector
        self.chars = {char: ind + 1
                      for ind, char in enumerate(ch)}  # +1 for OOV vector
        self.rels = {word: ind for ind, word in enumerate(rels)}
        self.nnvecs = nnvecs
        if langs:
            self.langs = {lang: ind + 1
                          for ind, lang in enumerate(langs)
                          }  # +1 for padding vector
        else:
            self.langs = None
        self.irels = rels
        self.external_embedding = None
        if options.external_embedding is not None:
            self.get_external_embeddings(options.external_embedding)

        lstm_input_size = self.word_emb_size + (self.edim if self.external_embedding is\
                                                not None else 0) + (self.lang_emb_size if
                                                                    self.multiling else 0) + 2 * self.char_lstm_output_size

        if not self.disableBilstm:
            self.bilstm1 = BiLSTM(lstm_input_size,
                                  self.lstm_output_size,
                                  self.model,
                                  dropout_rate=0.33)
            self.bilstm2 = BiLSTM(2 * self.lstm_output_size,
                                  self.lstm_output_size,
                                  self.model,
                                  dropout_rate=0.33)
        else:
            self.lstm_output_size = int(lstm_input_size * 0.5)

        self.char_bilstm = BiLSTM(self.char_emb_size,
                                  self.char_lstm_output_size,
                                  self.model,
                                  dropout_rate=0.33)

        self.clookup = self.model.add_lookup_parameters(
            (len(ch) + 1, self.char_emb_size))
        self.wlookup = self.model.add_lookup_parameters(
            (len(words) + 2, self.word_emb_size))
        if self.multiling and self.lang_emb_size > 0:
            self.langslookup = self.model.add_lookup_parameters(
                (len(langs) + 1, self.lang_emb_size))

        #used in the PaddingVec
        self.word2lstm = self.model.add_parameters(
            (self.lstm_output_size * 2, lstm_input_size))
        self.word2lstmbias = self.model.add_parameters(
            (self.lstm_output_size * 2))
        self.chPadding = self.model.add_parameters(
            (self.char_lstm_output_size * 2))

    def Init(self):
        evec = self.elookup[1] if self.external_embedding is not None else None
        paddingWordVec = self.wlookup[1]
        paddingLangVec = self.langslookup[
            0] if self.multiling and self.lang_emb_size > 0 else None

        self.paddingVec = dy.tanh(self.word2lstm.expr() * dy.concatenate(
            filter(
                None,
                [paddingWordVec, evec,
                 self.chPadding.expr(), paddingLangVec])) +
                                  self.word2lstmbias.expr())
        self.empty = self.paddingVec if self.nnvecs == 1 else dy.concatenate(
            [self.paddingVec for _ in xrange(self.nnvecs)])

    def getWordEmbeddings(self, sentence, train):
        for root in sentence:
            wordcount = float(self.wordsCount.get(root.norm, 0))
            noDropFlag = not train or (random.random() <
                                       (wordcount / (0.25 + wordcount)))
            root.wordvec = self.wlookup[int(self.vocab.get(root.norm, 0)
                                            ) if noDropFlag else 0]
            self.get_char_vector(root, train)

            if self.external_embedding is not None:
                if not noDropFlag and random.random() < 0.5:
                    root.evec = self.elookup[0]
                elif root.form in self.external_embedding:
                    root.evec = self.elookup[self.extrnd[root.form]]
                elif root.norm in self.external_embedding:
                    root.evec = self.elookup[self.extrnd[root.norm]]
                else:
                    root.evec = self.elookup[0]
            else:
                root.evec = None

            if self.multiling:
                root.langvec = self.langslookup[self.langs[
                    root.language_id]] if self.lang_emb_size > 0 else None
            else:
                root.langvec = None

            root.vec = dy.concatenate(
                filter(None,
                       [root.wordvec, root.evec, root.chVec, root.langvec]))
        if not self.disableBilstm:
            self.bilstm1.set_token_vecs(sentence, train)
            self.bilstm2.set_token_vecs(sentence, train)

    def get_char_vector(self, root, train):
        if root.form == "*root*":  # no point running a character analysis over this placeholder token
            root.chVec = self.chPadding.expr(
            )  # use the padding vector if it's the root token
        else:
            char_vecs = []
            for char in root.form:
                char_vecs.append(self.clookup[self.chars.get(char, 0)])
            root.chVec = self.char_bilstm.get_sequence_vector(char_vecs, train)

    def get_external_embeddings(self, external_embedding_file):
        external_embedding_fp = codecs.open(external_embedding_file,
                                            'r',
                                            encoding='utf-8')
        external_embedding_fp.readline()
        self.external_embedding = {}
        for line in external_embedding_fp:
            line = line.strip().split()
            self.external_embedding[line[0]] = [float(f) for f in line[1:]]

        external_embedding_fp.close()

        self.edim = len(self.external_embedding.values()[0])
        self.noextrn = [0.0 for _ in xrange(self.edim)]  #???
        self.extrnd = {
            word: i + 3
            for i, word in enumerate(self.external_embedding)
        }
        self.elookup = self.model.add_lookup_parameters(
            (len(self.external_embedding) + 3, self.edim))
        for word, i in self.extrnd.iteritems():
            self.elookup.init_row(i, self.external_embedding[word])
            self.extrnd['*PAD*'] = 1
            self.extrnd['*INITIAL*'] = 2

        print 'Load external embedding. Vector dimensions', self.edim
Ejemplo n.º 23
0
    def __init__(self, model, options, vocab, nnvecs):

        self.word_counts, words, chars, pos, cpos, self.irels, treebanks, langs = vocab

        self.model = model
        self.nnvecs = nnvecs

        extra_words = 2  # MLP padding vector and OOV vector
        self.words = {word: ind for ind, word in enumerate(words, extra_words)}
        #why not just len(self.words)?
        self.word_lookup = self.model.add_lookup_parameters(
            (len(self.words) + extra_words, options.word_emb_size))

        extra_pos = 2  # MLP padding vector and OOV vector
        self.pos = {pos: ind for ind, pos in enumerate(cpos, extra_pos)}
        self.pos_lookup = self.model.add_lookup_parameters(
            (len(cpos) + extra_pos, options.pos_emb_size))

        extra_chars = 1  # OOV vector
        self.chars = {char: ind for ind, char in enumerate(chars, extra_chars)}
        self.char_lookup = self.model.add_lookup_parameters(
            (len(chars) + extra_chars, options.char_emb_size))

        extra_treebanks = 1  # Padding vector
        self.treebanks = {
            treebank: ind
            for ind, treebank in enumerate(treebanks, extra_treebanks)
        }
        self.treebank_lookup = self.model.add_lookup_parameters(
            (len(treebanks) + extra_treebanks, options.tbank_emb_size))

        # initialise word vectors with external embeddings where they exist
        if (options.ext_emb_dir
                or options.ext_emb_file) and not options.predict:
            self.external_embedding = defaultdict(lambda: {})
            for lang in langs:
                if options.word_emb_size > 0:
                    self.external_embedding["words"].update(
                        utils.get_external_embeddings(options, lang,
                                                      self.words.viewkeys()))
                if options.char_emb_size > 0:
                    self.external_embedding["chars"].update(
                        utils.get_external_embeddings(options,
                                                      lang,
                                                      self.chars,
                                                      chars=True))
            self.init_lookups(options)

        self.lstm_input_size = options.word_emb_size + options.pos_emb_size + options.tbank_emb_size +\
            2* (options.char_lstm_output_size if options.char_emb_size > 0 else 0)

        print "Word-level LSTM input size: " + str(self.lstm_input_size)

        self.bilstms = []
        if options.no_bilstms > 0:
            if options.unidir_lstm is not None:
                #replace the BiLSTMs with unidirectional ones
                #it's ugly to still call it bilstm but easier
                if options.unidir_lstm:
                    self.bilstms.append(
                        LSTM(self.lstm_input_size,
                             2 * options.lstm_output_size,
                             self.model,
                             dropout_rate=0.33,
                             direction=options.unidir_lstm,
                             layers=options.no_bilstms))
            else:
                self.bilstms.append(
                    BiLSTM(self.lstm_input_size,
                           options.lstm_output_size,
                           self.model,
                           dropout_rate=0.33))
                for i in range(1, options.no_bilstms):
                    self.bilstms.append(
                        BiLSTM(2 * options.lstm_output_size,
                               options.lstm_output_size,
                               self.model,
                               dropout_rate=0.33))
            #used in the PaddingVec
            self.word2lstm = self.model.add_parameters(
                (options.lstm_output_size * 2, self.lstm_input_size))
            self.word2lstmbias = self.model.add_parameters(
                (options.lstm_output_size * 2))
        else:
            self.word2lstm = self.model.add_parameters(
                (self.lstm_input_size, self.lstm_input_size))
            self.word2lstmbias = self.model.add_parameters(
                (self.lstm_input_size))

        self.char_bilstm = BiLSTM(options.char_emb_size,
                                  options.char_lstm_output_size,
                                  self.model,
                                  dropout_rate=0.33)

        self.charPadding = self.model.add_parameters(
            (options.char_lstm_output_size * 2))

        #recursive composition things
        if options.use_recursive_composition:
            deprel_dir = [(rel, direction) for rel in self.irels
                          for direction in [0, 1]]
            extra_deprel = 1  # padding rel vec
            #this does not work
            self.ideprel_dir = {
                val: ind
                for ind, val in enumerate(deprel_dir, extra_deprel)
            }
            self.deprel_lookup = self.model.add_lookup_parameters(
                (len(self.ideprel_dir) + extra_deprel, options.deprel_size))
            lstm_out_dim = options.lstm_output_size * 2
            if options.use_recursive_composition == 'RecNN':
                self.hCompos = self.model.add_parameters(
                    (lstm_out_dim, lstm_out_dim))
                self.dCompos = self.model.add_parameters(
                    (lstm_out_dim, lstm_out_dim))
                self.rCompos = self.model.add_parameters(
                    (lstm_out_dim, options.deprel_size))
                self.biasCompos = self.model.add_parameters((lstm_out_dim))
            else:
                compos_in_dim = lstm_out_dim * 2 + options.deprel_size
                self.composLSTM = dy.VanillaLSTMBuilder(
                    1, compos_in_dim, lstm_out_dim, self.model)
Ejemplo n.º 24
0
    def __init__(self, model, options, words, rels, langs, w2i, ch, nnvecs):
        self.model = model
        self.disableBilstm = options.disable_bilstm
        self.multiling = options.use_lembed and options.multiling
        self.lstm_output_size = options.lstm_output_size
        self.char_lstm_output_size = options.char_lstm_output_size
        self.word_emb_size = options.word_emb_size
        self.char_emb_size = options.char_emb_size
        self.lang_emb_size = options.lang_emb_size
        self.wordsCount = words
        self.vocab = {word: ind + 2
                      for word, ind in w2i.iteritems()
                      }  # +2 for MLP padding vector and OOV vector
        self.chars = {char: ind + 1
                      for ind, char in enumerate(ch)}  # +1 for OOV vector
        self.rels = {word: ind for ind, word in enumerate(rels)}
        self.nnvecs = nnvecs
        if langs:
            self.langs = {lang: ind + 1
                          for ind, lang in enumerate(langs)
                          }  # +1 for padding vector
        else:
            self.langs = None
        self.irels = rels
        self.external_embedding = None
        if options.external_embedding is not None:
            self.get_external_embeddings(options.external_embedding)

        lstm_input_size = self.word_emb_size + (self.edim if self.external_embedding is\
                                                not None else 0) + (self.lang_emb_size if
                                                                    self.multiling else 0) + 2 * self.char_lstm_output_size

        if not self.disableBilstm:
            self.bilstm1 = BiLSTM(lstm_input_size,
                                  self.lstm_output_size,
                                  self.model,
                                  dropout_rate=0.33)
            self.bilstm2 = BiLSTM(2 * self.lstm_output_size,
                                  self.lstm_output_size,
                                  self.model,
                                  dropout_rate=0.33)
        else:
            self.lstm_output_size = int(lstm_input_size * 0.5)

        self.char_bilstm = BiLSTM(self.char_emb_size,
                                  self.char_lstm_output_size,
                                  self.model,
                                  dropout_rate=0.33)

        self.clookup = self.model.add_lookup_parameters(
            (len(ch) + 1, self.char_emb_size))
        self.wlookup = self.model.add_lookup_parameters(
            (len(words) + 2, self.word_emb_size))
        if self.multiling and self.lang_emb_size > 0:
            self.langslookup = self.model.add_lookup_parameters(
                (len(langs) + 1, self.lang_emb_size))

        #used in the PaddingVec
        self.word2lstm = self.model.add_parameters(
            (self.lstm_output_size * 2, lstm_input_size))
        self.word2lstmbias = self.model.add_parameters(
            (self.lstm_output_size * 2))
        self.chPadding = self.model.add_parameters(
            (self.char_lstm_output_size * 2))
Ejemplo n.º 25
0
class FeatureExtractor(object):
    def __init__(self, model, options, vocab, nnvecs):

        self.word_counts, words, chars, pos, cpos, self.irels, treebanks, langs = vocab

        self.model = model
        self.nnvecs = nnvecs

        extra_words = 2  # MLP padding vector and OOV vector
        self.words = {word: ind for ind, word in enumerate(words, extra_words)}
        self.word_lookup = self.model.add_lookup_parameters(
            (len(self.words) + extra_words, options.word_emb_size))

        extra_pos = 2  # MLP padding vector and OOV vector
        self.pos = {pos: ind for ind, pos in enumerate(cpos, extra_pos)}
        self.pos_lookup = self.model.add_lookup_parameters(
            (len(cpos) + extra_pos, options.pos_emb_size))

        extra_chars = 1  # OOV vector
        self.chars = {char: ind for ind, char in enumerate(chars, extra_chars)}
        self.char_lookup = self.model.add_lookup_parameters(
            (len(chars) + extra_chars, options.char_emb_size))

        extra_treebanks = 1  # Padding vector
        self.treebanks = {
            treebank: ind
            for ind, treebank in enumerate(treebanks, extra_treebanks)
        }
        self.treebank_lookup = self.model.add_lookup_parameters(
            (len(treebanks) + extra_treebanks, options.tbank_emb_size))

        # initialise word vectors with external embeddings where they exist
        if (options.ext_emb_dir
                or options.ext_emb_file) and not options.predict:
            self.external_embedding = defaultdict(lambda: {})
            for lang in langs:
                if options.word_emb_size > 0:
                    self.external_embedding["words"].update(
                        utils.get_external_embeddings(options, lang,
                                                      self.words.viewkeys()))
                if options.char_emb_size > 0:
                    self.external_embedding["chars"].update(
                        utils.get_external_embeddings(options,
                                                      lang,
                                                      self.chars,
                                                      chars=True))
            self.init_lookups(options)

        self.lstm_input_size = options.word_emb_size + options.pos_emb_size + options.tbank_emb_size +\
            2* (options.char_lstm_output_size if options.char_emb_size > 0 else 0)

        print "Word-level LSTM input size: " + str(self.lstm_input_size)

        self.bilstms = []
        if options.no_bilstms > 0:
            self.bilstms.append(
                BiLSTM(self.lstm_input_size,
                       options.lstm_output_size,
                       self.model,
                       dropout_rate=0.33))
            for i in range(1, options.no_bilstms):
                self.bilstms.append(
                    BiLSTM(2 * options.lstm_output_size,
                           options.lstm_output_size,
                           self.model,
                           dropout_rate=0.33))
            #used in the PaddingVec
            self.word2lstm = self.model.add_parameters(
                (options.lstm_output_size * 2, self.lstm_input_size))
            self.word2lstmbias = self.model.add_parameters(
                (options.lstm_output_size * 2))
        else:
            self.word2lstm = self.model.add_parameters(
                (self.lstm_input_size, self.lstm_input_size))
            self.word2lstmbias = self.model.add_parameters(
                (self.lstm_input_size))

        self.char_bilstm = BiLSTM(options.char_emb_size,
                                  options.char_lstm_output_size,
                                  self.model,
                                  dropout_rate=0.33)

        self.charPadding = self.model.add_parameters(
            (options.char_lstm_output_size * 2))

    def Init(self, options):
        paddingWordVec = self.word_lookup[
            1] if options.word_emb_size > 0 else None
        paddingPosVec = self.pos_lookup[1] if options.pos_emb_size > 0 else None
        paddingCharVec = self.charPadding.expr(
        ) if options.char_emb_size > 0 else None
        paddingTbankVec = self.treebank_lookup[
            0] if options.tbank_emb_size > 0 else None

        self.paddingVec = dy.tanh(self.word2lstm.expr() *\
            dy.concatenate(filter(None,[paddingWordVec,
                                        paddingPosVec,
                                        paddingCharVec,
                                        paddingTbankVec])) + self.word2lstmbias.expr())

        self.empty = self.paddingVec if self.nnvecs == 1 else\
            dy.concatenate([self.paddingVec for _ in xrange(self.nnvecs)])

    def getWordEmbeddings(self,
                          sentence,
                          train,
                          options,
                          test_embeddings=defaultdict(lambda: {})):
        for root in sentence:
            root.vecs = defaultdict(
                lambda: None
            )  # all vecs are None by default (possibly a little risky?)
            if options.word_emb_size > 0:
                if train:
                    word_count = float(self.word_counts.get(root.norm, 0))
                    dropFlag = random.random() > word_count / (0.25 +
                                                               word_count)
                    root.vecs["word"] = self.word_lookup[
                        self.words.get(root.norm, 0) if not dropFlag else 0]
                else:  # need to check in test_embeddings at prediction time
                    if root.norm in self.words:
                        root.vecs["word"] = self.word_lookup[self.words[
                            root.norm]]
                    elif root.norm in test_embeddings["words"]:
                        root.vecs["word"] = dy.inputVector(
                            test_embeddings["words"][root.norm])
                    else:
                        root.vecs["word"] = self.word_lookup[0]
            if options.pos_emb_size > 0:
                root.vecs["pos"] = self.pos_lookup[self.pos.get(root.cpos, 0)]
            if options.char_emb_size > 0:
                root.vecs["char"] = self.get_char_vector(
                    root, train, test_embeddings["chars"])
            if options.tbank_emb_size > 0:
                if options.forced_tbank_emb:
                    treebank_id = options.forced_tbank_emb
                elif root.proxy_tbank:
                    treebank_id = root.proxy_tbank
                else:
                    treebank_id = root.treebank_id
                # this is a bit of a hack for models trained on an old version of the code
                # that used treebank name rather than id as the lookup
                if not treebank_id in self.treebanks and treebank_id in utils.reverse_iso_dict and \
                    utils.reverse_iso_dict[treebank_id] in self.treebanks:
                    treebank_id = utils.reverse_iso_dict[treebank_id]
                root.vecs["treebank"] = self.treebank_lookup[
                    self.treebanks[treebank_id]]

            root.vec = dy.concatenate(
                filter(None, [
                    root.vecs["word"], root.vecs["pos"], root.vecs["char"],
                    root.vecs["treebank"]
                ]))

        for bilstm in self.bilstms:
            bilstm.set_token_vecs(sentence, train)

    def get_char_vector(self, root, train, test_embeddings_chars={}):

        if root.char_rep == "*root*":  # no point running a character analysis over this placeholder token
            return self.charPadding.expr(
            )  # use the padding vector if it's the root token
        else:
            char_vecs = []
            for char in root.char_rep:
                if char in self.chars:
                    char_vecs.append(self.char_lookup[self.chars[char]])
                elif char in test_embeddings_chars:
                    char_vecs.append(
                        dy.inputVector(test_embeddings_chars[char]))
                else:
                    char_vecs.append(self.char_lookup[0])
            return self.char_bilstm.get_sequence_vector(char_vecs, train)

    def init_lookups(self, options):

        if self.external_embedding["words"]:
            print 'Initialising %i word vectors with external embeddings' % len(
                self.external_embedding["words"])
            for word in self.external_embedding["words"]:
                if len(self.external_embedding["words"]
                       [word]) != options.word_emb_size:
                    raise Exception(
                        "Size of external embedding does not match specified word embedding size of %s"
                        % (options.word_emb_size))
                self.word_lookup.init_row(
                    self.words[word], self.external_embedding["words"][word])
        elif options.word_emb_size > 0:
            print 'No word external embeddings found: all vectors initialised randomly'

        if self.external_embedding["chars"]:
            print 'Initialising %i char vectors with external embeddings' % len(
                self.external_embedding["chars"])
            for char in self.external_embedding["chars"]:
                if len(self.external_embedding["chars"]
                       [char]) != options.char_emb_size:
                    raise Exception(
                        "Size of external embedding does not match specified char embedding size of %s"
                        % (options.char_emb_size))
                self.char_lookup.init_row(
                    self.chars[char], self.external_embedding["chars"][char])
        elif options.char_emb_size > 0:
            print 'No character external embeddings found: all vectors initialised randomly'
Ejemplo n.º 26
0
class FeatureExtractor(object):
    def __init__(self, model, wordsCount, rels, langs, words, ch, nnvecs, options):
        """
        Options handling
        """
        self.model = model
        if langs:
            self.langs = {lang: ind+1 for ind, lang in enumerate(langs)} # +1 for padding vector
        else:
            self.langs = None
        self.nnvecs = nnvecs
        self.multiling = options.multiling #and options.use_lembed
        self.external_embedding = None
        if options.external_embedding is not None:
            self.get_external_embeddings(options.external_embedding, model, wordsCount)
        self.disable_bilstm = options.disable_bilstm
        self.disable_second_bilstm = options.disable_second_bilstm

        """sharing"""
        self.shareBiLSTM = options.shareBiLSTM
        self.shareWordLookup = options.shareWordLookup
        self.shareCharLookup = options.shareCharLookup
        self.shareCharBiLSTM = options.shareCharBiLSTM
        self.word_lembed = options.lembed_word
        self.char_lembed = options.lembed_char

        """dims"""
        self.word_emb_size = options.word_emb_size
        self.char_emb_size = options.char_emb_size
        self.lstm_output_size = options.lstm_output_size
        self.char_lstm_output_size = options.char_lstm_output_size
        self.lang_emb_size = options.lang_emb_size

        lstm_input_size = self.word_emb_size + (self.edim if self.external_embedding is\
                          not None else 0) + (self.lang_emb_size if self.word_lembed else 0)\
                          + 2 * self.char_lstm_output_size

        """UTILS"""
        self.wordsCount = wordsCount
        self.irels = rels

        if self.multiling and not self.shareWordLookup:
            w2i = {}
            for lang in self.langs:
                 w2i[lang] = {w: i for i, w in enumerate(words[lang])}
            self.vocab = {}
            for lang in self.langs:
                self.vocab[lang] = {word: ind+2 for word, ind in w2i[lang].iteritems()}

        else:
            w2i = {w: i for i, w in enumerate(words)}
            self.vocab = {word: ind+2 for word, ind in w2i.iteritems()} # +2 for MLP padding vector and OOV vector

        if not self.multiling or self.shareCharLookup:
            self.chars = {char: ind+1 for ind, char in enumerate(ch)} # +1 for OOV vector
        else:
            self.chars = {}
            for lang in self.langs:
                self.chars[lang] = {char: ind+1 for ind, char in enumerate(ch[lang])}
        self.rels = {word: ind for ind, word in enumerate(rels)}

        """BILSTMS"""
        #word
        if not self.multiling or self.shareBiLSTM:
            if not self.disable_bilstm:
                self.bilstm1 = BiLSTM(lstm_input_size, self.lstm_output_size, model,
                                      dropout_rate=0.33)
                if not self.disable_second_bilstm:
                    self.bilstm2 = BiLSTM(2* self.lstm_output_size, self.lstm_output_size, model,
                                          dropout_rate=0.33)
            else:
                self.lstm_output_size = int(lstm_input_size * 0.5)
        else:
            self.bilstm1= {}
            self.bilstm2= {}
            for lang in self.langs:
                self.bilstm1[lang] = BiLSTM(lstm_input_size, self.lstm_output_size, model,
                                      dropout_rate=0.33)
                self.bilstm2[lang] = BiLSTM(2* self.lstm_output_size, self.lstm_output_size, model,
                                            dropout_rate=0.33)

        #char
        if self.char_lembed:
            char_in_dims = self.char_emb_size + self.lang_emb_size
        else:
            char_in_dims = self.char_emb_size

        if not self.multiling or self.shareCharBiLSTM:
            self.char_bilstm = BiLSTM(char_in_dims,self.char_lstm_output_size,self.model,dropout_rate=0.33)
        else:
            self.char_bilstms = {}
            for lang in self.langs:
                self.char_bilstms[lang] = BiLSTM(char_in_dims,self.char_lstm_output_size,self.model,dropout_rate=0.33)

        """LOOKUPS"""
        if not self.multiling or self.shareCharLookup:
            self.clookup = self.model.add_lookup_parameters((len(ch) + 1, self.char_emb_size))
        else:
            self.clookups = {}
            for lang in self.langs:
                self.clookups[lang] = self.model.add_lookup_parameters((len(ch[lang]) + 1, self.char_emb_size))

        if not self.multiling or self.shareWordLookup:
            self.wlookup = self.model.add_lookup_parameters((len(words) + 2, self.word_emb_size))
        else:
            self.wlookups = {}
            for lang in self.langs:
                self.wlookups[lang] = self.model.add_lookup_parameters((len(words[lang]) + 2, self.word_emb_size))

        if self.multiling and self.lang_emb_size > 0:
            self.langslookup = model.add_lookup_parameters((len(langs) + 1, self.lang_emb_size))


        """Padding"""
        self.word2lstm = model.add_parameters((self.lstm_output_size * 2, lstm_input_size))
        self.word2lstmbias = model.add_parameters((self.lstm_output_size *2))
        self.chPadding = model.add_parameters((self.char_lstm_output_size *2))

    def get_char_vec(self,word,dropout,lang=None,langvec=None):
        if word.form == "*root*":
            word.chVec = self.chPadding.expr() # use the padding vector if it's the word token
        else:
            char_vecs = []
            for char in word.form:
                if lang:
                    cvec = self.clookups[lang][self.chars[lang].get(char,0)]
                else:
                    cvec = self.clookup[self.chars.get(char,0)]
                if langvec:
                    char_vecs.append(dy.concatenate([langvec,cvec]))
                else:
                    char_vecs.append(cvec)
            if lang:
                word.chVec = self.char_bilstms[lang].get_sequence_vector(char_vecs,dropout)
            else:
                word.chVec = self.char_bilstm.get_sequence_vector(char_vecs,dropout)

    def Init(self):
        #TODO: This function makes me cry
        #I'm not sure how necessary it is to get different padding vecs
        evec = self.elookup[1] if self.external_embedding is not None else None
        paddingLangVec = self.langslookup[0] if self.multiling and self.lang_emb_size > 0 else None
        if not self.multiling or self.shareWordLookup:
            paddingWordVec = self.wlookup[1]
            #import ipdb;ipdb.set_trace()
            self.paddingVec = dy.tanh(self.word2lstm.expr() * dy.concatenate(filter(None,
                                                                              [paddingWordVec,
                                                                               evec,
                                                                               self.chPadding.expr(),
                                                                               paddingLangVec if self.word_lembed else None]))
                                                                              + self.word2lstmbias.expr() )
            self.empty = self.paddingVec if self.nnvecs == 1 else dy.concatenate([self.paddingVec for _ in xrange(self.nnvecs)])
        else:
            paddingWordVecs = {}
            self.paddingVecs = {}
            self.emptyVecs = {}
            for lang in self.langs:
                paddingWordVecs[lang] = self.wlookups[lang][1]
                self.paddingVecs[lang] = dy.tanh(self.word2lstm.expr() * dy.concatenate(filter(None,
                                                                                        [paddingWordVecs[lang],
                                                                                         evec,
                                                                                         self.chPadding.expr(),
                                                                                         paddingLangVec if self.word_lembed else None]))
                                                                                          + self.word2lstmbias.expr() )
                self.emptyVecs[lang] = self.paddingVecs[lang] if self.nnvecs == 1 else dy.concatenate([self.paddingVecs[lang] for _ in xrange(self.nnvecs)])

    def getWordEmbeddings(self, sentence, train, get_vectors=False):

        lang = sentence[0].language_id

        for root in sentence:
            #word
            if not self.multiling or self.shareWordLookup:
                wordcount = float(self.wordsCount.get(root.norm, 0))
            else:
                wordcount = float(self.wordsCount[lang].get(root.norm, 0))

            noDropFlag =  not train or (random.random() < (wordcount/(0.25+wordcount)))
            if not self.multiling or self.shareWordLookup:
                root.wordvec = self.wlookup[int(self.vocab.get(root.norm, 0)) if noDropFlag else 0]
            else:
                root.wordvec = self.wlookups[lang][int(self.vocab[lang].get(root.norm, 0)) if noDropFlag else 0]

            if self.multiling and self.word_lembed:
                root.langvec = self.langslookup[self.langs[root.language_id]] if self.lang_emb_size > 0 else None
            else:
                root.langvec = None

            #char
            if not self.multiling or self.shareCharBiLSTM:
                if self.char_lembed:
                    langVec = self.langslookup[self.langs[lang]]
                    self.get_char_vec(root,train, langvec=langvec)
                else:
                    self.get_char_vec(root,train)

            else:
                self.get_char_vec(root,train, lang=lang)

            if self.external_embedding is not None:
                if not noDropFlag and random.random() < 0.5:
                    root.evec = self.elookup[0]
                elif root.form in self.external_embedding:
                    root.evec = self.elookup[self.extrnd[root.form]]
                elif root.norm in self.external_embedding:
                    root.evec = self.elookup[self.extrnd[root.norm]]
                else:
                    root.evec = self.elookup[0]
            else:
                root.evec = None

            root.vec = dy.concatenate(filter(None, [root.wordvec,
                                                    root.evec,
                                                    root.chVec,
                                                    root.langvec]))

        if not self.multiling or self.shareBiLSTM:
            self.bilstm1.set_token_vecs(sentence,train)
            self.bilstm2.set_token_vecs(sentence,train)
        else:
            self.bilstm1[lang].set_token_vecs(sentence,train)
            self.bilstm2[lang].set_token_vecs(sentence,train)

        if get_vectors:
            data_vec = list()
            for i, token in enumerate(sentence):
                if token.form != '*root*':
                    import pdb
                    wordvec = token.wordvec.value()
                    if self.external_embedding is not None:
                        wordvec += token.evec.value()
                    data_tuple = (i+1, token.form, token.cpos, token.feats, token.chVec.value(), wordvec, token.vec.value())
                    data_vec.append(data_tuple)
            return data_vec 


    def get_external_embeddings(self, external_embedding_file, model, wordsCount):
        
        # NOTE: this is modified to load fastText embeddings!
        self.external_embedding = {}
        external_embedding_fp = codecs.open(external_embedding_file, 'r', encoding='utf-8')

        # read first line --- number of tokens and embedding dimension
        self.edim = int(external_embedding_fp.readline().split()[1])
        num_tokens = 0

        for line in external_embedding_fp:
            line = line.strip().split()
            if len(line) != self.edim + 1: 
                continue
            else:
                if line[0] in wordsCount:
                    self.external_embedding[line[0]] = [float(f) for f in line[1:]]
                    num_tokens += 1


        external_embedding_fp.close()
        # self.edim = len(self.external_embedding.values()[0])
        self.noextrn = [0.0 for _ in xrange(self.edim)] #???
        self.extrnd = {word: i + 3 for i, word in enumerate(self.external_embedding)}
        self.elookup = model.add_lookup_parameters((len(self.external_embedding) + 3, self.edim))
        for word, i in self.extrnd.iteritems():
            self.elookup.init_row(i, self.external_embedding[word])
        self.extrnd['*PAD*'] = 1
        self.extrnd['*INITIAL*'] = 2

        print '-' * 100
        print 'Load external embedding. Vector dimensions:', self.edim, ', number of tokens:', num_tokens
        print '-' * 100
Ejemplo n.º 27
0
def main(_):
    tf.logging.set_verbosity(tf.logging.INFO)
    data_loader = TextLoader(True, FLAGS.train_path, FLAGS.batch_size, FLAGS.seq_length, None, None, 'utf8', False)
    test_data_loader = TextLoader(False, FLAGS.test_path, FLAGS.batch_size, FLAGS.seq_length, data_loader.vocab, data_loader.labels, 'utf8', False)
    tf.logging.info("vocab_size: " + str(data_loader.vocab_size))
    FLAGS.vocab_size = data_loader.vocab_size
    tf.logging.info("label_size: " + str(data_loader.label_size))
    FLAGS.label_size = data_loader.label_size
    bilstm = BiLSTM(FLAGS)
    init = tf.global_variables_initializer()
    config = tf.ConfigProto(allow_soft_placement=True)
    config.gpu_options.allow_growth = True
    with tf.Session(config=config) as sess:
        sess.run(init)
        saver = tf.train.Saver(tf.global_variables(), max_to_keep=2)
        idx = 0
        test_best_acc = 0
        for epcho in range(FLAGS.num_epcho):  #for each epoch
            data_loader.reset_batch_pointer()
            for train_batch_num in range(data_loader.num_batches):  #for each batch
                input_x, input_y, x_len = data_loader.next_batch()
                feed = {bilstm.input_x:input_x, bilstm.input_y:input_y, bilstm.x_len:x_len, bilstm.dropout_keep_prob:FLAGS.dropout_keep_prob}
                _, global_step_op, train_loss, train_acc = sess.run(
                    [bilstm.train_step, bilstm.global_step, bilstm.loss, bilstm.acc], feed_dict=feed)
                tf.logging.info("training...........global_step = {}, epoch = {}, current_batch = {}, "
                                "train_loss = {:.4f}, accuracy = {:.4f}".format(global_step_op, epcho, train_batch_num, train_loss, train_acc))
                idx += 1
                if idx % FLAGS.check_every == 0:
                    test_acc = 0
                    all_num = 0
                    acc_num = 0
                    test_data_loader.reset_batch_pointer()
                    write_result = []
                    for _ in range(test_data_loader.num_batches):
                        input_x_test, input_y_test, x_len_test = test_data_loader.next_batch()
                        feed = {bilstm.input_x: input_x_test, bilstm.input_y: input_y_test, bilstm.x_len: x_len_test, bilstm.dropout_keep_prob: 1.0}
                        prediction, arg_index = sess.run([bilstm.prediction, bilstm.arg_index], feed_dict=feed)
                        all_num = all_num + len(input_y_test)
                        write_str = ""
                        for i, indexs in enumerate(arg_index):
                            pre_label_id = indexs[0]
                            real_label_id = input_y_test[i]
                            if pre_label_id == real_label_id:
                                acc_num = acc_num + 1
                            if real_label_id in test_data_loader.id_2_label:
                                write_str = test_data_loader.id_2_label.get(real_label_id)
                            else:
                                write_str = "__label__unknown"
                            for index in indexs:
                                cur_label = test_data_loader.id_2_label.get(index)
                                cur_score = prediction[i][index]
                                write_str = write_str + " " + cur_label + ":" + str(cur_score)
                            write_str = write_str + "\n"
                            write_result.append(write_str)
                    test_acc = acc_num * 1.0 / all_num
                    tf.logging.info("testing...........global_step = {}, epoch = {}, accuracy = {:.4f}, cur_best_acc = {}".format(global_step_op, epcho, test_acc, test_best_acc))
                    if test_best_acc < test_acc:
                        test_best_acc = test_acc
                        # save_model
                        checkpoint_path = os.path.join(FLAGS.model_path, 'lstm.ckpt')
                        saver.save(sess, checkpoint_path, global_step=global_step_op)
                        resultfile = open(FLAGS.result_file, 'w', encoding='utf-8')
                        for pre_sen in write_result:
                            resultfile.write(pre_sen)
                        tf.logging.info("has saved model and write.result...................................................................")
                        resultfile.close()
Ejemplo n.º 28
0
    def __init__(self, model, wordsCount, rels, langs, words, ch, nnvecs, options):
        """
        Options handling
        """
        self.model = model
        if langs:
            self.langs = {lang: ind+1 for ind, lang in enumerate(langs)} # +1 for padding vector
        else:
            self.langs = None
        self.nnvecs = nnvecs
        self.multiling = options.multiling #and options.use_lembed
        self.external_embedding = None
        if options.external_embedding is not None:
            self.get_external_embeddings(options.external_embedding, model, wordsCount)
        self.disable_bilstm = options.disable_bilstm
        self.disable_second_bilstm = options.disable_second_bilstm

        """sharing"""
        self.shareBiLSTM = options.shareBiLSTM
        self.shareWordLookup = options.shareWordLookup
        self.shareCharLookup = options.shareCharLookup
        self.shareCharBiLSTM = options.shareCharBiLSTM
        self.word_lembed = options.lembed_word
        self.char_lembed = options.lembed_char

        """dims"""
        self.word_emb_size = options.word_emb_size
        self.char_emb_size = options.char_emb_size
        self.lstm_output_size = options.lstm_output_size
        self.char_lstm_output_size = options.char_lstm_output_size
        self.lang_emb_size = options.lang_emb_size

        lstm_input_size = self.word_emb_size + (self.edim if self.external_embedding is\
                          not None else 0) + (self.lang_emb_size if self.word_lembed else 0)\
                          + 2 * self.char_lstm_output_size

        """UTILS"""
        self.wordsCount = wordsCount
        self.irels = rels

        if self.multiling and not self.shareWordLookup:
            w2i = {}
            for lang in self.langs:
                 w2i[lang] = {w: i for i, w in enumerate(words[lang])}
            self.vocab = {}
            for lang in self.langs:
                self.vocab[lang] = {word: ind+2 for word, ind in w2i[lang].iteritems()}

        else:
            w2i = {w: i for i, w in enumerate(words)}
            self.vocab = {word: ind+2 for word, ind in w2i.iteritems()} # +2 for MLP padding vector and OOV vector

        if not self.multiling or self.shareCharLookup:
            self.chars = {char: ind+1 for ind, char in enumerate(ch)} # +1 for OOV vector
        else:
            self.chars = {}
            for lang in self.langs:
                self.chars[lang] = {char: ind+1 for ind, char in enumerate(ch[lang])}
        self.rels = {word: ind for ind, word in enumerate(rels)}

        """BILSTMS"""
        #word
        if not self.multiling or self.shareBiLSTM:
            if not self.disable_bilstm:
                self.bilstm1 = BiLSTM(lstm_input_size, self.lstm_output_size, model,
                                      dropout_rate=0.33)
                if not self.disable_second_bilstm:
                    self.bilstm2 = BiLSTM(2* self.lstm_output_size, self.lstm_output_size, model,
                                          dropout_rate=0.33)
            else:
                self.lstm_output_size = int(lstm_input_size * 0.5)
        else:
            self.bilstm1= {}
            self.bilstm2= {}
            for lang in self.langs:
                self.bilstm1[lang] = BiLSTM(lstm_input_size, self.lstm_output_size, model,
                                      dropout_rate=0.33)
                self.bilstm2[lang] = BiLSTM(2* self.lstm_output_size, self.lstm_output_size, model,
                                            dropout_rate=0.33)

        #char
        if self.char_lembed:
            char_in_dims = self.char_emb_size + self.lang_emb_size
        else:
            char_in_dims = self.char_emb_size

        if not self.multiling or self.shareCharBiLSTM:
            self.char_bilstm = BiLSTM(char_in_dims,self.char_lstm_output_size,self.model,dropout_rate=0.33)
        else:
            self.char_bilstms = {}
            for lang in self.langs:
                self.char_bilstms[lang] = BiLSTM(char_in_dims,self.char_lstm_output_size,self.model,dropout_rate=0.33)

        """LOOKUPS"""
        if not self.multiling or self.shareCharLookup:
            self.clookup = self.model.add_lookup_parameters((len(ch) + 1, self.char_emb_size))
        else:
            self.clookups = {}
            for lang in self.langs:
                self.clookups[lang] = self.model.add_lookup_parameters((len(ch[lang]) + 1, self.char_emb_size))

        if not self.multiling or self.shareWordLookup:
            self.wlookup = self.model.add_lookup_parameters((len(words) + 2, self.word_emb_size))
        else:
            self.wlookups = {}
            for lang in self.langs:
                self.wlookups[lang] = self.model.add_lookup_parameters((len(words[lang]) + 2, self.word_emb_size))

        if self.multiling and self.lang_emb_size > 0:
            self.langslookup = model.add_lookup_parameters((len(langs) + 1, self.lang_emb_size))


        """Padding"""
        self.word2lstm = model.add_parameters((self.lstm_output_size * 2, lstm_input_size))
        self.word2lstmbias = model.add_parameters((self.lstm_output_size *2))
        self.chPadding = model.add_parameters((self.char_lstm_output_size *2))
Ejemplo n.º 29
0
class FeatureExtractor(object):
    def __init__(self, model, options, vocab, nnvecs=1):

        self.word_counts, words, chars, pos, cpos, rels, treebanks, langs = vocab

        self.model = model
        self.nnvecs = nnvecs

        # Load ELMo if the option is set
        if options.elmo is not None:
            from elmo import ELMo
            self.elmo = ELMo(options.elmo, options.elmo_gamma,
                             options.elmo_learn_gamma)
            self.elmo.init_weights(model)
        else:
            self.elmo = None

        extra_words = 2  # MLP padding vector and OOV vector
        self.words = {word: ind for ind, word in enumerate(words, extra_words)}
        self.word_lookup = self.model.add_lookup_parameters(
            (len(self.words) + extra_words, options.word_emb_size))

        extra_pos = 2  # MLP padding vector and OOV vector
        self.pos = {pos: ind for ind, pos in enumerate(cpos, extra_pos)}
        self.pos_lookup = self.model.add_lookup_parameters(
            (len(cpos) + extra_pos, options.pos_emb_size))

        self.irels = rels
        self.rels = {rel: ind for ind, rel in enumerate(rels)}

        extra_chars = 1  # OOV vector
        self.chars = {char: ind for ind, char in enumerate(chars, extra_chars)}
        self.char_lookup = self.model.add_lookup_parameters(
            (len(chars) + extra_chars, options.char_emb_size))

        extra_treebanks = 1  # Padding vector
        self.treebanks = {
            treebank: ind
            for ind, treebank in enumerate(treebanks, extra_treebanks)
        }
        self.treebank_lookup = self.model.add_lookup_parameters(
            (len(treebanks) + extra_treebanks, options.tbank_emb_size))

        # initialise word vectors with external embeddings where they exist
        # This part got ugly - TODO: refactor
        if not options.predict:
            self.external_embedding = defaultdict(lambda: {})

            if options.ext_word_emb_file and options.word_emb_size > 0:
                # Load pre-trained word embeddings
                for lang in langs:
                    embeddings = utils.get_external_embeddings(
                        options,
                        emb_file=options.ext_word_emb_file,
                        lang=lang,
                        words=self.words.viewkeys())
                    self.external_embedding["words"].update(embeddings)

            if options.ext_char_emb_file and options.char_emb_size > 0:
                # Load pre-trained character embeddings
                for lang in langs:
                    embeddings = utils.get_external_embeddings(
                        options,
                        emb_file=options.ext_char_emb_file,
                        lang=lang,
                        words=self.chars,
                        chars=True)
                    self.external_embedding["chars"].update(embeddings)

            if options.ext_emb_dir:
                # For every language, load the data for the word and character
                # embeddings from a directory.
                for lang in langs:
                    if options.word_emb_size > 0:
                        embeddings = utils.get_external_embeddings(
                            options,
                            emb_dir=options.ext_emb_dir,
                            lang=lang,
                            words=self.words.viewkeys())
                        self.external_embedding["words"].update(embeddings)

                    if options.char_emb_size > 0:
                        embeddings = utils.get_external_embeddings(
                            options,
                            emb_dir=options.ext_emb_dir,
                            lang=lang,
                            words=self.chars,
                            chars=True)
                        self.external_embedding["chars"].update(embeddings)

            self.init_lookups(options)

        elmo_emb_size = self.elmo.emb_dim if self.elmo else 0
        self.lstm_input_size = (
            options.word_emb_size + elmo_emb_size + options.pos_emb_size +
            options.tbank_emb_size + 2 *
            (options.char_lstm_output_size if options.char_emb_size > 0 else 0)
        )
        print "Word-level LSTM input size: " + str(self.lstm_input_size)

        self.bilstms = []
        if options.no_bilstms > 0:
            self.bilstms.append(
                BiLSTM(self.lstm_input_size,
                       options.lstm_output_size,
                       self.model,
                       dropout_rate=0.33))
            for i in range(1, options.no_bilstms):
                self.bilstms.append(
                    BiLSTM(2 * options.lstm_output_size,
                           options.lstm_output_size,
                           self.model,
                           dropout_rate=0.33))
            #used in the PaddingVec
            self.word2lstm = self.model.add_parameters(
                (options.lstm_output_size * 2, self.lstm_input_size))
            self.word2lstmbias = self.model.add_parameters(
                (options.lstm_output_size * 2))
        else:
            self.word2lstm = self.model.add_parameters(
                (self.lstm_input_size, self.lstm_input_size))
            self.word2lstmbias = self.model.add_parameters(
                (self.lstm_input_size))

        self.char_bilstm = BiLSTM(options.char_emb_size,
                                  options.char_lstm_output_size,
                                  self.model,
                                  dropout_rate=0.33)

        self.charPadding = self.model.add_parameters(
            (options.char_lstm_output_size * 2))

    def Init(self, options):
        paddingWordVec = self.word_lookup[
            1] if options.word_emb_size > 0 else None
        paddingElmoVec = dy.zeros(self.elmo.emb_dim) if self.elmo else None
        paddingPosVec = self.pos_lookup[1] if options.pos_emb_size > 0 else None
        paddingCharVec = self.charPadding.expr(
        ) if options.char_emb_size > 0 else None
        paddingTbankVec = self.treebank_lookup[
            0] if options.tbank_emb_size > 0 else None

        self.paddingVec = dy.tanh(self.word2lstm.expr() *\
            dy.concatenate(filter(None,[paddingWordVec,
                                        paddingElmoVec,
                                        paddingPosVec,
                                        paddingCharVec,
                                        paddingTbankVec])) + self.word2lstmbias.expr())

        self.empty = self.paddingVec if self.nnvecs == 1 else\
            dy.concatenate([self.paddingVec for _ in xrange(self.nnvecs)])

    def getWordEmbeddings(self,
                          sentence,
                          train,
                          options,
                          test_embeddings=defaultdict(lambda: {})):

        if self.elmo:
            # Get full text of sentence - excluding root, which is loaded differently
            # for transition and graph-based parsers.
            if options.graph_based:
                sentence_text = " ".join(
                    [entry.form for entry in sentence[1:]])
            else:
                sentence_text = " ".join(
                    [entry.form for entry in sentence[:-1]])

            elmo_sentence_representation = \
                self.elmo.get_sentence_representation(sentence_text)

        for i, root in enumerate(sentence):
            root.vecs = defaultdict(
                lambda: None
            )  # all vecs are None by default (possibly a little risky?)
            if options.word_emb_size > 0:
                if train:
                    word_count = float(self.word_counts.get(root.norm, 0))
                    dropFlag = random.random() > word_count / (0.25 +
                                                               word_count)
                    root.vecs["word"] = self.word_lookup[
                        self.words.get(root.norm, 0) if not dropFlag else 0]
                else:  # need to check in test_embeddings at prediction time
                    if root.norm in self.words:
                        root.vecs["word"] = self.word_lookup[self.words[
                            root.norm]]
                    elif root.norm in test_embeddings["words"]:
                        root.vecs["word"] = dy.inputVector(
                            test_embeddings["words"][root.norm])
                    else:
                        root.vecs["word"] = self.word_lookup[0]
            if options.pos_emb_size > 0:
                root.vecs["pos"] = self.pos_lookup[self.pos.get(root.cpos, 0)]
            if options.char_emb_size > 0:
                root.vecs["char"] = self.get_char_vector(
                    root, train, test_embeddings["chars"])
            if options.tbank_emb_size > 0:
                if options.forced_tbank_emb:
                    treebank_id = options.forced_tbank_emb
                elif root.proxy_tbank:
                    treebank_id = root.proxy_tbank
                else:
                    treebank_id = root.treebank_id
                # this is a bit of a hack for models trained on an old version of the code
                # that used treebank name rather than id as the lookup
                if not treebank_id in self.treebanks and treebank_id in utils.reverse_iso_dict and \
                    utils.reverse_iso_dict[treebank_id] in self.treebanks:
                    treebank_id = utils.reverse_iso_dict[treebank_id]
                root.vecs["treebank"] = self.treebank_lookup[
                    self.treebanks[treebank_id]]
            if self.elmo:
                if i < len(sentence) - 1:
                    # Don't look up the 'root' word
                    root.vecs["elmo"] = elmo_sentence_representation[i]
                else:
                    # TODO
                    root.vecs["elmo"] = dy.zeros(self.elmo.emb_dim)

            root.vec = dy.concatenate(
                filter(None, [
                    root.vecs["word"], root.vecs["elmo"], root.vecs["pos"],
                    root.vecs["char"], root.vecs["treebank"]
                ]))

        for bilstm in self.bilstms:
            bilstm.set_token_vecs(sentence, train)

    def get_char_vector(self, root, train, test_embeddings_chars={}):

        if root.char_rep == "*root*":  # no point running a character analysis over this placeholder token
            return self.charPadding.expr(
            )  # use the padding vector if it's the root token
        else:
            char_vecs = []
            for char in root.char_rep:
                if char in self.chars:
                    char_vecs.append(self.char_lookup[self.chars[char]])
                elif char in test_embeddings_chars:
                    char_vecs.append(
                        dy.inputVector(test_embeddings_chars[char]))
                else:
                    char_vecs.append(self.char_lookup[0])
            return self.char_bilstm.get_sequence_vector(char_vecs, train)

    def init_lookups(self, options):

        if self.external_embedding["words"]:
            print 'Initialising %i word vectors with external embeddings' % len(
                self.external_embedding["words"])
            for word in self.external_embedding["words"]:
                if len(self.external_embedding["words"]
                       [word]) != options.word_emb_size:
                    raise Exception(
                        "Size of external embedding does not match specified word embedding size of %s"
                        % (options.word_emb_size))
                self.word_lookup.init_row(
                    self.words[word], self.external_embedding["words"][word])
        elif options.word_emb_size > 0:
            print 'No word external embeddings found: all vectors initialised randomly'

        if self.external_embedding["chars"]:
            print 'Initialising %i char vectors with external embeddings' % len(
                self.external_embedding["chars"])
            for char in self.external_embedding["chars"]:
                if len(self.external_embedding["chars"]
                       [char]) != options.char_emb_size:
                    raise Exception(
                        "Size of external embedding does not match specified char embedding size of %s"
                        % (options.char_emb_size))
                self.char_lookup.init_row(
                    self.chars[char], self.external_embedding["chars"][char])
        elif options.char_emb_size > 0:
            print 'No character external embeddings found: all vectors initialised randomly'
    def __init__(self, vocab_size, charEmbedding, out_size, crf=True):
        """功能:对LSTM的模型进行训练与测试
           参数:
            vocab_size:词典大小
            out_size:标注种类
            crf选择是否添加CRF层"""
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")

        # 加载模型参数
        self.emb_size = LSTMConfig.emb_size
        self.hidden_size = LSTMConfig.hidden_size
        self.charEmbedding = charEmbedding
        self.crf = crf
        # 根据是否添加crf初始化不同的模型 选择不一样的损失计算函数
        if not crf:
            self.model = BiLSTM(vocab_size, self.emb_size, self.hidden_size,
                                out_size).to(self.device)
            self.cal_loss_func = cal_loss
        else:
            print('bilstmcrf')
            self.model = BiLSTM_CRF(vocab_size, self.emb_size,
                                    self.charEmbedding, self.hidden_size,
                                    out_size).to(self.device)
            self.cal_loss_func = cal_lstm_crf_loss
        print(self.model)
        #
        # preTrainModel=torch.load('/home/huang/Desktop/named_entity_recognition/ckpts/language_model.ckpt')
        # model_dict = self.model.state_dict()

        # pretrained_dict = {'bilstm.'+k: v for k, v in preTrainModel.items() if 'bilstm.'+k in model_dict}  # filter out unnecessary keys
        # model_dict.update(pretrained_dict)
        # self.model.load_state_dict(model_dict)

        # 加载训练参数:
        self.epoches = TrainingConfig.epoches
        self.print_step = TrainingConfig.print_step
        self.lr = TrainingConfig.lr
        self.batch_size = TrainingConfig.batch_size
        weight_p, bias_p = [], []
        for name, p in self.model.named_parameters():
            if 'bias' in name:
                bias_p += [p]
            else:
                weight_p += [p]
        parameter_list = [
            # {"params": self.model.bilstm.embed.parameters(), "lr": 0.001},
            #                   {"params": self.model.bilstm.preTrainembedding.parameters(), "lr": 0.00001},                          \
            #                   {"params": self.model.bilstm.lstm.parameters(), "lr": 0.001},
            #                   {"params": self.model.bilstm.conv1.parameters(), "lr": 0.001}, \
            #                   {"params": self.model.bilstm.conv2.parameters(), "lr": 0.001},
            #                   {"params": self.model.bilstm.conv3.parameters(), "lr": 0.001},
            #                   {"params": self.model.bilstm.dense1.parameters(), "lr": 0.001},
            #                   {"params": self.model.bilstm.dense2.parameters(), "lr": 0.001},
            #                   {"params": self.model.bilstm.dense3.parameters(), "lr": 0.001},
            #                   {"params": self.model.bilstm.lin.parameters(), "lr": 0.001},
            #                   {"params": self.model.transition, "lr": 0.001},
            {
                "params": weight_p,
                "weight_decay": 0.00005
            },
            {
                'params': bias_p,
                'weight_decay': 0
            }
        ]
        # 初始化优化器
        self.optimizer = optim.Adam(parameter_list, lr=0.001)

        # 初始化其他指标
        self.step = 0
        self.best_val_loss = 1e18
        self.best_model = None