Beispiel #1
0
def train(config):
    # tag_length should not include start/end tags
    model = BiLSTM_CRF(config)
    optimizer = optim.Adam(model.parameters(), config.lr)
    # f1 score of validation dataset
    valid_f1 = -1000
    stop = False
    start_t = time.time()
    for epoch in range(config.n_epoch):
        # or bert_test.zero_grad() since all bert_test parameters are in optimizer
        if stop:
            break
        optimizer.zero_grad()

        _, batch_inputs, batch_outputs, masks, length = random_batch(embeddings, x_train, y_train, config.batch_size)

        loss = model.neg_log_likelihood(batch_inputs, batch_outputs, masks, length)

        loss.backward()
        optimizer.step()

        if (epoch + 1) % config.eval_freq == 0:
            print('Epoch: {:04d}, loss: {:.4f}, seconds: {:.4f}'.format(epoch, loss, time.time() - start_t))
            entities, new_valid_f1, prec, recall = get_f1(model, config, test=False)
            print('[Validation]f1 score from {:.6f} to {:.6f}'.format(valid_f1, new_valid_f1))
            print('[Validation]precision: {}, recall: {}\n'.format(prec, recall))
            if epoch > 100000 and (abs(new_valid_f1 - valid_f1) < 0.001 or new_valid_f1 < valid_f1):
                stop = True
            if new_valid_f1 > valid_f1:
                valid_f1 = new_valid_f1

    torch.save(model.state_dict(), config.model_save_path)
Beispiel #2
0
def singel_predict(model_path, content, char_to_id_json_path, batch_size,
                   embedding_dim, hidden_dim, num_layers, sentence_length,
                   offset, target_type_list, tag2id):

    char_to_id = json.load(
        open(char_to_id_json_path, mode="r", encoding="utf-8"))
    # 将字符串转为码表id列表
    char_ids = content_to_id(content, char_to_id)
    # 处理成 batch_size * sentence_length 的 tensor 数据
    # 定义模型输入列表
    model_inputs_list, model_input_map_list = build_model_input_list(
        content, char_ids, batch_size, sentence_length, offset)
    # 加载模型
    model = BiLSTM_CRF(vocab_size=len(char_to_id),
                       tag_to_ix=tag2id,
                       embedding_dim=embedding_dim,
                       hidden_dim=hidden_dim,
                       batch_size=batch_size,
                       num_layers=num_layers,
                       sequence_length=sentence_length)
    # 加载模型字典
    model.load_state_dict(torch.load(model_path))

    tag_id_dict = {
        v: k
        for k, v in tag_to_id.items() if k[2:] in target_type_list
    }
    # 定义返回实体列表
    entities = []
    with torch.no_grad():
        for step, model_inputs in enumerate(model_inputs_list):
            prediction_value = model(model_inputs)
            # 获取每一行预测结果
            for line_no, line_value in enumerate(prediction_value):
                # 定义将要识别的实体
                entity = None
                # 获取当前行每个字的预测结果
                for char_idx, tag_id in enumerate(line_value):
                    # 若预测结果 tag_id 属于目标字典数据 key 中
                    if tag_id in tag_id_dict:
                        # 取符合匹配字典id的第一个字符,即B, I
                        tag_index = tag_id_dict[tag_id][0]
                        # 计算当前字符确切的下标位置
                        current_char = model_input_map_list[step][line_no][
                            char_idx]
                        # 若当前字标签起始为 B, 则设置为实体开始
                        if tag_index == "B":
                            entity = current_char
                        # 若当前字标签起始为 I, 则进行字符串追加
                        elif tag_index == "I" and entity:
                            entity += current_char
                    # 当实体不为空且当前标签类型为 O 时,加入实体列表
                    if tag_id == tag_to_id["O"] and entity:
                        # 满足当前字符为O,上一个字符为目标提取实体结尾时,将其加入实体列表
                        entities.append(entity)
                        # 重置实体
                        entity = None
    return entities
Beispiel #3
0
def get_f1(model, config, test=True):
    if test:
        model = BiLSTM_CRF(config)
        model.load_state_dict(torch.load(config.model_save_path))
        model.eval()
        x, y = x_test, y_test
    else:
        x, y = x_valid, y_valid

    n_batch = math.ceil(len(x) / config.batch_size)
    entity_pred, entity_true = [], []

    for i in range(n_batch):
        start = i * config.batch_size
        end = (i + 1) * config.batch_size if i != (n_batch - 1) else len(x)
        batch_ids, batch_inputs, batch_outputs, masks, length = random_batch(embeddings,
                                                                             x[start:end],
                                                                             y[start:end],
                                                                             end - start,
                                                                             False)
        scores, sequences = model(batch_inputs, masks, length)
        entity_pred += retrieve_entity(batch_ids, sequences, masks, id2tag, id2word)
        entity_true += retrieve_entity(batch_ids, batch_outputs.numpy(), masks, id2tag, id2word)

    union = [i for i in entity_pred if i in entity_true]
    precision = float(len(union)) / len(entity_pred)
    recall = float(len(union)) / len(entity_true)
    f1_score = 2 * precision * recall / (precision + recall) if len(union) != 0 else 0.0

    return entity_pred, f1_score, precision, recall
Beispiel #4
0
def main(_):
    word2idx = pickle_load(FLAGS.word2idx_path)
    word2vec = pickle_load(FLAGS.word2vec_path)

    embedding_size = list(word2vec.values())[0].shape[0]
    word_embeddings = np.zeros([len(word2idx) + 1,
                                embedding_size])  # index 0 for UNK token
    for word in word2idx.keys():
        word_embeddings[word2idx[word]] = word2vec[word]

    FLAGS.n_words = word_embeddings.shape[0]
    FLAGS.embedding_size = word_embeddings.shape[1]

    train_data = read_data(FLAGS.train_data_path, word2idx,
                           FLAGS.max_sequence_len)
    valid_data = read_data(FLAGS.valid_data_path, word2idx,
                           FLAGS.max_sequence_len)

    graph = tf.Graph()
    with tf.Session(graph=graph) as sess:
        model = BiLSTM_CRF(FLAGS, sess, word_embeddings)
        model.build_model()
        model.train(train_data, valid_data)
if __name__ == '__main__':

    char_embedding_mat = np.load('data/char_embedding_matrix.npy')

    X_test = np.load('data/X_test.npy')
    y_test = np.load('data/y_test.npy')
    true_path = 'res/true.txt'
    predict_path = 'res/predict.txt'

    ner_model = BiLSTM_CRF(n_input=500,
                           n_vocab=char_embedding_mat.shape[0],
                           n_embed=200,
                           embedding_mat=char_embedding_mat,
                           keep_prob=0.5,
                           n_lstm=150,
                           keep_prob_lstm=0.5,
                           n_entity=13,
                           optimizer='adam',
                           batch_size=512,
                           epochs=100)

    model_file = 'checkpoints/bilstm_crf_weights_best.hdf5'
    ner_model.model.load_weights(model_file)

    y_pred = ner_model.model.predict(X_test[:, :])

    char2vec, n_char, n_embed, char2index = p.get_char2object()

    index2char = {i: w for w, i in char2index.items()}
Beispiel #6
0
X = np.load('data/train.npy')
y = np.load('data/y.npy')

X_train = X[:600]
y_train = y[:600]

# ner_model = BiLSTM_CRF(n_input=200, n_vocab=char_embedding_mat.shape[0],
#                        n_embed=100, embedding_mat=char_embedding_mat,
#                        keep_prob=0.5, n_lstm=100, keep_prob_lstm=0.8,
#                        n_entity=7, optimizer='adam', batch_size=64, epochs=500)
ner_model = BiLSTM_CRF(n_input=300,
                       n_vocab=char_embedding_mat.shape[0],
                       n_embed=100,
                       embedding_mat=char_embedding_mat,
                       keep_prob=0.5,
                       n_lstm=256,
                       keep_prob_lstm=0.6,
                       n_entity=3,
                       optimizer='adam',
                       batch_size=16,
                       epochs=500)

cp_folder, cp_file = 'checkpoints', 'bilstm_crf_weights_best_attention_experiment2.hdf5'
log_filepath = 'logs/bilstm_crf_summaries'

cb = [
    ModelCheckpoint(os.path.join(cp_folder, cp_file),
                    monitor='val_loss',
                    verbose=1,
                    save_best_only=True,
                    save_weights_only=True,
Beispiel #7
0
    x_sent[0, :x_sent_1.shape[0]] = x_sent_1
    x_sent[1, :x_sent_2.shape[0]] = x_sent_2

    # create a batch of 2 samples with their proper padding
    x_tags = torch.full((2, max_sent_size), Const.PAD_TAG_ID, dtype=torch.long)
    x_tags[0, :x_tags_1.shape[0]] = x_tags_1
    x_tags[1, :x_tags_2.shape[0]] = x_tags_2

    # mask tensor with shape (batch_size, max_sent_size)
    mask = (x_tags != Const.PAD_TAG_ID).float()

    # get a reversed dict mapping int to str
    ix_to_tag = {ix: tag for tag, ix in tag_to_ix.items()}

    # see bilstm_crf.py
    model = BiLSTM_CRF(len(word_to_ix), len(tag_to_ix))
    optimizer = optim.SGD(model.parameters(), lr=0.01, weight_decay=1e-4)

    # Check predictions before training
    print('Predictions before training:')
    with torch.no_grad():
        scores, seqs = model(x_sent, mask=mask)
        for score, seq in zip(scores, seqs):
            str_seq = " ".join(ids_to_tags(seq, ix_to_tag))
            print('%.2f: %s' % (score.item(), str_seq))

    # Make sure prepare_sequence from earlier in the LSTM section is loaded
    for epoch in range(
            300):  # normally you would NOT do 300 epochs, it is toy data

        # Step 1. Remember that Pytorch accumulates gradients.
Beispiel #8
0
def main(config):
    trainDataPath = config['data']['trainDataPath']
    validDataPath = config['data']['validDataPath']
    testDataPath = config['data']['testDataPath']

    modelName = config['modelName']

    batchSize = config['model']['batchSize']
    epochNum = config['model']['epochNum']
    earlyStop = config['model']['earlyStop']
    learningRate = config['model']['learningRate']
    modelSavePath = config['model']['modelSavePath']

    #GPU/CPU
    DEVICE = config['DEVICE']

    trianDataset = NERDataset(trainDataPath, config)
    validDataset = NERDataset(validDataPath, config)
    testDataset = NERDataset(testDataPath, config)

    trainIter = data.DataLoader(dataset=trianDataset,
                                batch_size=batchSize,
                                shuffle=True,
                                num_workers=4,
                                collate_fn=pad)

    validIter = data.DataLoader(dataset=validDataset,
                                batch_size=batchSize,
                                shuffle=False,
                                num_workers=4,
                                collate_fn=pad)

    testIter = data.DataLoader(dataset=testDataset,
                               batch_size=batchSize,
                               shuffle=False,
                               num_workers=4,
                               collate_fn=pad)

    if modelName == 'bilstm':
        net = BiLSTM(config)
        train = bilstmTrain
        eval = bilstmEval
        if torch.cuda.device_count() > 1:
            net = nn.DataParallel(net)

    if modelName == 'bilstm_crf':
        net = BiLSTM_CRF(config)
        train = bilstmCRFTrain
        eval = bilstmCRFEval

    if modelName == 'transformer_crf':
        net = Transformer_CRF(config)
        train = transformerCRFTrain
        eval = transformerCRFEval

    if modelName == 'cnn':
        net = CNN(config)
        train = cnnTrain
        eval = cnnEval

    net = net.to(DEVICE)

    lossFunction = nn.NLLLoss()
    optimizer = optim.Adam(net.parameters(),
                           lr=learningRate,
                           betas=(0.9, 0.999),
                           eps=1e-08)

    earlyNumber, beforeLoss, maxScore = 0, sys.maxsize, -1

    #开始训练
    for epoch in range(epochNum):

        print('第%d次迭代: ' % epoch)

        totalLoss = train(net,
                          trainIter,
                          optimizer=optimizer,
                          criterion=lossFunction,
                          DEVICE=DEVICE)
        print('训练损失为: %f' % totalLoss)

        totalLoss, f1Score = eval(net,
                                  validIter,
                                  criterion=lossFunction,
                                  DEVICE=DEVICE)

        if f1Score > maxScore:
            maxScore = f1Score
            torch.save(net.state_dict(), modelSavePath)

        print('验证损失为:%f   f1Score:%f / %f' % (totalLoss, f1Score, maxScore))

        if f1Score < maxScore:
            earlyNumber += 1
            print('earyStop: %d/%d' % (earlyNumber, earlyStop))
        else:
            earlyNumber = 0
        if earlyNumber >= earlyStop: break
        print('\n')

    #加载最优模型
    net.load_state_dict(torch.load(modelSavePath))
    totalLoss, f1Score = eval(net,
                              testIter,
                              criterion=lossFunction,
                              DEVICE=DEVICE)
    print('测试损失为: %f, f1Score: %f' % (totalLoss, f1Score))
Beispiel #9
0
def main(config):
    trainDataPath = config['data']['trainDataPath']
    validDataPath = config['data']['validDataPath']
    testDataPath = config['data']['testDataPath']
    batchSize = config['model']['batchSize']

    #GPU/CPU
    DEVICE = config['DEVICE']

    trianDataset = NERDataset(trainDataPath, config)
    validDataset = NERDataset(validDataPath, config)
    testDataset = NERTestDataset(testDataPath, config)

    trainIter = data.DataLoader(dataset=trianDataset,
                                batch_size=batchSize,
                                shuffle=True,
                                num_workers=6,
                                collate_fn=pad)

    validIter = data.DataLoader(dataset=validDataset,
                                batch_size=batchSize,
                                shuffle=False,
                                num_workers=6,
                                collate_fn=pad)

    testIter = data.DataLoader(dataset=testDataset,
                               batch_size=batchSize,
                               shuffle=False,
                               num_workers=6,
                               collate_fn=testPad)

    if config['modelName'] == 'bilstm':
        net = BiLSTM(config)
        config['modelSavePath'] = config['data']['BiLSTMSavePath']
        modelSavePath = config['modelSavePath']
        config['submitDataPath'] = config['data']['BiLSTMSubmitDataPath']
        train = bilstm_train
        test = bilstm_test

    if config['modelName'] == 'bilstm_crf':
        net = BiLSTM_CRF(config)
        config['modelSavePath'] = config['data']['BiLSTMCRFSavePath']
        modelSavePath = config['modelSavePath']
        config['submitDataPath'] = config['data']['BiLSTMCRFSubmitDataPath']
        train = bilstm_crf_train
        test = bilstm_crf_test

    if config['modelName'] == 'transformer_cnn':
        net = Transformer_CNN(config)
        config['modelSavePath'] = config['data']['TransformerCNNSavePath']
        config['submitDataPath'] = config['data'][
            'TransformerCNNSubmitDataPath']
        modelSavePath = config['modelSavePath']
        train = transformer_cnn_train
        test = transformer_cnn_test

    if torch.cuda.device_count() > 1:
        net = nn.DataParallel(net)

    net = net.to(DEVICE)

    if os.path.exists(modelSavePath):
        net.load_state_dict(torch.load(modelSavePath))

    #if config['train']:
    #train(net, trainIter, validIter, config)

    #if config['test']:
    test(net, testIter, config)
Beispiel #10
0
def train(data_loader, data_size, batch_size, embedding_dim, hidden_dim,
          sentence_length, num_layers, epochs, learning_rate, tag2id,
          model_saved_path, train_log_path, validate_log_path,
          train_history_image_path):
    '''
    data_loader: 数据集的加载器, 之前已经通过load_dataset完成了构造
    data_size:   训练集和测试集的样本数量
    batch_size:  批次的样本个数
    embedding_dim:  词嵌入的维度
    hidden_dim:     隐藏层的维度
    sentence_length:  文本限制的长度
    num_layers:       神经网络堆叠的LSTM层数
    epochs:           训练迭代的轮次
    learning_rate:    学习率
    tag2id:           标签到id的映射字典
    model_saved_path: 模型保存的路径
    train_log_path:   训练日志保存的路径
    validate_log_path:  测试集日志保存的路径
    train_history_image_path:  训练数据的相关图片保存路径
    '''
    # 将中文字符和id的对应码表加载进内存
    char2id = json.load(
        open("./data/char_to_id.json", mode="r", encoding="utf-8"))
    # 初始化BiLSTM_CRF模型
    model = BiLSTM_CRF(vocab_size=len(char2id),
                       tag_to_ix=tag2id,
                       embedding_dim=embedding_dim,
                       hidden_dim=hidden_dim,
                       batch_size=batch_size,
                       num_layers=num_layers,
                       sequence_length=sentence_length)

    # 定义优化器, 使用SGD作为优化器(pytorch中Embedding支持的GPU加速为SGD, SparseAdam)
    # 参数说明如下:
    # lr:          优化器学习率
    # momentum:    优化下降的动量因子, 加速梯度下降过程
    # optimizer = optim.SGD(params=model.parameters(), lr=learning_rate, momentum=0.85, weight_decay=1e-4)
    optimizer = optim.Adam(params=model.parameters(),
                           lr=learning_rate,
                           betas=(0.9, 0.999),
                           eps=1e-8,
                           weight_decay=1e-4)

    # 设定优化器学习率更新策略
    # 参数说明如下:
    # optimizer:    优化器
    # step_size:    更新频率, 每过多少个epoch更新一次优化器学习率
    # gamma:        学习率衰减幅度,
    #               按照什么比例调整(衰减)学习率(相对于上一轮epoch), 默认0.1
    #   例如:
    #   初始学习率 lr = 0.5,    step_size = 20,    gamma = 0.1
    #              lr = 0.5     if epoch < 20
    #              lr = 0.05    if 20 <= epoch < 40
    #              lr = 0.005   if 40 <= epoch < 60
    # scheduler = optim.lr_scheduler.StepLR(optimizer=optimizer, step_size=5, gamma=0.8)

    # 初始化存放训练中损失, 准确率, 召回率, F1等数值指标
    train_loss_list = []
    train_acc_list = []
    train_recall_list = []
    train_f1_list = []
    train_log_file = open(train_log_path, mode="w", encoding="utf-8")
    # 初始化存放测试中损失, 准确率, 召回率, F1等数值指标
    validate_loss_list = []
    validate_acc_list = []
    validate_recall_list = []
    validate_f1_list = []
    validate_log_file = open(validate_log_path, mode="w", encoding="utf-8")
    # 利用tag2id生成id到tag的映射字典
    id2tag = {v: k for k, v in tag2id.items()}
    # 利用char2id生成id到字符的映射字典
    id2char = {v: k for k, v in char2id.items()}

    # 按照参数epochs的设定来循环epochs次
    for epoch in range(epochs):
        # 在进度条打印前, 先输出当前所执行批次
        tqdm.write("Epoch {}/{}".format(epoch + 1, epochs))
        # 定义要记录的正确总实体数, 识别实体数以及真实实体数
        total_acc_entities_length, \
        total_predict_entities_length, \
        total_gold_entities_length = 0, 0, 0
        # 定义每batch步数, 批次loss总值, 准确度, f1值
        step, total_loss, correct, f1 = 1, 0.0, 0, 0

        # 开启当前epochs的训练部分
        for inputs, labels in tqdm(data_loader["train"]):
            # 将数据以Variable进行封装
            inputs, labels = Variable(inputs), Variable(labels)
            # 在训练模型期间, 要在每个样本计算梯度前将优化器归零, 不然梯度会被累加
            optimizer.zero_grad()
            # 此处调用的是BiLSTM_CRF类中的neg_log_likelihood()函数
            loss = model.neg_log_likelihood(inputs, labels)
            # 获取当前步的loss, 由tensor转为数字
            step_loss = loss.data
            # 累计每步损失值
            total_loss += step_loss
            # 获取解码最佳路径列表, 此时调用的是BiLSTM_CRF类中的forward()函数
            best_path_list = model(inputs)
            # 模型评估指标值获取包括:当前批次准确率, 召回率, F1值以及对应的实体个数
            step_acc, step_recall, f1_score, acc_entities_length, \
            predict_entities_length, gold_entities_length = evaluate(inputs.tolist(),
                                                                     labels.tolist(),
                                                                     best_path_list,
                                                                     id2char,
                                                                     id2tag)
            # 训练日志内容
            '''
            log_text = "Epoch: %s | Step: %s " \
                       "| loss: %.5f " \
                       "| acc: %.5f " \
                       "| recall: %.5f " \
                       "| f1 score: %.5f" % \
                       (epoch, step, step_loss, step_acc, step_recall,f1_score)
            '''

            # 分别累计正确总实体数、识别实体数以及真实实体数
            total_acc_entities_length += acc_entities_length
            total_predict_entities_length += predict_entities_length
            total_gold_entities_length += gold_entities_length

            # 对损失函数进行反向传播
            loss.backward()
            # 通过optimizer.step()计算损失, 梯度和更新参数
            optimizer.step()
            # 记录训练日志
            # train_log_file.write(log_text + "\n")
            step += 1
        # 获取当前epochs平均损失值(每一轮迭代的损失总值除以总数据量)
        epoch_loss = total_loss / data_size["train"]
        # 计算当前epochs准确率
        if total_predict_entities_length > 0:
            total_acc = total_acc_entities_length / total_predict_entities_length
        # 计算当前epochs召回率
        if total_gold_entities_length > 0:
            total_recall = total_acc_entities_length / total_gold_entities_length
        # 计算当前epochs的F1值
        total_f1 = 0
        if total_acc + total_recall != 0:
            total_f1 = 2 * total_acc * total_recall / (total_acc +
                                                       total_recall)
        log_text = "Epoch: %s " \
                   "| mean loss: %.5f " \
                   "| total acc: %.5f " \
                   "| total recall: %.5f " \
                   "| total f1 scroe: %.5f" % (epoch, epoch_loss,
                                               total_acc,
                                               total_recall,
                                               total_f1)

        # 当前epochs训练后更新学习率, 必须在优化器更新之后
        # scheduler.step()

        # 记录当前epochs训练loss值(用于图表展示), 准确率, 召回率, f1值
        train_loss_list.append(epoch_loss)
        train_acc_list.append(total_acc)
        train_recall_list.append(total_recall)
        train_f1_list.append(total_f1)
        train_log_file.write(log_text + "\n")

        # 定义要记录的正确总实体数, 识别实体数以及真实实体数
        total_acc_entities_length, \
        total_predict_entities_length, \
        total_gold_entities_length = 0, 0, 0
        # 定义每batch步数, 批次loss总值, 准确度, f1值
        step, total_loss, correct, f1 = 1, 0.0, 0, 0

        # 开启当前epochs的验证部分
        with torch.no_grad():
            for inputs, labels in tqdm(data_loader["validation"]):
                # 将数据以 Variable 进行封装
                inputs, labels = Variable(inputs), Variable(labels)
                # 此处调用的是 BiLSTM_CRF 类中的 neg_log_likelihood 函数
                # 返回最终的 CRF 的对数似然结果
                try:
                    loss = model.neg_log_likelihood(inputs, labels)
                except:
                    continue
                # 获取当前步的 loss 值,由 tensor 转为数字
                step_loss = loss.data
                # 累计每步损失值
                total_loss += step_loss
                # 获取解码最佳路径列表, 此时调用的是BiLSTM_CRF类中的forward()函数
                best_path_list = model(inputs)
                # 模型评估指标值获取: 当前批次准确率, 召回率, F1值以及对应的实体个数
                step_acc, step_recall, f1_score, acc_entities_length, \
                predict_entities_length, gold_entities_length = evaluate(inputs.tolist(),
                                                                         labels.tolist(),
                                                                         best_path_list,
                                                                         id2char,
                                                                         id2tag)

                # 训练日志内容
                '''
                log_text = "Epoch: %s | Step: %s " \
                           "| loss: %.5f " \
                           "| acc: %.5f " \
                           "| recall: %.5f " \
                           "| f1 score: %.5f" % \
                           (epoch, step, step_loss, step_acc, step_recall,f1_score)
                '''

                # 分别累计正确总实体数、识别实体数以及真实实体数
                total_acc_entities_length += acc_entities_length
                total_predict_entities_length += predict_entities_length
                total_gold_entities_length += gold_entities_length

                # 记录验证集损失日志
                # validate_log_file.write(log_text + "\n")
                step += 1

            # 获取当前批次平均损失值(每一批次损失总值除以数据量)
            epoch_loss = total_loss / data_size["validation"]
            # 计算总批次准确率
            if total_predict_entities_length > 0:
                total_acc = total_acc_entities_length / total_predict_entities_length
            # 计算总批次召回率
            if total_gold_entities_length > 0:
                total_recall = total_acc_entities_length / total_gold_entities_length
            # 计算总批次F1值
            total_f1 = 0
            if total_acc + total_recall != 0.0:
                total_f1 = 2 * total_acc * total_recall / (total_acc +
                                                           total_recall)
            log_text = "Epoch: %s " \
                       "| mean loss: %.5f " \
                       "| total acc: %.5f " \
                       "| total recall: %.5f " \
                       "| total f1 scroe: %.5f" % (epoch, epoch_loss,
                                                   total_acc,
                                                   total_recall,
                                                   total_f1)

            # 记录当前批次验证loss值(用于图表展示)准确率, 召回率, f1值
            validate_loss_list.append(epoch_loss)
            validate_acc_list.append(total_acc)
            validate_recall_list.append(total_recall)
            validate_f1_list.append(total_f1)
            validate_log_file.write(log_text + "\n")

    # 保存模型
    torch.save(model.state_dict(), model_saved_path)

    # 将loss下降历史数据转为图片存储
    save_train_history_image(train_loss_list, validate_loss_list,
                             train_history_image_path, "Loss")
    # 将准确率提升历史数据转为图片存储
    save_train_history_image(train_acc_list, validate_acc_list,
                             train_history_image_path, "Acc")
    # 将召回率提升历史数据转为图片存储
    save_train_history_image(train_recall_list, validate_recall_list,
                             train_history_image_path, "Recall")
    # 将F1上升历史数据转为图片存储
    save_train_history_image(train_f1_list, validate_f1_list,
                             train_history_image_path, "F1")
    print("train Finished".center(100, "-"))
Beispiel #11
0
        F1 = 2 * P * R / (P + R + 1e-18)
        loss = model.neg_log_likelihood(sentence_in, targets)
        loss_ += (loss.item() / len(sentence))
        count += 1
        average_loss = loss_ / (count + 1e-18)
        print('\rtesting:%d/%d\t loss:%0.5f \t' %
              (count, len(valdata), average_loss),
              end='',
              flush=True)
    return F1, average_loss


#%%

# 定义模型和优化器
model = BiLSTM_CRF(len(word_to_ix), tag_to_ix, EMBEDDING_DIM, HIDDEN_DIM)
# model.load_state_dict(torch.load('../model_dict/params.pkl'))
model.cuda()  # 调用cuda
#%%
optimizer = optim.SGD(model.parameters(), lr=0.01, weight_decay=1e-4)
# optimizer = optim.Adam(model.parameters(),lr = 1e-4, amsgrad=True, weight_decay=1e-5)

#%%
# 开始训练
for epoch in range(
        0, 10):  # again, normally you would NOT do 300 epochs, it is toy data
    inter = 0
    out_entity_num = 0
    target_entity_num = 0
    loss_ = 0
    for iter, (sentence, tags) in enumerate(train):
Beispiel #12
0
            pbar(step, {'loss': loss.item(), "acc": acc.acc})
            step += 1


def test(model, test_data):
    # Check predictions after training
    acc = Acc()
    with torch.no_grad():
        for test_sentence, test_tag in test_data:
            precheck_sent = prepare_sequence(test_sentence, word_to_ix)
            predict = model(precheck_sent)
            predict_tags = np.array(predict[1])
            predict_tags = torch.from_numpy(predict_tags)
            precheck_tags = torch.tensor([tag_to_ix[t] for t in test_tag],
                                         dtype=torch.long)
            acc.update(predict_tags, precheck_tags)
        print("acc:{%.4f}" % acc.acc)


# We got it!
if __name__ == '__main__':
    tag_to_ix = {"B": 0, "I": 1, "O": 2, START_TAG: 3, STOP_TAG: 4}
    training_data = get_train_data()
    word_to_ix = get_word_to_ix(training_data=training_data)
    model = BiLSTM_CRF(len(word_to_ix), tag_to_ix, EMBEDDING_DIM, HIDDEN_DIM)
    optimizer = optim.SGD(model.parameters(), lr=0.01, weight_decay=1e-4)
    test_data = training_data
    test(model, test_data)
    train(model, training_data)
    test(model, test_data)
def create_model(
    bert_config,
    is_training,
    input_ids,
    input_mask,
    segment_ids,
    label_ids,
    num_labels,
    use_one_hot_embeddings,
    label_weights,
    cell="lstm",
    num_layers=1,
    dropout_rate=0.1,
):
    """
    Create a model architecture.

    Args:
        bert_config:
        is_training:
        input_ids:
        input_mask:
        segment_ids:
        label_ids: [batch_size=32, ]
        num_labels: int, 类别个数
        use_one_hot_embeddings:
        label_weights: [batch_size=32, num_labels=2]

    """
    # output_layer: [batch_size=32, hidden_size=768]
    # 获取对应的embedding 输入数据[batch_size, seq_length, embedding_size]
    # with tf.Session():
    #     print(
    #         "{} input_ids={}".format(input_ids.shape, tf.reshape(input_ids, [-1, FLAGS.max_token_length]).eval().shape))

    # with tf.Session() as sess:
    #     input_ids = torch.tensor(sess.run(input_ids).eval(), dtype=torch.long)
    #     sess.close()

    input_ids = tf.reshape(input_ids, [-1, FLAGS.max_token_length])
    _, embedding = model(input_ids)

    print("input_ids={}".format(input_ids.shape))
    # hidden_size: 768

    print("{} embedding".format(embedding.shape))

    hidden_size = embedding.shape[-1].value

    max_seq_length = embedding.shape[1].value

    # 算序列真实长度
    used = tf.sign(tf.abs(input_ids))
    true_lengths = tf.reduce_sum(
        used, reduction_indices=1)  # [batch_size] 大小的向量,包含了当前batch中的真实序列长度

    print("{} true_lengths".format(true_lengths))
    # 添加Bi-LSTM+CRF Layer
    blstm_crf = BiLSTM_CRF(embedding_inputs=embedding,
                           hidden_units_num=hidden_size,
                           cell_type=cell,
                           num_layers=num_layers,
                           dropout_rate=dropout_rate,
                           initializers=initializers,
                           num_labels=num_labels,
                           sequence_length=max_seq_length,
                           tag_indices=label_ids,
                           sequence_lengths=true_lengths,
                           is_training=is_training)

    (loss, per_example_loss, logits,
     probabilities) = blstm_crf.add_blstm_crf_layer(crf_only=True)

    return (loss, per_example_loss, logits, probabilities)
Beispiel #14
0
        if index % log_interval == 0:
            print('Epoch:{0}-{1}/{2}, Batch Loss:{3}'.format(
                epoch, index, batches,
                loss.item() / sources.size(0)))

        loss.backward()
        total_loss += loss.item()
        op.step()

    return total_loss


if __name__ == '__main__':
    epochs = 1
    embedding_dim = hidden_dim = 300

    dataset = QuestionTag()
    data_loader = DataLoader(dataset, batch_size=1, shuffle=True)

    model = BiLSTM_CRF(len(dataset.word2idx), dataset.tag2idx, embedding_dim,
                       hidden_dim).cuda()
    optimizer = optim.SGD(model.parameters(), lr=0.01, weight_decay=1e-4)

    start = time.time()
    for epoch in range(1, epochs + 1):
        loss = train(epoch, data_loader, model, optimizer)
        print('Epoch:{0}, Loss:{1}, Elapsed-time:{2}'.format(
            epoch, loss / len(dataset), round(time.time() - start, 2)))
        print('#' * 100)

    torch.save(model, './check_point/model.pkl')
def main():
    EPOCHS = 500
    EARLY_STOP_EPOCHS = 5
    SPLIT_WORDS = 'first'

    # Init Train Dataset
    posdataset = POSTrainDataset(data_dir,
                                 dataset_choice=dataset,
                                 unk_chance=0)
    loader = DataLoader(posdataset)

    # Criterion to for loss (weighted)
    weighted_loss = torch.ones(len(posdataset.ttoi))

    for key in posdataset.entities:
        weighted_loss[posdataset.ttoi[key]] = posdataset.entities[key]
    weighted_loss = weighted_loss.to(device)
    criterion = nn.CrossEntropyLoss(weight=weighted_loss)

    if model_architecture == "bertlstm":
        model = BERT_BiLSTM_CRF(len(posdataset.wtoi), posdataset.ttoi,
                                HIDDEN_DIM, N_LAYERS)
        # No grad Bert layer
        for p in model.model.parameters():
            p.requires_grad = False
    elif model_architecture == "bilstm":
        EMBEDDING_SIZE = 256
        model = BiLSTM_CRF(len(posdataset.wtoi), posdataset.ttoi,
                           EMBEDDING_SIZE, HIDDEN_DIM, N_LAYERS, DROPOUT)

    model.to(device)

    optimizer = AdamW(model.parameters(), lr=LEARNING_RATE, eps=ADAMEPS)
    scheduler = optim.lr_scheduler.ExponentialLR(optimizer, SCHEDULER_GAMMA)

    model_name = f"{model_architecture}_h{HIDDEN_DIM}_n{N_LAYERS}_lr{LEARNING_RATE:f}_d{DROPOUT}_{dataset}"
    print(f"Running for {model_name}")

    # Implement Early stop
    early_stop = 0

    # Model training and eval
    best_loss = sys.maxsize
    train_losses = []
    eval_losses = []
    eval_recall = []
    eval_precision = []
    for epoch in range(1, EPOCHS + 1):
        start = time.time()

        scheduler.step()
        # Toggle Train set
        posdataset.train = True
        trainloss = train(loader, model, optimizer, criterion, device,
                          SPLIT_WORDS)
        train_losses.append(trainloss)
        # Toggle Validation Set
        posdataset.train = False
        loss, accuracy, precision, recall = eval(loader, model, criterion,
                                                 device, SPLIT_WORDS)
        eval_losses.append(loss)
        eval_recall.append(recall)
        eval_precision.append(precision)

        time_taken = time.time() - start

        print(
            'Epoch {}, Training Loss: {}, Evaluation Loss: {}, Evaluation Accuracy: {}, Evaluation Precision: {}, Evaluation Recall: {}, time taken: {:.3f}s'
            .format(epoch, trainloss, loss, accuracy, precision, recall,
                    time_taken),
            flush=True)

        # Check if current loss is better than previous
        if loss < best_loss:
            best_loss = loss
            torch.save(model, output_model_dir / '{}.pt'.format(model_name))
            early_stop = 0

        # If loss has stagnated, early stop
        else:
            early_stop += 1
            if early_stop >= EARLY_STOP_EPOCHS:
                print('Early Stopping')
                break

    # Plot respective graphs for visualisation
    plt.figure()
    plt.title('{} Model Training'.format(model_name))
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.plot(train_losses)
    plt.savefig('{}_Training.png'.format(model_name))

    plt.figure()
    plt.title('{} Model Evaluation'.format(model_name))
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.plot(eval_losses)
    plt.savefig('{}_EvalLoss.png'.format(model_name))

    plt.figure()
    plt.title('{} Model Evaluation'.format(model_name))
    plt.xlabel('Epoch')
    plt.ylabel('Precision')
    plt.plot(eval_precision)
    plt.savefig('{}_EvalPrec.png'.format(model_name))

    plt.figure()
    plt.title('{} Model Evaluation'.format(model_name))
    plt.xlabel('Epoch')
    plt.ylabel('Recall')
    plt.plot(eval_recall)
    plt.savefig('{}_EvalRecall.png'.format(model_name))
Beispiel #16
0
    'Drug': 5,
    'Frequency': 6,
    'Amount': 7,
    'Method': 8,
    'Treatment': 9,
    'Operation': 10,
    'Anatomy': 11,
    'Level': 12,
    'Duration': 13,
    'SideEff': 14,
    'O': 15,
    START_TAG: 16,
    STOP_TAG: 17
}

model = BiLSTM_CRF(len(word_to_ix), tag_to_ix, EMBEDDING_DIM, HIDDEN_DIM)
model.load_state_dict(
    torch.load('../model_dict/hidim512_1_0.6659692762422823_params.pkl'))
model.cuda()  # 调用cuda

base_path = '/home/lingang/chris/knowledge_graph'
# base_path = '/media/chris/D/challenge/knowledge_graph_rename'

file_txt_path = os.path.join(base_path, 'dataset', 'test_data')

result_dict = {}

with torch.no_grad():
    for p in os.listdir(file_txt_path):
        p1 = os.path.join(file_txt_path, p)
        output = []