Ejemplo n.º 1
0
def train_and_val():
    embedding_dim = 100
    hidden_dim = 100
    model_load_path = None
    best_model_save_path = 'model/model_100_best_0223.pth'
    max_score = 0
    stop_epoch = 30
    unimprove_time = 0
    val_json_path = '/home/agwave/Data/resume/val_0222.json'
    val_pdf_dir = '/home/agwave/Data/resume/val_0222/'

    training_data = get_data_from_data_txt(TRAIN_WORD_TO_TAG_PATH)
    with open('supporting_document/train_word_to_tag_0223.json', 'r') as j:
        word_to_ix = json.load(j)
    tag_to_ix = {'b-name': 0, 'i-name': 1, 'b-bir': 2, 'i-bir': 3, 'b-gend': 4, 'i-gend': 5,
                 'b-tel': 6, 'i-tel': 7, 'b-acad': 8, 'i-acad': 9, 'b-nati': 10, 'i-nati': 11,
                 'b-live': 12, 'i-live': 13, 'b-poli': 14, 'i-poli': 15, 'b-unv': 16, 'i-unv': 17,
                 'b-comp': 18, 'i-comp': 19, 'b-work': 20, 'i-work': 21, 'b-post': 22, 'i-post': 23,
                 'b-proj': 24, 'i-proj': 25, 'b-resp': 26, 'i-resp': 27, 'b-degr': 28, 'i-degr': 29,
                 'b-grti': 30, 'i-grti': 31, 'b-woti': 32, 'i-woti': 33, 'b-prti': 34, 'i-prti': 35,
                 'o': 36, '<start>': 37, '<stop>': 38}
    model = BiLSTM_CRF(len(word_to_ix), tag_to_ix, embedding_dim, hidden_dim)
    optimizer = optim.Adam(model.parameters(), lr=0.01)
    start_epoch = 0
    if model_load_path != None:
        print('load model...')
        checkpoint = torch.load(model_load_path)
        model.load_state_dict(checkpoint['model_state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        start_epoch = checkpoint['epoch'] + 1
    preliminary_score = get_score_by_model(model, val_json_path, val_pdf_dir)
    print('preliminary score:', preliminary_score)

    for epoch in range(start_epoch, stop_epoch):
        print("---------------------")
        print("running epoch : ", epoch)
        start_time = time.time()
        for sentence, tags in tqdm(training_data):
            model.zero_grad()
            sentence_in = prepare_sequence(sentence, word_to_ix)
            targets = torch.tensor([tag_to_ix[t] for t in tags], dtype=torch.long)
            loss = model.neg_log_likelihood(sentence_in, targets)
            loss.backward()
            nn.utils.clip_grad_norm_(model.parameters(), 1)
            optimizer.step()
        cur_epoch_score = get_score_by_model(model, val_json_path, val_pdf_dir)
        print('score', cur_epoch_score)
        print('running time:', time.time() - start_time)
        if cur_epoch_score > max_score:
            unimprove_time = 0
            max_score = cur_epoch_score
            torch.save({
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'epoch': epoch
            }, best_model_save_path)
            print('save best model successfully.')
        else:
            break
Ejemplo n.º 2
0
def train_all_data():
    embedding_dim = 100
    hidden_dim = 100
    stop_epoch = 1
    model_1_epoch = 'model/model_1_epoch_lr0001.pth'

    training_data = get_data_from_data_txt(DATA_PERFECT_PATH)
    word_to_ix = get_word_to_ix(training_data, min_word_freq=1)
    tag_to_ix = {'b-name': 0, 'i-name': 1, 'b-bir': 2, 'i-bir': 3, 'b-gend': 4, 'i-gend': 5,
                 'b-tel': 6, 'i-tel': 7, 'b-acad': 8, 'i-acad': 9, 'b-nati': 10, 'i-nati': 11,
                 'b-live': 12, 'i-live': 13, 'b-poli': 14, 'i-poli': 15, 'b-unv': 16, 'i-unv': 17,
                 'b-comp': 18, 'i-comp': 19, 'b-work': 20, 'i-work': 21, 'b-post': 22, 'i-post': 23,
                 'b-proj': 24, 'i-proj': 25, 'b-resp': 26, 'i-resp': 27, 'b-degr': 28, 'i-degr': 29,
                 'b-grti': 30, 'i-grti': 31, 'b-woti': 32, 'i-woti': 33, 'b-prti': 34, 'i-prti': 35,
                 'o': 36, '<start>': 37, '<stop>': 38, 'c-live': 39, 'c-proj': 40, 'c-woti': 41,
                 'c-post': 42, 'c-unv': 43, 'c-nati': 44, 'c-poli': 45, 'c-prti':46, 'c-comp': 47}

    model = BiLSTM_CRF(len(word_to_ix), tag_to_ix, embedding_dim, hidden_dim)
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    # Make sure prepare_sequence from earlier in the LSTM section is loaded
    for epoch in range(
            stop_epoch):  # again, normally you would NOT do 300 epochs, it is toy data
        print("---------------------")
        print("running epon : ", epoch + 1)
        start_time = time.time()
        for sentence, tags in tqdm(training_data):
            model.zero_grad()
            sentence_in = prepare_sequence(sentence, word_to_ix)
            targets = torch.tensor([tag_to_ix[t] for t in tags], dtype=torch.long)
            loss = model.neg_log_likelihood(sentence_in, targets)
            loss.backward()
            nn.utils.clip_grad_norm_(model.parameters(), 15)
            optimizer.step()
        cur_epoch_score = get_score_by_model(model, TRAIN_JSON_PATH, TRAIN_PDF_DIR)
        print('score', cur_epoch_score)
        print('running time:', time.time() - start_time)
        print()
        if epoch == stop_epoch:
            torch.save({
                'model_state_dict': model.state_dict()
            }, model_1_epoch)
Ejemplo n.º 3
0
    best_dev_F, new_dev_F, save = evaluating_batch(model, dev_batched,
                                                   best_dev_F)

    if not disable_flag:
        if not early_stopping.early_stop:
            early_stopping(-new_dev_F, model, optimizer)
        else:
            print("Early stopping, now introduce adv examples")
            parameters['launch_epoch'] = epoch
            disable_flag = 1
            sample_count = len(train_batched)

    else:
        if save:
            torch.save(model.state_dict(), model_name)
            best_idx = epoch

    best_test_F, new_test_F, _ = evaluating_batch(model, test_batched,
                                                  best_test_F)

    all_F.append([0.0, new_dev_F, new_test_F])

    sys.stdout.flush()
    print('Epoch %d : train/dev/test : %.2f / %.2f / %.2f - %d' %
          (epoch, new_train_F, new_dev_F, new_test_F, best_idx))
    model.train(True)
    adjust_learning_rate(optimizer,
                         lr=learning_rate /
                         (1 + 0.05 * sample_count / len(train_data)))
Ejemplo n.º 4
0
optimizer = optim.Adam(model.parameters(),
                       lr=0.001,
                       betas=(0.9, 0.999),
                       eps=1e-08,
                       weight_decay=0)

best_score = 0
for epoch in range(Config.epochs):
    model.train()
    total_loss = 0
    for batch_sentence, batch_label, batch_length in train_dataloder:

        model.zero_grad()

        batch_sentence, batch_label, batch_length, _ = sort_batch_data(
            batch_sentence, batch_label, batch_length)
        if Config.use_gpu:
            batch_sentence = batch_sentence.cuda()
            batch_label = batch_label.cuda()
        loss = model.neg_log_likehood(batch_sentence, batch_label,
                                      batch_length)
        loss.backward()
        optimizer.step()
        total_loss += loss.cpu().item()
    epoch_score = eval(eval_dataset, model)
    if epoch_score > best_score:
        best_score = epoch_score
        torch.save(model.state_dict(), 'model_best.pth')
    print('loss:{0}, epoch_score:{1}, best_score:{2}'.format(
        total_loss / len(train_dataset), epoch_score, best_score))
Ejemplo n.º 5
0
    print(epoch)
    for i in range(len(batch_data)):
        model.zero_grad()
        sen_batch = []
        tag_batch = []
        for data in batch_data[len(batch_data) - 1 - i]:
            sen_batch.append(get_idxseq(data[0], word_to_ix))
            tag_batch.append([tag_to_ix[t] for t in data[1]])
        loss = model.neg_log(
            sen_batch, tag_batch,
            torch.tensor(mask[len(batch_data) - 1 - i],
                         dtype=torch.long).cuda())
        loss.backward()
        print(loss)
        optimizer.step()
    torch.save(model.state_dict(), './params5.pkl')
print(crf.transitions.data)
'''''
model.load_state_dict(torch.load('./params3.pkl'))
print(crf.transitions.data)
with codecs.open('./newtrain.txt', encoding='UTF-8') as f:
    train = f.readlines()
    for i in range(20):
        if i % 2 != 0:
            sen = train[i].strip(line_end)
            sen = get_idxseq(sen, word_to_ix)
            print(model(torch.tensor(sen, dtype=torch.long).cuda())[0])
''' ''

with codecs.open("./test.txt", 'r', encoding='UTF-8') as f:
    test_data = f.readlines()
Ejemplo n.º 6
0
                                                 batch_first=True).to(device)
                            labs = pad_sequence(labs,
                                                batch_first=True).to(device)
                            lens = torch.tensor(lens).to(device)
                            lens, idx = torch.sort(lens, descending=True)
                            sents = sents[idx]
                            labs = labs[idx]
                            score, preds = model(sents, lens)
                            for i, l in enumerate(lens):
                                true_labels.append(
                                    seqid2text(labs[i, :l], ix_to_lab))
                                pred_labels.append(
                                    seqid2text(preds[i, :l], ix_to_lab))
                        f1 = f1_score(true_labels, pred_labels)
                        if (f1 > best_f1):
                            torch.save(model.state_dict(),
                                       "models/model-27-02-20")
                            best_f1 = f1

                        print("Accuracy: {:.4f}".format(
                            accuracy_score(true_labels, pred_labels)))
                        print("F1 score: {:.4f}".format(f1))
                        print(classification_report(true_labels, pred_labels))
                        model.train(True)
    if args.do_test:
        with torch.no_grad():
            print("Evaluation on test set")
            model.load_state_dict(
                torch.load("models/model-27-02-20", map_location=device))
            model.eval()
            true_labels = []
Ejemplo n.º 7
0
def main():
    parser = argparse.ArgumentParser()
    # parameters
    parser.add_argument("--epoch",
                        default=100,
                        type=int,
                        help="the number of epoches needed to train")
    parser.add_argument("--lr",
                        default=1e-3,
                        type=float,
                        help="the learning rate")
    parser.add_argument("--train_data_path",
                        default='data/train.tsv',
                        type=str,
                        help="train dataset path")
    parser.add_argument("--dev_data_path",
                        default=None,
                        type=str,
                        help="dev dataset path")
    parser.add_argument("--test_data_path",
                        default='data/test.tsv',
                        type=str,
                        help="test dataset path")
    parser.add_argument("--train_batch_size",
                        default=128,
                        type=int,
                        help="the batch size")
    parser.add_argument("--dev_batch_size",
                        default=64,
                        type=int,
                        help="the batch size")
    parser.add_argument("--test_batch_size",
                        default=64,
                        type=int,
                        help="the batch size")
    parser.add_argument("--embedding_path",
                        default='data/sgns.renmin.bigram-char',
                        type=str,
                        help="pre-trained word embeddings path")
    parser.add_argument("--embedding_size",
                        default=300,
                        type=int,
                        help="the word embedding size")
    parser.add_argument("--hidden_size",
                        default=512,
                        type=int,
                        help="the hidden size")
    parser.add_argument("--fine_tuning",
                        default=True,
                        type=bool,
                        help="whether fine-tune word embeddings")
    parser.add_argument("--early_stopping",
                        default=15,
                        type=int,
                        help="Tolerance for early stopping (# of epochs).")
    parser.add_argument("--load_model",
                        default='results/20_Model_best.pt',
                        help="load pretrained model for testing")
    args = parser.parse_args()

    if not args.train_data_path:
        logger.info("please input train dataset path")
        exit()
    if not (args.dev_data_path or args.test_data_path):
        logger.info("please input dev or test dataset path")
        exit()

    TEXT, LABEL, vocab_size, word_embeddings, train_iter, dev_iter, test_iter, tag_dict = \
                dataset.load_dataset(args.train_data_path, args.dev_data_path, \
                 args.test_data_path, args.embedding_path, args.train_batch_size, \
                                        args.dev_batch_size, args.test_batch_size)

    idx_tag = {}
    for tag in tag_dict:
        idx_tag[tag_dict[tag]] = tag

    model = BiLSTM_CRF(args.embedding_size, args.hidden_size, vocab_size,
                       tag_dict, word_embeddings)
    if torch.cuda.is_available():
        model = model.cuda()

    # cost_test = []
    # start = time.perf_counter()
    # train_dev_size = len(train_iter)
    # train_size = int(train_dev_size*0.9)
    train_data, dev_data = dataset.train_dev_split(train_iter, 0.9)
    # for batch in train_data:
    #     print(batch)
    #     exit()

    # train_data = lambda: islice(train_iter,0,train_size)
    # dev_data = lambda: islice(train_iter,train_size,train_dev_size)
    # train_data = islice(train_iter,0,train_size)
    # dev_data = islice(train_iter,train_size,train_dev_size)
    if args.load_model:
        model.load_state_dict(torch.load(args.load_model, map_location='cpu'))
        # p, r, f1, eval_loss, all_assess = eval_model(model, dev_data, idx_tag)
        # logger.info('Eval Loss:%.4f, Eval P:%.4f, Eval R:%.4f, Eval F1:%.4f', \
        #                             eval_loss, p, r, f1)
        p, r, f1, eval_loss, all_assess = eval_model(model, test_iter, idx_tag)
        logger.info('LOC Test P:%.4f, Test R:%.4f, Test F1:%.4f', \
                all_assess['LOC']['P'], all_assess['LOC']['R'], all_assess['LOC']['F'])
        logger.info('PER Test P:%.4f, Test R:%.4f, Test F1:%.4f', \
                all_assess['PER']['P'], all_assess['PER']['R'], all_assess['PER']['F'])
        logger.info('ORG Test P:%.4f, Test R:%.4f, Test F1:%.4f', \
                all_assess['ORG']['P'], all_assess['ORG']['R'], all_assess['ORG']['F'])
        logger.info('Micro_AVG Test P:%.4f, Test R:%.4f, Test F1:%.4f', \
                                    p, r, f1)
        return

    best_score = 0.0
    for epoch in range(args.epoch):
        # train_data_ = copy.deepcopy(train_data)
        # dev_data_ = copy.deepcopy(dev_data)
        # train_model(model, train_data_, dev_data_, epoch, args.lr, idx_tag)
        train_loss, p, r, f1, eval_loss = train_model(model, train_data,
                                                      dev_data, epoch, args.lr,
                                                      idx_tag)

        logger.info('Epoch:%d, Training Loss:%.4f', epoch, train_loss)
        logger.info('Epoch:%d, Eval Loss:%.4f, Eval P:%.4f, Eval R:%.4f, Eval F1:%.4f', \
                                    epoch, eval_loss, p, r, f1)
        # p, r, f1, eval_loss, all_assess = eval_model(model,  test_iter, idx_tag)
        # logger.info('Test Loss:%.4f, Test P:%.4f, Test R:%.4f, Test F1:%.4f', \
        #                             eval_loss, p, r, f1)
        if f1 > best_score:
            best_score = f1
            torch.save(
                model.state_dict(),
                'results/%d_%s_%s.pt' % (epoch, 'Model', str(best_score)))
Ejemplo n.º 8
0
                y_pre.append(y_hat)
                y_true.append(y.squeeze(1)).tolist()

            y_pre = [[idx2label[idx] for idx in y_pre_idx] for y_pre_idx in y_pre]
            y_true = [[idx2label[idx] for idx in y_true_idx] for y_true_idx in y_true]
            #print(y_pre[10],y_true[10])

            # 评价指标
            P = precision_score(y_true, y_pre)
            R = recall_score(y_true, y_pre)
            F1 = f1_score(y_true, y_pre)

            if F1 > best_f1:
                best_f1 = F1
                torch.save(model.state_dict(), args.model_path)

            print('train_step %d,train_loss %.4f, P %.3f, R %.3f, F1 %.4f'%(steps,train_loss_sum%n,P,R,F1))



# print('test')
# model = BiLSTM_CRF(args, label2idx, weight,device).to(device)
# model.load_state_dict(torch.load('./model/best_f1.bin'))
# model.eval()
# m = 0
# y_pre,y_true=[],[]
# for batch in valid_iter:
#     X, y = batch.TEXT, batch.LABEL
#     X, y = X.to(device).long(), y.to(device).long()
#     _, y_hat = model(X)
Ejemplo n.º 9
0
def my_train():
    os.makedirs(f"model_result", exist_ok=True)
    torch.manual_seed(1)
    device = torch.device('cuda')

    data_dir = f"data/{DATASET}/processed"

    # 加载
    train_data = NERDataset(os.path.join(data_dir, "train.pkl"))
    test_data = NERDataset(os.path.join(data_dir, "test.pkl"))
    dev_data = NERDataset(os.path.join(data_dir, "dev.pkl"))

    word_to_idx = load_obj(os.path.join(data_dir, "word_to_idx.pkl"))
    tag_to_idx = load_obj(os.path.join(data_dir, "tag_to_idx.pkl"))

    idx_to_tag = {n: m for m, n in tag_to_idx.items()}

    train_loader = DataLoader(
        train_data,
        batch_size=BATCH_SIZE,
        collate_fn=BatchPadding(),
        shuffle=True,
        num_workers=2,
        pin_memory=True,
    )
    dev_loader = DataLoader(
        dev_data,
        batch_size=BATCH_SIZE,
        collate_fn=BatchPadding(),
        shuffle=True,
        num_workers=2,
        pin_memory=True,
    )
    test_loader = DataLoader(
        test_data,
        batch_size=BATCH_SIZE,
        collate_fn=BatchPadding(),
        shuffle=True,
        num_workers=2,
        pin_memory=True,
    )

    # 建模
    model = BiLSTM_CRF(len(word_to_idx), len(tag_to_idx), EMBEDDING_DIM,
                       HIDDEN_DIM, DROPOUT).to(device)
    print(model)
    optimizer = optim.Adam(model.parameters(), lr=LEARN_RATE)

    print("\n开始训练")
    f1_max = 0
    cur_patience = 0  # 用于避免过拟合
    for epoch in range(EPOCHS):
        model.train()
        for i, (seqs, tags, masks) in enumerate(train_loader, 1):
            optimizer.zero_grad()
            loss = model.loss(seqs.to(device), tags.to(device),
                              masks.to(device))
            loss.backward()
            optimizer.step()
            if i % LOG_INTERVAL == 0:
                print("epoch {}: {:.0f}%\t\tLoss: {:.6f}".format(
                    epoch, 100.0 * i / len(train_loader), loss.item()))
        dev_precision, dev_recall, dev_f1 = evaluate(model, dev_loader,
                                                     idx_to_tag)
        test_precision, test_recall, test_f1 = evaluate(
            model, test_loader, idx_to_tag)
        print(
            f"\ndev\tprecision: {dev_precision}, recall: {dev_recall}, f1: {dev_f1}"
        )
        print(
            f"test\tprecision: {test_precision}, recall: {test_recall}, f1: {test_f1}\n"
        )

        torch.save(model.state_dict(), f"model_result/{epoch}.pt")

        if dev_f1 > f1_max:  # 用于检测过拟合情况
            f1_max = dev_f1
            cur_patience = 0
            if dev_f1 > 0.9 and test_f1 > 0.9:
                break
        else:
            cur_patience += 1
            if cur_patience >= PATIENCE:  # 多次低于最高f1,break
                break
    print("Best dev F1: ", f1_max)
Ejemplo n.º 10
0
            loss = model.loss(seqs.to(device), tags.to(device), masks.to(device))
            loss.backward()
            optimizer.step()
            if i % args.log_interval == 0:
                print(
                    "Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}".format(
                        epoch + 1,
                        i * seqs.size(1),
                        len(train_loader.dataset),
                        100.0 * i / len(train_loader),
                        loss.item(),
                    )
                )
        print("Evaluating...")
        dev_precision, dev_recall, dev_f1 = evaluate(model, dev_loader, ix_to_tag)
        test_precision, test_recall, test_f1 = evaluate(model, test_loader, ix_to_tag)
        print(f"\ndev\tprecision: {dev_precision}, recall: {dev_recall}, f1: {dev_f1}")
        print(f"test\tprecision: {test_precision}, recall: {test_recall}, f1: {test_f1}\n")

        torch.save(model.state_dict(), f"checkpoints/{args.name}/model-epoch{epoch}.pt")

        if dev_f1 > best_dev_f1:
            best_dev_f1 = dev_f1
            bad_count = 0
        else:
            bad_count += 1
            if bad_count >= args.patience:
                print("Early stopped!")
                break
    print("Best dev F1: ", best_dev_f1)