Ejemplo n.º 1
0
def train(data):
    print("Training model...")
    data.show_data_summary()
    save_data_name = data.model_dir + ".dset"
    data.save(save_data_name)

    model = SeqModel(data)

    for name, param in model.named_parameters():
        if param.requires_grad:
            print(name)

    if data.optimizer.lower() == "sgd":
        optimizer = optim.SGD(model.parameters(), lr=data.HP_lr, momentum=data.HP_momentum, weight_decay=data.HP_l2)
    elif data.optimizer.lower() == "adagrad":
        optimizer = optim.Adagrad(model.parameters(), lr=data.HP_lr, weight_decay=data.HP_l2)
    elif data.optimizer.lower() == "adadelta":
        optimizer = optim.Adadelta(model.parameters(), lr=data.HP_lr, weight_decay=data.HP_l2)
    elif data.optimizer.lower() == "rmsprop":
        optimizer = optim.RMSprop(model.parameters(), lr=data.HP_lr, weight_decay=data.HP_l2)
    elif data.optimizer.lower() == "adam":
        optimizer = optim.Adam(model.parameters(), lr=data.HP_lr, weight_decay=data.HP_l2)
    else:
        print("Optimizer illegal: %s" % (data.optimizer))
        exit(1)

    best_dev_f1 = -1
    test_f1 = []
    best_epoch = 0

    # start training
    for idx in range(data.HP_iteration):
        epoch_start = time.time()
        print("Epoch: %s/%s" % (idx, data.HP_iteration))
        if data.optimizer == "SGD":
            optimizer = lr_decay(optimizer, idx, data.HP_lr_decay, data.HP_lr)

        instance_count = 0
        total_perplexity_1 = 0
        total_perplexity_2 = 0

        total_loss_1 = 0
        total_loss_2 = 0
        total_loss_3 = 0
        total_loss_4 = 0

        random.shuffle(data.source_train_idx)
        random.shuffle(data.target_train_idx)
        random.shuffle(data.source_lm_idx)
        random.shuffle(data.target_lm_idx)

        model.train()
        model.zero_grad()

        batch_size_1 = data.HP_batch_size
        train_num_1 = len(data.source_train_idx)
        train_num_2 = len(data.target_train_idx)
        train_num_3 = len(data.source_lm_idx)
        train_num_4 = len(data.target_lm_idx)

        batch_num = train_num_1 // batch_size_1 + 1
        batch_size_2 = train_num_2 // batch_num
        batch_size_3 = train_num_3 // batch_num
        batch_size_4 = train_num_4 // batch_num

        for batch_id in range(batch_num):

            instance_1 = data.source_train_idx[batch_id * batch_size_1: (batch_id + 1) * batch_size_1
            if ((batch_id + 1) * batch_size_1) < train_num_1 else train_num_1]
            instance_2 = data.target_train_idx[batch_id * batch_size_2: (batch_id + 1) * batch_size_2
            if ((batch_id + 1) * batch_size_2) < train_num_2 else train_num_2]
            instance_3 = data.source_lm_idx[batch_id * batch_size_3: (batch_id + 1) * batch_size_3
            if ((batch_id + 1) * batch_size_3) < train_num_3 else train_num_3]
            instance_4 = data.target_lm_idx[batch_id * batch_size_4: (batch_id + 1) * batch_size_4
            if ((batch_id + 1) * batch_size_4) < train_num_4 else train_num_4]

            if not instance_1 or not instance_2:
                continue

            # NER
            batch_word_1, batch_wordlen_1, batch_wordrecover_1, batch_char_1, batch_charlen_1, \
            batch_charrecover_1, batch_label_1, lm_seq_tensor_1, mask_1 = batchify_with_label(instance_1, data.HP_gpu)

            batch_word_2, batch_wordlen_2, batch_wordrecover_2, batch_char_2, batch_charlen_2, \
            batch_charrecover_2, batch_label_2, lm_seq_tensor_2, mask_2 = batchify_with_label(instance_2, data.HP_gpu)

            # LM
            batch_word_3, batch_wordlen_3, batch_wordrecover_3, batch_char_3, batch_charlen_3, \
            batch_charrecover_3, batch_label_3, lm_seq_tensor_3, mask_3 = batchify_with_label(instance_3 + instance_1, data.HP_gpu)
            batch_word_4, batch_wordlen_4, batch_wordrecover_4, batch_char_4, batch_charlen_4, \
            batch_charrecover_4, batch_label_4, lm_seq_tensor_4, mask_4 = batchify_with_label(instance_4 + instance_2, data.HP_gpu)

            batch_word = [batch_word_1, batch_word_2, batch_word_3, batch_word_4]
            batch_wordlen = [batch_wordlen_1, batch_wordlen_2, batch_wordlen_3, batch_wordlen_4]

            batch_char = [batch_char_1, batch_char_2, batch_char_3, batch_char_4]
            batch_charlen = [batch_charlen_1, batch_charlen_2, batch_charlen_3, batch_charlen_4]

            batch_charrecover = [batch_charrecover_1, batch_charrecover_2, batch_charrecover_3, batch_charrecover_4]
            batch_label = [batch_label_1, batch_label_2, batch_label_3, batch_label_4]

            lm_seq_tensor = [lm_seq_tensor_1, lm_seq_tensor_2, lm_seq_tensor_3, lm_seq_tensor_4]
            mask = [mask_1, mask_2, mask_3, mask_4]

            instance_count += 1
            loss_ = []
            perplexity_ = []

            # source language model
            loss, perplexity, tag_seq = model.loss('model1', batch_word[2], batch_wordlen[2], batch_char[2],
                                                   batch_charlen[2], batch_charrecover[2], batch_label[2],
                                                   lm_seq_tensor[2], mask[2])
            loss_.append(loss)
            perplexity_.append(perplexity)
            # source NER
            loss, perplexity, tag_seq = model.loss('model2', batch_word[0], batch_wordlen[0], batch_char[0],
                                                   batch_charlen[0], batch_charrecover[0], batch_label[0],
                                                   lm_seq_tensor[0], mask[0])

            loss_.append(loss)
            # target language model
            loss, perplexity, tag_seq = model.loss('model3', batch_word[3], batch_wordlen[3], batch_char[3],
                                                   batch_charlen[3], batch_charrecover[3], batch_label[3],
                                                   lm_seq_tensor[3], mask[3])

            loss_.append(loss)
            perplexity_.append(perplexity)
            loss = 0
            model_num = len(loss_)
            for loss_id in range(model_num):
                loss += loss_[loss_id]
            loss.backward()
            optimizer.step()
            model.zero_grad()

            total_loss_1 += loss_[0].data[0]
            total_loss_2 += loss_[1].data[0]
            total_loss_3 += loss_[2].data[0]

            total_perplexity_1 += perplexity_[0].data[0]
            total_perplexity_2 += perplexity_[1].data[0]


        epoch_finish = time.time()
        epoch_cost = epoch_finish - epoch_start
        source_lm_perplexity = math.exp(total_perplexity_1 / batch_num)
        target_lm_perplexity = math.exp(total_perplexity_2 / batch_num)

        print("Epoch: %s training finished. Time: %.2fs, speed: %.2fst/s,  total loss: %s" % (
            idx, epoch_cost, train_num_1 / epoch_cost, total_loss_2))
        print("Epoch: %s training finished. Time: %.2fs, speed: %.2fst/s,  total perplexity: %.4f" % (
            idx, epoch_cost, train_num_3 / epoch_cost, source_lm_perplexity))

        print("Epoch: %s training finished. Time: %.2fs, speed: %.2fst/s,  total loss: %s" % (
            idx, epoch_cost, train_num_2 / epoch_cost, total_loss_4))
        print("Epoch: %s training finished. Time: %.2fs, speed: %.2fst/s,  total perplexity: %.4f" % (
            idx, epoch_cost, train_num_4 / epoch_cost, target_lm_perplexity))

        if total_loss_1 > 1e8 or str(total_loss_1) == "nan" or total_loss_2 > 1e8 or str(
                total_loss_2) == "nan" or total_loss_3 > 1e8 or str(total_loss_3) == "nan" or total_loss_4 > 1e8 or str(
            total_loss_4) == "nan":
            print("ERROR: LOSS EXPLOSION (>1e8) ! PLEASE SET PROPER PARAMETERS AND STRUCTURE! EXIT....")
            exit(1)

        # dev-test
        speed, acc, p, r, f, _, _ = evaluate(data, model, "dev-test")
        test_f1.append(f[1])
        dev_finish = time.time()
        dev_cost = dev_finish - epoch_finish

        current_score = f[0]
        print("Dev-Source: time: %.2fs, speed: %.2fst/s; acc: %.4f, p: %.4f, r: %.4f, f: %.4f" % (
            dev_cost, speed, acc[0], p[0], r[0], f[0]))
        print("Test-Target: time: %.2fs, speed: %.2fst/s; acc: %.4f, p: %.4f, r: %.4f, f: %.4f" % (
            dev_cost, speed, acc[1], p[1], r[1], f[1]))

        if current_score > best_dev_f1:
            best_epoch = idx
            print("Exceed previous best f score:", best_dev_f1)
            model_name = data.model_dir + ".model"
            print("Save current best model in file:", model_name)
            torch.save(model.state_dict(), model_name)
            best_dev_f1 = current_score

        if current_score > 0.72:
            print("change optim sgd:")
            optimizer = optim.SGD(model.parameters(), lr=0.015, momentum=data.HP_momentum,
                                  weight_decay=data.HP_l2)
        print("The best Source-domain dev f-score: %.4f, Target-domain f-score: %.4f" % (best_dev_f1, test_f1[best_epoch]))
Ejemplo n.º 2
0
def train(train_data):
    print("Training model...")
    train_data.show_data_summary()
    save_data_name = train_data.init_dir + ".init"
    train_data.save(save_data_name)
    model = SeqModel(train_data)

    for name, param in model.named_parameters():
        if param.requires_grad:
            print(name)

    if train_data.optimizer.lower() == "sgd":
        optimizer = optim.SGD(model.parameters(),
                              lr=train_data.HP_lr,
                              momentum=train_data.HP_momentum,
                              weight_decay=train_data.HP_l2)
    elif train_data.optimizer.lower() == "adagrad":
        optimizer = optim.Adagrad(model.parameters(),
                                  lr=train_data.HP_lr,
                                  weight_decay=train_data.HP_l2)
    elif train_data.optimizer.lower() == "adadelta":
        optimizer = optim.Adadelta(model.parameters(),
                                   lr=train_data.HP_lr,
                                   weight_decay=train_data.HP_l2)
    elif train_data.optimizer.lower() == "rmsprop":
        optimizer = optim.RMSprop(model.parameters(),
                                  lr=train_data.HP_lr,
                                  weight_decay=train_data.HP_l2)
    elif train_data.optimizer.lower() == "adam":
        optimizer = optim.Adam(model.parameters(),
                               lr=train_data.HP_lr,
                               weight_decay=train_data.HP_l2)
    else:
        print("Optimizer illegal: %s" % train_data.optimizer)
        exit(1)

    best_dev = -10
    dev_f = []
    test_f = []
    best_epoch = 0

    for idx in range(train_data.HP_iteration):
        epoch_start = time.time()
        print("Epoch: %s/%s" % (idx, train_data.HP_iteration))
        if train_data.optimizer.lower() == "sgd":
            optimizer = lr_decay(optimizer, idx, train_data.HP_lr_decay,
                                 train_data.HP_lr)

        random.shuffle(train_data.ner_1_train_idx)
        random.shuffle(train_data.ner_2_train_idx)
        random.shuffle(train_data.lm_1_idx)
        random.shuffle(train_data.lm_2_idx)

        model.train()
        model.zero_grad()

        ner_1_loss = 0
        ner_2_loss = 0
        lm_1_perplexity = 0
        lm_2_perplexity = 0

        ner_1_batch_size = train_data.HP_batch_size
        batch_nums = len(train_data.ner_1_train_idx) // ner_1_batch_size + 1
        ner_2_batch_size = len(train_data.ner_2_train_idx) // batch_nums
        lm_1_batch_size = len(train_data.lm_1_idx) // batch_nums
        lm_2_batch_size = len(train_data.lm_2_idx) // batch_nums

        print("batch size: ", ner_1_batch_size, ner_2_batch_size,
              lm_1_batch_size, lm_2_batch_size)

        for batch_id in range(batch_nums):
            ner_1_data = train_data.ner_1_train_idx[batch_id * ner_1_batch_size: (batch_id + 1) * ner_1_batch_size if\
            (batch_id + 1) * ner_1_batch_size < len(train_data.ner_1_train_idx) else len(train_data.ner_1_train_idx)]
            ner_2_data = train_data.ner_2_train_idx[batch_id * ner_2_batch_size: (batch_id + 1) * ner_2_batch_size if\
            (batch_id + 1) * ner_2_batch_size < len(train_data.ner_2_train_idx) else len(train_data.ner_2_train_idx)]
            lm_1_data = train_data.lm_1_idx[batch_id * lm_1_batch_size: (batch_id + 1) * lm_1_batch_size if \
                (batch_id + 1) * lm_1_batch_size < len(train_data.lm_1_idx) else len(train_data.lm_1_idx)]
            lm_2_data = train_data.lm_2_idx[batch_id * lm_2_batch_size: (batch_id + 1) * lm_2_batch_size if \
                (batch_id + 1) * lm_2_batch_size < len(train_data.lm_2_idx) else len(train_data.lm_2_idx)]

            ner_1_batch_data = batchify_with_label(ner_1_data,
                                                   train_data.HP_gpu)
            if train_data.mode == 'supervised':
                ner_2_batch_data = batchify_with_label(ner_2_data,
                                                       train_data.HP_gpu)
            lm_1_batch_data = batchify_with_label(lm_1_data, train_data.HP_gpu)
            lm_2_batch_data = batchify_with_label(lm_2_data, train_data.HP_gpu)

            losses = []
            perplexities = []

            # word_seq_tensor, word_seq_lengths, word_seq_recover, char_seq_tensor, char_seq_lengths,
            # char_seq_recover,  label_seq_tensor, lm_seq_tensor, mask
            loss, perplexity, tag_seq_forward, tag_seq_backward, tag_seq = \
                model.loss('ner1', ner_1_batch_data[0], ner_1_batch_data[1], ner_1_batch_data[3], ner_1_batch_data[4],
                           ner_1_batch_data[5], ner_1_batch_data[6], ner_1_batch_data[7], ner_1_batch_data[8])
            losses.append(loss)

            if train_data.mode == 'supervised':
                loss, perplexity, tag_seq_forward, tag_seq_backward, tag_seq = \
                    model.loss('ner2', ner_2_batch_data[0], ner_2_batch_data[1], ner_2_batch_data[3],
                               ner_2_batch_data[4], ner_2_batch_data[5], ner_2_batch_data[6], ner_2_batch_data[7],
                               ner_2_batch_data[8])
                losses.append(loss)

            loss, perplexity, tag_seq_forward, tag_seq_backward, tag_seq = \
                model.loss('lm1', lm_1_batch_data[0], lm_1_batch_data[1], lm_1_batch_data[3], lm_1_batch_data[4],
                           lm_1_batch_data[5], lm_1_batch_data[6], lm_1_batch_data[7], lm_1_batch_data[8])
            losses.append(loss)
            perplexities.append(perplexity)

            loss, perplexity, tag_seq_forward, tag_seq_backward, tag_seq = \
                model.loss('lm2', lm_2_batch_data[0], lm_2_batch_data[1], lm_2_batch_data[3], lm_2_batch_data[4],
                           lm_2_batch_data[5], lm_2_batch_data[6], lm_2_batch_data[7], lm_2_batch_data[8])
            losses.append(loss)
            perplexities.append(perplexity)

            model_loss = 0
            loss_rate = [0.8, 1, 0.5, 0.5
                         ] if train_data.mode == 'supervised' else [1, 1, 1]
            for loss_id in range(len(losses)):
                model_loss += losses[loss_id] * loss_rate[loss_id]
            model_loss.backward()
            optimizer.step()
            model.zero_grad()

            ner_1_loss += losses[0].data[0]
            ner_2_loss += losses[1].data[0]
            lm_1_perplexity += perplexities[0].data[0]
            lm_2_perplexity += perplexities[1].data[0]

        epoch_finish = time.time()
        epoch_cost = epoch_finish - epoch_start

        print("Epoch: %s training finished. Time: %.2fs." % (idx, epoch_cost))
        print("ner 1 total loss: %s" % ner_1_loss)
        if train_data.mode == 'supervised':
            print("ner 2 total loss: %s" % ner_2_loss)
        print("lm 1 perplexity: %.4f" % math.exp(lm_1_perplexity / batch_nums))
        print("lm 2 perplexity: %.4f" % math.exp(lm_2_perplexity / batch_nums))

        if ner_1_loss > 1e8 or str(
                ner_1_loss) == "nan" or ner_2_loss > 1e8 or str(
                    ner_2_loss) == "nan":
            print(
                "ERROR: LOSS EXPLOSION (>1e8) ! PLEASE SET PROPER PARAMETERS AND STRUCTURE! EXIT...."
            )
            exit(1)

        evaluate('ner1', train_data.ner_1_dev_idx,
                 train_data.label_alphabet_ner_1, train_data, model)
        if train_data.mode == 'supervised':
            p, r, f = evaluate('ner2', train_data.ner_2_dev_idx,
                               train_data.label_alphabet_ner_2, train_data,
                               model)
        else:
            p, r, f = evaluate('ner2', train_data.ner_2_dev_idx,
                               train_data.label_alphabet_ner_1, train_data,
                               model)
        dev_f.append(f)

        if f > best_dev:
            best_epoch = idx
            print("Exceed previous best f score:", best_dev)
            model_name = train_data.model_dir + ".model"
            print("Save current best model in file:", model_name)
            torch.save(model.state_dict(), model_name)
            best_dev = f

        evaluate('ner1', train_data.ner_1_test_idx,
                 train_data.label_alphabet_ner_1, train_data, model)
        if train_data.mode == 'supervised':
            p, r, f = evaluate('ner2', train_data.ner_2_test_idx,
                               train_data.label_alphabet_ner_2, train_data,
                               model)
        else:
            p, r, f = evaluate('ner2', train_data.ner_2_test_idx,
                               train_data.label_alphabet_ner_1, train_data,
                               model)
        test_f.append(f)
    print("the best dev score is in epoch %s, dev:%.4f, test:%.4f" %
          (best_epoch, dev_f[best_epoch], test_f[best_epoch]))
Ejemplo n.º 3
0
def train(data):
    print("Training model...")
    data.show_data_summary()
    save_data_name = data.model_dir + ".dset"
    data.save(save_data_name)
    model = SeqModel(data)
    for name, param in model.named_parameters():
        if param.requires_grad:
            print(name)
    loss_function = nn.NLLLoss()
    if data.optimizer.lower() == "sgd":
        optimizer = optim.SGD(model.parameters(),
                              lr=data.HP_lr,
                              momentum=data.HP_momentum,
                              weight_decay=data.HP_l2)
    elif data.optimizer.lower() == "adagrad":
        optimizer = optim.Adagrad(model.parameters(),
                                  lr=data.HP_lr,
                                  weight_decay=data.HP_l2)
    elif data.optimizer.lower() == "adadelta":
        optimizer = optim.Adadelta(model.parameters(),
                                   lr=data.HP_lr,
                                   weight_decay=data.HP_l2)
    elif data.optimizer.lower() == "rmsprop":
        optimizer = optim.RMSprop(model.parameters(),
                                  lr=data.HP_lr,
                                  weight_decay=data.HP_l2)
    elif data.optimizer.lower() == "adam":
        optimizer = optim.Adam(model.parameters(),
                               lr=data.HP_lr,
                               weight_decay=data.HP_l2)
    else:
        print("Optimizer illegal: %s" % (data.optimizer))
        exit(1)
    best_dev = -10
    dev_f = []
    test_f = []
    perplexity_1 = []
    perplexity_2 = []
    best_epoch = 0
    # data.HP_iteration = 1
    LM_data = data.train_Ids_2
    ## start training
    for idx in range(data.HP_iteration):
        epoch_start = time.time()
        temp_start = epoch_start
        print("Epoch: %s/%s" % (idx, data.HP_iteration))
        if data.optimizer == "SGD":
            optimizer = lr_decay(optimizer, idx, data.HP_lr_decay, data.HP_lr)
        instance_count = 0
        total_perplexity_1 = 0
        total_perplexity_2 = 0
        total_loss_1 = 0
        total_loss_2 = 0
        total_loss_3 = 0
        total_loss_4 = 0
        random.shuffle(data.train_Ids_1)
        random.shuffle(data.train_Ids_2)

        ## set model in train model
        model.train()
        model.zero_grad()
        batch_size = data.HP_batch_size
        batch_id = 0

        ###co-train for 4 models
        train_num_1 = len(data.train_Ids_1)
        train_num_2 = len(data.train_Ids_2)
        train_num_3 = len(LM_data)
        total_batch_1 = train_num_1 // batch_size + 1
        batch_size_2 = train_num_2 // total_batch_1
        l_batch_num_2 = train_num_2 - total_batch_1 * batch_size_2

        start_2 = end_2 = 0

        for batch_id in range(total_batch_1):
            start = batch_id * batch_size
            end = (batch_id + 1) * batch_size
            start_2 = end_2
            if batch_id < l_batch_num_2:
                end_2 = start_2 + (batch_size_2 + 1)
            else:
                end_2 = start_2 + batch_size_2

            if end > train_num_1:
                end = train_num_1
            if end_2 > train_num_2:
                end_2 = train_num_2

            instance_1 = data.train_Ids_1[start:end]
            instance_2 = data.train_Ids_2[start_2:end_2]

            if not instance_1 or not instance_2:
                continue
            #seq label
            batch_word_1, batch_features_1, batch_wordlen_1, batch_wordrecover_1, batch_char_1, batch_charlen_1, batch_charrecover_1, batch_label_1, lm_seq_tensor_1, mask_1 = batchify_with_label(
                instance_1, data.HP_gpu)
            batch_word_2, batch_features_2, batch_wordlen_2, batch_wordrecover_2, batch_char_2, batch_charlen_2, batch_charrecover_2, batch_label_2, lm_seq_tensor_2, mask_2 = batchify_with_label(
                instance_2, data.HP_gpu)

            batch_word = [batch_word_1, batch_word_2]
            batch_features = [batch_features_1, batch_features_2]
            batch_wordlen = [batch_wordlen_1, batch_wordlen_2]
            batch_char = [batch_char_1, batch_char_2]
            batch_charlen = [batch_charlen_1, batch_charlen_2]
            batch_charrecover = [batch_charrecover_1, batch_charrecover_2]
            batch_label = [batch_label_1, batch_label_2]
            lm_seq_tensor = [lm_seq_tensor_1, lm_seq_tensor_2]
            mask = [mask_1, mask_2]
            instance_count += 1
            loss_ = []
            perplexity_ = []

            # LM 1
            loss, perplexity, tag_seq_forward, tag_seq_backward, tag_seq = model.loss(
                'model1', batch_word[0], batch_features[0], batch_wordlen[0],
                batch_char[0], batch_charlen[0], batch_charrecover[0],
                batch_label[0], lm_seq_tensor[0], mask[0])

            loss_.append(loss)
            perplexity_.append(perplexity)

            #seq label 1
            loss, perplexity, tag_seq_forward, tag_seq_backward, tag_seq = model.loss(
                'model2', batch_word[0], batch_features[0], batch_wordlen[0],
                batch_char[0], batch_charlen[0], batch_charrecover[0],
                batch_label[0], lm_seq_tensor[0], mask[0])
            loss_.append(loss)

            # LM 2
            loss, perplexity, tag_seq_forward, tag_seq_backward, tag_seq = model.loss(
                'model3', batch_word[1], batch_features[1], batch_wordlen[1],
                batch_char[1], batch_charlen[1], batch_charrecover[1],
                batch_label[1], lm_seq_tensor[1], mask[1])

            loss_.append(loss)
            perplexity_.append(perplexity)

            #seq label 2
            loss, perplexity, tag_seq_forward, tag_seq_backward, tag_seq = model.loss(
                'model4', batch_word[1], batch_features[1], batch_wordlen[1],
                batch_char[1], batch_charlen[1], batch_charrecover[1],
                batch_label[1], lm_seq_tensor[1], mask[1])
            loss_.append(loss)

            loss_rate = [1.0, 1.0, 1.0, 2.0]
            loss = 0
            model_num = len(loss_)
            for loss_id in range(model_num):
                loss += loss_[loss_id] * loss_rate[loss_id]
            loss.backward()
            optimizer.step()
            model.zero_grad()

            total_loss_1 += loss_[0].data[0]
            total_loss_2 += loss_[1].data[0]
            total_loss_3 += loss_[2].data[0]
            total_loss_4 += loss_[3].data[0]
            total_perplexity_1 += perplexity_[0].data[0]
            total_perplexity_2 += perplexity_[1].data[0]

        epoch_finish = time.time()
        epoch_cost = epoch_finish - epoch_start
        LM_perplex_1 = math.exp(total_perplexity_1 / total_batch_1)
        LM_perplex_2 = math.exp(total_perplexity_2 / total_batch_1)
        perplexity_1.append(LM_perplex_1)
        perplexity_2.append(LM_perplex_2)
        print("Epoch: %s training finished. Time: %.2fs" % (idx, epoch_cost))
        print("Epoch: %s training finished. Time: %.2fs,   total loss: %s" %
              (idx, epoch_cost, total_loss_2))
        print("totalloss:", total_loss_2)
        print(
            "Epoch: %s training finished. Time: %.2fs,  total perplexity: %.4f"
            % (idx, epoch_cost, LM_perplex_1))
        print("Epoch: %s training finished. Time: %.2fs,   total loss: %s" %
              (idx, epoch_cost, total_loss_4))
        print("totalloss:", total_loss_4)
        print(
            "Epoch: %s training finished. Time: %.2fs,   total perplexity: %.4f"
            % (idx, epoch_cost, LM_perplex_2))

        speed, acc, p, r, f, _, _ = evaluate(data, model, "dev")
        dev_f.append(f[1])
        dev_finish = time.time()
        dev_cost = dev_finish - epoch_finish

        if data.seg:
            current_score = f[1]
            print(
                "Dev: time: %.2fs, speed: %.2fst/s; acc: %.4f, p: %.4f, r: %.4f, f: %.4f"
                % (dev_cost, speed, acc[0], p[0], r[0], f[0]))
            print(
                "Dev: time: %.2fs, speed: %.2fst/s; acc: %.4f, p: %.4f, r: %.4f, f: %.4f"
                % (dev_cost, speed, acc[1], p[1], r[1], f[1]))
        else:
            current_score = acc[1]
            print("Dev: time: %.2fs speed: %.2fst/s; acc: %.4f" %
                  (dev_cost, speed, acc[0]))
            print("Dev: time: %.2fs speed: %.2fst/s; acc: %.4f" %
                  (dev_cost, speed, acc[1]))

        if current_score > best_dev:
            best_epoch = idx
            if data.seg:
                print("Exceed previous best f score:", best_dev)
            else:
                print("Exceed previous best acc score:", best_dev)
            # model_name = data.model_dir +'.'+ str(idx) + ".model"
            model_name = data.model_dir + ".model"
            print("Save current best model in file:", model_name)
            torch.save(model.state_dict(), model_name)
            best_dev = current_score
        # ## decode test
        speed, acc, p, r, f, _, _ = evaluate(data, model, "test")
        test_f.append(f[1])
        test_finish = time.time()
        test_cost = test_finish - dev_finish
        if data.seg:
            print(
                "Test: time: %.2fs, speed: %.2fst/s; acc: %.4f, p: %.4f, r: %.4f, f: %.4f"
                % (test_cost, speed, acc[0], p[0], r[0], f[0]))
            print(
                "Test: time: %.2fs, speed: %.2fst/s; acc: %.4f, p: %.4f, r: %.4f, f: %.4f"
                % (test_cost, speed, acc[1], p[1], r[1], f[1]))
        else:
            print("Test: time: %.2fs, speed: %.2fst/s; acc: %.4f" %
                  (test_cost, speed, acc[0]))
            print("Test: time: %.2fs, speed: %.2fst/s; acc: %.4f" %
                  (test_cost, speed, acc[1]))
        gc.collect()

    print("the best dev score is in epoch %s, dev:%.4f, test:%.4f" %
          (best_epoch, dev_f[best_epoch], test_f[best_epoch]))
    with open('data/fscore_13PC.txt', 'w') as ft:
        ft.write('dev f scores:\n')
        for t in dev_f:
            ft.write(str(round(t, 6)))
            ft.write(' ')
        ft.write('\n')
        ft.write('test f scores:\n')
        for t in test_f:
            ft.write(str(round(t, 6)))
            ft.write(' ')
        ft.write('\n')
        ft.write('LM 1 perplexity:\n')
        for t in perplexity_1:
            ft.write(str(round(t, 6)))
            ft.write(' ')
        ft.write('\n')
        ft.write('LM 2 perplexity:\n')
        for t in perplexity_2:
            ft.write(str(round(t, 6)))
            ft.write(' ')

    if data.task_emb_save_dir is not None:
        with open('data/task_emb.txt', 'w') as ft:
            for task, i in data.task_alphabet.iteritems():
                ft.write(task)
                ft.write(' ')
                for t in model.word_hidden.LSTM_param_generator.task_emb_vocab.weight.data[
                        i]:
                    ft.write(str(round(t, 6)))
                    ft.write(' ')
                ft.write('\n')
    if data.domain_emb_save_dir is not None:
        with open('data/domain_emb.txt', 'w') as fd:
            for domain, i in data.domain_alphabet.iteritems():
                fd.write(domain)
                fd.write(' ')
                for t in model.word_hidden.LSTM_param_generator.domain_emb_vocab.weight.data[
                        i]:
                    fd.write(str(round(t, 6)))
                    fd.write(' ')
                fd.write('\n')