Python SeqModel.loss Examples

Programming Language: Python

Namespace/Package Name: model.seqmodel

Class/Type: SeqModel

Method/Function: loss

Examples at hotexamples.com: 3

Python SeqModel.loss - 3 examples found. These are the top rated real world Python examples of model.seqmodel.SeqModel.loss extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

SeqModel(27)

parameters(15)

zero_grad(15)

state_dict(14)

load_state_dict(13)

neg_log_likelihood_loss(12)

train(11)

named_parameters(3)

loss(3)

forward(2)

cuda(2)

freeze_net(1)

get_m2_params(1)

load_expanded_state_dict(1)

eval(1)

set_tasks_weights(1)

unfreeze_net(1)

Example #1

Show file

def train(data):
    print("Training model...")
    data.show_data_summary()
    save_data_name = data.model_dir + ".dset"
    data.save(save_data_name)

    model = SeqModel(data)

    for name, param in model.named_parameters():
        if param.requires_grad:
            print(name)

    if data.optimizer.lower() == "sgd":
        optimizer = optim.SGD(model.parameters(), lr=data.HP_lr, momentum=data.HP_momentum, weight_decay=data.HP_l2)
    elif data.optimizer.lower() == "adagrad":
        optimizer = optim.Adagrad(model.parameters(), lr=data.HP_lr, weight_decay=data.HP_l2)
    elif data.optimizer.lower() == "adadelta":
        optimizer = optim.Adadelta(model.parameters(), lr=data.HP_lr, weight_decay=data.HP_l2)
    elif data.optimizer.lower() == "rmsprop":
        optimizer = optim.RMSprop(model.parameters(), lr=data.HP_lr, weight_decay=data.HP_l2)
    elif data.optimizer.lower() == "adam":
        optimizer = optim.Adam(model.parameters(), lr=data.HP_lr, weight_decay=data.HP_l2)
    else:
        print("Optimizer illegal: %s" % (data.optimizer))
        exit(1)

    best_dev_f1 = -1
    test_f1 = []
    best_epoch = 0

    # start training
    for idx in range(data.HP_iteration):
        epoch_start = time.time()
        print("Epoch: %s/%s" % (idx, data.HP_iteration))
        if data.optimizer == "SGD":
            optimizer = lr_decay(optimizer, idx, data.HP_lr_decay, data.HP_lr)

        instance_count = 0
        total_perplexity_1 = 0
        total_perplexity_2 = 0

        total_loss_1 = 0
        total_loss_2 = 0
        total_loss_3 = 0
        total_loss_4 = 0

        random.shuffle(data.source_train_idx)
        random.shuffle(data.target_train_idx)
        random.shuffle(data.source_lm_idx)
        random.shuffle(data.target_lm_idx)

        model.train()
        model.zero_grad()

        batch_size_1 = data.HP_batch_size
        train_num_1 = len(data.source_train_idx)
        train_num_2 = len(data.target_train_idx)
        train_num_3 = len(data.source_lm_idx)
        train_num_4 = len(data.target_lm_idx)

        batch_num = train_num_1 // batch_size_1 + 1
        batch_size_2 = train_num_2 // batch_num
        batch_size_3 = train_num_3 // batch_num
        batch_size_4 = train_num_4 // batch_num

        for batch_id in range(batch_num):

            instance_1 = data.source_train_idx[batch_id * batch_size_1: (batch_id + 1) * batch_size_1
            if ((batch_id + 1) * batch_size_1) < train_num_1 else train_num_1]
            instance_2 = data.target_train_idx[batch_id * batch_size_2: (batch_id + 1) * batch_size_2
            if ((batch_id + 1) * batch_size_2) < train_num_2 else train_num_2]
            instance_3 = data.source_lm_idx[batch_id * batch_size_3: (batch_id + 1) * batch_size_3
            if ((batch_id + 1) * batch_size_3) < train_num_3 else train_num_3]
            instance_4 = data.target_lm_idx[batch_id * batch_size_4: (batch_id + 1) * batch_size_4
            if ((batch_id + 1) * batch_size_4) < train_num_4 else train_num_4]

            if not instance_1 or not instance_2:
                continue

            # NER
            batch_word_1, batch_wordlen_1, batch_wordrecover_1, batch_char_1, batch_charlen_1, \
            batch_charrecover_1, batch_label_1, lm_seq_tensor_1, mask_1 = batchify_with_label(instance_1, data.HP_gpu)

            batch_word_2, batch_wordlen_2, batch_wordrecover_2, batch_char_2, batch_charlen_2, \
            batch_charrecover_2, batch_label_2, lm_seq_tensor_2, mask_2 = batchify_with_label(instance_2, data.HP_gpu)

            # LM
            batch_word_3, batch_wordlen_3, batch_wordrecover_3, batch_char_3, batch_charlen_3, \
            batch_charrecover_3, batch_label_3, lm_seq_tensor_3, mask_3 = batchify_with_label(instance_3 + instance_1, data.HP_gpu)
            batch_word_4, batch_wordlen_4, batch_wordrecover_4, batch_char_4, batch_charlen_4, \
            batch_charrecover_4, batch_label_4, lm_seq_tensor_4, mask_4 = batchify_with_label(instance_4 + instance_2, data.HP_gpu)

            batch_word = [batch_word_1, batch_word_2, batch_word_3, batch_word_4]
            batch_wordlen = [batch_wordlen_1, batch_wordlen_2, batch_wordlen_3, batch_wordlen_4]

            batch_char = [batch_char_1, batch_char_2, batch_char_3, batch_char_4]
            batch_charlen = [batch_charlen_1, batch_charlen_2, batch_charlen_3, batch_charlen_4]

            batch_charrecover = [batch_charrecover_1, batch_charrecover_2, batch_charrecover_3, batch_charrecover_4]
            batch_label = [batch_label_1, batch_label_2, batch_label_3, batch_label_4]

            lm_seq_tensor = [lm_seq_tensor_1, lm_seq_tensor_2, lm_seq_tensor_3, lm_seq_tensor_4]
            mask = [mask_1, mask_2, mask_3, mask_4]

            instance_count += 1
            loss_ = []
            perplexity_ = []

            # source language model
            loss, perplexity, tag_seq = model.loss('model1', batch_word[2], batch_wordlen[2], batch_char[2],
                                                   batch_charlen[2], batch_charrecover[2], batch_label[2],
                                                   lm_seq_tensor[2], mask[2])
            loss_.append(loss)
            perplexity_.append(perplexity)
            # source NER
            loss, perplexity, tag_seq = model.loss('model2', batch_word[0], batch_wordlen[0], batch_char[0],
                                                   batch_charlen[0], batch_charrecover[0], batch_label[0],
                                                   lm_seq_tensor[0], mask[0])

            loss_.append(loss)
            # target language model
            loss, perplexity, tag_seq = model.loss('model3', batch_word[3], batch_wordlen[3], batch_char[3],
                                                   batch_charlen[3], batch_charrecover[3], batch_label[3],
                                                   lm_seq_tensor[3], mask[3])

            loss_.append(loss)
            perplexity_.append(perplexity)
            loss = 0
            model_num = len(loss_)
            for loss_id in range(model_num):
                loss += loss_[loss_id]
            loss.backward()
            optimizer.step()
            model.zero_grad()

            total_loss_1 += loss_[0].data[0]
            total_loss_2 += loss_[1].data[0]
            total_loss_3 += loss_[2].data[0]

            total_perplexity_1 += perplexity_[0].data[0]
            total_perplexity_2 += perplexity_[1].data[0]


        epoch_finish = time.time()
        epoch_cost = epoch_finish - epoch_start
        source_lm_perplexity = math.exp(total_perplexity_1 / batch_num)
        target_lm_perplexity = math.exp(total_perplexity_2 / batch_num)

        print("Epoch: %s training finished. Time: %.2fs, speed: %.2fst/s,  total loss: %s" % (
            idx, epoch_cost, train_num_1 / epoch_cost, total_loss_2))
        print("Epoch: %s training finished. Time: %.2fs, speed: %.2fst/s,  total perplexity: %.4f" % (
            idx, epoch_cost, train_num_3 / epoch_cost, source_lm_perplexity))

        print("Epoch: %s training finished. Time: %.2fs, speed: %.2fst/s,  total loss: %s" % (
            idx, epoch_cost, train_num_2 / epoch_cost, total_loss_4))
        print("Epoch: %s training finished. Time: %.2fs, speed: %.2fst/s,  total perplexity: %.4f" % (
            idx, epoch_cost, train_num_4 / epoch_cost, target_lm_perplexity))

        if total_loss_1 > 1e8 or str(total_loss_1) == "nan" or total_loss_2 > 1e8 or str(
                total_loss_2) == "nan" or total_loss_3 > 1e8 or str(total_loss_3) == "nan" or total_loss_4 > 1e8 or str(
            total_loss_4) == "nan":
            print("ERROR: LOSS EXPLOSION (>1e8) ! PLEASE SET PROPER PARAMETERS AND STRUCTURE! EXIT....")
            exit(1)

        # dev-test
        speed, acc, p, r, f, _, _ = evaluate(data, model, "dev-test")
        test_f1.append(f[1])
        dev_finish = time.time()
        dev_cost = dev_finish - epoch_finish

        current_score = f[0]
        print("Dev-Source: time: %.2fs, speed: %.2fst/s; acc: %.4f, p: %.4f, r: %.4f, f: %.4f" % (
            dev_cost, speed, acc[0], p[0], r[0], f[0]))
        print("Test-Target: time: %.2fs, speed: %.2fst/s; acc: %.4f, p: %.4f, r: %.4f, f: %.4f" % (
            dev_cost, speed, acc[1], p[1], r[1], f[1]))

        if current_score > best_dev_f1:
            best_epoch = idx
            print("Exceed previous best f score:", best_dev_f1)
            model_name = data.model_dir + ".model"
            print("Save current best model in file:", model_name)
            torch.save(model.state_dict(), model_name)
            best_dev_f1 = current_score

        if current_score > 0.72:
            print("change optim sgd:")
            optimizer = optim.SGD(model.parameters(), lr=0.015, momentum=data.HP_momentum,
                                  weight_decay=data.HP_l2)
        print("The best Source-domain dev f-score: %.4f, Target-domain f-score: %.4f" % (best_dev_f1, test_f1[best_epoch]))

Example #2

Show file

def train(train_data):
    print("Training model...")
    train_data.show_data_summary()
    save_data_name = train_data.init_dir + ".init"
    train_data.save(save_data_name)
    model = SeqModel(train_data)

    for name, param in model.named_parameters():
        if param.requires_grad:
            print(name)

    if train_data.optimizer.lower() == "sgd":
        optimizer = optim.SGD(model.parameters(),
                              lr=train_data.HP_lr,
                              momentum=train_data.HP_momentum,
                              weight_decay=train_data.HP_l2)
    elif train_data.optimizer.lower() == "adagrad":
        optimizer = optim.Adagrad(model.parameters(),
                                  lr=train_data.HP_lr,
                                  weight_decay=train_data.HP_l2)
    elif train_data.optimizer.lower() == "adadelta":
        optimizer = optim.Adadelta(model.parameters(),
                                   lr=train_data.HP_lr,
                                   weight_decay=train_data.HP_l2)
    elif train_data.optimizer.lower() == "rmsprop":
        optimizer = optim.RMSprop(model.parameters(),
                                  lr=train_data.HP_lr,
                                  weight_decay=train_data.HP_l2)
    elif train_data.optimizer.lower() == "adam":
        optimizer = optim.Adam(model.parameters(),
                               lr=train_data.HP_lr,
                               weight_decay=train_data.HP_l2)
    else:
        print("Optimizer illegal: %s" % train_data.optimizer)
        exit(1)

    best_dev = -10
    dev_f = []
    test_f = []
    best_epoch = 0

    for idx in range(train_data.HP_iteration):
        epoch_start = time.time()
        print("Epoch: %s/%s" % (idx, train_data.HP_iteration))
        if train_data.optimizer.lower() == "sgd":
            optimizer = lr_decay(optimizer, idx, train_data.HP_lr_decay,
                                 train_data.HP_lr)

        random.shuffle(train_data.ner_1_train_idx)
        random.shuffle(train_data.ner_2_train_idx)
        random.shuffle(train_data.lm_1_idx)
        random.shuffle(train_data.lm_2_idx)

        model.train()
        model.zero_grad()

        ner_1_loss = 0
        ner_2_loss = 0
        lm_1_perplexity = 0
        lm_2_perplexity = 0

        ner_1_batch_size = train_data.HP_batch_size
        batch_nums = len(train_data.ner_1_train_idx) // ner_1_batch_size + 1
        ner_2_batch_size = len(train_data.ner_2_train_idx) // batch_nums
        lm_1_batch_size = len(train_data.lm_1_idx) // batch_nums
        lm_2_batch_size = len(train_data.lm_2_idx) // batch_nums

        print("batch size: ", ner_1_batch_size, ner_2_batch_size,
              lm_1_batch_size, lm_2_batch_size)

        for batch_id in range(batch_nums):
            ner_1_data = train_data.ner_1_train_idx[batch_id * ner_1_batch_size: (batch_id + 1) * ner_1_batch_size if\
            (batch_id + 1) * ner_1_batch_size < len(train_data.ner_1_train_idx) else len(train_data.ner_1_train_idx)]
            ner_2_data = train_data.ner_2_train_idx[batch_id * ner_2_batch_size: (batch_id + 1) * ner_2_batch_size if\
            (batch_id + 1) * ner_2_batch_size < len(train_data.ner_2_train_idx) else len(train_data.ner_2_train_idx)]
            lm_1_data = train_data.lm_1_idx[batch_id * lm_1_batch_size: (batch_id + 1) * lm_1_batch_size if \
                (batch_id + 1) * lm_1_batch_size < len(train_data.lm_1_idx) else len(train_data.lm_1_idx)]
            lm_2_data = train_data.lm_2_idx[batch_id * lm_2_batch_size: (batch_id + 1) * lm_2_batch_size if \
                (batch_id + 1) * lm_2_batch_size < len(train_data.lm_2_idx) else len(train_data.lm_2_idx)]

            ner_1_batch_data = batchify_with_label(ner_1_data,
                                                   train_data.HP_gpu)
            if train_data.mode == 'supervised':
                ner_2_batch_data = batchify_with_label(ner_2_data,
                                                       train_data.HP_gpu)
            lm_1_batch_data = batchify_with_label(lm_1_data, train_data.HP_gpu)
            lm_2_batch_data = batchify_with_label(lm_2_data, train_data.HP_gpu)

            losses = []
            perplexities = []

            # word_seq_tensor, word_seq_lengths, word_seq_recover, char_seq_tensor, char_seq_lengths,
            # char_seq_recover,  label_seq_tensor, lm_seq_tensor, mask
            loss, perplexity, tag_seq_forward, tag_seq_backward, tag_seq = \
                model.loss('ner1', ner_1_batch_data[0], ner_1_batch_data[1], ner_1_batch_data[3], ner_1_batch_data[4],
                           ner_1_batch_data[5], ner_1_batch_data[6], ner_1_batch_data[7], ner_1_batch_data[8])
            losses.append(loss)

            if train_data.mode == 'supervised':
                loss, perplexity, tag_seq_forward, tag_seq_backward, tag_seq = \
                    model.loss('ner2', ner_2_batch_data[0], ner_2_batch_data[1], ner_2_batch_data[3],
                               ner_2_batch_data[4], ner_2_batch_data[5], ner_2_batch_data[6], ner_2_batch_data[7],
                               ner_2_batch_data[8])
                losses.append(loss)

            loss, perplexity, tag_seq_forward, tag_seq_backward, tag_seq = \
                model.loss('lm1', lm_1_batch_data[0], lm_1_batch_data[1], lm_1_batch_data[3], lm_1_batch_data[4],
                           lm_1_batch_data[5], lm_1_batch_data[6], lm_1_batch_data[7], lm_1_batch_data[8])
            losses.append(loss)
            perplexities.append(perplexity)

            loss, perplexity, tag_seq_forward, tag_seq_backward, tag_seq = \
                model.loss('lm2', lm_2_batch_data[0], lm_2_batch_data[1], lm_2_batch_data[3], lm_2_batch_data[4],
                           lm_2_batch_data[5], lm_2_batch_data[6], lm_2_batch_data[7], lm_2_batch_data[8])
            losses.append(loss)
            perplexities.append(perplexity)

            model_loss = 0
            loss_rate = [0.8, 1, 0.5, 0.5
                         ] if train_data.mode == 'supervised' else [1, 1, 1]
            for loss_id in range(len(losses)):
                model_loss += losses[loss_id] * loss_rate[loss_id]
            model_loss.backward()
            optimizer.step()
            model.zero_grad()

            ner_1_loss += losses[0].data[0]
            ner_2_loss += losses[1].data[0]
            lm_1_perplexity += perplexities[0].data[0]
            lm_2_perplexity += perplexities[1].data[0]

        epoch_finish = time.time()
        epoch_cost = epoch_finish - epoch_start

        print("Epoch: %s training finished. Time: %.2fs." % (idx, epoch_cost))
        print("ner 1 total loss: %s" % ner_1_loss)
        if train_data.mode == 'supervised':
            print("ner 2 total loss: %s" % ner_2_loss)
        print("lm 1 perplexity: %.4f" % math.exp(lm_1_perplexity / batch_nums))
        print("lm 2 perplexity: %.4f" % math.exp(lm_2_perplexity / batch_nums))

        if ner_1_loss > 1e8 or str(
                ner_1_loss) == "nan" or ner_2_loss > 1e8 or str(
                    ner_2_loss) == "nan":
            print(
                "ERROR: LOSS EXPLOSION (>1e8) ! PLEASE SET PROPER PARAMETERS AND STRUCTURE! EXIT...."
            )
            exit(1)

        evaluate('ner1', train_data.ner_1_dev_idx,
                 train_data.label_alphabet_ner_1, train_data, model)
        if train_data.mode == 'supervised':
            p, r, f = evaluate('ner2', train_data.ner_2_dev_idx,
                               train_data.label_alphabet_ner_2, train_data,
                               model)
        else:
            p, r, f = evaluate('ner2', train_data.ner_2_dev_idx,
                               train_data.label_alphabet_ner_1, train_data,
                               model)
        dev_f.append(f)

        if f > best_dev:
            best_epoch = idx
            print("Exceed previous best f score:", best_dev)
            model_name = train_data.model_dir + ".model"
            print("Save current best model in file:", model_name)
            torch.save(model.state_dict(), model_name)
            best_dev = f

        evaluate('ner1', train_data.ner_1_test_idx,
                 train_data.label_alphabet_ner_1, train_data, model)
        if train_data.mode == 'supervised':
            p, r, f = evaluate('ner2', train_data.ner_2_test_idx,
                               train_data.label_alphabet_ner_2, train_data,
                               model)
        else:
            p, r, f = evaluate('ner2', train_data.ner_2_test_idx,
                               train_data.label_alphabet_ner_1, train_data,
                               model)
        test_f.append(f)
    print("the best dev score is in epoch %s, dev:%.4f, test:%.4f" %
          (best_epoch, dev_f[best_epoch], test_f[best_epoch]))

Example #3

Show file

def train(data):
    print("Training model...")
    data.show_data_summary()
    save_data_name = data.model_dir + ".dset"
    data.save(save_data_name)
    model = SeqModel(data)
    for name, param in model.named_parameters():
        if param.requires_grad:
            print(name)
    loss_function = nn.NLLLoss()
    if data.optimizer.lower() == "sgd":
        optimizer = optim.SGD(model.parameters(),
                              lr=data.HP_lr,
                              momentum=data.HP_momentum,
                              weight_decay=data.HP_l2)
    elif data.optimizer.lower() == "adagrad":
        optimizer = optim.Adagrad(model.parameters(),
                                  lr=data.HP_lr,
                                  weight_decay=data.HP_l2)
    elif data.optimizer.lower() == "adadelta":
        optimizer = optim.Adadelta(model.parameters(),
                                   lr=data.HP_lr,
                                   weight_decay=data.HP_l2)
    elif data.optimizer.lower() == "rmsprop":
        optimizer = optim.RMSprop(model.parameters(),
                                  lr=data.HP_lr,
                                  weight_decay=data.HP_l2)
    elif data.optimizer.lower() == "adam":
        optimizer = optim.Adam(model.parameters(),
                               lr=data.HP_lr,
                               weight_decay=data.HP_l2)
    else:
        print("Optimizer illegal: %s" % (data.optimizer))
        exit(1)
    best_dev = -10
    dev_f = []
    test_f = []
    perplexity_1 = []
    perplexity_2 = []
    best_epoch = 0
    # data.HP_iteration = 1
    LM_data = data.train_Ids_2
    ## start training
    for idx in range(data.HP_iteration):
        epoch_start = time.time()
        temp_start = epoch_start
        print("Epoch: %s/%s" % (idx, data.HP_iteration))
        if data.optimizer == "SGD":
            optimizer = lr_decay(optimizer, idx, data.HP_lr_decay, data.HP_lr)
        instance_count = 0
        total_perplexity_1 = 0
        total_perplexity_2 = 0
        total_loss_1 = 0
        total_loss_2 = 0
        total_loss_3 = 0
        total_loss_4 = 0
        random.shuffle(data.train_Ids_1)
        random.shuffle(data.train_Ids_2)

        ## set model in train model
        model.train()
        model.zero_grad()
        batch_size = data.HP_batch_size
        batch_id = 0

        ###co-train for 4 models
        train_num_1 = len(data.train_Ids_1)
        train_num_2 = len(data.train_Ids_2)
        train_num_3 = len(LM_data)
        total_batch_1 = train_num_1 // batch_size + 1
        batch_size_2 = train_num_2 // total_batch_1
        l_batch_num_2 = train_num_2 - total_batch_1 * batch_size_2

        start_2 = end_2 = 0

        for batch_id in range(total_batch_1):
            start = batch_id * batch_size
            end = (batch_id + 1) * batch_size
            start_2 = end_2
            if batch_id < l_batch_num_2:
                end_2 = start_2 + (batch_size_2 + 1)
            else:
                end_2 = start_2 + batch_size_2

            if end > train_num_1:
                end = train_num_1
            if end_2 > train_num_2:
                end_2 = train_num_2

            instance_1 = data.train_Ids_1[start:end]
            instance_2 = data.train_Ids_2[start_2:end_2]

            if not instance_1 or not instance_2:
                continue
            #seq label
            batch_word_1, batch_features_1, batch_wordlen_1, batch_wordrecover_1, batch_char_1, batch_charlen_1, batch_charrecover_1, batch_label_1, lm_seq_tensor_1, mask_1 = batchify_with_label(
                instance_1, data.HP_gpu)
            batch_word_2, batch_features_2, batch_wordlen_2, batch_wordrecover_2, batch_char_2, batch_charlen_2, batch_charrecover_2, batch_label_2, lm_seq_tensor_2, mask_2 = batchify_with_label(
                instance_2, data.HP_gpu)

            batch_word = [batch_word_1, batch_word_2]
            batch_features = [batch_features_1, batch_features_2]
            batch_wordlen = [batch_wordlen_1, batch_wordlen_2]
            batch_char = [batch_char_1, batch_char_2]
            batch_charlen = [batch_charlen_1, batch_charlen_2]
            batch_charrecover = [batch_charrecover_1, batch_charrecover_2]
            batch_label = [batch_label_1, batch_label_2]
            lm_seq_tensor = [lm_seq_tensor_1, lm_seq_tensor_2]
            mask = [mask_1, mask_2]
            instance_count += 1
            loss_ = []
            perplexity_ = []

            # LM 1
            loss, perplexity, tag_seq_forward, tag_seq_backward, tag_seq = model.loss(
                'model1', batch_word[0], batch_features[0], batch_wordlen[0],
                batch_char[0], batch_charlen[0], batch_charrecover[0],
                batch_label[0], lm_seq_tensor[0], mask[0])

            loss_.append(loss)
            perplexity_.append(perplexity)

            #seq label 1
            loss, perplexity, tag_seq_forward, tag_seq_backward, tag_seq = model.loss(
                'model2', batch_word[0], batch_features[0], batch_wordlen[0],
                batch_char[0], batch_charlen[0], batch_charrecover[0],
                batch_label[0], lm_seq_tensor[0], mask[0])
            loss_.append(loss)

            # LM 2
            loss, perplexity, tag_seq_forward, tag_seq_backward, tag_seq = model.loss(
                'model3', batch_word[1], batch_features[1], batch_wordlen[1],
                batch_char[1], batch_charlen[1], batch_charrecover[1],
                batch_label[1], lm_seq_tensor[1], mask[1])

            loss_.append(loss)
            perplexity_.append(perplexity)

            #seq label 2
            loss, perplexity, tag_seq_forward, tag_seq_backward, tag_seq = model.loss(
                'model4', batch_word[1], batch_features[1], batch_wordlen[1],
                batch_char[1], batch_charlen[1], batch_charrecover[1],
                batch_label[1], lm_seq_tensor[1], mask[1])
            loss_.append(loss)

            loss_rate = [1.0, 1.0, 1.0, 2.0]
            loss = 0
            model_num = len(loss_)
            for loss_id in range(model_num):
                loss += loss_[loss_id] * loss_rate[loss_id]
            loss.backward()
            optimizer.step()
            model.zero_grad()

            total_loss_1 += loss_[0].data[0]
            total_loss_2 += loss_[1].data[0]
            total_loss_3 += loss_[2].data[0]
            total_loss_4 += loss_[3].data[0]
            total_perplexity_1 += perplexity_[0].data[0]
            total_perplexity_2 += perplexity_[1].data[0]

        epoch_finish = time.time()
        epoch_cost = epoch_finish - epoch_start
        LM_perplex_1 = math.exp(total_perplexity_1 / total_batch_1)
        LM_perplex_2 = math.exp(total_perplexity_2 / total_batch_1)
        perplexity_1.append(LM_perplex_1)
        perplexity_2.append(LM_perplex_2)
        print("Epoch: %s training finished. Time: %.2fs" % (idx, epoch_cost))
        print("Epoch: %s training finished. Time: %.2fs,   total loss: %s" %
              (idx, epoch_cost, total_loss_2))
        print("totalloss:", total_loss_2)
        print(
            "Epoch: %s training finished. Time: %.2fs,  total perplexity: %.4f"
            % (idx, epoch_cost, LM_perplex_1))
        print("Epoch: %s training finished. Time: %.2fs,   total loss: %s" %
              (idx, epoch_cost, total_loss_4))
        print("totalloss:", total_loss_4)
        print(
            "Epoch: %s training finished. Time: %.2fs,   total perplexity: %.4f"
            % (idx, epoch_cost, LM_perplex_2))

        speed, acc, p, r, f, _, _ = evaluate(data, model, "dev")
        dev_f.append(f[1])
        dev_finish = time.time()
        dev_cost = dev_finish - epoch_finish

        if data.seg:
            current_score = f[1]
            print(
                "Dev: time: %.2fs, speed: %.2fst/s; acc: %.4f, p: %.4f, r: %.4f, f: %.4f"
                % (dev_cost, speed, acc[0], p[0], r[0], f[0]))
            print(
                "Dev: time: %.2fs, speed: %.2fst/s; acc: %.4f, p: %.4f, r: %.4f, f: %.4f"
                % (dev_cost, speed, acc[1], p[1], r[1], f[1]))
        else:
            current_score = acc[1]
            print("Dev: time: %.2fs speed: %.2fst/s; acc: %.4f" %
                  (dev_cost, speed, acc[0]))
            print("Dev: time: %.2fs speed: %.2fst/s; acc: %.4f" %
                  (dev_cost, speed, acc[1]))

        if current_score > best_dev:
            best_epoch = idx
            if data.seg:
                print("Exceed previous best f score:", best_dev)
            else:
                print("Exceed previous best acc score:", best_dev)
            # model_name = data.model_dir +'.'+ str(idx) + ".model"
            model_name = data.model_dir + ".model"
            print("Save current best model in file:", model_name)
            torch.save(model.state_dict(), model_name)
            best_dev = current_score
        # ## decode test
        speed, acc, p, r, f, _, _ = evaluate(data, model, "test")
        test_f.append(f[1])
        test_finish = time.time()
        test_cost = test_finish - dev_finish
        if data.seg:
            print(
                "Test: time: %.2fs, speed: %.2fst/s; acc: %.4f, p: %.4f, r: %.4f, f: %.4f"
                % (test_cost, speed, acc[0], p[0], r[0], f[0]))
            print(
                "Test: time: %.2fs, speed: %.2fst/s; acc: %.4f, p: %.4f, r: %.4f, f: %.4f"
                % (test_cost, speed, acc[1], p[1], r[1], f[1]))
        else:
            print("Test: time: %.2fs, speed: %.2fst/s; acc: %.4f" %
                  (test_cost, speed, acc[0]))
            print("Test: time: %.2fs, speed: %.2fst/s; acc: %.4f" %
                  (test_cost, speed, acc[1]))
        gc.collect()

    print("the best dev score is in epoch %s, dev:%.4f, test:%.4f" %
          (best_epoch, dev_f[best_epoch], test_f[best_epoch]))
    with open('data/fscore_13PC.txt', 'w') as ft:
        ft.write('dev f scores:\n')
        for t in dev_f:
            ft.write(str(round(t, 6)))
            ft.write(' ')
        ft.write('\n')
        ft.write('test f scores:\n')
        for t in test_f:
            ft.write(str(round(t, 6)))
            ft.write(' ')
        ft.write('\n')
        ft.write('LM 1 perplexity:\n')
        for t in perplexity_1:
            ft.write(str(round(t, 6)))
            ft.write(' ')
        ft.write('\n')
        ft.write('LM 2 perplexity:\n')
        for t in perplexity_2:
            ft.write(str(round(t, 6)))
            ft.write(' ')

    if data.task_emb_save_dir is not None:
        with open('data/task_emb.txt', 'w') as ft:
            for task, i in data.task_alphabet.iteritems():
                ft.write(task)
                ft.write(' ')
                for t in model.word_hidden.LSTM_param_generator.task_emb_vocab.weight.data[
                        i]:
                    ft.write(str(round(t, 6)))
                    ft.write(' ')
                ft.write('\n')
    if data.domain_emb_save_dir is not None:
        with open('data/domain_emb.txt', 'w') as fd:
            for domain, i in data.domain_alphabet.iteritems():
                fd.write(domain)
                fd.write(' ')
                for t in model.word_hidden.LSTM_param_generator.domain_emb_vocab.weight.data[
                        i]:
                    fd.write(str(round(t, 6)))
                    fd.write(' ')
                fd.write('\n')