Beispiel #1
0
def train(data):
    print("Training model...")
    data.show_data_summary()
    save_data_name = data.model_dir + ".dset"
    data.save(save_data_name)

    model = SeqModel(data)

    for name, param in model.named_parameters():
        if param.requires_grad:
            print(name)

    if data.optimizer.lower() == "sgd":
        optimizer = optim.SGD(model.parameters(), lr=data.HP_lr, momentum=data.HP_momentum, weight_decay=data.HP_l2)
    elif data.optimizer.lower() == "adagrad":
        optimizer = optim.Adagrad(model.parameters(), lr=data.HP_lr, weight_decay=data.HP_l2)
    elif data.optimizer.lower() == "adadelta":
        optimizer = optim.Adadelta(model.parameters(), lr=data.HP_lr, weight_decay=data.HP_l2)
    elif data.optimizer.lower() == "rmsprop":
        optimizer = optim.RMSprop(model.parameters(), lr=data.HP_lr, weight_decay=data.HP_l2)
    elif data.optimizer.lower() == "adam":
        optimizer = optim.Adam(model.parameters(), lr=data.HP_lr, weight_decay=data.HP_l2)
    else:
        print("Optimizer illegal: %s" % (data.optimizer))
        exit(1)

    best_dev_f1 = -1
    test_f1 = []
    best_epoch = 0

    # start training
    for idx in range(data.HP_iteration):
        epoch_start = time.time()
        print("Epoch: %s/%s" % (idx, data.HP_iteration))
        if data.optimizer == "SGD":
            optimizer = lr_decay(optimizer, idx, data.HP_lr_decay, data.HP_lr)

        instance_count = 0
        total_perplexity_1 = 0
        total_perplexity_2 = 0

        total_loss_1 = 0
        total_loss_2 = 0
        total_loss_3 = 0
        total_loss_4 = 0

        random.shuffle(data.source_train_idx)
        random.shuffle(data.target_train_idx)
        random.shuffle(data.source_lm_idx)
        random.shuffle(data.target_lm_idx)

        model.train()
        model.zero_grad()

        batch_size_1 = data.HP_batch_size
        train_num_1 = len(data.source_train_idx)
        train_num_2 = len(data.target_train_idx)
        train_num_3 = len(data.source_lm_idx)
        train_num_4 = len(data.target_lm_idx)

        batch_num = train_num_1 // batch_size_1 + 1
        batch_size_2 = train_num_2 // batch_num
        batch_size_3 = train_num_3 // batch_num
        batch_size_4 = train_num_4 // batch_num

        for batch_id in range(batch_num):

            instance_1 = data.source_train_idx[batch_id * batch_size_1: (batch_id + 1) * batch_size_1
            if ((batch_id + 1) * batch_size_1) < train_num_1 else train_num_1]
            instance_2 = data.target_train_idx[batch_id * batch_size_2: (batch_id + 1) * batch_size_2
            if ((batch_id + 1) * batch_size_2) < train_num_2 else train_num_2]
            instance_3 = data.source_lm_idx[batch_id * batch_size_3: (batch_id + 1) * batch_size_3
            if ((batch_id + 1) * batch_size_3) < train_num_3 else train_num_3]
            instance_4 = data.target_lm_idx[batch_id * batch_size_4: (batch_id + 1) * batch_size_4
            if ((batch_id + 1) * batch_size_4) < train_num_4 else train_num_4]

            if not instance_1 or not instance_2:
                continue

            # NER
            batch_word_1, batch_wordlen_1, batch_wordrecover_1, batch_char_1, batch_charlen_1, \
            batch_charrecover_1, batch_label_1, lm_seq_tensor_1, mask_1 = batchify_with_label(instance_1, data.HP_gpu)

            batch_word_2, batch_wordlen_2, batch_wordrecover_2, batch_char_2, batch_charlen_2, \
            batch_charrecover_2, batch_label_2, lm_seq_tensor_2, mask_2 = batchify_with_label(instance_2, data.HP_gpu)

            # LM
            batch_word_3, batch_wordlen_3, batch_wordrecover_3, batch_char_3, batch_charlen_3, \
            batch_charrecover_3, batch_label_3, lm_seq_tensor_3, mask_3 = batchify_with_label(instance_3 + instance_1, data.HP_gpu)
            batch_word_4, batch_wordlen_4, batch_wordrecover_4, batch_char_4, batch_charlen_4, \
            batch_charrecover_4, batch_label_4, lm_seq_tensor_4, mask_4 = batchify_with_label(instance_4 + instance_2, data.HP_gpu)

            batch_word = [batch_word_1, batch_word_2, batch_word_3, batch_word_4]
            batch_wordlen = [batch_wordlen_1, batch_wordlen_2, batch_wordlen_3, batch_wordlen_4]

            batch_char = [batch_char_1, batch_char_2, batch_char_3, batch_char_4]
            batch_charlen = [batch_charlen_1, batch_charlen_2, batch_charlen_3, batch_charlen_4]

            batch_charrecover = [batch_charrecover_1, batch_charrecover_2, batch_charrecover_3, batch_charrecover_4]
            batch_label = [batch_label_1, batch_label_2, batch_label_3, batch_label_4]

            lm_seq_tensor = [lm_seq_tensor_1, lm_seq_tensor_2, lm_seq_tensor_3, lm_seq_tensor_4]
            mask = [mask_1, mask_2, mask_3, mask_4]

            instance_count += 1
            loss_ = []
            perplexity_ = []

            # source language model
            loss, perplexity, tag_seq = model.loss('model1', batch_word[2], batch_wordlen[2], batch_char[2],
                                                   batch_charlen[2], batch_charrecover[2], batch_label[2],
                                                   lm_seq_tensor[2], mask[2])
            loss_.append(loss)
            perplexity_.append(perplexity)
            # source NER
            loss, perplexity, tag_seq = model.loss('model2', batch_word[0], batch_wordlen[0], batch_char[0],
                                                   batch_charlen[0], batch_charrecover[0], batch_label[0],
                                                   lm_seq_tensor[0], mask[0])

            loss_.append(loss)
            # target language model
            loss, perplexity, tag_seq = model.loss('model3', batch_word[3], batch_wordlen[3], batch_char[3],
                                                   batch_charlen[3], batch_charrecover[3], batch_label[3],
                                                   lm_seq_tensor[3], mask[3])

            loss_.append(loss)
            perplexity_.append(perplexity)
            loss = 0
            model_num = len(loss_)
            for loss_id in range(model_num):
                loss += loss_[loss_id]
            loss.backward()
            optimizer.step()
            model.zero_grad()

            total_loss_1 += loss_[0].data[0]
            total_loss_2 += loss_[1].data[0]
            total_loss_3 += loss_[2].data[0]

            total_perplexity_1 += perplexity_[0].data[0]
            total_perplexity_2 += perplexity_[1].data[0]


        epoch_finish = time.time()
        epoch_cost = epoch_finish - epoch_start
        source_lm_perplexity = math.exp(total_perplexity_1 / batch_num)
        target_lm_perplexity = math.exp(total_perplexity_2 / batch_num)

        print("Epoch: %s training finished. Time: %.2fs, speed: %.2fst/s,  total loss: %s" % (
            idx, epoch_cost, train_num_1 / epoch_cost, total_loss_2))
        print("Epoch: %s training finished. Time: %.2fs, speed: %.2fst/s,  total perplexity: %.4f" % (
            idx, epoch_cost, train_num_3 / epoch_cost, source_lm_perplexity))

        print("Epoch: %s training finished. Time: %.2fs, speed: %.2fst/s,  total loss: %s" % (
            idx, epoch_cost, train_num_2 / epoch_cost, total_loss_4))
        print("Epoch: %s training finished. Time: %.2fs, speed: %.2fst/s,  total perplexity: %.4f" % (
            idx, epoch_cost, train_num_4 / epoch_cost, target_lm_perplexity))

        if total_loss_1 > 1e8 or str(total_loss_1) == "nan" or total_loss_2 > 1e8 or str(
                total_loss_2) == "nan" or total_loss_3 > 1e8 or str(total_loss_3) == "nan" or total_loss_4 > 1e8 or str(
            total_loss_4) == "nan":
            print("ERROR: LOSS EXPLOSION (>1e8) ! PLEASE SET PROPER PARAMETERS AND STRUCTURE! EXIT....")
            exit(1)

        # dev-test
        speed, acc, p, r, f, _, _ = evaluate(data, model, "dev-test")
        test_f1.append(f[1])
        dev_finish = time.time()
        dev_cost = dev_finish - epoch_finish

        current_score = f[0]
        print("Dev-Source: time: %.2fs, speed: %.2fst/s; acc: %.4f, p: %.4f, r: %.4f, f: %.4f" % (
            dev_cost, speed, acc[0], p[0], r[0], f[0]))
        print("Test-Target: time: %.2fs, speed: %.2fst/s; acc: %.4f, p: %.4f, r: %.4f, f: %.4f" % (
            dev_cost, speed, acc[1], p[1], r[1], f[1]))

        if current_score > best_dev_f1:
            best_epoch = idx
            print("Exceed previous best f score:", best_dev_f1)
            model_name = data.model_dir + ".model"
            print("Save current best model in file:", model_name)
            torch.save(model.state_dict(), model_name)
            best_dev_f1 = current_score

        if current_score > 0.72:
            print("change optim sgd:")
            optimizer = optim.SGD(model.parameters(), lr=0.015, momentum=data.HP_momentum,
                                  weight_decay=data.HP_l2)
        print("The best Source-domain dev f-score: %.4f, Target-domain f-score: %.4f" % (best_dev_f1, test_f1[best_epoch]))
Beispiel #2
0
def train(data):
    print("Training model...")
    data.show_data_summary()
    save_data_name = data.model_dir + ".dset"
    data.save(save_data_name)
    model = SeqModel(data)
    pytorch_total_params = sum(p.numel() for p in model.parameters())
    print("--------pytorch total params--------")
    print(pytorch_total_params)

    loss_function = nn.NLLLoss()
    if data.optimizer.lower() == "sgd":
        optimizer = optim.SGD(filter(lambda p: p.requires_grad,
                                     model.parameters()),
                              lr=data.HP_lr,
                              momentum=data.HP_momentum,
                              weight_decay=data.HP_l2)
    elif data.optimizer.lower() == "adagrad":
        optimizer = optim.Adagrad(model.parameters(),
                                  lr=data.HP_lr,
                                  weight_decay=data.HP_l2)
    elif data.optimizer.lower() == "adadelta":
        optimizer = optim.Adadelta(model.parameters(),
                                   lr=data.HP_lr,
                                   weight_decay=data.HP_l2)
    elif data.optimizer.lower() == "rmsprop":
        optimizer = optim.RMSprop(model.parameters(),
                                  lr=data.HP_lr,
                                  weight_decay=data.HP_l2)
    elif data.optimizer.lower() == "adam":
        optimizer = optim.Adam(model.parameters(),
                               lr=data.HP_lr,
                               weight_decay=data.HP_l2)
    else:
        print("Optimizer illegal: %s" % (data.optimizer))
        exit(1)
    best_dev = -10
    best_test = -10
    best_epoch = -1
    no_imprv_epoch = 0
    # data.HP_iteration = 1
    ## start training
    for idx in range(data.HP_iteration):
        epoch_start = time.time()
        temp_start = epoch_start
        print("Epoch: %s/%s" %
              (idx, data.HP_iteration))  # print (self.train_Ids)
        if data.optimizer == "SGD":
            optimizer = lr_decay(optimizer, idx, data.HP_lr_decay, data.HP_lr)
        instance_count = 0
        sample_id = 0
        sample_loss = 0
        total_loss = 0
        right_token = 0
        whole_token = 0
        random.shuffle(data.train_Ids)
        ## set model in train model
        model.train()
        model.zero_grad()
        batch_size = data.HP_batch_size
        batch_id = 0
        train_num = len(data.train_Ids)

        total_batch = train_num // batch_size + 1
        for batch_id in range(total_batch):
            start = batch_id * batch_size
            end = (batch_id + 1) * batch_size
            if end > train_num:
                end = train_num
            instance = data.train_Ids[start:end]

            if not instance:
                continue

            # label_instance = [[i for i in range(0, data.label_alphabet_size + 1)] for _ in range(len(instance))]

            batch_word, batch_features, batch_wordlen, batch_wordrecover, batch_char, batch_charlen, batch_charrecover, batch_label, mask, input_label_seq_tensor = batchify_with_label(
                instance, data.HP_gpu, data.label_alphabet_size)

            instance_count += 1
            loss, tag_seq = model.neg_log_likelihood_loss(
                batch_word, batch_features, batch_wordlen, batch_char,
                batch_charlen, batch_charrecover, batch_label, mask,
                input_label_seq_tensor)
            right, whole = predict_check(tag_seq, batch_label, mask)
            right_token += right
            whole_token += whole
            sample_loss += loss.data[0]
            total_loss += loss.data[0]
            if end % 500 == 0:
                # temp_time = time.time()
                # temp_cost = temp_time - temp_start
                # temp_start = temp_time
                # print("     Instance: %s; Time: %.2fs; loss: %.4f; acc: %s/%s=%.4f"%(end, temp_cost, sample_loss, right_token, whole_token,(right_token+0.)/whole_token))
                if sample_loss > 1e8 or str(sample_loss) == "nan":
                    print(
                        "ERROR: LOSS EXPLOSION (>1e8) ! PLEASE SET PROPER PARAMETERS AND STRUCTURE! EXIT...."
                    )

                    exit(1)
                sys.stdout.flush()
                sample_loss = 0
            loss.backward()
            if data.whether_clip_grad:
                from torch.nn.utils import clip_grad_norm
                clip_grad_norm(model.parameters(), data.clip_grad)
            optimizer.step()
            model.zero_grad()
        temp_time = time.time()
        temp_cost = temp_time - temp_start
        print("     Instance: %s; Time: %.2fs; loss: %.4f; acc: %s/%s=%.4f" %
              (end, temp_cost, sample_loss, right_token, whole_token,
               (right_token + 0.) / whole_token))

        epoch_finish = time.time()
        epoch_cost = epoch_finish - epoch_start
        print(
            "Epoch: %s training finished. Time: %.2fs, speed: %.2fst/s,  total loss: %s"
            % (idx, epoch_cost, train_num / epoch_cost, total_loss))
        print("totalloss:", total_loss)
        if total_loss > 1e8 or str(total_loss) == "nan":
            print(
                "ERROR: LOSS EXPLOSION (>1e8) ! PLEASE SET PROPER PARAMETERS AND STRUCTURE! EXIT...."
            )
            # exit(1)
        # continue
        speed, acc, p, r, f, _, _ = evaluate(data, model, "dev")
        dev_finish = time.time()
        dev_cost = dev_finish - epoch_finish

        if data.seg:
            current_score = f
            print(
                "Dev: time: %.2fs, speed: %.2fst/s; acc: %.4f, p: %.4f, r: %.4f, f: %.4f"
                % (dev_cost, speed, acc, p, r, f))
        else:
            current_score = acc
            print("Dev: time: %.2fs speed: %.2fst/s; acc: %.4f" %
                  (dev_cost, speed, acc))

        # ## decode test
        speed, acc_test, p, r, f_test, _, _ = evaluate(data, model, "test")
        test_finish = time.time()
        test_cost = test_finish - dev_finish
        if data.seg:
            print(
                "Test: time: %.2fs, speed: %.2fst/s; acc: %.4f, p: %.4f, r: %.4f, f: %.4f"
                % (test_cost, speed, acc_test, p, r, f_test))
        else:
            print("Test: time: %.2fs, speed: %.2fst/s; acc: %.4f" %
                  (test_cost, speed, acc_test))

        if current_score > best_dev:
            if data.seg:
                best_test = f_test
                print("Exceed previous best f score:", best_dev)
            else:
                best_test = acc_test
                print("Exceed previous best acc score:", best_dev)
            best_epoch = idx
            # model_name = data.model_dir +'.'+ str(idx) + ".model"
            # print("Save current best model in file:", model_name)
            # torch.save(model.state_dict(), model_name)
            best_dev = current_score
            no_imprv_epoch = 0

        else:
            # early stop
            no_imprv_epoch += 1
            if no_imprv_epoch >= 10:
                print("early stop")
                print("Current best f score in dev", best_dev)
                print("Current best f score in test", best_test)
                break

        if data.seg:
            print("Current best f score in dev", best_dev)
            print("Current best f score in test", best_test)
        else:
            print("Current best acc score in dev", best_dev)
            print("Current best acc score in test", best_test)
        gc.collect()
Beispiel #3
0
def train(data, meansentfeats = False):
    print("Training model...")
    data.show_data_summary()
    save_data_name = data.model_dir +".dset"
    data.save(save_data_name)
    model = SeqModel(data, meansentfeats = meansentfeats)
    loss_function = nn.NLLLoss()
    if data.optimizer.lower() == "sgd":
        optimizer = optim.SGD(model.parameters(), lr=data.HP_lr, momentum=data.HP_momentum,weight_decay=data.HP_l2)
    elif data.optimizer.lower() == "adagrad":
        optimizer = optim.Adagrad(model.parameters(), lr=data.HP_lr, weight_decay=data.HP_l2)
    elif data.optimizer.lower() == "adadelta":
        optimizer = optim.Adadelta(model.parameters(), lr=data.HP_lr, weight_decay=data.HP_l2)
    elif data.optimizer.lower() == "rmsprop":
        optimizer = optim.RMSprop(model.parameters(), lr=data.HP_lr, weight_decay=data.HP_l2)
    elif data.optimizer.lower() == "adam":
        optimizer = optim.Adam(model.parameters(), lr=data.HP_lr, weight_decay=data.HP_l2)
    else:
        print("Optimizer illegal: %s"%(data.optimizer))
        exit(1)
    best_dev = -10
    # data.HP_iteration = 1
    ## start training
    for idx in range(data.HP_iteration):
        epoch_start = time.time()
        temp_start = epoch_start
        print("Epoch: %s/%s" %(idx,data.HP_iteration))
        if data.optimizer == "SGD":
            optimizer = lr_decay(optimizer, idx, data.HP_lr_decay, data.HP_lr)
        instance_count = 0
        sample_id = 0
        sample_loss = 0
        total_loss = 0
        right_token = 0
        whole_token = 0
        random.shuffle(data.train_Ids)
        ## set model in train model
        model.train()
        model.zero_grad()
        batch_size = data.HP_batch_size
        batch_id = 0
        train_num = len(data.train_Ids)
        total_batch = train_num//batch_size+1
        for batch_id in range(total_batch):
            start = batch_id*batch_size
            end = (batch_id+1)*batch_size
            if end >train_num:
                end = train_num
            instance = data.train_Ids[start:end]
            if not instance:
                continue
            batch_word, batch_features, batch_wordlen, batch_wordrecover, batch_char, batch_charlen, batch_charrecover, batch_label, mask  = batchify_with_label(instance, data.HP_gpu)
            instance_count += 1
            loss, tag_seq = model.neg_log_likelihood_loss(batch_word,batch_features, batch_wordlen, batch_char, batch_charlen, batch_charrecover, batch_label, mask)
            right, whole = predict_check(tag_seq, batch_label, mask)
            right_token += right
            whole_token += whole
            sample_loss += loss.data[0]
            total_loss += loss.data[0]
            if end%500 == 0:
                temp_time = time.time()
                temp_cost = temp_time - temp_start
                temp_start = temp_time
                print("     Instance: %s; Time: %.2fs; loss: %.4f; acc: %s/%s=%.4f"%(end, temp_cost, sample_loss, right_token, whole_token,(right_token+0.)/whole_token))
                if sample_loss > 1e8 or str(sample_loss) == "nan":
                    print("ERROR: LOSS EXPLOSION (>1e8) ! PLEASE SET PROPER PARAMETERS AND STRUCTURE! EXIT....")
                    exit(1)
                sys.stdout.flush()
                sample_loss = 0
            loss.backward()
            optimizer.step()
            model.zero_grad()
        temp_time = time.time()
        temp_cost = temp_time - temp_start
        print("     Instance: %s; Time: %.2fs; loss: %.4f; acc: %s/%s=%.4f"%(end, temp_cost, sample_loss, right_token, whole_token,(right_token+0.)/whole_token))

        epoch_finish = time.time()
        epoch_cost = epoch_finish - epoch_start
        print("Epoch: %s training finished. Time: %.2fs, speed: %.2fst/s,  total loss: %s"%(idx, epoch_cost, train_num/epoch_cost, total_loss))
        print("totalloss:", total_loss)
        if total_loss > 1e8 or str(total_loss) == "nan":
            print("ERROR: LOSS EXPLOSION (>1e8) ! PLEASE SET PROPER PARAMETERS AND STRUCTURE! EXIT....")
            exit(1)
        # continue
        speed, acc, p, r, f, _,_ = evaluate(data, model, "dev")
        dev_finish = time.time()
        dev_cost = dev_finish - epoch_finish

        if data.seg:
            current_score = f
            print("Dev: time: %.2fs, speed: %.2fst/s; acc: %.4f, p: %.4f, r: %.4f, f: %.4f"%(dev_cost, speed, acc, p, r, f))
        else:
            current_score = acc
            print("Dev: time: %.2fs speed: %.2fst/s; acc: %.4f"%(dev_cost, speed, acc))

        if current_score > best_dev:
            if data.seg:
                print("Exceed previous best f score:", best_dev)
            else:
                print("Exceed previous best acc score:", best_dev)
            model_name = data.model_dir +'.'+ str(idx) + ".model"
            print("Save current best model in file:", model_name)
            torch.save(model.state_dict(), model_name)
            best_dev = current_score
        # ## decode test
        speed, acc, p, r, f, _,_ = evaluate(data, model, "test")
        test_finish = time.time()
        test_cost = test_finish - dev_finish
        if data.seg:
            print("Test: time: %.2fs, speed: %.2fst/s; acc: %.4f, p: %.4f, r: %.4f, f: %.4f"%(test_cost, speed, acc, p, r, f))
        else:
            print("Test: time: %.2fs, speed: %.2fst/s; acc: %.4f"%(test_cost, speed, acc))
        gc.collect()
Beispiel #4
0
def train(data):
    print "Training model..."

    model = SeqModel(data)
    print "model:{}".format(model)

    if data.gpu:
        model.cuda()

    if data.optimizer.lower() == "sgd":
        optimizer = optim.SGD(model.parameters(),
                              lr=data.lr,
                              momentum=data.momentum,
                              weight_decay=data.l2)
        if data.use_mapping:
            optimizer_wc = optim.SGD(model.word_hidden.wordrep.w.parameters(),
                                     lr=data.lr,
                                     momentum=data.momentum,
                                     weight_decay=data.l2)
    elif data.optimizer.lower() == "adam":
        optimizer = optim.Adam(model.parameters(),
                               lr=data.lr,
                               weight_decay=data.l2)
        if data.use_mapping:
            optimizer_wc = optim.Adam(model.word_hidden.wordrep.w.parameters(),
                                      lr=data.lr,
                                      weight_decay=data.l2)
    else:
        print("Optimizer illegal: %s , use sgd or adam." % data.optimizer)
        exit(0)

    best_dev = -10
    best_dev_epoch = -1
    best_test = -10
    best_test_epoch = -1
    # start training
    for idx in range(data.iteration):
        epoch_start = time.time()
        temp_start = epoch_start
        print("Epoch: %s/%s" % (idx + 1, data.iteration))
        if data.optimizer == "SGD":
            optimizer = lr_decay(optimizer, idx, data.lr_decay, data.lr)
            if data.use_mapping:
                optimizer_wc = lr_decay(optimizer_wc, idx, data.lr_decay,
                                        data.lr)

        instance_count = 0
        sample_id = 0
        #
        sample_loss = 0
        sample_mapping_loss = 0

        total_loss = 0
        total_mapping_loss = 0

        right_token = 0
        whole_token = 0

        random.shuffle(data.train_Ids)

        # set model in train mode
        model.train()
        model.zero_grad()
        batch_size = data.batch_size
        batch_id = 0
        train_num = len(data.train_Ids)
        total_batch = train_num // batch_size + 1

        for batch_id in range(total_batch):
            start = batch_id * batch_size
            end = (batch_id + 1) * batch_size
            if end > train_num:
                end = train_num
            instance = data.train_Ids[start:end]

            if not instance:
                continue
            batch_word, batch_features, batch_wordlen, batch_wordrecover, batch_char, batch_charlen, batch_charrecover, batch_label, batch_trans, trans_seq_lengths, trans_seq_recover, mask = batchify_with_label(
                instance, data.gpu)
            instance_count += 1
            loss, tag_seq, wc_loss = model.neg_log_likelihood_loss(
                batch_word, batch_features, batch_wordlen, batch_char,
                batch_charlen, batch_charrecover, batch_label, mask,
                batch_trans, trans_seq_lengths, trans_seq_recover)
            right, whole = predict_check(tag_seq, batch_label, mask)
            right_token += right
            whole_token += whole
            #sample_loss += loss.data[0]
            sample_loss += loss.data.item()
            if data.use_mapping:
                sample_mapping_loss += wc_loss.data[0]

            #total_loss += loss.data[0]
            total_loss += loss.data.item()
            if data.use_mapping:
                total_mapping_loss += wc_loss.data[0]
            if batch_id % data.show_loss_per_batch == 0:
                temp_time = time.time()
                temp_cost = temp_time - temp_start
                temp_start = temp_time
                if data.use_mapping:
                    print(
                        "     Instance: %s; Time: %.2fs; loss: %.4f; acc: %s/%s=%.4f"
                        % (batch_id, temp_cost, sample_loss, right_token,
                           whole_token, (right_token + 0.) / whole_token))
                else:
                    print(
                        "     Instance: %s; Time: %.2fs; loss: %.4f;mapping_loss: %.4f; acc: %s/%s=%.4f"
                        % (batch_id, temp_cost, sample_loss,
                           sample_mapping_loss, right_token, whole_token,
                           (right_token + 0.) / whole_token))
                sys.stdout.flush()
                sample_loss = 0
                sample_mapping_loss = 0

            if data.use_trans and data.use_mapping:
                for param in model.word_hidden.wordrep.w.parameters():
                    param.requires_grad = False
                loss.backward(retain_graph=True)
                if data.clip != None:
                    torch.nn.utils.clip_grad_norm(model.parameters(),
                                                  data.clip)
                optimizer.step()
                model.zero_grad()
                for param in model.word_hidden.wordrep.w.parameters():
                    param.requires_grad = True
                wc_loss.backward()
                optimizer_wc.step()
                model.zero_grad()
            else:
                loss.backward()
                # torch.nn.utils.clip_grad_norm(model.parameters(), data.clip)
                optimizer.step()
                model.zero_grad()

        temp_time = time.time()
        temp_cost = temp_time - temp_start
        if data.use_mapping:
            print(
                "     Instance: %s; Time: %.2fs; loss: %.4f; acc: %s/%s=%.4f" %
                (batch_id, temp_cost, sample_loss, right_token, whole_token,
                 (right_token + 0.) / whole_token))
        else:
            print(
                "     Instance: %s; Time: %.2fs; loss: %.4f;mapping_loss: %.4f; acc: %s/%s=%.4f"
                % (batch_id, temp_cost, sample_loss, sample_mapping_loss,
                   right_token, whole_token, (right_token + 0.) / whole_token))
        epoch_finish = time.time()
        epoch_cost = epoch_finish - epoch_start
        if data.use_mapping:
            print(
                "Epoch: %s training finished. Time: %.2fs, speed: %.2fst/s,  total loss: %s,total mapping loss: %s"
                % (idx + 1, epoch_cost, train_num / epoch_cost, total_loss,
                   total_mapping_loss))
        else:
            print(
                "Epoch: %s training finished. Time: %.2fs, speed: %.2fst/s,  total loss: %s"
                % (idx + 1, epoch_cost, train_num / epoch_cost, total_loss))

        # continue
        speed, acc, p, r, f, _, _ = evaluate(data, model, "dev", data.nbest)
        dev_finish = time.time()
        dev_cost = dev_finish - epoch_finish

        if data.seg:
            current_score = f
            print(
                "Dev: time: %.2fs, speed: %.2fst/s; acc: %.4f, p: %.4f, r: %.4f, f: %.4f"
                % (dev_cost, speed, acc, p, r, f))
        else:
            current_score = acc
            print("Dev: time: %.2fs speed: %.2fst/s; acc: %.4f" %
                  (dev_cost, speed, acc))

        if current_score > best_dev:
            if data.seg:
                print "Exceed previous best f score:", best_dev
            else:
                print "Exceed previous best acc score:", best_dev
            if data.save_model:
                model_name = data.model_dir + data.state_training_name + '.' + str(
                    current_score)[2:-1]
                print "Save current best model in file:", model_name
                torch.save(model.state_dict(), model_name)
            best_dev = current_score
            best_dev_epoch = idx
            # ## decode test
        speed, acc, p, r, f, _, _ = evaluate(data, model, "test", data.nbest)
        if f > best_test:
            best_test = f
            best_test_epoch = idx

        test_finish = time.time()
        test_cost = test_finish - dev_finish
        if data.seg:
            print(
                "Test: time: %.2fs, speed: %.2fst/s; acc: %.4f, p: %.4f, r: %.4f, f: %.4f"
                % (test_cost, speed, acc, p, r, f))
        else:
            print("Test: time: %.2fs, speed: %.2fst/s; acc: %.4f" %
                  (test_cost, speed, acc))
        print('best_dev_score: %.4f, best_dev_epoch:%d' %
              (best_dev, best_dev_epoch))
        print('best_test_score: %.4f, best_test_epoch:%d' %
              (best_test, best_test_epoch))
        gc.collect()
Beispiel #5
0
def train(data):
    ''' initialize model and train
    '''
    print('Training model...')
    data.show_data_summary()
    # save dset
    save_data_name = data.model_dir + '.dset'
    data.save(save_data_name)
    # save exportable model architecture (for deployment)
    data.save_export(data.model_dir + '.xpt')
    model = SeqModel(data)
    if data.optimizer.lower() == 'sgd':
        optimizer = optim.SGD(model.parameters(),
                              lr=data.HP_lr,
                              momentum=data.HP_momentum,
                              weight_decay=data.HP_l2)
    elif data.optimizer.lower() == 'adagrad':
        optimizer = optim.Adagrad(model.parameters(),
                                  lr=data.HP_lr,
                                  weight_decay=data.HP_l2)
    elif data.optimizer.lower() == 'adadelta':
        optimizer = optim.SGD(model.parameters(),
                              lr=data.HP_lr,
                              weight_decay=data.HP_l2)
    elif data.optimizer.lower() == 'rmsprop':
        optimizer = optim.RMSprop(model.parameters(),
                                  lr=data.HP_lr,
                                  weight_decay=data.HP_l2)
    elif data.optimizer.lower() == 'adam':
        optimizer = optim.Adam(model.parameters(),
                               lr=data.HP_lr,
                               weight_decay=data.HP_l2)
    else:
        print('Optimizer illegal: {}'.format(data.optimizer))
        exit(1)
    best_dev = -10
    ## start training
    for idx in range(data.iteration):
        epoch_start = time.time()
        temp_start = epoch_start
        print("Epoch {}/{}".format(idx, data.iteration))
        if data.optimizer.lower() == 'sgd':
            optimizer = lr_decay(optimizer, idx, data.HP_lr_decay, data.HP_lr)
        instance_count = 0
        sample_id = 0
        sample_loss = 0
        total_loss = 0
        right_token = 0
        whole_token = 0
        random.shuffle(data.train_Ids)
        ## set model in train mode
        model.train()
        model.zero_grad()
        batch_size = data.batch_size
        #batch_id = 0
        train_num = len(data.train_Ids)
        total_batch = train_num // batch_size + 1
        for batch_id in range(total_batch):
            start = batch_id * batch_size
            end = (batch_id + 1) * batch_size
            if end > train_num:
                end = train_num
            instance = data.train_Ids[start:end]
            instance_texts = data.train_texts[start:end]
            if not instance:
                continue
            batch_word, batch_features, batch_wordlen, batch_wordrecover, batch_char, batch_charlen, batch_charrecover, batch_label, mask = batchify_with_label(
                instance, data.HP_gpu, volatile_flag=False, label_flag=True)

            #print(batch_char.size())
            #print(batch_char.max())
            instance_count += 1
            loss, tag_seq = model.neg_log_likelihood_loss(
                batch_word, batch_features, batch_wordlen, batch_char,
                batch_charlen, batch_charrecover, batch_label, mask)
            right, whole = predict_check(tag_seq, batch_label, mask)
            right_token += right
            whole_token += whole
            sample_loss += loss.item()
            total_loss += loss.item()
            if end % 500 == 0:
                temp_time = time.time()
                temp_cost = temp_time - temp_start
                temp_start = temp_time
                print(' Instance {}; Time {:.2}s; loss {:.4}; acc {}/{}={:.4}'.
                      format(end, temp_cost, sample_loss, right_token,
                             whole_token, (right_token + 0.) / whole_token))
                if sample_loss > 1e8 or str(sample_loss) == 'nan':
                    print(
                        'ERROR: LOSS EXPLOSION (>1e8) ! Please set adapted parameters and structure! EXIT ...'
                    )
                    exit(1)
                sys.stdout.flush()
                sample_loss = 0

            loss.backward()
            optimizer.step()
            model.zero_grad()

        temp_time = time.time()
        temp_cost = temp_time - temp_start
        print(' Instance {}; Time {:.2}s; loss {:.4}; acc {}/{}={:.4}'.format(
            end, temp_cost, sample_loss, right_token, whole_token,
            (right_token + 0.) / whole_token))

        epoch_finish = time.time()
        epoch_cost = epoch_finish - epoch_start
        print(
            ' Epoch: {} training finished. Time: {:.2}s; speed: {:.2}st/s; total loss: {}'
            .format(idx, epoch_cost, train_num / epoch_cost, total_loss))
        if total_loss > 1e8 or str(sample_loss) == 'nan':
            print(
                'ERROR: LOSS EXPLOSION (>1e8) ! Please set adapted parameters and structure! EXIT ...'
            )
            exit(1)
        # continue
        speed, acc, p, r, f, _, _ = evaluate(data, model, "dev")
        dev_finish = time.time()
        dev_cost = dev_finish - epoch_finish
        # saving dev results json for model analysis
        dev_res = tuple((speed, acc, p, r, f))

        if data.seg:
            current_score = f
            print(
                "Dev: time: {:.2}s, speed {:.2}st/s; acc: {:.4}, p: {:.4}, r: {:.4}, f: {:.4}"
                .format(dev_cost, speed, acc, p, r, f))
        else:
            current_score = acc
            print("Dev: time: {:.2}s, speed {:.2}st/s; acc: {:.4}".format(
                dev_cost, speed, acc))

        # decode test
        speed, acc, p, r, f, _, _ = evaluate(data, model, 'test')
        test_finish = time.time()
        test_cost = test_finish - dev_finish
        if data.seg:
            print(
                "Test: time: {:.2}s, speed {:.2}st/s; acc: {:.4}, p: {:.4}, r: {:.4}, f: {:.4}"
                .format(dev_cost, speed, acc, p, r, f))
        else:
            print("Test: time: {:.2}s, speed {:.2}st/s; acc: {:.4}".format(
                dev_cost, speed, acc))

        if current_score > best_dev:
            if data.seg:
                print('"Exceed previous best f score:', best_dev)
            else:
                print('"Exceed previous best acc score:', best_dev)

            model_name = data.model_dir + '.' + str(idx) + '.model'
            print('Save current best model in file:', model_name)
            torch.save(model.state_dict(), model_name)
            best_dev = current_score
            path2info = data.model_dir + '.infos'
            save_infos(data, dev_res, path2info)

        gc.collect()
    print('Training done!')
    return best_dev
Beispiel #6
0
def train(data):
    print("Training model...")
    data.show_data_summary()
    save_data_name = data.model_dir + ".dset"
    data.save(save_data_name)
    model = SeqModel(data)
    for name, param in model.named_parameters():
        if param.requires_grad:
            print(name)
    loss_function = nn.NLLLoss()
    if data.optimizer.lower() == "sgd":
        optimizer = optim.SGD(model.parameters(),
                              lr=data.HP_lr,
                              momentum=data.HP_momentum,
                              weight_decay=data.HP_l2)
    elif data.optimizer.lower() == "adagrad":
        optimizer = optim.Adagrad(model.parameters(),
                                  lr=data.HP_lr,
                                  weight_decay=data.HP_l2)
    elif data.optimizer.lower() == "adadelta":
        optimizer = optim.Adadelta(model.parameters(),
                                   lr=data.HP_lr,
                                   weight_decay=data.HP_l2)
    elif data.optimizer.lower() == "rmsprop":
        optimizer = optim.RMSprop(model.parameters(),
                                  lr=data.HP_lr,
                                  weight_decay=data.HP_l2)
    elif data.optimizer.lower() == "adam":
        optimizer = optim.Adam(model.parameters(),
                               lr=data.HP_lr,
                               weight_decay=data.HP_l2)
    else:
        print("Optimizer illegal: %s" % (data.optimizer))
        exit(1)
    best_dev = -10
    dev_f = []
    test_f = []
    perplexity_1 = []
    perplexity_2 = []
    best_epoch = 0
    # data.HP_iteration = 1
    LM_data = data.train_Ids_2
    ## start training
    for idx in range(data.HP_iteration):
        epoch_start = time.time()
        temp_start = epoch_start
        print("Epoch: %s/%s" % (idx, data.HP_iteration))
        if data.optimizer == "SGD":
            optimizer = lr_decay(optimizer, idx, data.HP_lr_decay, data.HP_lr)
        instance_count = 0
        total_perplexity_1 = 0
        total_perplexity_2 = 0
        total_loss_1 = 0
        total_loss_2 = 0
        total_loss_3 = 0
        total_loss_4 = 0
        random.shuffle(data.train_Ids_1)
        random.shuffle(data.train_Ids_2)

        ## set model in train model
        model.train()
        model.zero_grad()
        batch_size = data.HP_batch_size
        batch_id = 0

        ###co-train for 4 models
        train_num_1 = len(data.train_Ids_1)
        train_num_2 = len(data.train_Ids_2)
        train_num_3 = len(LM_data)
        total_batch_1 = train_num_1 // batch_size + 1
        batch_size_2 = train_num_2 // total_batch_1
        l_batch_num_2 = train_num_2 - total_batch_1 * batch_size_2

        start_2 = end_2 = 0

        for batch_id in range(total_batch_1):
            start = batch_id * batch_size
            end = (batch_id + 1) * batch_size
            start_2 = end_2
            if batch_id < l_batch_num_2:
                end_2 = start_2 + (batch_size_2 + 1)
            else:
                end_2 = start_2 + batch_size_2

            if end > train_num_1:
                end = train_num_1
            if end_2 > train_num_2:
                end_2 = train_num_2

            instance_1 = data.train_Ids_1[start:end]
            instance_2 = data.train_Ids_2[start_2:end_2]

            if not instance_1 or not instance_2:
                continue
            #seq label
            batch_word_1, batch_features_1, batch_wordlen_1, batch_wordrecover_1, batch_char_1, batch_charlen_1, batch_charrecover_1, batch_label_1, lm_seq_tensor_1, mask_1 = batchify_with_label(
                instance_1, data.HP_gpu)
            batch_word_2, batch_features_2, batch_wordlen_2, batch_wordrecover_2, batch_char_2, batch_charlen_2, batch_charrecover_2, batch_label_2, lm_seq_tensor_2, mask_2 = batchify_with_label(
                instance_2, data.HP_gpu)

            batch_word = [batch_word_1, batch_word_2]
            batch_features = [batch_features_1, batch_features_2]
            batch_wordlen = [batch_wordlen_1, batch_wordlen_2]
            batch_char = [batch_char_1, batch_char_2]
            batch_charlen = [batch_charlen_1, batch_charlen_2]
            batch_charrecover = [batch_charrecover_1, batch_charrecover_2]
            batch_label = [batch_label_1, batch_label_2]
            lm_seq_tensor = [lm_seq_tensor_1, lm_seq_tensor_2]
            mask = [mask_1, mask_2]
            instance_count += 1
            loss_ = []
            perplexity_ = []

            # LM 1
            loss, perplexity, tag_seq_forward, tag_seq_backward, tag_seq = model.loss(
                'model1', batch_word[0], batch_features[0], batch_wordlen[0],
                batch_char[0], batch_charlen[0], batch_charrecover[0],
                batch_label[0], lm_seq_tensor[0], mask[0])

            loss_.append(loss)
            perplexity_.append(perplexity)

            #seq label 1
            loss, perplexity, tag_seq_forward, tag_seq_backward, tag_seq = model.loss(
                'model2', batch_word[0], batch_features[0], batch_wordlen[0],
                batch_char[0], batch_charlen[0], batch_charrecover[0],
                batch_label[0], lm_seq_tensor[0], mask[0])
            loss_.append(loss)

            # LM 2
            loss, perplexity, tag_seq_forward, tag_seq_backward, tag_seq = model.loss(
                'model3', batch_word[1], batch_features[1], batch_wordlen[1],
                batch_char[1], batch_charlen[1], batch_charrecover[1],
                batch_label[1], lm_seq_tensor[1], mask[1])

            loss_.append(loss)
            perplexity_.append(perplexity)

            #seq label 2
            loss, perplexity, tag_seq_forward, tag_seq_backward, tag_seq = model.loss(
                'model4', batch_word[1], batch_features[1], batch_wordlen[1],
                batch_char[1], batch_charlen[1], batch_charrecover[1],
                batch_label[1], lm_seq_tensor[1], mask[1])
            loss_.append(loss)

            loss_rate = [1.0, 1.0, 1.0, 2.0]
            loss = 0
            model_num = len(loss_)
            for loss_id in range(model_num):
                loss += loss_[loss_id] * loss_rate[loss_id]
            loss.backward()
            optimizer.step()
            model.zero_grad()

            total_loss_1 += loss_[0].data[0]
            total_loss_2 += loss_[1].data[0]
            total_loss_3 += loss_[2].data[0]
            total_loss_4 += loss_[3].data[0]
            total_perplexity_1 += perplexity_[0].data[0]
            total_perplexity_2 += perplexity_[1].data[0]

        epoch_finish = time.time()
        epoch_cost = epoch_finish - epoch_start
        LM_perplex_1 = math.exp(total_perplexity_1 / total_batch_1)
        LM_perplex_2 = math.exp(total_perplexity_2 / total_batch_1)
        perplexity_1.append(LM_perplex_1)
        perplexity_2.append(LM_perplex_2)
        print("Epoch: %s training finished. Time: %.2fs" % (idx, epoch_cost))
        print("Epoch: %s training finished. Time: %.2fs,   total loss: %s" %
              (idx, epoch_cost, total_loss_2))
        print("totalloss:", total_loss_2)
        print(
            "Epoch: %s training finished. Time: %.2fs,  total perplexity: %.4f"
            % (idx, epoch_cost, LM_perplex_1))
        print("Epoch: %s training finished. Time: %.2fs,   total loss: %s" %
              (idx, epoch_cost, total_loss_4))
        print("totalloss:", total_loss_4)
        print(
            "Epoch: %s training finished. Time: %.2fs,   total perplexity: %.4f"
            % (idx, epoch_cost, LM_perplex_2))

        speed, acc, p, r, f, _, _ = evaluate(data, model, "dev")
        dev_f.append(f[1])
        dev_finish = time.time()
        dev_cost = dev_finish - epoch_finish

        if data.seg:
            current_score = f[1]
            print(
                "Dev: time: %.2fs, speed: %.2fst/s; acc: %.4f, p: %.4f, r: %.4f, f: %.4f"
                % (dev_cost, speed, acc[0], p[0], r[0], f[0]))
            print(
                "Dev: time: %.2fs, speed: %.2fst/s; acc: %.4f, p: %.4f, r: %.4f, f: %.4f"
                % (dev_cost, speed, acc[1], p[1], r[1], f[1]))
        else:
            current_score = acc[1]
            print("Dev: time: %.2fs speed: %.2fst/s; acc: %.4f" %
                  (dev_cost, speed, acc[0]))
            print("Dev: time: %.2fs speed: %.2fst/s; acc: %.4f" %
                  (dev_cost, speed, acc[1]))

        if current_score > best_dev:
            best_epoch = idx
            if data.seg:
                print("Exceed previous best f score:", best_dev)
            else:
                print("Exceed previous best acc score:", best_dev)
            # model_name = data.model_dir +'.'+ str(idx) + ".model"
            model_name = data.model_dir + ".model"
            print("Save current best model in file:", model_name)
            torch.save(model.state_dict(), model_name)
            best_dev = current_score
        # ## decode test
        speed, acc, p, r, f, _, _ = evaluate(data, model, "test")
        test_f.append(f[1])
        test_finish = time.time()
        test_cost = test_finish - dev_finish
        if data.seg:
            print(
                "Test: time: %.2fs, speed: %.2fst/s; acc: %.4f, p: %.4f, r: %.4f, f: %.4f"
                % (test_cost, speed, acc[0], p[0], r[0], f[0]))
            print(
                "Test: time: %.2fs, speed: %.2fst/s; acc: %.4f, p: %.4f, r: %.4f, f: %.4f"
                % (test_cost, speed, acc[1], p[1], r[1], f[1]))
        else:
            print("Test: time: %.2fs, speed: %.2fst/s; acc: %.4f" %
                  (test_cost, speed, acc[0]))
            print("Test: time: %.2fs, speed: %.2fst/s; acc: %.4f" %
                  (test_cost, speed, acc[1]))
        gc.collect()

    print("the best dev score is in epoch %s, dev:%.4f, test:%.4f" %
          (best_epoch, dev_f[best_epoch], test_f[best_epoch]))
    with open('data/fscore_13PC.txt', 'w') as ft:
        ft.write('dev f scores:\n')
        for t in dev_f:
            ft.write(str(round(t, 6)))
            ft.write(' ')
        ft.write('\n')
        ft.write('test f scores:\n')
        for t in test_f:
            ft.write(str(round(t, 6)))
            ft.write(' ')
        ft.write('\n')
        ft.write('LM 1 perplexity:\n')
        for t in perplexity_1:
            ft.write(str(round(t, 6)))
            ft.write(' ')
        ft.write('\n')
        ft.write('LM 2 perplexity:\n')
        for t in perplexity_2:
            ft.write(str(round(t, 6)))
            ft.write(' ')

    if data.task_emb_save_dir is not None:
        with open('data/task_emb.txt', 'w') as ft:
            for task, i in data.task_alphabet.iteritems():
                ft.write(task)
                ft.write(' ')
                for t in model.word_hidden.LSTM_param_generator.task_emb_vocab.weight.data[
                        i]:
                    ft.write(str(round(t, 6)))
                    ft.write(' ')
                ft.write('\n')
    if data.domain_emb_save_dir is not None:
        with open('data/domain_emb.txt', 'w') as fd:
            for domain, i in data.domain_alphabet.iteritems():
                fd.write(domain)
                fd.write(' ')
                for t in model.word_hidden.LSTM_param_generator.domain_emb_vocab.weight.data[
                        i]:
                    fd.write(str(round(t, 6)))
                    fd.write(' ')
                fd.write('\n')
Beispiel #7
0
def pipeline(data, ner_dir, re_dir):

    seq_model = SeqModel(data)
    seq_wordseq = WordSequence(data, False, True, True, data.use_char)

    classify_wordseq = WordSequence(data, True, False, True, False)
    classify_model = ClassifyModel(data)
    if torch.cuda.is_available():
        classify_model = classify_model.cuda(data.HP_gpu)

    iter_parameter = itertools.chain(
        *map(list, [seq_wordseq.parameters(),
                    seq_model.parameters()]))
    seq_optimizer = optim.Adam(iter_parameter,
                               lr=opt.ner_lr,
                               weight_decay=data.HP_l2)
    iter_parameter = itertools.chain(*map(
        list, [classify_wordseq.parameters(),
               classify_model.parameters()]))
    classify_optimizer = optim.Adam(iter_parameter,
                                    lr=opt.re_lr,
                                    weight_decay=data.HP_l2)

    if data.tune_wordemb == False:
        my_utils.freeze_net(seq_wordseq.wordrep.word_embedding)
        my_utils.freeze_net(classify_wordseq.wordrep.word_embedding)

    re_X_positive = []
    re_Y_positive = []
    re_X_negative = []
    re_Y_negative = []
    relation_vocab = data.re_feature_alphabets[
        data.re_feature_name2id['[RELATION]']]
    my_collate = my_utils.sorted_collate1
    for i in range(len(data.re_train_X)):
        x = data.re_train_X[i]
        y = data.re_train_Y[i]

        if y != relation_vocab.get_index("</unk>"):
            re_X_positive.append(x)
            re_Y_positive.append(y)
        else:
            re_X_negative.append(x)
            re_Y_negative.append(y)

    re_test_loader = DataLoader(my_utils.RelationDataset(
        data.re_test_X, data.re_test_Y),
                                data.HP_batch_size,
                                shuffle=False,
                                collate_fn=my_collate)

    best_ner_score = -1
    best_re_score = -1

    for idx in range(data.HP_iteration):
        epoch_start = time.time()

        seq_wordseq.train()
        seq_wordseq.zero_grad()
        seq_model.train()
        seq_model.zero_grad()

        classify_wordseq.train()
        classify_wordseq.zero_grad()
        classify_model.train()
        classify_model.zero_grad()

        batch_size = data.HP_batch_size

        random.shuffle(data.train_Ids)
        ner_train_num = len(data.train_Ids)
        ner_total_batch = ner_train_num // batch_size + 1

        re_train_loader, re_train_iter = makeRelationDataset(
            re_X_positive, re_Y_positive, re_X_negative, re_Y_negative,
            data.unk_ratio, True, my_collate, data.HP_batch_size)
        re_total_batch = len(re_train_loader)

        total_batch = max(ner_total_batch, re_total_batch)
        min_batch = min(ner_total_batch, re_total_batch)

        for batch_id in range(total_batch):

            if batch_id < ner_total_batch:
                start = batch_id * batch_size
                end = (batch_id + 1) * batch_size
                if end > ner_train_num:
                    end = ner_train_num
                instance = data.train_Ids[start:end]
                batch_word, batch_features, batch_wordlen, batch_wordrecover, batch_char, batch_charlen, batch_charrecover, batch_label, mask, \
                    batch_permute_label = batchify_with_label(instance, data.HP_gpu)

                hidden = seq_wordseq.forward(batch_word, batch_features,
                                             batch_wordlen, batch_char,
                                             batch_charlen, batch_charrecover,
                                             None, None)
                hidden_adv = None
                loss, tag_seq = seq_model.neg_log_likelihood_loss(
                    hidden, hidden_adv, batch_label, mask)
                loss.backward()
                seq_optimizer.step()
                seq_wordseq.zero_grad()
                seq_model.zero_grad()

            if batch_id < re_total_batch:
                [batch_word, batch_features, batch_wordlen, batch_wordrecover, \
                 batch_char, batch_charlen, batch_charrecover, \
                 position1_seq_tensor, position2_seq_tensor, e1_token, e1_length, e2_token, e2_length, e1_type, e2_type, \
                 tok_num_betw, et_num], [targets, targets_permute] = my_utils.endless_get_next_batch_without_rebatch1(
                    re_train_loader, re_train_iter)

                hidden = classify_wordseq.forward(batch_word, batch_features,
                                                  batch_wordlen, batch_char,
                                                  batch_charlen,
                                                  batch_charrecover,
                                                  position1_seq_tensor,
                                                  position2_seq_tensor)
                hidden_adv = None
                loss, pred = classify_model.neg_log_likelihood_loss(
                    hidden, hidden_adv, batch_wordlen, e1_token, e1_length,
                    e2_token, e2_length, e1_type, e2_type, tok_num_betw,
                    et_num, targets)
                loss.backward()
                classify_optimizer.step()
                classify_wordseq.zero_grad()
                classify_model.zero_grad()

        epoch_finish = time.time()
        print("epoch: %s training finished. Time: %.2fs" %
              (idx, epoch_finish - epoch_start))

        # _, _, _, _, f, _, _ = ner.evaluate(data, seq_wordseq, seq_model, "test")
        ner_score = ner.evaluate1(data, seq_wordseq, seq_model, "test")
        print("ner evaluate: f: %.4f" % (ner_score))

        re_score = relation_extraction.evaluate(classify_wordseq,
                                                classify_model, re_test_loader)
        print("re evaluate: f: %.4f" % (re_score))

        if ner_score + re_score > best_ner_score + best_re_score:
            print("new best score: ner: %.4f , re: %.4f" %
                  (ner_score, re_score))
            best_ner_score = ner_score
            best_re_score = re_score

            torch.save(seq_wordseq.state_dict(),
                       os.path.join(ner_dir, 'wordseq.pkl'))
            torch.save(seq_model.state_dict(),
                       os.path.join(ner_dir, 'model.pkl'))
            torch.save(classify_wordseq.state_dict(),
                       os.path.join(re_dir, 'wordseq.pkl'))
            torch.save(classify_model.state_dict(),
                       os.path.join(re_dir, 'model.pkl'))
Beispiel #8
0
def train(data):
    print "Training model..."
    data.show_data_summary()
    save_data_name = data.model_dir + ".dset"
    data.save(save_data_name)
    model = SeqModel(data)

    loss_function = nn.NLLLoss()
    if data.optimizer.lower() == "sgd":
        optimizer = optim.SGD(model.parameters(),
                              lr=data.HP_lr,
                              momentum=data.HP_momentum,
                              weight_decay=data.HP_l2)
    elif data.optimizer.lower() == "adagrad":
        optimizer = optim.Adagrad(model.parameters(),
                                  lr=data.HP_lr,
                                  weight_decay=data.HP_l2)
    elif data.optimizer.lower() == "adadelta":
        optimizer = optim.Adadelta(model.parameters(),
                                   lr=data.HP_lr,
                                   weight_decay=data.HP_l2)
    elif data.optimizer.lower() == "rmsprop":
        optimizer = optim.RMSprop(model.parameters(),
                                  lr=data.HP_lr,
                                  weight_decay=data.HP_l2)
    elif data.optimizer.lower() == "adam":
        optimizer = optim.Adam(model.parameters(),
                               lr=data.HP_lr,
                               weight_decay=data.HP_l2)
    else:
        print("Optimizer illegal: %s" % (data.optimizer))
        exit(0)
    best_dev = -10

    for idx in range(data.HP_iteration):
        epoch_start = time.time()
        temp_start = epoch_start
        print("Epoch: %s/%s" % (idx, data.HP_iteration))
        if data.optimizer == "SGD":
            optimizer = lr_decay(optimizer, idx, data.HP_lr_decay, data.HP_lr)
        instance_count = 0
        sample_id = 0
        sample_loss = 0
        total_loss = 0

        sample_loss = {idtask: 0 for idtask in range(data.HP_tasks)}
        right_token = {idtask: 0 for idtask in range(data.HP_tasks)}
        whole_token = {idtask: 0 for idtask in range(data.HP_tasks)}
        random.shuffle(data.train_Ids)
        model.train()
        model.zero_grad()
        batch_size = data.HP_batch_size
        batch_id = 0
        train_num = len(data.train_Ids)
        total_batch = train_num // batch_size + 1
        for batch_id in range(total_batch):
            start = batch_id * batch_size
            end = (batch_id + 1) * batch_size
            if end > train_num:
                end = train_num
            instance = data.train_Ids[start:end]
            if not instance:
                continue

            batch_word, batch_features, batch_wordlen, batch_wordrecover, batch_char, batch_charlen, batch_charrecover, batch_label, mask = batchify_with_label(
                instance, data.HP_gpu, inference=False)
            instance_count += 1

            loss, losses, tag_seq = model.neg_log_likelihood_loss(
                batch_word,
                batch_features,
                batch_wordlen,
                batch_char,
                batch_charlen,
                batch_charrecover,
                batch_label,
                mask,
                inference=False)
            for idtask in range(data.HP_tasks):
                right, whole = predict_check(tag_seq[idtask],
                                             batch_label[idtask], mask)
                sample_loss[idtask] += losses[idtask].data[0]
                right_token[idtask] += right
                whole_token[idtask] += whole
                if end % 500 == 0:
                    temp_time = time.time()
                    temp_cost = temp_time - temp_start
                    temp_start = temp_time
                    print(
                        "     Instance: %s; Task %s; Time: %.2fs; loss: %.4f; acc: %s/%s=%.4f"
                        % (end, idtask, temp_cost, sample_loss[idtask],
                           right_token[idtask], whole_token[idtask],
                           (right_token[idtask] + 0.) / whole_token[idtask]))
                    if sample_loss[idtask] > 1e8 or str(sample_loss) == "nan":
                        print "ERROR: LOSS EXPLOSION (>1e8) ! PLEASE SET PROPER PARAMETERS AND STRUCTURE! EXIT...."
                        exit(0)
                    sys.stdout.flush()
                    sample_loss[idtask] = 0

            if end % 500 == 0:
                print "--------------------------------------------------------------------------"

            total_loss += loss.data[0]
            loss.backward()
            optimizer.step()
            model.zero_grad()

        temp_time = time.time()
        temp_cost = temp_time - temp_start
        for idtask in range(data.HP_tasks):
            print(
                "     Instance: %s; Time: %.2fs; loss: %.4f; acc: %s/%s=%.4f" %
                (end, temp_cost, sample_loss[idtask], right_token[idtask],
                 whole_token[idtask],
                 (right_token[idtask] + 0.) / whole_token[idtask]))

        epoch_finish = time.time()
        epoch_cost = epoch_finish - epoch_start
        print(
            "Epoch: %s training finished. Time: %.2fs, speed: %.2fst/s,  total loss: %s"
            % (idx, epoch_cost, train_num / epoch_cost, total_loss))
        print "totalloss:", total_loss
        if total_loss > 1e8 or str(total_loss) == "nan":
            print "ERROR: LOSS EXPLOSION (>1e8) ! PLEASE SET PROPER PARAMETERS AND STRUCTURE! EXIT...."
            exit(0)

        summary = evaluate(data, model, "dev", False, False)

        dev_finish = time.time()
        dev_cost = dev_finish - epoch_finish

        current_scores = []
        for idtask in xrange(0, data.HP_tasks):
            speed, acc, p, r, f, pred_labels, _ = summary[idtask]
            if data.seg:
                current_score = f
                current_scores.append(f)
                print(
                    "Task %d Dev: time: %.2fs, speed: %.2fst/s; acc: %.4f, p: %.4f, r: %.4f, f: %.4f"
                    % (idtask, dev_cost, speed, acc, p, r, f))
            else:
                current_score = acc
                current_scores.append(acc)
                print("Task %d Dev: time: %.2fs speed: %.2fst/s; acc: %.4f" %
                      (idtask, dev_cost, speed, acc))

        pred_results_tasks = []
        pred_scores_tasks = []
        for idtask in xrange(data.HP_tasks):
            speed, acc, p, r, f, pred_results, pred_scores = summary[idtask]
            pred_results_tasks.append(pred_results)
            pred_scores_tasks.append(pred_scores_tasks)

        # EVALUATING ON DEV SET FOR CHOOSING THE BEST MODEL

        # MULTITASK LEARNING OF BOTH CONSTITUENCY AND DEPENDENCY PARSING
        if data.dependency_parsing and data.constituency_parsing:

            # CONSTITUENCY PARSING
            with tempfile.NamedTemporaryFile() as f_decode_mt:
                with tempfile.NamedTemporaryFile() as f_decode_st:

                    if len(data.index_of_main_tasks) > 1:
                        data.decode_dir = f_decode_mt.name
                        decoded_st_dir = f_decode_st.name
                        data.write_decoded_results(pred_results_tasks, 'dev')
                        # transform between @ and {}
                        rebuild.rebuild_tree(data.decode_dir, decoded_st_dir)
                    else:
                        if data.decode_dir is None:
                            data.decode_dir = f_decode_st.name
                            decoded_st_dir = f_decode_st.name
                        data.write_decoded_results(pred_results_tasks, 'dev')

                    # evaluate the output comparing to the gold
                    command = [
                        "PYTHONPATH=" + data.cons2label, "python",
                        data.evaluate, " --input ", decoded_st_dir, " --gold ",
                        data.gold_dev_cons, " --evalb ", data.evalb, ">",
                        f_decode_mt.name
                    ]

                    os.system(" ".join(command))
                    current_score_cons = float([
                        l for l in f_decode_mt.read().split("\n")
                        if l.startswith("Bracketing FMeasure")
                    ][0].split("=")[1])
                    print(current_score_cons)

            # DEPENDENCY PARSING
            with tempfile.NamedTemporaryFile() as f_decode_mt:
                with tempfile.NamedTemporaryFile() as f_decode_st:

                    if len(data.index_of_main_tasks) > 1:
                        data.decode_dir = f_decode_mt.name
                        decoded_st_dir = f_decode_st.name
                        data.write_decoded_results(pred_results_tasks, 'dev')

                    else:

                        print("else")
                        if data.decode_dir is None:
                            data.decode_dir = f_decode_st.name
                            decoded_st_dir = f_decode_st.name
                        data.write_decoded_results(pred_results_tasks, 'dev')

                    output_nn = open(data.decode_dir)

                    tmp = tempfile.NamedTemporaryFile().name

                    decodeDependencies.decode(output_nn, tmp, data.language)
                    current_score_depen = float(
                        decodeDependencies.evaluateDependencies(
                            data.gold_dev_dep, tmp))
                    print(current_score_depen)

        # SINGLE OR MULTITASK CONSTITUENCY PARSING
        elif data.constituency_parsing:

            with tempfile.NamedTemporaryFile() as f_decode_mt:
                with tempfile.NamedTemporaryFile() as f_decode_st:

                    if len(data.index_of_main_tasks) > 1:
                        data.decode_dir = f_decode_mt.name
                        decoded_st_dir = f_decode_st.name
                        data.write_decoded_results(pred_results_tasks, 'dev')
                        # transform between @ and {}
                        rebuild.rebuild_tree(data.decode_dir, decoded_st_dir)
                    else:

                        if data.decode_dir is None:
                            data.decode_dir = f_decode_st.name
                            decoded_st_dir = f_decode_st.name
                        data.write_decoded_results(pred_results_tasks, 'dev')

                    command = [
                        "PYTHONPATH=" + data.cons2label, "python",
                        data.evaluate, " --input ", decoded_st_dir, " --gold ",
                        data.gold_dev_cons, " --evalb ", data.evalb, ">",
                        f_decode_mt.name
                    ]

                    os.system(" ".join(command))
                    current_score = float([
                        l for l in f_decode_mt.read().split("\n")
                        if l.startswith("Bracketing FMeasure")
                    ][0].split("=")[1])
                    print "Current Score (from EVALB)", current_score, "Previous best dev (from EVALB)", best_dev

        # SINGLE OR MULTITASK DEPENDENCY PARSING
        elif data.dependency_parsing:

            with tempfile.NamedTemporaryFile() as f_decode_mt:
                with tempfile.NamedTemporaryFile() as f_decode_st:

                    # If we are learning multiple task we move it as a sequence
                    # labeling
                    if len(data.index_of_main_tasks) > 1:
                        data.decode_dir = f_decode_mt.name
                        decoded_st_dir = f_decode_st.name
                        data.write_decoded_results(pred_results_tasks, 'dev')

                    else:

                        if data.decode_dir is None:
                            data.decode_dir = f_decode_st.name
                            decoded_st_dir = f_decode_st.name
                        data.write_decoded_results(pred_results_tasks, 'dev')

                    output_nn = open(data.decode_dir)
                    tmp = tempfile.NamedTemporaryFile().name

                    decodeDependencies.decode(output_nn, tmp, data.language)
                    current_score = decodeDependencies.evaluateDependencies(
                        data.gold_dev_dep, tmp)
                    print "Current Score (from LAS)", current_score, "Previous best dev (from LAS)", best_dev

        else:

            current_score = sum(current_scores) / len(current_scores)
            print "Current Score", current_score, "Previous best dev", best_dev

        # SAVE THE BEST MODEL

        # by default save model with highest harmonic mean when parsing both
        # dependency and constituency trees
        if data.dependency_parsing and data.constituency_parsing:
            harmonic_mean = (2 * current_score_cons * current_score_depen) / \
                (current_score_cons + current_score_depen)
            if harmonic_mean > best_dev:
                print("New harmonic mean " + repr(harmonic_mean))
                print("Exceed previous best harmonic mean score: " +
                      repr(best_dev) + " LAS " + repr(current_score_depen) +
                      " F1 " + repr(current_score_cons))
                model_name = data.model_dir + ".model"
                print "Overwritting model to", model_name
                torch.save(model.state_dict(), model_name)
                best_dev = harmonic_mean
            else:
                print("sofar the best " + repr(best_dev))
        else:

            if current_score > best_dev:
                if data.seg:
                    print "Exceed previous best f score:", best_dev
                else:
                    print "Exceed previous best acc score:", best_dev

                model_name = data.model_dir + ".model"
                print "Overwritting model to", model_name
                torch.save(model.state_dict(), model_name)

                best_dev = current_score
            else:
                print("sofar the best " + repr(best_dev))

        summary = evaluate(data, model, "test", False)
        test_finish = time.time()
        test_cost = test_finish - dev_finish

        for idtask in xrange(0, data.HP_tasks):
            speed, acc, p, r, f, _, _ = summary[idtask]
            if data.seg:
                current_score = f
                print(
                    "Task %d Test: time: %.2fs, speed: %.2fst/s; acc: %.4f, p: %.4f, r: %.4f, f: %.4f"
                    % (idtask, test_cost, speed, acc, p, r, f))
            else:
                current_score = acc
                print("Task %d Test: time: %.2fs speed: %.2fst/s; acc: %.4f" %
                      (idtask, test_cost, speed, acc))

        gc.collect()
Beispiel #9
0
def train(data, model_file):
    print "Training model..."

    model = SeqModel(data)
    wordseq = WordSequence(data, False, True, data.use_char)
    if opt.self_adv == 'grad':
        wordseq_adv = WordSequence(data, False, True, data.use_char)
    elif opt.self_adv == 'label':
        wordseq_adv = WordSequence(data, False, True, data.use_char)
        model_adv = SeqModel(data)
    else:
        wordseq_adv = None

    if data.optimizer.lower() == "sgd":
        optimizer = optim.SGD(model.parameters(), lr=data.HP_lr, momentum=data.HP_momentum, weight_decay=data.HP_l2)
    elif data.optimizer.lower() == "adagrad":
        optimizer = optim.Adagrad(model.parameters(), lr=data.HP_lr, weight_decay=data.HP_l2)
    elif data.optimizer.lower() == "adadelta":
        optimizer = optim.Adadelta(model.parameters(), lr=data.HP_lr, weight_decay=data.HP_l2)
    elif data.optimizer.lower() == "rmsprop":
        optimizer = optim.RMSprop(model.parameters(), lr=data.HP_lr, weight_decay=data.HP_l2)
    elif data.optimizer.lower() == "adam":
        if opt.self_adv == 'grad':
            iter_parameter = itertools.chain(*map(list, [wordseq.parameters(), wordseq_adv.parameters(), model.parameters()]))
            optimizer = optim.Adam(iter_parameter, lr=data.HP_lr, weight_decay=data.HP_l2)
        elif opt.self_adv == 'label':
            iter_parameter = itertools.chain(*map(list, [wordseq.parameters(), model.parameters()]))
            optimizer = optim.Adam(iter_parameter, lr=data.HP_lr, weight_decay=data.HP_l2)
            iter_parameter = itertools.chain(*map(list, [wordseq_adv.parameters(), model_adv.parameters()]))
            optimizer_adv = optim.Adam(iter_parameter, lr=data.HP_lr, weight_decay=data.HP_l2)

        else:
            iter_parameter = itertools.chain(*map(list, [wordseq.parameters(), model.parameters()]))
            optimizer = optim.Adam(iter_parameter, lr=data.HP_lr, weight_decay=data.HP_l2)

    else:
        print("Optimizer illegal: %s" % (data.optimizer))
        exit(0)
    best_dev = -10

    if data.tune_wordemb == False:
        my_utils.freeze_net(wordseq.wordrep.word_embedding)
        if opt.self_adv != 'no':
            my_utils.freeze_net(wordseq_adv.wordrep.word_embedding)


    # data.HP_iteration = 1
    ## start training
    for idx in range(data.HP_iteration):
        epoch_start = time.time()
        temp_start = epoch_start
        print("epoch: %s/%s" % (idx, data.HP_iteration))
        if data.optimizer == "SGD":
            optimizer = lr_decay(optimizer, idx, data.HP_lr_decay, data.HP_lr)
        instance_count = 0
        sample_id = 0
        sample_loss = 0
        total_loss = 0
        right_token = 0
        whole_token = 0
        random.shuffle(data.train_Ids)
        ## set model in train model
        wordseq.train()
        wordseq.zero_grad()
        if opt.self_adv == 'grad':
            wordseq_adv.train()
            wordseq_adv.zero_grad()
        elif opt.self_adv == 'label':
            wordseq_adv.train()
            wordseq_adv.zero_grad()
            model_adv.train()
            model_adv.zero_grad()
        model.train()
        model.zero_grad()
        batch_size = data.HP_batch_size
        batch_id = 0
        train_num = len(data.train_Ids)
        total_batch = train_num // batch_size + 1
        for batch_id in range(total_batch):
            start = batch_id * batch_size
            end = (batch_id + 1) * batch_size
            if end > train_num:
                end = train_num
            instance = data.train_Ids[start:end]
            if not instance:
                continue
            batch_word, batch_features, batch_wordlen, batch_wordrecover, batch_char, batch_charlen, batch_charrecover, batch_label, mask,\
                batch_permute_label = batchify_with_label(instance, data.HP_gpu)
            instance_count += 1

            if opt.self_adv == 'grad':
                hidden = wordseq.forward(batch_word, batch_features, batch_wordlen, batch_char, batch_charlen,batch_charrecover, None, None)
                hidden_adv = wordseq_adv.forward(batch_word, batch_features, batch_wordlen, batch_char, batch_charlen, batch_charrecover, None, None)
                loss, tag_seq = model.neg_log_likelihood_loss(hidden, hidden_adv, batch_label, mask)
                loss.backward()
                my_utils.reverse_grad(wordseq_adv)
                optimizer.step()
                wordseq.zero_grad()
                wordseq_adv.zero_grad()
                model.zero_grad()

            elif opt.self_adv == 'label' :
                wordseq.unfreeze_net()
                wordseq_adv.freeze_net()
                hidden = wordseq.forward(batch_word, batch_features, batch_wordlen, batch_char, batch_charlen,batch_charrecover, None, None)
                hidden_adv = wordseq_adv.forward(batch_word, batch_features, batch_wordlen, batch_char, batch_charlen, batch_charrecover, None, None)
                loss, tag_seq = model.neg_log_likelihood_loss(hidden, hidden_adv, batch_label, mask)
                loss.backward()
                optimizer.step()
                wordseq.zero_grad()
                wordseq_adv.zero_grad()
                model.zero_grad()

                wordseq.freeze_net()
                wordseq_adv.unfreeze_net()
                hidden = wordseq.forward(batch_word, batch_features, batch_wordlen, batch_char, batch_charlen,batch_charrecover, None, None)
                hidden_adv = wordseq_adv.forward(batch_word, batch_features, batch_wordlen, batch_char, batch_charlen, batch_charrecover, None, None)
                loss_adv, _ = model_adv.neg_log_likelihood_loss(hidden, hidden_adv, batch_permute_label, mask)
                loss_adv.backward()
                optimizer_adv.step()
                wordseq.zero_grad()
                wordseq_adv.zero_grad()
                model_adv.zero_grad()

            else:
                hidden = wordseq.forward(batch_word, batch_features, batch_wordlen, batch_char, batch_charlen, batch_charrecover, None, None)
                hidden_adv = None
                loss, tag_seq = model.neg_log_likelihood_loss(hidden, hidden_adv, batch_label, mask)
                loss.backward()
                optimizer.step()
                wordseq.zero_grad()
                model.zero_grad()


            # right, whole = predict_check(tag_seq, batch_label, mask)
            # right_token += right
            # whole_token += whole
            sample_loss += loss.data.item()
            total_loss += loss.data.item()
            if end % 500 == 0:
                # temp_time = time.time()
                # temp_cost = temp_time - temp_start
                # temp_start = temp_time
                # print("     Instance: %s; Time: %.2fs; loss: %.4f; acc: %s/%s=%.4f" % (
                # end, temp_cost, sample_loss, right_token, whole_token, (right_token + 0.) / whole_token))
                if sample_loss > 1e8 or str(sample_loss) == "nan":
                    print "ERROR: LOSS EXPLOSION (>1e8) ! PLEASE SET PROPER PARAMETERS AND STRUCTURE! EXIT...."
                    exit(0)
                sys.stdout.flush()
                sample_loss = 0

        # temp_time = time.time()
        # temp_cost = temp_time - temp_start
        # print("     Instance: %s; Time: %.2fs; loss: %.4f; acc: %s/%s=%.4f" % (
        # end, temp_cost, sample_loss, right_token, whole_token, (right_token + 0.) / whole_token))

        epoch_finish = time.time()
        epoch_cost = epoch_finish - epoch_start
        print("epoch: %s training finished. Time: %.2fs, speed: %.2fst/s,  total loss: %s" % (
        idx, epoch_cost, train_num / epoch_cost, total_loss))
        print "totalloss:", total_loss
        if total_loss > 1e8 or str(total_loss) == "nan":
            print "ERROR: LOSS EXPLOSION (>1e8) ! PLEASE SET PROPER PARAMETERS AND STRUCTURE! EXIT...."
            exit(0)
        # continue
        speed, acc, p, r, f, _, _ = evaluate(data, wordseq, model, "test")

        dev_finish = time.time()
        dev_cost = dev_finish - epoch_finish

        if data.seg:
            current_score = f
            print("Dev: time: %.2fs, speed: %.2fst/s; acc: %.4f, p: %.4f, r: %.4f, f: %.4f" % (
            dev_cost, speed, acc, p, r, f))
        else:
            current_score = acc
            print("Dev: time: %.2fs speed: %.2fst/s; acc: %.4f" % (dev_cost, speed, acc))

        if current_score > best_dev:
            if data.seg:
                print "Exceed previous best f score:", best_dev
            else:
                print "Exceed previous best acc score:", best_dev

            torch.save(wordseq.state_dict(), os.path.join(model_file, 'wordseq.pkl'))
            if opt.self_adv == 'grad':
                torch.save(wordseq_adv.state_dict(), os.path.join(model_file, 'wordseq_adv.pkl'))
            elif opt.self_adv == 'label':
                torch.save(wordseq_adv.state_dict(), os.path.join(model_file, 'wordseq_adv.pkl'))
                torch.save(model_adv.state_dict(), os.path.join(model_file, 'model_adv.pkl'))
            model_name = os.path.join(model_file, 'model.pkl')
            torch.save(model.state_dict(), model_name)
            best_dev = current_score

        gc.collect()
Beispiel #10
0
def train(data):
    print("Training model...")
    data.show_data_summary()
    save_data_name = data.model_dir +".dset"
    data.save(save_data_name)
    model = SeqModel(data)
    loss_function = nn.NLLLoss()
    if data.optimizer.lower() == "sgd":
        optimizer = optim.SGD(model.parameters(), lr=data.HP_lr, momentum=data.HP_momentum,weight_decay=data.HP_l2)
    elif data.optimizer.lower() == "adagrad":
        optimizer = optim.Adagrad(model.parameters(), lr=data.HP_lr, weight_decay=data.HP_l2)
    elif data.optimizer.lower() == "adadelta":
        optimizer = optim.Adadelta(model.parameters(), lr=data.HP_lr, weight_decay=data.HP_l2)
    elif data.optimizer.lower() == "rmsprop":
        optimizer = optim.RMSprop(model.parameters(), lr=data.HP_lr, weight_decay=data.HP_l2)
    elif data.optimizer.lower() == "adam":
        optimizer = optim.Adam(model.parameters(), lr=data.HP_lr, weight_decay=data.HP_l2)
    else:
        print("Optimizer illegal: %s"%(data.optimizer))
        exit(0)
    best_dev = -10
    # data.HP_iteration = 1
    ## start training
    for idx in range(data.HP_iteration):
        epoch_start = time.time()
        temp_start = epoch_start
        print("Epoch: %s/%s" %(idx,data.HP_iteration))
        if data.optimizer == "SGD":
            optimizer = lr_decay(optimizer, idx, data.HP_lr_decay, data.HP_lr)
        instance_count = 0
        sample_id = 0
        sample_loss = 0
        total_loss = 0
        right_token = 0
        whole_token = 0
        random.shuffle(data.train_Ids)
        ## set model in train model
        model.train()
        model.zero_grad()
        batch_size = data.HP_batch_size
        batch_id = 0
        train_num = len(data.train_Ids)
        total_batch = train_num//batch_size+1
        for batch_id in range(total_batch):
            start = batch_id*batch_size
            end = (batch_id+1)*batch_size 
            if end >train_num:
                end = train_num
            instance = data.train_Ids[start:end]
            if not instance:
                continue
            batch_word, batch_features, batch_wordlen, batch_wordrecover, batch_char, batch_charlen, batch_charrecover, batch_label, mask  = batchify_with_label(instance, data.HP_gpu)
            instance_count += 1
            loss, tag_seq = model.neg_log_likelihood_loss(batch_word,batch_features, batch_wordlen, batch_char, batch_charlen, batch_charrecover, batch_label, mask)
            right, whole = predict_check(tag_seq, batch_label, mask)
            right_token += right
            whole_token += whole
            sample_loss += loss.data[0]
            total_loss += loss.data[0]
            if end%500 == 0:
                temp_time = time.time()
                temp_cost = temp_time - temp_start
                temp_start = temp_time
                print("     Instance: %s; Time: %.2fs; loss: %.4f; acc: %s/%s=%.4f"%(end, temp_cost, sample_loss, right_token, whole_token,(right_token+0.)/whole_token))
                sys.stdout.flush()
                sample_loss = 0
            loss.backward()
            optimizer.step()
            model.zero_grad()
        temp_time = time.time()
        temp_cost = temp_time - temp_start
        print("     Instance: %s; Time: %.2fs; loss: %.4f; acc: %s/%s=%.4f"%(end, temp_cost, sample_loss, right_token, whole_token,(right_token+0.)/whole_token))       
        epoch_finish = time.time()
        epoch_cost = epoch_finish - epoch_start
        print("Epoch: %s training finished. Time: %.2fs, speed: %.2fst/s,  total loss: %s"%(idx, epoch_cost, train_num/epoch_cost, total_loss))
        # continue
        speed, acc, p, r, f, _,_ = evaluate(data, model, "dev")
        dev_finish = time.time()
        dev_cost = dev_finish - epoch_finish

        if data.seg:
            current_score = f
            print("Dev: time: %.2fs, speed: %.2fst/s; acc: %.4f, p: %.4f, r: %.4f, f: %.4f"%(dev_cost, speed, acc, p, r, f))
        else:
            current_score = acc
            print("Dev: time: %.2fs speed: %.2fst/s; acc: %.4f"%(dev_cost, speed, acc))

        if current_score > best_dev:
            if data.seg:
                print("Exceed previous best f score:", best_dev)
            else:
                print("Exceed previous best acc score:", best_dev)
            model_name = data.model_dir +'.'+ str(idx) + ".model"
            print("Save current best model in file:", model_name)
            torch.save(model.state_dict(), model_name)
            best_dev = current_score 
        # ## decode test
        speed, acc, p, r, f, _,_ = evaluate(data, model, "test")
        test_finish = time.time()
        test_cost = test_finish - dev_finish
        if data.seg:
            print("Test: time: %.2fs, speed: %.2fst/s; acc: %.4f, p: %.4f, r: %.4f, f: %.4f"%(test_cost, speed, acc, p, r, f))
        else:
            print("Test: time: %.2fs, speed: %.2fst/s; acc: %.4f"%(test_cost, speed, acc))
        gc.collect() 
Beispiel #11
0
def train(data):
    print("Training model...")
    data.show_data_summary()
    save_data_name = data.model_dir + "/data.dset"
    if data.save_model:
        data.save(save_data_name)

    batch_size = data.HP_batch_size
    train_num = len(data.train_Ids)
    total_batch = train_num // batch_size + 1

    model = SeqModel(data)
    pytorch_total_params = sum(p.numel() for p in model.parameters())

    print(model)
    print("pytorch total params: %d" % pytorch_total_params)

    ## model 1 optimizer
    lr_detail1 = [
        {
            "params": filter(lambda p: p.requires_grad,
                             model.mcmodel.parameters()),
            "lr": data.HP_lr
        },
    ]
    if data.optimizer.lower() == "sgd":
        optimizer = optim.SGD(lr_detail1,
                              momentum=data.HP_momentum,
                              weight_decay=data.HP_l2)
    elif data.optimizer.lower() == "adagrad":
        optimizer = optim.Adagrad(lr_detail1, weight_decay=data.HP_l2)
    elif data.optimizer.lower() == "adadelta":
        optimizer = optim.Adadelta(lr_detail1, weight_decay=data.HP_l2)
    elif data.optimizer.lower() == "rmsprop":
        optimizer = optim.RMSprop(lr_detail1, weight_decay=data.HP_l2)
    elif data.optimizer.lower() == "adam":
        optimizer = optim.Adam(lr_detail1, weight_decay=data.HP_l2)
    else:
        print("Optimizer illegal: %s" % (data.optimizer))
        exit(1)

    ## model 2 optimizer
    optimizer2 = AdamW(model.get_m2_params(),
                       lr=data.HP_lr2,
                       weight_decay=data.HP_l2)
    t_total = total_batch * data.HP_iteration
    warmup_step = int(data.warmup_step * t_total)
    scheduler2 = WarmupLinearSchedule(optimizer2, warmup_step, t_total)

    best_dev = -10
    best_test = -10
    max_test = -10
    max_test_epoch = -1
    max_dev_epoch = -1

    ## start training
    for idx in range(data.HP_iteration):
        epoch_start = time.time()
        print("\n ###### Epoch: %s/%s ######" %
              (idx, data.HP_iteration))  # print (self.train_Ids)
        if data.optimizer.lower() == "sgd":
            optimizer = lr_decay(optimizer, idx, data.HP_lr_decay, data.HP_lr)

        sample_loss = 0
        total_loss = 0
        random.shuffle(data.train_Ids)

        model.train()
        model.zero_grad()
        for batch_id in range(total_batch):
            start = batch_id * batch_size
            end = (batch_id + 1) * batch_size
            if end > train_num:
                end = train_num
            instance = data.train_Ids[start:end]

            if not instance:
                continue
            batch_word, batch_features, batch_wordlen, batch_wordrecover, batch_char, batch_charlen, batch_charrecover, batch_label, mask = batchify_with_label(
                instance, data.HP_gpu, True)
            loss, tag_seqs = model.neg_log_likelihood_loss(
                batch_word, batch_features, batch_wordlen, batch_char,
                batch_charlen, batch_charrecover, batch_label, mask)

            sample_loss += loss.item()
            total_loss += loss.item()
            if end % 500 == 0:
                if sample_loss > 1e8 or str(sample_loss) == "nan":
                    print(
                        "ERROR: LOSS EXPLOSION (>1e8) ! PLEASE SET PROPER PARAMETERS AND STRUCTURE! EXIT...."
                    )
                    exit(1)
                sys.stdout.flush()
                sample_loss = 0

            loss.backward()
            clip_grad_norm_(model.parameters(), data.clip_grad)

            optimizer.step()
            optimizer2.step()
            scheduler2.step()
            model.zero_grad()

        epoch_finish = time.time()
        epoch_cost = epoch_finish - epoch_start
        print(
            "Epoch: %s training finished. Time: %.2f s, speed: %.2f doc/s,  total loss: %s"
            % (idx, epoch_cost, train_num / epoch_cost, total_loss))

        if total_loss > 1e8 or str(total_loss) == "nan":
            print(
                "ERROR: LOSS EXPLOSION (>1e8) ! PLEASE SET PROPER PARAMETERS AND STRUCTURE! EXIT...."
            )
            exit(1)

        # dev
        dev_score, _ = evaluate(data, model, "dev")

        # test
        test_score, _ = evaluate(data, model, "test")

        if max_test < test_score:
            max_test_epoch = idx
        max_test = max(test_score, max_test)
        if dev_score > best_dev:
            print("Exceed previous best dev score")
            best_test = test_score
            best_dev = dev_score
            max_dev_epoch = idx
            if data.save_model:
                model_name = data.model_dir + "/best_model.ckpt"
                print("Save current best model in file:", model_name)
                torch.save(model.state_dict(), model_name)

        print(
            "Score summary: max dev (%d): %.4f, test: %.4f; max test (%d): %.4f"
            % (max_dev_epoch, best_dev, best_test, max_test_epoch, max_test))

        gc.collect()
Beispiel #12
0
def train(data):
    print("Training model...")
    data.show_data_summary()
    save_data_name = data.model_dir + ".dset"
    # 存储data数据
    data.save(save_data_name)
    model = SeqModel(data)
    # check to load pretrained model
    if data.use_crf:
        pretrain_model_path = os.path.join('model_snapshot', 'lan_crf.model')
    else:
        pretrain_model_path = os.path.join('model_snapshot', 'lan.model')
    if data.use_pre_trained_model and os.path.exists(pretrain_model_path):
        model.load_state_dict(torch.load(pretrain_model_path))
        print("load pretrained model success:%s" % pretrain_model_path)
    pytorch_total_params = sum(p.numel() for p in model.parameters())
    print("--------pytorch total params--------")
    print(pytorch_total_params)
    optimizer = None
    if data.optimizer.lower() == "sgd":
        optimizer = optim.SGD(filter(lambda p: p.requires_grad, model.parameters()), lr=data.HP_lr,
                              momentum=data.HP_momentum, weight_decay=data.HP_l2)
    elif data.optimizer.lower() == "adagrad":
        optimizer = optim.Adagrad(model.parameters(), lr=data.HP_lr, weight_decay=data.HP_l2)
    elif data.optimizer.lower() == "adadelta":
        optimizer = optim.Adadelta(model.parameters(), lr=data.HP_lr, weight_decay=data.HP_l2)
    elif data.optimizer.lower() == "rmsprop":
        optimizer = optim.RMSprop(model.parameters(), lr=data.HP_lr, weight_decay=data.HP_l2)
    elif data.optimizer.lower() == "adam":
        optimizer = optim.Adam(model.parameters(), lr=data.HP_lr, weight_decay=data.HP_l2)
    else:
        print("Optimizer illegal: %s" % (data.optimizer))
        exit(1)
    best_dev = -10
    best_test = -10
    no_imprv_epoch = 0
    ## start training
    for idx in range(data.HP_iteration):
        epoch_start = time.time()
        temp_start = epoch_start
        print("Epoch: %s/%s" % (idx, data.HP_iteration))  # print (self.train_Ids)
        # every 5 epoch decay learning rate
        if idx % 5 == 0:
            optimizer = lr_decay(optimizer, idx, data.HP_lr_decay, data.HP_lr)
        instance_count = 0
        total_loss = 0
        ## set model in train model
        model.train()
        model.zero_grad()
        start = 0
        end = start + data.HP_batch_size
        train_epochs = []
        while end <= len(data.train_Ids):
            train_epochs.append((start, end))
            start = end
            end = end + data.HP_batch_size
        if end > len(data.train_Ids) > start:
            train_epochs.append((start, len(data.train_Ids)))
        for sample_id, (start, end) in enumerate(train_epochs):
            instance = data.train_Ids[start: end]
            sample_loss = 0
            batch_word, batch_word_len, _, batch_word_recover, batch_label, mask, input_label_seq_tensor = batchify_with_label(
                instance, data.HP_gpu, data)
            instance_count += 1
            loss, tag_seq = model.neg_log_likelihood_loss(
                batch_word, batch_word_len, batch_label, mask, input_label_seq_tensor)
            sample_loss += loss.item()
            total_loss += loss.item()
            print("Epoch:%s,no_imprv_epoch:%s,Instance: %s" % (
                idx, no_imprv_epoch, sample_id))
            right, whole = predict_check(tag_seq, batch_label, mask, data.use_crf)
            print("               loss: %.4f, acc: %s/%s=%.4f" % (
                loss.item(), right, whole, (right + 0.) / whole * 100))

            if sample_loss > 1e8 or str(sample_loss) == "nan":
                print("ERROR: LOSS EXPLOSION (>1e8) ! PLEASE SET PROPER PARAMETERS AND STRUCTURE! EXIT....")
                exit(1)
            sys.stdout.flush()
            loss.backward()
            if data.whether_clip_grad:
                nn.utils.clip_grad_norm_(model.parameters(), data.clip_grad)
            optimizer.step()
            model.zero_grad()
            # break
        epoch_finish = time.time()
        if total_loss > 1e8 or str(total_loss) == "nan":
            print("ERROR: LOSS EXPLOSION (>1e8) ! PLEASE SET PROPER PARAMETERS AND STRUCTURE! EXIT....")
            exit(1)
        speed, acc, report, f_value, \
        ner_acc, ner_p, ner_r, ner_f = evaluate(data, model, "dev")
        dev_finish = time.time()
        dev_cost = dev_finish - epoch_finish

        if data.seg:
            current_score = f_value
            # current_score = sent_f1
            print("Dev: time: %.2fs, speed: %.2fst/s;\n"
                  "acc: %.4f, f_value: %.4f\n"
                  "ner_acc: %.4f, ner_p: %.4f, ner_r: %.4f, ner_f: %.4f\n"
                  "current f1:%.4f" % (
                      dev_cost, speed, acc, f_value,
                      ner_acc, ner_p, ner_r, ner_f, current_score
                  ))
        else:
            current_score = acc
            print("Dev: time: %.2fs speed: %.2fst/s; acc: %.4f" % (
                dev_cost, speed, acc))

        # ## decode test
        speed, acc, report, f_value, \
        ner_acc, ner_p, ner_r, ner_f = evaluate(data, model, "test")
        dev_finish = time.time()
        dev_cost = dev_finish - epoch_finish
        if data.seg:
            print("Test: time: %.2fs, speed: %.2fst/s;\n"
                  "acc: %.4f, f_value: %.4f\n"
                  "ner_acc: %.4f, ner_p: %.4f, ner_r: %.4f, ner_f: %.4f\n"
                  "current f1:%.4f" % (
                      dev_cost, speed, acc, f_value,
                      ner_acc, ner_p, ner_r, ner_f, current_score
                  ))
        else:
            print("Test: time: %.2fs speed: %.2fst/s; acc: %.4f" % (
                dev_cost, speed, acc))

        if current_score > best_dev:
            if data.seg:
                best_test = f_value
                # best_test = sent_f1
                print("Exceed previous best avg f score:", best_dev)
            else:
                best_test = acc
                print("Exceed previous best acc score:", best_dev)
            if data.use_crf:
                result_file = "result_crf.txt"
                model_name = data.model_dir + "_crf.model"
            else:
                result_file = "result.txt"
                model_name = data.model_dir + ".model"
            with open(result_file, 'w', encoding='utf-8') as w:
                w.write(
                    "Save current best model in file:%s, iteration:%s/%s, best_test_f_score:%.5f\n"
                    "ner:\n"
                    "   precision:%.5f, recall:%.5f, f1_score:%.5f\n"
                    "%s\n\n" % (
                        model_name, idx, data.HP_iteration, best_test,
                        ner_p, ner_r, ner_f,
                        report))
            print("Save current best model in file:", model_name)
            torch.save(model.state_dict(), model_name)
            best_dev = current_score
            no_imprv_epoch = 0
        else:
            # early stop
            no_imprv_epoch += 1
            if no_imprv_epoch >= 10:
                print("early stop")
                print("Current best f score in dev", best_dev)
                print("Current best f score in test", best_test)
                break

        if data.seg:
            print("Current best f score in dev", best_dev)
            print("Current best f score in test", best_test)
        else:
            print("Current best acc score in dev", best_dev)
            print("Current best acc score in test", best_test)
        gc.collect()