else:
                losses = loss_function(pos, neg)
            ent_embeddings = model.ent_embeddings(
                torch.cat([pos_h_batch, pos_t_batch, neg_h_batch,
                           neg_t_batch]))
            rel_embeddings = model.rel_embeddings(
                torch.cat([pos_r_batch, neg_r_batch]))
            losses = losses + loss.normLoss(ent_embeddings) + loss.normLoss(
                rel_embeddings) + loss.normLoss(pos_h_e) + loss.normLoss(
                    pos_t_e) + loss.normLoss(neg_h_e) + loss.normLoss(neg_t_e)

            losses.backward()
            optimizer.step()
            total_loss += losses.data

        agent.append(trainCurve, epoch, total_loss[0])

        if epoch % 10 == 0:
            now_time = time.time()
            print(now_time - start_time)
            print("Train total loss: %d %f" % (epoch, total_loss[0]))

        if epoch % 10 == 0:
            if config.filter == True:
                pos_h_batch, pos_t_batch, pos_r_batch, neg_h_batch, neg_t_batch, neg_r_batch = getBatch_filter_random_v2(
                    validList, config.batch_size, config.entity_total,
                    tripleDict, tail_per_head, head_per_tail)
            else:
                pos_h_batch, pos_t_batch, pos_r_batch, neg_h_batch, neg_t_batch, neg_r_batch = getBatch_raw_random_v2(
                    validList, config.batch_size, config.entity_total,
                    tail_per_head, head_per_tail)
Ejemplo n.º 2
0
def train(model_path,train_batch_size,validate_batch_size,validate_batch_num,resize,train_gpu,validate_gpu=-1):

    # train_gpu = 0
    # validate_gpu = 1
    # model_path = '../amazon2/alexnet'
    # train_batch_size = 256
    # validate_batch_size = 128
    # validate_batch_num = 8

    # parameters

    k=5
    epochs = 1
    lr = 1e-4
    weight_decay = 0
    momentum = 0.9


    criteria2metric = {
        'train loss': 'loss',
        'valid loss': 'loss'
    }
    hyperparameters_train = {
        'name':'train',
        'learning rate': lr,
        'batch size': train_batch_size,
        'optimizer': 'Adam',
        'momentum': 0,
        'net':model_path.split('/')[-1],
        'epoch':'No.1',
    }
    hyperparameters_validate = {
        'name':'validate',
        'learning rate': lr,
        'batch size': train_batch_size,
        'optimizer': 'Adam',
        'momentum': 0,
        'net':model_path.split('/')[-1],
        'epoch': 'No.1',
    }


    agent = Agent(username='******',password='******')
    train_loss_show = agent.register(hyperparameters_train, criteria2metric['train loss'])
    validate_loss_show = agent.register(hyperparameters_validate, criteria2metric['valid loss'])
    global_step = 0

    with open('kdf.pkl', 'rb') as f:
        kfold = pickle.load(f,encoding='latin1')

    loss_info = []    # 第i个记录了 fold i 的最小(train_loss,validate_loss)
    for fold in range(k):
        train_index = kfold[fold][0]
        validate_index = kfold[fold][1]


        model = AM_alex()
        if model.getname()!=model_path.split('/')[-1]:
            print('Wrong Model!')
            return
        model.cuda(device_id=train_gpu)

        optimizer = torch.optim.Adam(model.parameters(), lr=lr,weight_decay=weight_decay)

        dset_train = AmazonDateset_train(train_index,IMG_TRAIN_PATH,IMG_EXT,LABEL_PATH,resize=resize)
        train_loader = DataLoader(dset_train, batch_size=train_batch_size, shuffle=True, num_workers=6)
        min_loss = [0.9,0.9]
        for epoch in range(epochs):
            print('--------------Epoch %d: train-----------' % epoch)
            model.train()
            for step, (data, target) in enumerate(train_loader):
                data, target = Variable(data), Variable(target)
                data = data.cuda(device_id=train_gpu)
                target = target.cuda(device_id=train_gpu)

                optimizer.zero_grad()
                output = model(data)
                # print(output.size())
                loss = F.binary_cross_entropy(output, target)
                loss.backward()
                optimizer.step()
                agent.append(train_loss_show, global_step, loss.data[0])
                global_step += 1
                if step % 10 == 0:
                    model.eval()
                    if validate_gpu != -1:
                        model.cuda(validate_gpu)
                    dset_validate = AmazonDateset_validate(validate_index, IMG_TRAIN_PATH, IMG_EXT, LABEL_PATH,random_transform=True,resize=resize)
                    validate_loader = DataLoader(dset_validate, batch_size=validate_batch_size, shuffle=True, num_workers=6)
                    total_vloss = 0
                    for vstep, (vdata, vtarget) in enumerate(validate_loader):
                        vdata, vtarget = Variable(vdata), Variable(vtarget)
                        if validate_gpu != -1:
                            vdata = vdata.cuda(validate_gpu)
                            vtarget = vtarget.cuda(validate_gpu)
                        else:
                            vdata = vdata.cuda(train_gpu)
                            vtarget = vtarget.cuda(train_gpu)

                        voutput = model(vdata)
                        vloss = F.binary_cross_entropy(voutput, vtarget)
                        total_vloss += vloss.data[0]
                        if vstep == (validate_batch_num-1):
                            break
                    vloss = total_vloss / validate_batch_num
                    model.train()
                    if validate_gpu != -1:
                        model.cuda(train_gpu)

                    agent.append(validate_loss_show, global_step, vloss)

                    print('{} Fold{} Epoch{} Step{}: [{}/{} ({:.0f}%)]\tTrain Loss: {:.6f}\tValidate Loss: {:.6f}'.format(model_path.split('/')[-1],fold, epoch,global_step, step * train_batch_size,
                                                                                   len(train_loader.dataset),
                                                                                   100. * step / len(train_loader),
                                                                                   loss.data[0],vloss))
                    if vloss<min_loss[1]:
                        min_loss[1] = vloss
                        min_loss[0] = loss.data[0]
        model_save = copy.deepcopy(model)
        torch.save(model_save.cpu(), os.path.join(model_path,'fold%d.mod'%(fold)))
        loss_info.append(min_loss)


    print('-----------------------------------------')
    print(model_path.split('/')[-1]+':')
    for i,l in enumerate(loss_info):
        print('Fold%d: Train loss:%f\tValidate loss:%f'%(i,l[0],l[1]))

    with open(os.path.join(model_path,'train_loss_info.pkl'),'wb') as f:
        pickle.dump(loss_info,f)
Ejemplo n.º 3
0
class AIFL_Digits(object):
    def __init__(self, E, D, M, data_A, data_B, exp, cuda=True, port=5000):
        self.E = E
        self.D = D
        self.M = M
        self.data_A = data_A
        self.data_B = data_B
        self.exp = exp
        self.cuda = cuda
        self.port = port

        assert self.data_A.channel == self.data_B.channel
        assert self.data_A.size == self.data_B.size
        assert self.data_A.n_class == self.data_B.n_class
        self.channel = self.data_A.channel
        self.size = self.data_A.size

        self.registe_curves()

        if self.cuda:
            self.E.cuda()
            self.D.cuda()
            self.M.cuda()

    def registe_curves(self):
        self.agent = Agent(username='',
                           password='',
                           address='127.0.0.1',
                           port=self.port)
        loss_D_exp = {self.exp: "D loss: D predicts samples' attributes"}
        loss_E_exp = {self.exp: 'E loss: E encodes samples'}
        loss_M_exp = {self.exp: 'M loss: M classifies samples'}
        acc_A_exp = {self.exp: 'Categorization accuracy on data A'}
        acc_B_exp = {self.exp: 'Categorization accuracy on data B'}
        pre_loss_E_exp = {self.exp: 'Pretrain E loss: E encodes samples'}
        pre_loss_M_exp = {self.exp: 'Pretrain M loss: M classifies samples'}
        pre_acc_A_exp = {
            self.exp: 'Pretrain categorization accuracy on data A'
        }
        pre_acc_B_exp = {
            self.exp: 'Pretrain categorization accuracy on data B'
        }
        lr_exp = {self.exp: 'Learning rate at training phase(log scale)'}
        pre_lr_exp = {
            self.exp: 'Learning rate at pretraining phase(log scale)'
        }
        self.d_loss = self.agent.register(loss_D_exp, 'D loss', overwrite=True)
        self.e_loss = self.agent.register(loss_E_exp, 'E loss', overwrite=True)
        self.m_loss = self.agent.register(loss_M_exp, 'M loss', overwrite=True)
        self.acc_A = self.agent.register(acc_A_exp, 'acc', overwrite=True)
        self.acc_B = self.agent.register(acc_B_exp, 'acc', overwrite=True)
        self.pre_e_loss = self.agent.register(pre_loss_E_exp,
                                              'E loss',
                                              overwrite=True)
        self.pre_m_loss = self.agent.register(pre_loss_M_exp,
                                              'M loss',
                                              overwrite=True)
        self.pre_acc_A = self.agent.register(pre_acc_A_exp,
                                             'acc',
                                             overwrite=True)
        self.pre_acc_B = self.agent.register(pre_acc_B_exp,
                                             'acc',
                                             overwrite=True)
        self.tlr = self.agent.register(lr_exp, 'lr', overwrite=True)
        self.plr = self.agent.register(pre_lr_exp, 'lr', overwrite=True)

    def train(self, ckpt_dir, test_A, test_B, init_lr_E=1e-3, init_lr_D=1e-3, init_lr_M=1e-3, \
       batch_size=64, training_epochs=50000):
        x = Variable(
            torch.FloatTensor(batch_size, self.channel, self.size, self.size))
        y = Variable(torch.LongTensor(batch_size))
        s = Variable(torch.FloatTensor(batch_size))

        att_pred_criterion = nn.BCELoss()
        cat_criterion = nn.CrossEntropyLoss()

        if self.cuda:
            x = x.cuda()
            y = y.cuda()
            s = s.cuda()
            att_pred_criterion = att_pred_criterion.cuda()
            cat_criterion = cat_criterion.cuda()

        optimizer_D = optim.Adam(self.D.parameters(),
                                 lr=init_lr_D,
                                 betas=(0.5, 0.999))
        optimizer_E = optim.Adam(self.E.parameters(),
                                 lr=init_lr_E,
                                 betas=(0.5, 0.999))
        optimizer_M = optim.Adam(self.M.parameters(),
                                 lr=init_lr_M,
                                 betas=(0.5, 0.999))

        # scheduler_D = lr_scheduler.StepLR(optimizer_D, step_size=1000, gamma=0.9)
        # scheduler_E = lr_scheduler.StepLR(optimizer_E, step_size=1000, gamma=0.9)
        # scheduler_M = lr_scheduler.StepLR(optimizer_M, step_size=1000, gamma=0.9)
        scheduler_D = lr_scheduler.ReduceLROnPlateau(optimizer_D,
                                                     mode='max',
                                                     min_lr=1e-7,
                                                     patience=5,
                                                     factor=0.65,
                                                     verbose=True)
        scheduler_E = lr_scheduler.ReduceLROnPlateau(optimizer_E,
                                                     mode='max',
                                                     min_lr=1e-7,
                                                     patience=5,
                                                     factor=0.65,
                                                     verbose=True)
        scheduler_M = lr_scheduler.ReduceLROnPlateau(optimizer_M,
                                                     mode='max',
                                                     min_lr=1e-7,
                                                     patience=5,
                                                     factor=0.65,
                                                     verbose=True)

        for epoch in range(training_epochs):
            # scheduler_D.step()
            # scheduler_E.step()
            # scheduler_M.step()

            begin_time = time.time()

            # fetch data
            batch_x_A, batch_y_A = self.data_A(batch_size // 2)
            batch_x_B, batch_y_B = self.data_B(batch_size - batch_x_A.shape[0])
            x.data.copy_(
                torch.from_numpy(np.concatenate([batch_x_A, batch_x_B])))
            y.data.copy_(
                torch.from_numpy(np.concatenate([batch_y_A, batch_y_B])))
            s.data.copy_(
                torch.from_numpy(
                    np.array([0] * batch_x_A.shape[0] +
                             [1] * batch_x_B.shape[0])))

            # update D
            self.D.zero_grad()
            h = self.E(x)
            pred_s = self.D(h.detach())
            D_loss = att_pred_criterion(pred_s, s)
            D_loss.backward()
            optimizer_D.step()

            # update E and M
            self.E.zero_grad()
            self.M.zero_grad()
            pred_s = self.D(h)
            pred_y = self.M(h)
            M_loss = cat_criterion(pred_y, y)
            E_loss = -att_pred_criterion(pred_s, s) + M_loss
            E_loss.backward()
            optimizer_E.step()
            optimizer_M.step()

            # registe data on curves
            self.agent.append(self.d_loss, epoch, float(D_loss.data[0]))
            self.agent.append(self.e_loss, epoch, float(E_loss.data[0]))
            self.agent.append(self.m_loss, epoch, float(M_loss.data[0]))

            elapsed_time = time.time() - begin_time
            print('Epoch[%06d], D_loss: %.4f, E_loss: %.4f, M_loss: %.4f, elapsed_time: %.4ssecs.' % \
              (epoch+1, D_loss.data[0], E_loss.data[0], M_loss.data[0], elapsed_time))

            if epoch % 500 == 0:
                acc = {'A': 0, 'B': 0}
                val_data = {'A': test_A, 'B': test_B}
                for domain in val_data:
                    while val_data[domain].has_next():
                        batch_x, batch_y = val_data[domain](batch_size)
                        x.data.copy_(torch.from_numpy(batch_x))
                        n = int(np.sum(batch_y != -1))
                        acc[domain] += np.sum(
                            np.argmax(self.M(self.E(x)).cpu().data.numpy(), 1)
                            [:n] == batch_y[:n])
                    acc[domain] /= float(val_data[domain].N)

                    val_data[domain].reset(
                    )  # reset so that next time when evaluates, cursor would start from 0

                print('Epoch[%06d], acc_A: %.4f, acc_B: %.4f' %
                      (epoch + 1, acc['A'], acc['B']))

                self.agent.append(self.acc_A, epoch, acc['A'])
                self.agent.append(self.acc_B, epoch, acc['B'])

                scheduler_D.step((acc['A'] + acc['B']) / 2)
                scheduler_E.step((acc['A'] + acc['B']) / 2)
                scheduler_M.step((acc['A'] + acc['B']) / 2)

                self.agent.append(
                    self.tlr, epoch,
                    float(np.log(optimizer_E.param_groups[0]['lr'])))

            if epoch % 10000 == 9999 or epoch == training_epochs - 1:
                torch.save(
                    self.E.state_dict(),
                    os.path.join(ckpt_dir,
                                 'E_epoch-%s.pth' % str(epoch + 1).zfill(6)))
                torch.save(
                    self.M.state_dict(),
                    os.path.join(ckpt_dir,
                                 'M_epoch-%s.pth' % str(epoch + 1).zfill(6)))
                torch.save(
                    self.D.state_dict(),
                    os.path.join(ckpt_dir,
                                 'D_epoch-%s.pth' % str(epoch + 1).zfill(6)))

    def pretrain(self,
                 ckpt_dir,
                 test_A,
                 test_B,
                 init_lr_E=1e-3,
                 init_lr_M=1e-3,
                 batch_size=64,
                 pretrain_epochs=5000):
        x = Variable(
            torch.FloatTensor(batch_size, self.channel, self.size, self.size))
        y = Variable(torch.LongTensor(batch_size))

        cat_criterion = nn.CrossEntropyLoss()

        if self.cuda:
            x = x.cuda()
            y = y.cuda()
            cat_criterion = cat_criterion.cuda()

        optimizer_E = optim.Adam(self.E.parameters(),
                                 lr=init_lr_E,
                                 betas=(0.5, 0.999))
        optimizer_M = optim.Adam(self.M.parameters(),
                                 lr=init_lr_M,
                                 betas=(0.5, 0.999))

        # scheduler_E = lr_scheduler.StepLR(optimizer_E, step_size=1000, gamma=0.3)
        # scheduler_M = lr_scheduler.StepLR(optimizer_M, step_size=1000, gamma=0.3)
        scheduler_E = lr_scheduler.ReduceLROnPlateau(optimizer_E,
                                                     mode='max',
                                                     min_lr=1e-7,
                                                     patience=5,
                                                     factor=0.65,
                                                     verbose=True)
        scheduler_M = lr_scheduler.ReduceLROnPlateau(optimizer_M,
                                                     mode='max',
                                                     min_lr=1e-7,
                                                     patience=5,
                                                     factor=0.65,
                                                     verbose=True)

        for epoch in range(pretrain_epochs):
            # scheduler_E.step()
            # scheduler_M.step()

            begin_time = time.time()

            # fetch data
            batch_x_A, batch_y_A = self.data_A(batch_size // 2)
            batch_x_B, batch_y_B = self.data_B(batch_size - batch_x_A.shape[0])
            x.data.copy_(
                torch.from_numpy(np.concatenate([batch_x_A, batch_x_B])))
            y.data.copy_(
                torch.from_numpy(np.concatenate([batch_y_A, batch_y_B])))

            # update E and M
            self.E.zero_grad()
            self.M.zero_grad()

            h = self.E(x)
            pred_y = self.M(h)
            M_loss = cat_criterion(pred_y, y)
            E_loss = M_loss
            E_loss.backward()
            optimizer_E.step()
            optimizer_M.step()

            # registe data on curves
            self.agent.append(self.pre_e_loss, epoch, float(E_loss.data[0]))
            self.agent.append(self.pre_m_loss, epoch, float(M_loss.data[0]))

            elapsed_time = time.time() - begin_time
            print('Pretrain epoch[%06d], E_loss(= M_loss): %.4f, elapsed_time: %.4ssecs.' % \
              (epoch+1, E_loss.data[0], elapsed_time))

            if epoch % 500 == 0:
                acc = {'A': 0, 'B': 0}
                val_data = {'A': test_A, 'B': test_B}
                for domain in val_data:
                    while val_data[domain].has_next():
                        batch_x, batch_y = val_data[domain](batch_size)
                        x.data.copy_(torch.from_numpy(batch_x))
                        n = int(np.sum(batch_y != -1))
                        acc[domain] += np.sum(
                            np.argmax(self.M(self.E(x)).cpu().data.numpy(), 1)
                            [:n] == batch_y[:n])
                    acc[domain] /= float(val_data[domain].N)

                    val_data[domain].reset(
                    )  # reset so that next time when evaluates, cursor would start from 0

                print('Pretrain epoch[%06d], acc_A: %.4f, acc_B: %.4f' %
                      (epoch + 1, acc['A'], acc['B']))

                self.agent.append(self.pre_acc_A, epoch, acc['A'])
                self.agent.append(self.pre_acc_B, epoch, acc['B'])

                scheduler_E.step((acc['A'] + acc['B']) / 2)
                scheduler_M.step((acc['A'] + acc['B']) / 2)

                self.agent.append(
                    self.plr, epoch,
                    float(np.log(optimizer_E.param_groups[0]['lr'])))

            if epoch % 10000 == 9999 or epoch == pretrain_epochs - 1:
                torch.save(
                    self.E.state_dict(),
                    os.path.join(
                        ckpt_dir,
                        'pretrain_E_epoch-%s.pth' % str(epoch + 1).zfill(6)))
                torch.save(
                    self.M.state_dict(),
                    os.path.join(
                        ckpt_dir,
                        'pretrain_M_epoch-%s.pth' % str(epoch + 1).zfill(6)))
Ejemplo n.º 4
0
        if done:
            qs = []
            q = np.zeros((config.MYSELF_NUM))
            total_reward = 0
            for r in rs[::-1]:
                q = r + config.GAMMA * q
                total_reward += r.sum() / config.MYSELF_NUM
                qs.append(q)
            qs = np.asarray(qs)
            q_mean = np.mean(qs)
            ddpg_agent.train_record.append((episode, total_reward))
            print('memory: {}/{}'.format(
                ddpg_agent.commander_memory.current_index,
                ddpg_agent.commander_memory.max_len))
            print('q_mean: ', q_mean)
            print('train_reward', total_reward)
            HBagent.append(train_r, episode, total_reward)

            break

        state = next_state

    if episode % config.TEST_ITERVAL == 0:
        print('\ntest (no noise)\n')
        test_reward, _1, _2 = ddpg_agent.test(episode, config.TEST_NUM)
        HBagent.append(test_record, episode, test_reward)

    if episode % config.SAVE_ITERVAL == 0:
        print('\nsave model\n')
        ddpg_agent.save(episode)
Ejemplo n.º 5
0
def train(lr, net, epoch, train_loader, valid_loader, transform,
          hyperparameters, batch_size):

    # register hypercurve
    agent = Agent(port=5001)
    hyperparameters['criteria'] = 'train loss'
    train_loss = agent.register(hyperparameters, 'loss')

    hyperparameters['criteria'] = 'valid loss'
    valid_loss = agent.register(hyperparameters, 'loss')

    hyperparameters['criteria'] = 'valid bleu'
    valid_bleu = agent.register(hyperparameters, 'bleu')

    hyperparameters['criteria'] = 'train bleu'
    train_bleu = agent.register(hyperparameters, 'bleu')

    hyperparameters['criteria'] = 'scheduled sampling probability'
    hyper_ssprob = agent.register(hyperparameters, 'probability')

    if torch.cuda.is_available():
        net.cuda()
    optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad,
                                        net.parameters()),
                                 lr=lr)
    net.train()

    best_score = -1
    global_steps = 0
    best_valid_loss = 10000
    for iepoch in range(epoch):

        new_epoch = False
        batchid = 0
        for (_, data) in enumerate(train_loader, 0):

            entext = data['entext']
            enlen = data['enlen']
            zhlabel = data['zhlabel']
            zhgtruth = data['zhgtruth']
            zhlen = data['zhlen']

            ssprob = max(math.exp(-(global_steps - 100000) / 500000), 0.8)
            print('scheduled sampling pro: ', ssprob)
            logits, predic = net(entext, zhgtruth, enlen, ssprob, True)
            loss = net.get_loss(logits, zhlabel)
            optimizer.zero_grad()
            loss.backward()
            utils.clip_grad_norm(net.parameters(), 5)
            optimizer.step()

            batchid += 1
            global_steps += 1

            print(global_steps, iepoch, batchid, sum(loss.data.cpu().numpy()))
            agent.append(train_loss, global_steps,
                         sum(loss.data.cpu().numpy()))
            agent.append(hyper_ssprob, global_steps, ssprob)

            if batchid % 50 == 0:
                net.eval()
                logits, predic = net(entext, zhgtruth, enlen, ssprob, True)

                tmppre = [0 for i in range(len(entext))]
                tmplabel = [0 for i in range(len(entext))]
                for i in range(len(entext)):
                    tmppre[i] = transform.clip(predic[i], language='zh')
                    tmplabel[i] = zhlabel[i][:zhlen[i]]

                tmpscore = bleuscore.score(tmppre, tmplabel)
                for i in range(25):
                    ans_ = transform.i2t(tmplabel[i], language='zh')
                    pre_ = transform.i2t(tmppre[i], language='zh')
                    print(ans_)
                    print(pre_)
                    print('-------------------\n')
                agent.append(train_bleu, global_steps, tmpscore)
                del logits, predic

            if batchid % 400 == 0:
                print('\n------------------------\n')
                net.eval()
                all_pre = []
                all_lable = []
                all_len = []
                all_loss = 0
                bats = 0
                for (_, data) in enumerate(valid_loader, 0):

                    entext = data['entext']
                    enlen = data['enlen']
                    zhlabel = data['zhlabel']
                    zhgtruth = data['zhgtruth']
                    zhlen = data['zhlen']

                    logits, predic = net(entext, zhgtruth, enlen, 0, False)
                    loss = net.get_loss(logits, zhlabel)

                    all_pre.extend(predic)
                    all_lable.extend(zhlabel)
                    all_len.extend(zhlen)
                    all_loss += sum(loss.data.cpu().numpy())

                    del loss, logits, predic
                    bats += 1

                for i in range(len(all_pre)):
                    all_pre[i] = transform.clip(all_pre[i], language='zh')
                    all_lable[i] = all_lable[i][:all_len[i]]

                score = bleuscore.score(all_pre, all_lable)

                for i in range(0, 600, 6):

                    ans_ = transform.i2t(all_lable[i], language='zh')
                    pre_ = transform.i2t(all_pre[i], language='zh')
                    print(ans_)
                    print(pre_)
                    print('-------------------\n')

                all_loss /= bats
                print(global_steps, iepoch, batchid, all_loss, score,
                      '\n********************\n')
                agent.append(valid_loss, global_steps, all_loss)
                agent.append(valid_bleu, global_steps, score)

                if best_valid_loss > all_loss or best_score < score:

                    best_valid_loss = all_loss
                    bestscore = score
                    torch.save(
                        net.state_dict(), model_dir +
                        "ssprob-{:3f}-loss-{:3f}-steps-{:d}-model.pkl".format(
                            ssprob, all_loss, global_steps))
                del all_lable, all_len, all_loss, all_pre
            net.train()
Ejemplo n.º 6
0
def train(lr, net, epoch, train_loader, valid_loader, transform,
          hyperparameters, batch_size):

    # register hypercurve
    agent = Agent(port=5000)
    hyperparameters['criteria'] = 'train loss'
    train_loss = agent.register(hyperparameters, 'loss')

    hyperparameters['criteria'] = 'valid loss'
    valid_loss = agent.register(hyperparameters, 'loss')

    hyperparameters['criteria'] = 'valid bleu'
    valid_bleu = agent.register(hyperparameters, 'bleu')

    #hyperparameters['criteria'] = 'train bleu'
    #train_bleu = agent.register(hyperparameters, 'bleu')

    hyperparameters['criteria'] = 'teacher_forcing_ratio'
    hyper_tfr = agent.register(hyperparameters, 'ratio')

    hyperparameters['criteria'] = 'teacher_forcing_loss'
    valid_tf_loss = agent.register(hyperparameters, 'loss')

    if torch.cuda.is_available():
        net.cuda()
    optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad,
                                        net.parameters()),
                                 lr=lr)
    net.train()

    best_score = -1
    global_steps = 0
    best_valid_loss = 10000
    for iepoch in range(epoch):

        new_epoch = False
        batchid = 0
        for (_, data) in enumerate(train_loader, 0):

            entext = data['entext']
            enlen = data['enlen']
            zhlabel = data['zhlabel']
            zhgtruth = data['zhgtruth']
            zhlen = data['zhlen']
            enstr = data['enstr']
            zhstr = data['zhstr']

            teacher_forcing_ratio = math.exp(-global_steps / 10000000)
            print('teacher_forcing_ratio: ', teacher_forcing_ratio)

            decoder_outputs, ret_dict = net(entext, zhgtruth, enlen, True,
                                            teacher_forcing_ratio)

            loss = net.get_loss(decoder_outputs, zhlabel)
            optimizer.zero_grad()
            loss.backward()
            utils.clip_grad_norm(net.parameters(), 5)
            optimizer.step()

            batchid += 1
            global_steps += 1

            print(global_steps, iepoch, batchid, max(enlen),
                  sum(loss.data.cpu().numpy()))
            agent.append(train_loss, global_steps,
                         sum(loss.data.cpu().numpy()))
            agent.append(hyper_tfr, global_steps, teacher_forcing_ratio)

            if global_steps % 50 == 0:
                net.eval()
                decoder_outputs, ret_dict = net(entext, zhgtruth, enlen, True,
                                                teacher_forcing_ratio)

                length = ret_dict['length']
                prediction = [0 for i in range(len(length))]
                tmppre = [
                    _.squeeze().cpu().data.tolist()
                    for _ in ret_dict['sequence']
                ]
                tmppre = np.array(tmppre).transpose(1, 0)

                for i in range(len(tmppre)):

                    prediction[i] = tmppre[i][:length[i]]
                    prediction[i] = transform.i2t(prediction[i], language='zh')
                    prediction[i] = re.sub(r'nuk#', '', prediction[i])
                    prediction[i] = re.sub(r'eos#', '', prediction[i])

                tmpscore = bleuscore.score(prediction, zhstr)

                for i in range(5):
                    print(prediction[i])
                    print(zhstr[i])
                    print('-------------------\n')

                del decoder_outputs, ret_dict
                #agent.append(train_bleu, global_steps, tmpscore)
                net.train()

            if global_steps % 200 == 0:
                print('\n------------------------\n')
                net.eval()
                all_pre = []
                all_label = []
                all_loss = 0
                all_en = []
                bats = 0
                teacher_forcing_loss = 0
                for (_, data) in enumerate(valid_loader, 0):

                    entext = data['entext']
                    enlen = data['enlen']
                    zhlabel = data['zhlabel']
                    zhgtruth = data['zhgtruth']
                    zhlen = data['zhlen']
                    enstr = data['enstr']
                    zhstr = data['zhstr']

                    decoder_outputs, ret_dict = net(entext, None, enlen, True,
                                                    0)
                    length = ret_dict['length']
                    prediction = [0 for i in range(len(length))]
                    tmppre = [
                        _.squeeze().cpu().data.tolist()
                        for _ in ret_dict['sequence']
                    ]
                    tmppre = np.array(tmppre).transpose(1, 0)

                    for i in range(len(tmppre)):
                        prediction[i] = tmppre[i][:length[i]]
                        prediction[i] = transform.i2t(prediction[i],
                                                      language='zh')
                        prediction[i] = re.sub(r'nuk#', '', prediction[i])
                        prediction[i] = re.sub(r'eos#', '', prediction[i])

                    loss = net.get_loss(decoder_outputs, zhlabel)

                    all_pre.extend(prediction)
                    all_label.extend(zhstr)
                    all_en.extend(enstr)
                    all_loss += sum(loss.data.cpu().numpy())

                    del loss, decoder_outputs, ret_dict

                    # teacher forcing loss, to judge if overfit
                    decoder_outputs, _ = net(entext, zhgtruth, enlen, True, 1)
                    loss = net.get_loss(decoder_outputs, zhlabel)
                    teacher_forcing_loss += sum(loss.data.cpu().numpy())
                    bats += 1
                score = bleuscore.score(all_pre, all_label)

                for i in range(0, 400):
                    print(all_en[i])
                    print(all_pre[i])
                    print(all_label[i])
                    print('-------------------\n')

                all_loss /= bats
                teacher_forcing_loss /= bats
                print(global_steps, iepoch, batchid, all_loss,
                      teacher_forcing_loss, score, '\n********************\n')
                agent.append(valid_loss, global_steps, all_loss)
                agent.append(valid_bleu, global_steps, score)
                agent.append(valid_tf_loss, global_steps, teacher_forcing_loss)
                if best_valid_loss > all_loss:

                    best_valid_loss = all_loss
                    #bestscore = score
                    _ = model_dir + "ratio-{:3f}-loss-{:3f}-score-{:3f}-steps-{:d}-model.pkl".format(
                        teacher_forcing_ratio, all_loss, score, global_steps)
                    torch.save(net.state_dict(), _)

                elif global_steps % 1000 == 0:
                    _ = model_dir + "ratio-{:3f}-loss-{:3f}-score-{:3f}-steps-{:d}-model.pkl".format(
                        teacher_forcing_ratio, all_loss, score, global_steps)
                    torch.save(net.state_dict(), _)

                del all_label, all_loss, all_pre
                net.train()
Ejemplo n.º 7
0
                vd = Variable(vd, volatile=True).float()
                vl = Variable(vl, volatile=True).float()
                if config.gpu >= 0:
                    vd = vd.cuda(config.gpu)
                    vl = vl.cuda(config.gpu)
                vp = model(vd)
                vloss += criticer(vp, vl).data[0]
                if vs == config.validate_batch_num - 1:
                    break
            vloss = vloss / config.validate_batch_num
            model.train()
            print('Epoch{} Step{}: [{}/{} ({:.0f}%)]\tValidate Loss: {:.6f}'.
                  format(epoch, step, step * config.batch_size, train_num,
                         100. * step * config.batch_size / train_num, vloss))
            if config.use_hyperboard:
                agent.append(validate_loss_record, global_step, vloss)

            # early stop
            if config.early_stop_num >= 0:
                if vloss > max_loss:
                    max_loss = vloss
                    no_gain = 0
                else:
                    no_gain += 1
                if no_gain >= config.early_stop_num:
                    print('Early Stop!')
                    # save model
                    model_save = copy.deepcopy(model)
                    torch.save(
                        model_save.cpu(),
                        './checkpoint/early_model_{}epoch_{}step.mod'.format(
Ejemplo n.º 8
0
        phase='train',
        transform_type=0,
        preprocessing=utils.get_preprocessing(preprocessing_fn),
        mean=MEAN,
        std=STD,
    )
    dataloader = DataLoader(dataset,
                            batch_size=BATCH_SIZE,
                            num_workers=NUM_WORKERS)
    for i, (img_ids, imgs, masks, _) in enumerate(dataloader):
        imgs = imgs.float().cuda(GPU)
        masks = masks.float().cuda(GPU)
        outputs = model(imgs)
        loss = criterion(outputs, masks)
        if epoch > 0:  # skip unstable phase
            agent.append(train_loss_record, global_step, loss.item())

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        global_step += 1

        if global_step % VALIDATE_STEP == 0:
            train_loss = loss.item()
            model.eval()
            with torch.no_grad():
                _, imgs, masks, _ = validate_dataset.sample(BATCH_SIZE * 2)
                imgs = imgs.float().cuda(GPU)
                masks = masks.float().cuda(GPU)
                outputs = model(imgs)
Ejemplo n.º 9
0
def trainer(data='coco',
            margin=0.2,
            dim=1024,
            dim_image=4096,
            dim_word=300,
            encoder='gru',
            max_epochs=15,
            dispFreq=10,
            decay_c=0.0,
            grad_clip=2.0,
            maxlen_w=150,
            batch_size=128,
            saveto='vse/coco',
            validFreq=100,
            lrate=0.0002,
            concat=True,
            reload_=False):

    hyper_params = {
        'data': data,
        'encoder': encoder,
        'batch_size': batch_size,
        'time': cur_time,
        'lrate': lrate,
        'concat': concat,
    }

    i2t_r1 = dict([('i2t_recall', 'r1')] + hyper_params.items())
    i2t_r5 = dict([('i2t_recall', 'r5')] + hyper_params.items())
    i2t_r10 = dict([('i2t_recall', 'r10')] + hyper_params.items())
    t2i_r1 = dict([('t2i_recall', 'r1')] + hyper_params.items())
    t2i_r5 = dict([('t2i_recall', 'r5')] + hyper_params.items())
    t2i_r10 = dict([('t2i_recall', 'r10')] + hyper_params.items())

    i2t_med = dict([('i2t_med', 'i2t_med')] + hyper_params.items())
    t2i_med = dict([('t2i_med', 't2i_med')] + hyper_params.items())

    agent = Agent(port=5020)
    i2t_r1_agent = agent.register(i2t_r1, 'recall', overwrite=True)
    i2t_r5_agent = agent.register(i2t_r5, 'recall', overwrite=True)
    i2t_r10_agent = agent.register(i2t_r10, 'recall', overwrite=True)
    t2i_r1_agent = agent.register(t2i_r1, 'recall', overwrite=True)
    t2i_r5_agent = agent.register(t2i_r5, 'recall', overwrite=True)
    t2i_r10_agent = agent.register(t2i_r10, 'recall', overwrite=True)

    i2t_med_agent = agent.register(i2t_med, 'median', overwrite=True)
    t2i_med_agent = agent.register(t2i_med, 'median', overwrite=True)

    # Model options
    model_options = {}
    model_options['data'] = data
    model_options['margin'] = margin
    model_options['dim'] = dim
    model_options['dim_image'] = dim_image
    model_options['dim_word'] = dim_word
    model_options['encoder'] = encoder
    model_options['max_epochs'] = max_epochs
    model_options['dispFreq'] = dispFreq
    model_options['decay_c'] = decay_c
    model_options['grad_clip'] = grad_clip
    model_options['maxlen_w'] = maxlen_w
    model_options['batch_size'] = batch_size
    model_options['saveto'] = saveto
    model_options['validFreq'] = validFreq
    model_options['lrate'] = lrate
    model_options['reload_'] = reload_
    model_options['concat'] = concat

    print model_options

    # reload options
    if reload_ and os.path.exists(saveto):
        print 'reloading...' + saveto
        with open('%s.pkl' % saveto, 'rb') as f:
            model_options = pkl.load(f)

    # Load training and development sets
    print 'loading dataset'
    train, dev = load_dataset(data)[:2]

    # Create and save dictionary
    print 'Create dictionary'
    worddict = build_dictionary(train[0] + dev[0])[0]
    n_words = len(worddict)
    model_options['n_words'] = n_words
    print 'Dictionary size: ' + str(n_words)
    with open('%s.dictionary.pkl' % saveto, 'wb') as f:
        pkl.dump(worddict, f)

    # Inverse dictionary
    word_idict = dict()
    for kk, vv in worddict.iteritems():
        word_idict[vv] = kk
    word_idict[0] = '<eos>'
    word_idict[1] = 'UNK'

    model_options['worddict'] = worddict
    model_options['word_idict'] = word_idict

    # Each sentence in the minibatch have same length (for encoder)
    train_iter = homogeneous_data.HomogeneousData([train[0], train[1]],
                                                  batch_size=batch_size,
                                                  maxlen=maxlen_w)

    img_sen_model = ImgSenRanking(model_options)
    img_sen_model = img_sen_model.cuda()

    loss_fn = PairwiseRankingLoss(margin=margin)
    loss_fn = loss_fn.cuda()

    params = filter(lambda p: p.requires_grad, img_sen_model.parameters())
    optimizer = torch.optim.Adam(params, lrate)

    uidx = 0
    curr = 0.0
    n_samples = 0

    for eidx in xrange(max_epochs):

        print 'Epoch ', eidx

        for x, im in train_iter:
            n_samples += len(x)
            uidx += 1

            x_id, im = homogeneous_data.prepare_data(x,
                                                     im,
                                                     worddict,
                                                     maxlen=maxlen_w,
                                                     n_words=n_words)

            if x_id is None:
                print 'Minibatch with zero sample under length ', maxlen_w
                uidx -= 1
                continue

            x_id = Variable(torch.from_numpy(x_id).cuda())
            im = Variable(torch.from_numpy(im).cuda())
            # Update
            ud_start = time.time()
            x, im = img_sen_model(x_id, im, x)
            cost = loss_fn(im, x)
            optimizer.zero_grad()
            cost.backward()
            torch.nn.utils.clip_grad_norm(params, grad_clip)
            optimizer.step()
            ud = time.time() - ud_start

            if numpy.mod(uidx, dispFreq) == 0:
                print 'Epoch ', eidx, 'Update ', uidx, 'Cost ', cost.data.cpu(
                ).numpy()[0], 'UD ', ud

            if numpy.mod(uidx, validFreq) == 0:

                print 'Computing results...'
                curr_model = {}
                curr_model['options'] = model_options
                curr_model['worddict'] = worddict
                curr_model['word_idict'] = word_idict
                curr_model['img_sen_model'] = img_sen_model

                ls, lim = encode_sentences(curr_model, dev[0]), encode_images(
                    curr_model, dev[1])

                r1, r5, r10, medr = 0.0, 0.0, 0.0, 0
                r1i, r5i, r10i, medri = 0.0, 0.0, 0.0, 0
                r_time = time.time()
                if data == 'arch' or data == 'arch_small':
                    (r1, r5, r10, medr) = i2t_arch(lim, ls)
                    print "Image to text: %.1f, %.1f, %.1f, %.1f" % (r1, r5,
                                                                     r10, medr)
                    (r1i, r5i, r10i, medri) = t2i_arch(lim, ls)
                    print "Text to image: %.1f, %.1f, %.1f, %.1f" % (
                        r1i, r5i, r10i, medri)
                else:
                    (r1, r5, r10, medr) = i2t(lim, ls)
                    print "Image to text: %.1f, %.1f, %.1f, %.1f" % (r1, r5,
                                                                     r10, medr)
                    (r1i, r5i, r10i, medri) = t2i(lim, ls)
                    print "Text to image: %.1f, %.1f, %.1f, %.1f" % (
                        r1i, r5i, r10i, medri)

                print "Cal Recall@K using %ss" % (time.time() - r_time)

                record_num = uidx / validFreq
                agent.append(i2t_r1_agent, record_num, r1)
                agent.append(i2t_r5_agent, record_num, r5)
                agent.append(i2t_r10_agent, record_num, r10)
                agent.append(t2i_r1_agent, record_num, r1i)
                agent.append(t2i_r5_agent, record_num, r5i)
                agent.append(t2i_r10_agent, record_num, r10i)

                agent.append(i2t_med_agent, record_num, medr)
                agent.append(t2i_med_agent, record_num, medri)

                currscore = r1 + r5 + r10 + r1i + r5i + r10i
                if currscore > curr:
                    curr = currscore

                    # Save model
                    print 'Saving model...',
                    pkl.dump(
                        model_options,
                        open('%s_params_%s.pkl' % (saveto, encoder), 'wb'))
                    torch.save(img_sen_model.state_dict(),
                               '%s_model_%s.pkl' % (saveto, encoder))
                    print 'Done'

        print 'Seen %d samples' % n_samples
Ejemplo n.º 10
0
        for optimizer in ['SGD', 'Adam']:
            for criteria in criteria2metric.keys():
                for corpus in ['wikipedia', 'PennTreeBank']:
                    hyperparameters = {
                        'learning rate': learning_rate,
                        'batch size': batch_size,
                        'criteria': criteria,
                        'corpus': corpus,
                        'optimizer': optimizer,
                        'momentum': 0.9,
                        'num hidden': 300,
                    }
                    metric = criteria2metric[criteria]
                    print('register criteria <%s> as metric <%s>' %
                          (criteria, metric))
                    name = agent.register(hyperparameters, metric)
                    name_list.append(name)
                    criteria_list.append(criteria)
                    offset_list.append(abs(random.random() * 0.5))

for i in range(10000):
    print('append %d' % i)
    for name, criteria, offset in zip(name_list, criteria_list, offset_list):
        value = math.exp(-i / 10.0) + offset + random.random() * 0.05
        metric = criteria2metric[criteria]
        if metric == 'accuracy': value = 1 - value
        scale = metric2scale[metric]
        value *= scale
        agent.append(name, i, value)
    time.sleep(0.1)
Ejemplo n.º 11
0
    for i,(original_img, original_bbox, img, bbox, label, scale, flip) in enumerate(train_dataloader):
        losses = model.train_step(img,bbox,label,scale)
        print('Epoch{} [{}/{}] \tTotal Loss: {:.6f}'.format(epoch, i,train_num,losses.total_loss.item()))

        # here can be delete
        global_step += 1
        ls[0] += losses.rpn_loc_loss.item()
        ls[1] += losses.rpn_cls_loss.item()
        ls[2] += losses.roi_loc_loss.item()
        ls[3] += losses.roi_cls_loss.item()
        ls[4] += losses.total_loss.item()

        if global_step%record_step == 0:
            ls_record[global_step] = ls/record_step
            ls = np.zeros((5))
            with open('losses_record.pkl','wb') as f:
                pickle.dump(ls_record,f)

            if opt.use_hyperboard:
                agent.append(rpn_loc_loss, global_step, ls_record[global_step][0])
                agent.append(rpn_cls_loss, global_step, ls_record[global_step][1])
                agent.append(roi_loc_loss, global_step, ls_record[global_step][2])
                agent.append(roi_cls_loss, global_step, ls_record[global_step][3])
                agent.append(loss_record, global_step, ls_record[global_step][4])


    if epoch == 2:
        model.decay_lr(opt.lr_decay)

model.save(save_optimizer=True)
Ejemplo n.º 12
0
        loss = criterion(predicts, train_set_labels)
        #
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        #valid set
    if (epoch % 2) == 0:
        model.eval()
        valid_predicts = model(valid_set_inputs)
        if use_cuda:
            pred = valid_predicts.data.cpu().numpy()
            pred = pred[:, 1]
        else:
            pred = predicts.data.numpy()[:, 1]

        pred = pred.tolist()
        targets = valid_set_labels.tolist()
        auc_score = roc_auc_score(targets, pred)
        print('saving model: train_set_loss-{:.4f} valid_set_auc-{:.4f}.model'.
              format(loss.data[0], auc_score))
        #
        agent.append(valid_auc, epoch, auc_score)
        agent.append(train_loss, epoch, loss.data[0])
        #save model
        torch.save(
            model.state_dict(),
            fo + '/train_set_loss-{:.4f} valid_set_auc-{:.4f}.model'.format(
                loss.data[0], auc_score))
        #for training
        model.train()
print('training finished')
            ddpg_agent.train_commander()

            if done:
                qs = []
                q = np.zeros((config.MYSELF_NUM))
                total_reward = 0
                for r in rs[::-1]:
                    q = r + config.GAMMA * q
                    total_reward += r.sum() / config.MYSELF_NUM
                    qs.append(q)
                qs = np.asarray(qs)
                q_mean = np.mean(qs)
                ddpg_agent.train_record[episode] = total_reward
                print('memory: {}/{}'.format(
                    ddpg_agent.commander_memory.current_index,
                    ddpg_agent.commander_memory.max_len))
                print('q_mean: ', q_mean)
                print('train_reward', total_reward)
                HBagent.append(train_r, episode, total_reward)

                break

            state = next_state

        if episode % config.SAVE_ITERVAL == 0:
            print('\nsave model\n')
            ddpg_agent.save(episode)

        if episode == 800:
            break
Ejemplo n.º 14
0
def train(lr, net, epoch, train_loader, valid_loader, transform, hyperparameters, batch_size):

    # register hypercurve
    agent = Agent(port=5005)
    hyperparameters['criteria'] = 'train loss'
    train_loss = agent.register(hyperparameters, 'loss')
    
    hyperparameters['criteria'] = 'valid loss'
    valid_loss = agent.register(hyperparameters, 'loss')
    
    hyperparameters['criteria'] = 'valid bleu'
    valid_bleu = agent.register(hyperparameters, 'bleu')
    
    hyperparameters['criteria'] = 'train bleu'
    train_bleu = agent.register(hyperparameters, 'bleu')
    
    hyperparameters['criteria'] = 'teacher_forcing_ratio'
    hyper_tfr = agent.register(hyperparameters, 'ratio')
    
    hyperparameters['criteria'] = 'teacher_forcing_loss'
    valid_tf_loss = agent.register(hyperparameters, 'loss')

    if torch.cuda.is_available():
        net.cuda()
    optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, net.parameters()), lr = lr)
    net.train()
    
    best_score = -1
    global_steps = 578800
    best_valid_loss  = 10000
    for iepoch in range(epoch):
        
        batchid = 0
        for (_, tdata) in enumerate(train_loader, 0):

            entext = tdata['entext']
            enlen = tdata['enlen']
            zhlabel = tdata['zhlabel']
            zhgtruth = tdata['zhgtruth']
            zhlen = tdata['zhlen']
            enstr = tdata['enstr']
            zhstr = tdata['zhstr']
            
            teacher_forcing_ratio = 1
            print ('teacher_forcing_ratio: ', teacher_forcing_ratio)
            
            decoder_outputs, ret_dict = net(entext, zhgtruth,True, teacher_forcing_ratio)
            
            
            loss = net.get_loss(decoder_outputs, zhlabel)
            optimizer.zero_grad()
            loss.backward()
            utils.clip_grad_norm(net.parameters(), 5)
            optimizer.step()
            
            batchid += 1
            global_steps += 1
            
            print (global_steps, iepoch, batchid, max(enlen), sum(loss.data.cpu().numpy())) 
            agent.append(train_loss, global_steps, sum(loss.data.cpu().numpy()))
            agent.append(hyper_tfr, global_steps, teacher_forcing_ratio)
            
            if global_steps % 50 == 0:
                net.eval()
                decoder_outputs, ret_dict = net(entext, zhgtruth, True, teacher_forcing_ratio)
                
                length = ret_dict['length']
                prediction = [0 for i in range(len(length))]
                tmppre = [_.squeeze().cpu().data.tolist() for _ in ret_dict['sequence']]
                tmppre = np.array(tmppre).transpose(1, 0)
                
                for i in range(len(tmppre)):
                    
                    prediction[i] = tmppre[i][:length[i]]
                    prediction[i] = transform.i2t(prediction[i], language = 'zh')
                    prediction[i] = re.sub(r'nuk#', '', prediction[i])
                    prediction[i] = re.sub(r'eos#', '', prediction[i])

                tmpscore = bleuscore.score(prediction, zhstr)   
                
                for i in range(5):
                    print (prediction[i])
                    print (zhstr[i])
                    print ('-------------------\n')

                del decoder_outputs, ret_dict
                agent.append(train_bleu, global_steps, tmpscore)
                net.train()
	    
                
            if global_steps % 200 == 0:
                print ('\n------------------------\n')
                net.eval()
                all_pre = []
                all_label = []
                all_loss = 0
                all_en = []
                bats = 0
                teacher_forcing_loss = 0
                for (_, vdata) in enumerate(valid_loader, 0):
                    
                    entext = vdata['entext']
                    enlen = vdata['enlen']
                    zhlabel = vdata['zhlabel']
                    zhgtruth = vdata['zhgtruth']
                    zhlen = vdata['zhlen']
                    enstr = vdata['enstr']
                    zhstr = vdata['zhstr']
                    
                    decoder_outputs, ret_dict = net(entext, None, True, 0)
                    length = ret_dict['length']
                    prediction = [0 for i in range(len(length))]
                    tmppre = [_.squeeze().cpu().data.tolist() for _ in ret_dict['sequence']]
                    tmppre = np.array(tmppre).transpose(1, 0)
                    
                    for i in range(len(tmppre)):
                        prediction[i] = tmppre[i][:length[i]]
                        prediction[i] = transform.i2t(prediction[i], language = 'zh')
                        prediction[i] = re.sub(r'nuk#', '', prediction[i])
                        prediction[i] = re.sub(r'eos#', '', prediction[i])
                    
                    loss = net.get_loss(decoder_outputs, zhlabel)
                    
                    all_pre.extend(prediction)
                    all_label.extend(zhstr)
                    all_en.extend(enstr)
                    all_loss += sum(loss.data.cpu().numpy())
                    
                    del loss, decoder_outputs, ret_dict

                    # teacher forcing loss, to judge if overfit
                    decoder_outputs, _ = net(entext, zhgtruth, True, 1)
                    loss = net.get_loss(decoder_outputs, zhlabel)
                    teacher_forcing_loss += sum(loss.data.cpu().numpy()) 
                    bats += 1
                score = bleuscore.score(all_pre, all_label)
            
                for i in range(0, 400):
                    print (all_en[i])
                    print (all_pre[i])
                    print (all_label[i])
                    print ('-------------------\n')
        
                all_loss /= bats
                teacher_forcing_loss /= bats
                print (global_steps, iepoch, batchid, all_loss, teacher_forcing_loss, score, '\n********************\n')
                agent.append(valid_loss, global_steps, all_loss)
                agent.append(valid_bleu, global_steps, score)
                agent.append(valid_tf_loss, global_steps, teacher_forcing_loss)
                if best_valid_loss > all_loss:
                    
                    best_valid_loss = all_loss
                    #bestscore = score
                    _ = model_dir + "ratio-{:3f}-loss-{:3f}-score-{:3f}-steps-{:d}-model.pkl".format(teacher_forcing_ratio, all_loss, score, global_steps)
                    torch.save(net.state_dict(), _)
                
                elif global_steps % 400 == 0:
                    _ = model_dir + "ratio-{:3f}-loss-{:3f}-score-{:3f}-steps-{:d}-model.pkl".format(teacher_forcing_ratio, all_loss, score, global_steps)
                    torch.save(net.state_dict(), _)

                del all_label, all_loss, all_pre
                net.train()