Esempio n. 1
0
def evaluate(encoder,
             decoder,
             valid_set,
             lang,
             embedding_size,
             encoder_style=ENCODER_STYLE,
             iter_time=10,
             beam_size=1,
             verbose=True):
    """The evaluate procedure."""
    # Get evaluate data
    valid_iter = data_iter(valid_set, batch_size=1, shuffle=False)
    if use_cuda:
        encoder.cuda()
        decoder.cuda()

    for iteration in range(iter_time):

        # Get data
        data, idx_data = get_batch(next(valid_iter))
        rt, re, rm, summary = idx_data

        # For Encoding
        rt = Variable(torch.LongTensor(rt))
        re = Variable(torch.LongTensor(re))
        rm = Variable(torch.LongTensor(rm))

        # For Decoding
        summary = Variable(torch.LongTensor(summary))

        if use_cuda:
            rt, re, rm, summary = rt.cuda(), re.cuda(), rm.cuda(
            ), summary.cuda()

        # Get decoding words and attention matrix
        decoded_words, decoder_attentions = predictwords(
            rt, re, rm, encoder, decoder, lang, embedding_size, encoder_style,
            beam_size)

        res = ' '.join(decoded_words[:-1])
        if verbose:
            print(res)
        yield res

        # Compare to the origin data
        triplets, gold_summary = data[0]
        print(triplets)
        for word in gold_summary:
            print(word, end=' ')
        print(' ')

        showAttention(triplets, decoded_words, decoder_attentions)
Esempio n. 2
0
def evaluate(encoder,
             decoder,
             valid_set,
             lang,
             embedding_size,
             encoder_style=ENCODER_STYLE,
             epoch_time=EPOCH_TIME,
             beam_size=1,
             verbose=True):
    """The evaluate procedure."""
    # Get evaluate data
    valid_iter = data_iter(valid_set, batch_size=1, shuffle=True)
    if use_cuda:
        encoder.cuda()
        decoder.cuda()

    for iteration in range(epoch_time):

        # Get data
        data, idx_data = get_batch(next(valid_iter))
        rt, re, rm, summary = idx_data

        # For Encoding
        rt = Variable(torch.LongTensor(rt))
        re = Variable(torch.LongTensor(re))
        rm = Variable(torch.LongTensor(rm))

        # For Decoding
        summary = Variable(torch.LongTensor(summary))

        if use_cuda:
            rt, re, rm, summary = rt.cuda(), re.cuda(), rm.cuda(
            ), summary.cuda()

        # Get decoding words and attention matrix
        decoded_words, decoder_attentions = predictwords(
            rt, re, rm, summary, encoder, decoder, lang, embedding_size,
            encoder_style, beam_size)

        res = ' '.join(decoded_words[:-1])
        if verbose:
            print(res)
        yield res
Esempio n. 3
0
def evaluate(encoder,
             decoder,
             valid_set,
             lang,
             embedding_size,
             encoder_style=ENCODER_STYLE,
             epoch_time=EPOCH_TIME,
             beam_size=1,
             verbose=True):
    valid_iter = data_iter(valid_set, batch_size=1, shuffle=True)
    if use_cuda:
        encoder.cuda()
        decoder.cuda()

    for iteration in range(epoch_time):
        data, idx_data = get_batch(next(valid_iter))
        rt, re, rm, summary = idx_data

        rt = Variable(torch.LongTensor(rt))
        re = Variable(torch.LongTensor(re))
        rm = Variable(torch.LongTensor(rm))

        summary = Variable(torch.LongTensor(summary))

        if use_cuda:
            rt, re, rm, summary = rt.cuda(), re.cuda(), rm.cuda(
            ), summary.cuda()

        decoded_words, decoder_attentions = hierarchical_predictwords(
            rt, re, rm, summary, encoder, decoder, lang, embedding_size,
            encoder_style, beam_size)

        res = ' '.join(decoded_words[:-1])
        if verbose:
            print(res)
        yield res
Esempio n. 4
0
def train(train_set,
          langs,
          embedding_size=600,
          learning_rate=0.01,
          iter_time=10,
          batch_size=32,
          get_loss=GET_LOSS,
          save_model=SAVE_MODEL,
          encoder_style=ENCODER_STYLE,
          use_model=USE_MODEL):
    """The training procedure."""
    # Set the timer
    start = time.time()

    # Initialize the model
    emb = docEmbedding(langs['rt'].n_words, langs['re'].n_words,
                       langs['rm'].n_words, embedding_size)
    emb.init_weights()

    if encoder_style == 'LIN':
        encoder = EncoderLIN(embedding_size, emb)
    elif encoder_style == 'BiLSTM':
        encoder = EncoderBiLSTM(embedding_size, emb)
    else:
        encoder = EncoderRNN(embedding_size, emb)

    decoder = AttnDecoderRNN(embedding_size, langs['summary'].n_words)

    if use_cuda:
        emb.cuda()
        encoder.cuda()
        decoder.cuda()

    if use_model is not None:
        encoder = load_model(encoder, use_model[0])
        decoder = load_model(decoder, use_model[1])

    # Choose optimizer
    loss_optimizer = optim.Adagrad(list(encoder.parameters()) +
                                   list(decoder.parameters()),
                                   lr=learning_rate,
                                   lr_decay=0,
                                   weight_decay=0)
    # decoder_optimizer = optim.Adagrad(decoder.parameters(), lr=learning_rate, lr_decay=0, weight_decay=0)

    criterion = nn.NLLLoss()

    total_loss = 0
    iteration = 0
    for epo in range(1, iter_time + 1):
        print("Epoch #%d" % (epo))
        # Get data

        train_iter = data_iter(train_set, batch_size=batch_size)
        for dt in train_iter:
            iteration += 1
            data, idx_data = get_batch(dt)
            rt, re, rm, summary = idx_data

            # Add paddings
            rt = addpaddings(rt)
            re = addpaddings(re)
            rm = addpaddings(rm)
            summary = addpaddings(summary)

            rt = Variable(torch.LongTensor(rt), requires_grad=False)
            re = Variable(torch.LongTensor(re), requires_grad=False)
            rm = Variable(torch.LongTensor(rm), requires_grad=False)

            # For Decoding
            summary = Variable(torch.LongTensor(summary), requires_grad=False)

            if use_cuda:
                rt, re, rm, summary = rt.cuda(), re.cuda(), rm.cuda(
                ), summary.cuda()

            # Get the average loss on the sentences
            loss = sentenceloss(rt, re, rm, summary, encoder, decoder,
                                loss_optimizer, criterion, embedding_size,
                                encoder_style)
            total_loss += loss

            # Print the information and save model
            if iteration % get_loss == 0:
                print("Time {}, iter {}, avg loss = {:.4f}".format(
                    gettime(start), iteration, total_loss / get_loss))
                total_loss = 0
        if epo % save_model == 0:
            torch.save(encoder.state_dict(),
                       "{}_encoder_{}".format(OUTPUT_FILE, iteration))
            torch.save(decoder.state_dict(),
                       "{}_decoder_{}".format(OUTPUT_FILE, iteration))
            print("Save the model at iter {}".format(iteration))

    return encoder, decoder
Esempio n. 5
0
def train(train_set,
          langs,
          embedding_size=EMBEDDING_SIZE,
          learning_rate=LR,
          batch_size=BATCH_SIZE,
          get_loss=GET_LOSS,
          grad_clip=GRAD_CLIP,
          encoder_style=ENCODER_STYLE,
          decoder_style=DECODER_STYLE,
          to_copy=TOCOPY,
          epoch_time=EPOCH_TIME,
          layer_depth=LAYER_DEPTH,
          max_length=MAX_LENGTH,
          max_sentence=MAX_SENTENCES,
          save_model=SAVE_MODEL,
          output_file=OUTPUT_FILE,
          iter_num=iterNum,
          pretrain=PRETRAIN):
    """The training procedure."""
    # # Test arg parser (For Debugging)
    # print("embedding_size={}, learning_rate={}, batch_size={}, get_loss={}, grad_clip={},\
    #         encoder_style={}, decoder_style={}, max_length={},\
    #         max_sentece={}, save_model={}, output_file={}, to_copy={},\
    #         epoch={}, layer_depth={}, iter num={}, pretrain={}".format(
    #         embedding_size, learning_rate, batch_size, get_loss, grad_clip,
    #         encoder_style, decoder_style, max_length, max_sentece, save_model, output_file,
    #         to_copy, epoch_time, layer_depth, iter_num, pretrain))
    # Set the timer
    start = time.time()

    # Initialize the model
    emb = docEmbedding(langs['rt'].n_words, langs['re'].n_words,
                       langs['rm'].n_words, embedding_size)
    emb.init_weights()

    # Choose encoder style
    if encoder_style == 'LIN':
        encoder = EncoderLIN(embedding_size, emb)

    elif encoder_style == 'BiLSTM':
        encoder = EncoderBiLSTM(embedding_size, emb, n_layers=layer_depth)

    elif encoder_style == 'BiLSTMMax':
        encoder = EncoderBiLSTMMaxPool(embedding_size,
                                       emb,
                                       n_layers=layer_depth)

    elif encoder_style == 'HierarchicalBiLSTM':
        encoder_args = {
            "hidden_size": embedding_size,
            "local_embed": emb,
            "n_layers": layer_depth
        }
        encoder = HierarchicalBiLSTM(**encoder_args)

    elif encoder_style == 'HierarchicalLIN':
        encoder_args = {"hidden_size": embedding_size, "local_embed": emb}
        encoder = HierarchicalLIN(**encoder_args)

    else:
        # initialize hierarchical encoder rnn, (both global and local)
        encoder_args = {
            "hidden_size": embedding_size,
            "local_embed": emb,
            "n_layers": layer_depth
        }
        encoder = HierarchicalRNN(**encoder_args)

    # Choose decoder style and training function
    if decoder_style == 'HierarchicalRNN':
        decoder = HierarchicalDecoder(embedding_size,
                                      langs['summary'].n_words,
                                      n_layers=layer_depth,
                                      copy=to_copy)
        train_func = Hierarchical_seq_train
    else:
        decoder = AttnDecoderRNN(embedding_size,
                                 langs['summary'].n_words,
                                 n_layers=layer_depth,
                                 copy=to_copy)
        train_func = Plain_seq_train

    if use_cuda:
        emb.cuda()
        encoder.cuda()
        decoder.cuda()

    # Choose optimizer
    loss_optimizer = optim.Adagrad(list(encoder.parameters()) +
                                   list(decoder.parameters()),
                                   lr=learning_rate,
                                   lr_decay=0,
                                   weight_decay=0)

    # loss_optimizer = optim.Adam(list(encoder.parameters()) + list(decoder.parameters()),
    #                             lr=learning_rate)

    # Load pre-train model
    use_model = None
    if pretrain is not None and iter_num is not None:
        use_model = [
            './models/' + pretrain + '_' + s + '_' + str(iter_num)
            for s in ['encoder', 'decoder', 'optim']
        ]

    if use_model is not None:
        encoder = load_model(encoder, use_model[0])
        decoder = load_model(decoder, use_model[1])
        loss_optimizer.load_state_dict(torch.load(use_model[2]))
        print("Load Pretrain Model {}".format(use_model))
    else:
        print("Not use Pretrain Model")

    criterion = nn.NLLLoss()

    # Build up the model
    model = Seq2Seq(encoder, decoder, train_func, criterion, embedding_size,
                    langs)

    # print(encoder)
    # print(decoder)
    # print(loss_optimizer)

    total_loss = 0
    iteration = 0
    for epo in range(1, epoch_time + 1):
        # Start of an epoch
        print("Epoch #%d" % (epo))

        # Get data
        train_iter = data_iter(train_set, batch_size=batch_size)
        for dt in train_iter:
            iteration += 1
            data, idx_data = get_batch(dt)
            rt, re, rm, summary = idx_data

            # Debugging: check the input triplets
            # show_triplets(data[0][0])

            # Add paddings
            rt = addpaddings(rt)
            re = addpaddings(re)
            rm = addpaddings(rm)

            # For summary paddings, if the model is herarchical then pad between sentences
            # If the batch_size is 1 then we don't need to do sentence padding
            if decoder_style == 'HierarchicalRNN' and batch_size != 1:
                summary = add_sentence_paddings(summary)
            else:
                summary = addpaddings(summary)

            rt = Variable(torch.LongTensor(rt), requires_grad=False)
            re = Variable(torch.LongTensor(re), requires_grad=False)
            rm = Variable(torch.LongTensor(rm), requires_grad=False)

            # For Decoding
            summary = Variable(torch.LongTensor(summary), requires_grad=False)

            if use_cuda:
                rt, re, rm, summary = rt.cuda(), re.cuda(), rm.cuda(
                ), summary.cuda()

            # Zero the gradient
            loss_optimizer.zero_grad()
            model.train()
            # calculate loss of "a batch of input sequence"
            loss = sequenceloss(rt, re, rm, summary, model)

            # Backpropagation
            loss.backward()
            torch.nn.utils.clip_grad_norm(
                list(model.encoder.parameters()) +
                list(model.decoder.parameters()), grad_clip)
            loss_optimizer.step()

            # Get the average loss on the sentences
            target_length = summary.size()[1]
            if float(torch.__version__[:3]) > 0.3:
                total_loss += loss.item()
            else:
                total_loss += loss.data[0]

            # Print the information and save model
            if iteration % get_loss == 0:
                print("Time {}, iter {}, Seq_len:{}, avg loss = {:.4f}".format(
                    gettime(start), iteration, target_length,
                    total_loss / get_loss))
                total_loss = 0

        if epo % save_model == 0:
            torch.save(encoder.state_dict(),
                       "models/{}_encoder_{}".format(output_file, iteration))
            torch.save(decoder.state_dict(),
                       "models/{}_decoder_{}".format(output_file, iteration))
            torch.save(loss_optimizer.state_dict(),
                       "models/{}_optim_{}".format(output_file, iteration))
            print("Save the model at iter {}".format(iteration))

    return model.encoder, model.decoder
Esempio n. 6
0
def train(train_set,
          langs,
          embedding_size=600,
          learning_rate=0.01,
          iter_time=10,
          batch_size=32,
          get_loss=GET_LOSS,
          save_model=SAVE_MODEL,
          encoder_style=ENCODER_STYLE,
          decoder_style=DECODER_STYLE,
          use_model=USE_MODEL):
    """The training procedure."""
    # Set the timer
    start = time.time()

    encoder, decoder, loss_optimizer, train_func = model_initialization(
        encoder_style, decoder_style, langs, embedding_size, learning_rate,
        use_model)

    criterion = nn.NLLLoss()

    # Build up the model
    model = Seq2Seq(encoder, decoder, train_func, None, criterion,
                    embedding_size, langs)

    # print(encoder)
    # print(decoder)
    # print(loss_optimizer)

    total_loss = 0
    iteration = 0
    for epo in range(1, iter_time + 1):
        # Start of an epoch
        print("Epoch #%d" % (epo))

        # Get data
        train_iter = data_iter(train_set, batch_size=batch_size)
        for dt in train_iter:
            iteration += 1
            data, idx_data = get_batch(dt)
            print(idx_data)
            rt, re, rm, summary = idx_data

            # Debugging: check the input triplets
            # show_triplets(data[0][0])

            # Add paddings
            rt = addpaddings(rt)
            re = addpaddings(re)
            rm = addpaddings(rm)

            # For summary paddings, if the model is herarchical then pad between sentences
            if decoder_style == 'HierarchicalRNN':
                summary = add_sentence_paddings(summary)
            else:
                summary = addpaddings(summary)

            rt = Variable(torch.LongTensor(rt), requires_grad=False)
            re = Variable(torch.LongTensor(re), requires_grad=False)
            rm = Variable(torch.LongTensor(rm), requires_grad=False)

            # DEBUG:
            if torch.sum(rm == 3).item() == 0:
                print('skip')
                continue

            # For Decoding
            summary = Variable(torch.LongTensor(summary), requires_grad=False)

            if use_cuda:
                rt, re, rm, summary = rt.cuda(), re.cuda(), rm.cuda(
                ), summary.cuda()

            # Zero the gradient
            loss_optimizer.zero_grad()
            model.train()
            # calculate loss of "a batch of input sequence"
            loss = sequenceloss(rt, re, rm, summary, model)

            # Backpropagation
            loss.backward()
            torch.nn.utils.clip_grad_norm(
                list(model.encoder.parameters()) +
                list(model.decoder.parameters()), GRAD_CLIP)
            loss_optimizer.step()

            # Get the average loss on the sentences
            target_length = summary.size()[1]
            if float(torch.__version__[:3]) > 0.3:
                total_loss += loss.item()
            else:
                total_loss += loss.data[0]

            # Print the information and save model
            if iteration % get_loss == 0:
                print("Time {}, iter {}, Seq_len:{}, avg loss = {:.4f}".format(
                    gettime(start), iteration, target_length,
                    total_loss / get_loss))
                total_loss = 0

        if epo % save_model == 0:
            torch.save(encoder.state_dict(),
                       "models/{}_encoder_{}".format(OUTPUT_FILE, iteration))
            torch.save(decoder.state_dict(),
                       "models/{}_decoder_{}".format(OUTPUT_FILE, iteration))
            torch.save(loss_optimizer.state_dict(),
                       "models/{}_optim_{}".format(OUTPUT_FILE, iteration))
            print("Save the model at iter {}".format(iteration))

    return model.encoder, model.decoder
Esempio n. 7
0
def train(train_set,
          langs,
          embedding_size=600,
          learning_rate=0.01,
          iter_time=10,
          batch_size=32,
          get_loss=GET_LOSS,
          save_model=SAVE_MODEL,
          encoder_style=ENCODER_STYLE,
          decoder_style=DECODER_STYLE,
          use_model=USE_MODEL):
    """The training procedure."""
    # Set the timer
    start = time.time()

    # Initialize the model
    emb = docEmbedding(langs['rt'].n_words, langs['re'].n_words,
                       langs['rm'].n_words, embedding_size)
    emb.init_weights()

    # Choose encoder style
    # TODO:: Set up a choice for hierarchical or not
    if encoder_style == 'LIN':
        encoder = EncoderLIN(embedding_size, emb)
    elif encoder_style == 'BiLSTM':
        encoder = EncoderBiLSTM(embedding_size, emb)
    elif encoder_style == 'BiLSTMMax':
        encoder = EncoderBiLSTMMaxPooling(embedding_size, emb)
    elif encoder_style == 'HierarchicalBiLSTM':
        encoder_args = {"hidden_size": embedding_size, "local_embed": emb}
        encoder = HierarchicalBiLSTM(**encoder_args)
    elif encoder_style == 'HierarchicalLIN':
        encoder_args = {"hidden_size": embedding_size, "local_embed": emb}
        encoder = HierarchicalLIN(**encoder_args)
    else:
        # initialize hierarchical encoder rnn, (both global and local)
        encoder_args = {"hidden_size": embedding_size, "local_embed": emb}
        encoder = HierarchicalEncoderRNN(**encoder_args)

    # Choose decoder style and training function
    if decoder_style == 'HierarchicalRNN':
        decoder = HierarchicalDecoder(embedding_size, langs['summary'].n_words)
        train_func = Hierarchical_seq_train
    else:
        decoder = AttnDecoderRNN(embedding_size, langs['summary'].n_words)
        train_func = Plain_seq_train

    if use_cuda:
        emb.cuda()
        encoder.cuda()
        decoder.cuda()

    # Choose optimizer
    loss_optimizer = optim.Adagrad(list(encoder.parameters()) +
                                   list(decoder.parameters()),
                                   lr=learning_rate,
                                   lr_decay=0,
                                   weight_decay=0)

    # loss_optimizer = optim.Adam(list(encoder.parameters()) + list(decoder.parameters()),
    #                             lr=learning_rate)

    if use_model is not None:
        encoder = load_model(encoder, use_model[0])
        decoder = load_model(decoder, use_model[1])
        loss_optimizer.load_state_dict(torch.load(use_model[2]))

    criterion = nn.NLLLoss()

    # Build up the model
    model = Seq2Seq(encoder, decoder, train_func, criterion, embedding_size,
                    langs)

    # print(encoder)
    # print(decoder)
    # print(loss_optimizer)

    total_loss = 0
    iteration = 0
    for epo in range(1, iter_time + 1):
        # Start of an epoch
        print("Epoch #%d" % (epo))

        # Get data
        train_iter = data_iter(train_set, batch_size=batch_size)
        for dt in train_iter:
            iteration += 1
            data, idx_data = get_batch(dt)
            rt, re, rm, summary = idx_data

            # Debugging: check the input triplets
            # show_triplets(data[0][0])

            # Add paddings
            rt = addpaddings(rt)
            re = addpaddings(re)
            rm = addpaddings(rm)

            # For summary paddings, if the model is herarchical then pad between sentences
            if decoder_style == 'HierarchicalRNN':
                summary = add_sentence_paddings(summary)
            else:
                summary = addpaddings(summary)

            rt = Variable(torch.LongTensor(rt), requires_grad=False)
            re = Variable(torch.LongTensor(re), requires_grad=False)
            rm = Variable(torch.LongTensor(rm), requires_grad=False)

            # For Decoding
            summary = Variable(torch.LongTensor(summary), requires_grad=False)

            if use_cuda:
                rt, re, rm, summary = rt.cuda(), re.cuda(), rm.cuda(
                ), summary.cuda()

            # Zero the gradient
            loss_optimizer.zero_grad()
            model.train()
            # calculate loss of "a batch of input sequence"
            loss = sequenceloss(rt, re, rm, summary, model)

            # Backpropagation
            loss.backward()
            torch.nn.utils.clip_grad_norm(
                list(model.encoder.parameters()) +
                list(model.decoder.parameters()), GRAD_CLIP)
            loss_optimizer.step()

            # Get the average loss on the sentences
            target_length = summary.size()[1]
            if float(torch.__version__[:3]) > 0.3:
                total_loss += loss.item()
            else:
                total_loss += loss.data[0]

            # Print the information and save model
            if iteration % get_loss == 0:
                print("Time {}, iter {}, Seq_len:{}, avg loss = {:.4f}".format(
                    gettime(start), iteration, target_length,
                    total_loss / get_loss))
                total_loss = 0

        if epo % save_model == 0:
            torch.save(encoder.state_dict(),
                       "models/{}_encoder_{}".format(OUTPUT_FILE, iteration))
            torch.save(decoder.state_dict(),
                       "models/{}_decoder_{}".format(OUTPUT_FILE, iteration))
            torch.save(loss_optimizer.state_dict(),
                       "models/{}_optim_{}".format(OUTPUT_FILE, iteration))
            print("Save the model at iter {}".format(iteration))

    return model.encoder, model.decoder
Esempio n. 8
0
        train_input = nd.array(hf.get('input'))
        train_label = nd.array(hf.get('label'))
    net = SrCnn()
    net.initialize(ctx=try_gpu())
    if os.path.exists("srcnn.params"):
        net.load_parameters("srcnn.params")
    ctx = try_gpu()
    trainer = gluon.Trainer(net.collect_params(),
                            'sgd', {'learning_rate': lr})
    print('training on', ctx)
    loss = gloss.L2Loss()
    for ep in range(epoch):
        train_l_sum,  n, start = 0.0, 0, time.time()
        # batch_idxs = len(train_input) // batch_size

        for X, y in data_iter(batch_size, train_input, train_label):
            X, y = X.as_in_context(ctx), y.as_in_context(ctx)
            X = nd.transpose(X, (0, 3, 1, 2))
            y = nd.transpose(y, (0, 3, 1, 2))
            with autograd.record():
                y_hat = net(X)
                l = loss(y_hat, y).sum()
            l.backward()
            trainer.step(batch_size)
            y = y.astype('float32')
            train_l_sum += l.asscalar()
            print(y.size)
            n += y.size
        print('epoch %d,loss %f' % (ep+1, train_l_sum/n))
        net.save_parameters("srcnn.params")
Esempio n. 9
0
def train(train_set,
          langs,
          embedding_size=EMBEDDING_SIZE,
          learning_rate=LR,
          batch_size=BATCH_SIZE,
          get_loss=GET_LOSS,
          grad_clip=GRAD_CLIP,
          encoder_style=ENCODER_STYLE,
          decoder_style=DECODER_STYLE,
          to_copy=TOCOPY,
          epoch_time=EPOCH_TIME,
          layer_depth=LAYER_DEPTH,
          max_length=MAX_LENGTH,
          max_sentence=MAX_SENTENCES,
          save_model=SAVE_MODEL,
          output_file=OUTPUT_FILE,
          iter_num=iterNum,
          pretrain=PRETRAIN):

    start = time.time()

    emb = docEmbedding(langs['rt'].n_words, langs['re'].n_words,
                       langs['rm'].n_words, embedding_size)
    emb.init_weights()

    encoder_args = {
        "hidden_size": embedding_size,
        "local_embed": emb,
        "n_layers": layer_depth
    }
    encoder = HierarchicalRNN(**encoder_args)

    if decoder_style == 'HierarchicalRNN':
        decoder = HierarchicalDecoder(embedding_size,
                                      langs['summary'].n_words,
                                      n_layers=layer_depth,
                                      copy=to_copy)
        train_func = Hierarchical_seq_train
    else:
        decoder = AttnDecoderRNN(embedding_size,
                                 langs['summary'].n_words,
                                 n_layers=layer_depth,
                                 copy=to_copy)
        train_func = Plain_seq_train

    if use_cuda:
        emb.cuda()
        encoder.cuda()
        decoder.cuda()

    loss_optimizer = optim.Adagrad(list(encoder.parameters()) +
                                   list(decoder.parameters()),
                                   lr=learning_rate,
                                   lr_decay=0,
                                   weight_decay=0)

    use_model = None
    if pretrain is not None and iter_num is not None:
        use_model = [
            './models/' + pretrain + '_' + s + '_' + str(iter_num)
            for s in ['encoder', 'decoder', 'optim']
        ]

    if use_model is not None:
        encoder = load_model(encoder, use_model[0])
        decoder = load_model(decoder, use_model[1])
        loss_optimizer.load_state_dict(torch.load(use_model[2]))
        print("Load Pretrain Model {}".format(use_model))
    else:
        print("Not use Pretrain Model")

    criterion = nn.NLLLoss()

    model = Seq2Seq(encoder, decoder, train_func, criterion, embedding_size,
                    langs)

    total_loss = 0
    iteration = 0
    for epo in range(1, epoch_time + 1):
        print("Epoch #%d" % (epo))

        train_iter = data_iter(train_set, batch_size=batch_size)
        for dt in train_iter:
            iteration += 1
            data, idx_data = get_batch(dt)
            rt, re, rm, summary = idx_data

            rt = addpaddings(rt)
            re = addpaddings(re)
            rm = addpaddings(rm)

            if decoder_style == 'HierarchicalRNN' and batch_size != 1:
                summary = add_sentence_paddings(summary)
            else:
                summary = addpaddings(summary)

            rt = Variable(torch.LongTensor(rt), requires_grad=False)
            re = Variable(torch.LongTensor(re), requires_grad=False)
            rm = Variable(torch.LongTensor(rm), requires_grad=False)

            summary = Variable(torch.LongTensor(summary), requires_grad=False)

            if use_cuda:
                rt, re, rm, summary = rt.cuda(), re.cuda(), rm.cuda(
                ), summary.cuda()

            loss_optimizer.zero_grad()
            model.train()

            loss = sequenceloss(rt, re, rm, summary, model)

            loss.backward()
            torch.nn.utils.clip_grad_norm(
                list(model.encoder.parameters()) +
                list(model.decoder.parameters()), grad_clip)
            loss_optimizer.step()

            target_length = summary.size()[1]
            if float(torch.__version__[:3]) > 0.3:
                total_loss += loss.item() / target_length
            else:
                total_loss += loss.data[0] / target_length

            if iteration % get_loss == 0:
                print("Time {}, iter {}, Seq_len:{}, avg loss = {:.4f}".format(
                    gettime(start), iteration, target_length,
                    total_loss / get_loss))
                total_loss = 0

        if epo % save_model == 0:
            torch.save(encoder.state_dict(),
                       "models/{}_encoder_{}".format(output_file, iteration))
            torch.save(decoder.state_dict(),
                       "models/{}_decoder_{}".format(output_file, iteration))
            torch.save(loss_optimizer.state_dict(),
                       "models/{}_optim_{}".format(output_file, iteration))
            print("Save the model at iter {}".format(iteration))

    return model.encoder, model.decoder