Ejemplo n.º 1
0
def trainIters(encoder, decoder, n_iters, print_every=1000, plot_every=100, learning_rate=0.01):
  start = time.time()
  plot_losses = []
  print_loss_total = 0 # Reset every print_every
  plot_loss_total = 0 # Reset every plot_every

  encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
  decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)
  training_pairs = [util.tensorsFromPair(random.choice(pairs), input_lang, output_lang) \
    for i in range(n_iters)]
  criterion = nn.NLLLoss()

  for iter in range(1, n_iters + 1):
    training_pair = training_pairs[iter - 1]
    input_tensor = training_pair[0]
    target_tensor = training_pair[1]

    loss = train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion)
    print_loss_total += loss
    plot_loss_total += loss

    if iter % print_every == 0:
      print_loss_avg = print_loss_total / print_every
      print_loss_total = 0
      print('%s (%d %d%%) %.4f' % (util.timeSince(start, iter / n_iters),
        iter, iter / n_iters * 100, print_loss_avg))

    if iter % plot_every == 0:
      plot_loss_avg = plot_loss_total / plot_every
      plot_losses.append(plot_loss_avg)
      plot_loss_total = 0

  showPlot(plot_losses)
Ejemplo n.º 2
0
def train(dataset : SequenceSequenceDataset, hidden_size : int,
          learning_rate : float, num_encoder_layers : int,
          num_decoder_layers : int, max_length : int, num_epochs : int, batch_size : int,
          print_every : int, context_vocab_size : int, tactic_vocab_size : int) -> Iterable[Checkpoint]:
    print("Initializing PyTorch...")
    in_stream = [inputFromSentence(datum[0], max_length) for datum in dataset]
    out_stream = [inputFromSentence(datum[1], max_length) for datum in dataset]
    data_loader = data.DataLoader(data.TensorDataset(torch.LongTensor(out_stream),
                                                     torch.LongTensor(in_stream)),
                                  batch_size=batch_size, num_workers=0,
                                  shuffle=True, pin_memory=True,
                                  drop_last=True)

    encoder = EncoderRNN(context_vocab_size, hidden_size, num_encoder_layers,
                         batch_size=batch_size)
    decoder = DecoderRNN(hidden_size, tactic_vocab_size, num_decoder_layers,
                         batch_size=batch_size)
    encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)
    optimizers = [encoder_optimizer, decoder_optimizer]
    criterion = maybe_cuda(nn.NLLLoss())

    start = time.time()
    num_items = len(dataset) * num_epochs
    total_loss = 0

    print("Training...")
    for epoch in range(num_epochs):
        print("Epoch {}".format(epoch))
        adjustLearningRates(learning_rate, optimizers, epoch)
        for batch_num, (output_batch, input_batch) in enumerate(data_loader):
            target_length = output_batch.size()[1]

            encoder_optimizer.zero_grad()
            decoder_optimizer.zero_grad()
            predictor_output = decoder.run_teach(encoder
                                                 .run(cast(SomeLongTensor, input_batch)),
                                                 cast(SomeLongTensor, output_batch))
            loss = maybe_cuda(Variable(LongTensor(0)))
            output_var = maybe_cuda(Variable(output_batch))
            for i in range(target_length):
                loss += criterion(predictor_output[i], output_var[:,i])
            loss.backward()
            encoder_optimizer.step()
            decoder_optimizer.step()

            total_loss += (loss.data[0] / target_length) * batch_size

            if (batch_num + 1) % print_every == 0:
                items_processed = (batch_num + 1) * batch_size + epoch * len(dataset)
                progress = items_processed / num_items
                print("{} ({} {:.2f}%) {:.4f}".
                      format(timeSince(start, progress),
                             items_processed, progress * 100,
                             total_loss / items_processed))

        yield encoder.state_dict(), decoder.state_dict()
Ejemplo n.º 3
0
def trainIters(epoch,
               pairs,
               lang,
               encoder,
               decoder,
               print_every=100,
               plot_every=200,
               learning_rate=0.001):
    start = time.time()
    print("Starting training")
    plot_losses = []
    num = len(pairs)
    encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate)
    criterion = nn.NLLLoss()

    for epo in range(epoch):
        print("epoch: ", epo + 1)
        print_loss_total = 0  # Reset every print_every
        plot_loss_total = 0  # Reset every plot_every
        for iter in range(1, len(pairs) + 1):
            training_pair = variablesFromPair(pairs[iter - 1], lang)
            input_variable = training_pair[0]
            target_variable = training_pair[1]
            loss = train(input_variable, target_variable, encoder, decoder,
                         encoder_optimizer, decoder_optimizer, criterion)
            print("loss: ", loss.numpy())
            print_loss_total += loss
            plot_loss_total += loss

            if iter % print_every == 0:
                print_loss_avg = print_loss_total / print_every
                print_loss_total = 0
                print('%s (%d %d%%) %.4f' %
                      (timeSince(start, iter / num), iter, iter / num * 100,
                       print_loss_avg))
                evaluateRandomly(lang, encoder, decoder, 1)

            if iter % plot_every == 0:
                plot_loss_avg = plot_loss_total / plot_every
                plot_losses.append(plot_loss_avg)
                plot_loss_total = 0

    showPlot(plot_losses)
Ejemplo n.º 4
0
    def checkpoints(self, inputs : List[List[float]], outputs : List[int]) \
        -> Iterable[NeuralPredictorState]:
        print("Building tensors")
        dataloader = data.DataLoader(data.TensorDataset(
            torch.FloatTensor(inputs), torch.LongTensor(outputs)),
                                     batch_size=self.batch_size,
                                     num_workers=0,
                                     shuffle=True,
                                     pin_memory=True,
                                     drop_last=True)
        num_batches = int(len(inputs) / self.batch_size)
        dataset_size = num_batches * self.batch_size

        print("Initializing model...")
        training_start = time.time()
        for epoch in range(1, self.num_epochs):
            self.adjuster.step()
            print("Epoch {} (learning rate {:.6f})".format(
                epoch, self._optimizer.param_groups[0]['lr']))
            epoch_loss = 0.
            for batch_num, data_batch in enumerate(dataloader, start=1):
                self._optimizer.zero_grad()
                input_batch, output_batch = data_batch
                # with autograd.detect_anomaly():
                predictionDistribution = self._model(input_batch)
                output_var = maybe_cuda(Variable(output_batch))
                loss = self._criterion(predictionDistribution, output_var)
                loss.backward()
                self._optimizer.step()

                epoch_loss += loss.item()
                if batch_num % self.print_every == 0:
                    items_processed = batch_num * self.batch_size + \
                        (epoch - 1) * dataset_size
                    progress = items_processed / (dataset_size *
                                                  self.num_epochs)
                    print("{} ({:7} {:5.2f}%) {:.4f}".format(
                        timeSince(training_start, progress), items_processed,
                        progress * 100, epoch_loss / batch_num))
            state = self._model.state_dict()
            loss = epoch_loss / num_batches
            checkpoint = NeuralPredictorState(epoch, loss, state)
            yield checkpoint
Ejemplo n.º 5
0
def train_vectors(data_path, model_file):
    """
    Trains a doc2vec model from character sequences split into 3-letter
    words, then saves it to a file.
    
    """
    start = time.time()
    print("getting data")
    ids, sentences_ls = dp.get_paragraphs(data_path)
    print("got processed data")
    tagged_data = [
        TaggedDocument(words=dp.preprocess_str_hp(_d).split(), tags=[str(i)])
        for i, _d in sentences_ls
    ]
    print("made ", len(sentences_ls), " sentences")
    embedding = build_model(tagged_data)
    print("made model in ", timeSince(start))
    embedding.save(model_file)
    return ids, embedding
Ejemplo n.º 6
0
def trainIters(encoder, decoder, pairs, input_lang, output_lang, config):
    start = time.time()
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every

    encoder_optimizer = optim.SGD(encoder.parameters(),
                                  lr=config.learning_rate)
    decoder_optimizer = optim.SGD(decoder.parameters(),
                                  lr=config.learning_rate)
    training_pairs = [
        variablesFromPair(random.choice(pairs), input_lang, output_lang,
                          config) for i in range(config.n_iters)
    ]
    criterion = nn.NLLLoss()

    for iter in range(1, config.n_iters + 1):
        training_pair = training_pairs[iter - 1]
        input_variable = training_pair[0]
        target_variable = training_pair[1]

        loss = train(input_variable, target_variable, encoder, decoder,
                     encoder_optimizer, decoder_optimizer, criterion, config)
        print_loss_total += loss
        plot_loss_total += loss

        if iter % config.print_every == 0:
            print_loss_avg = print_loss_total / config.print_every
            print_loss_total = 0
            print('%s (%d %d%%) %.4f' %
                  (timeSince(start, iter / config.n_iters), iter,
                   iter / config.n_iters * 100, print_loss_avg))

        if iter % config.plot_every == 0:
            plot_loss_avg = plot_loss_total / config.plot_every
            plot_losses.append(plot_loss_avg)
            plot_loss_total = 0

    showPlot(plot_losses)
Ejemplo n.º 7
0
        # Generate a random episode

        #refactor training line
        #t = time.time()
        train_loss = train_batched_step(samples, model)  #TODO
        #print("optim time:", time.time() - t)
        avg_train_loss += train_loss
        counter += 1

        if episode == 1 or episode % args.print_freq == 0 or episode == model.num_pretrain_episodes:
            val_loss = eval_ll(val_states, model)
            print(
                '{:s} ({:d} {:.0f}% finished) TrainLoss: {:.4f}, ValLoss: {:.4f}'
                .format(
                    timeSince(
                        start,
                        float(episode) / float(model.num_pretrain_episodes)),
                    episode,
                    float(episode) / float(model.num_pretrain_episodes) * 100.,
                    avg_train_loss / counter, val_loss),
                flush=True)
            avg_train_loss = 0.
            counter = 0

            print('gen sample stats', batchtime.items())
            batchtime['max'] = 0
            batchtime['mean'] = 0
            batchtime['count'] = 0
            if episode % args.save_freq == 0 or episode == model.num_pretrain_episodes:
                model.save(path)
            if episode % 10000 == 0 or episode == model.num_pretrain_episodes:
Ejemplo n.º 8
0
def supervised_q(args: argparse.Namespace) -> None:
    replay_memory = []
    with open(args.tmp_file, 'r') as f:
        for idx, line in enumerate(tqdm(f, desc="Loading data")):
            replay_memory.append(LabeledTransition.from_dict(json.loads(line)))
    if args.max_tuples is not None:
        replay_memory = replay_memory[-args.max_tuples:]

    # Load the predictor
    predictor = cast(
        features_polyarg_predictor.FeaturesPolyargPredictor,
        predict_tactic.loadPredictorByFile(args.predictor_weights))

    q_estimator: QEstimator
    # Create an initial Q Estimator
    if args.estimator == "polyarg":
        q_estimator = PolyargQEstimator(args.learning_rate, args.epoch_step,
                                        args.gamma, predictor)
    else:
        q_estimator = FeaturesQEstimator(args.learning_rate, args.epoch_step,
                                         args.gamma)
    if args.start_from:
        q_estimator_name, *saved = \
          torch.load(args.start_from)
        if args.estimator == "polyarg":
            assert q_estimator_name == "polyarg evaluator", \
                q_estimator_name
        else:
            assert q_estimator_name == "features evaluator", \
                q_estimator_name
        q_estimator.load_saved_state(*saved)

    training_start = time.time()
    training_samples = assign_scores(args,
                                     q_estimator,
                                     predictor,
                                     replay_memory,
                                     progress=True)
    input_tensors = q_estimator.get_input_tensors(training_samples)
    rescore_lr = args.learning_rate

    for epoch in range(1, args.num_epochs + 1):
        scores = torch.FloatTensor(
            [score for _, _, _, score in training_samples])
        batches: Sequence[Sequence[torch.Tensor]] = data.DataLoader(
            data.TensorDataset(*(input_tensors + [scores])),
            batch_size=args.batch_size,
            num_workers=0,
            shuffle=True,
            pin_memory=True,
            drop_last=True)

        epoch_loss = 0.
        eprint("Epoch {}: Learning rate {:.12f}".format(
            epoch, q_estimator.optimizer.param_groups[0]['lr']),
               guard=args.show_loss)
        for idx, batch in enumerate(batches, start=1):
            q_estimator.optimizer.zero_grad()
            word_features_batch, vec_features_batch, \
                expected_outputs_batch = batch
            outputs = q_estimator.model(word_features_batch,
                                        vec_features_batch)
            loss = q_estimator.criterion(outputs,
                                         maybe_cuda(expected_outputs_batch))
            loss.backward()
            q_estimator.optimizer.step()
            q_estimator.total_batches += 1
            epoch_loss += loss.item()
            if idx % args.print_every == 0:
                items_processed = idx * args.batch_size + \
                    (epoch - 1) * len(replay_memory)
                progress = items_processed / (len(replay_memory) *
                                              args.num_epochs)
                eprint("{} ({:7} {:5.2f}%) {:.4f}".format(
                    timeSince(training_start, progress), items_processed,
                    progress * 100, epoch_loss * (len(batches) / idx)),
                       guard=args.show_loss)
        q_estimator.adjuster.step()

        q_estimator.save_weights(args.out_weights, args)
        if epoch % args.score_every == 0 and epoch < args.num_epochs:
            training_samples = assign_scores(args,
                                             q_estimator,
                                             predictor,
                                             replay_memory,
                                             progress=True)
            rescore_lr *= args.rescore_gamma
            q_estimator.optimizer.param_groups[0]['lr'] = rescore_lr

        pass

    pass
Ejemplo n.º 9
0
    def train(self,
              triplets,
              n_iters,
              d_steps,
              d_optimizer,
              g_steps,
              g_optimizer,
              batch_size,
              max_len,
              criterion,
              word2index,
              index2word,
              embeddings_index,
              embeddings_size,
              print_every,
              plot_every,
              checkpoint_every,
              to_file=False,
              loss_f=None,
              sample_out_f=None,
              path_to_exp_out=None):
        # criterion is for both G and D

        # record start time for logging
        begin_time = time.time()
        print_d_loss_total = 0  # Reset every print_every
        plot_d_loss_total = 0  # Reset every plot_every
        print_g_loss_total = 0  # Reset every print_every
        plot_g_loss_total = 0  # Reset every plot_every
        plot_d_loss_avgs = []
        plot_g_loss_avgs = []

        for iter in range(1, n_iters + 1):

            # train D
            for d_train_idx in range(d_steps):
                # 1. Train D on real+fake
                self.D.zero_grad()

                #  1A: Train D on real
                #       get data
                #       prepare batch
                training_batch, seq_lens = get_random_batch(
                    triplets, batch_size)
                #       concat the context_ans batch with the question batch
                #       each element in the training batch is context + question + answer
                cqa_batch, _, cqa_lens = prepare_batch_var(training_batch,
                                                           seq_lens,
                                                           batch_size,
                                                           word2index,
                                                           embeddings_index,
                                                           embeddings_size,
                                                           mode=['word'],
                                                           concat_opt='cqa')

                train_input = Variable(cqa_batch[0].cuda(
                )) if use_cuda else Variable(
                    cqa_batch[0]
                )  # embeddings vectors, size = [seq len x batch size x embedding dim]

                d_real_decision = self.D.forward(train_input, cqa_lens[0])
                real_target = Variable(torch.FloatTensor([1]*batch_size)).cuda() if use_cuda else \
                    Variable(torch.FloatTensor([1]*batch_size))
                d_real_error = criterion(d_real_decision,
                                         real_target)  # ones = true
                d_real_error.backward(
                )  # compute/store gradients, but don't change params

                #  1B: Train D on fake
                fake_cqa_batch, fake_cqa_lens = prepare_fake_batch_var(
                    self.G,
                    training_batch,
                    max_len,
                    batch_size,
                    word2index,
                    index2word,
                    embeddings_index,
                    embeddings_size,
                    mode=('word'))

                # # sanity check: rpepare fake batch and prepare batch have the same order
                # print(fake_cqa_batch[0][12] == cqa_batch[0][12])

                d_fake_data = Variable(
                    fake_cqa_batch[0].cuda()) if use_cuda else Variable(
                        fake_cqa_batch[0])
                d_fake_decision = self.D.forward(d_fake_data, fake_cqa_lens[0])
                fake_target = Variable(torch.FloatTensor([0]*batch_size)).cuda() if use_cuda else \
                    Variable(torch.FloatTensor([0]*batch_size))
                # d_fake_error = criterion(d_fake_decision, fake_target)  # zeros = fake
                # d_fake_error.backward()
                # d_optimizer.step()

                # accumulate loss
                # FIXME I dont think below implementation works for batch version
                d_error = torch.mean(d_fake_decision) - torch.mean(
                    d_real_decision)  # W_GAN loss
                # d_error = -torch.mean(self.log(1 - d_fake_decision)) - torch.mean(self.log(d_real_decision)) # GAN loss
                d_error.backward()
                d_optimizer.step()

                # d_error = d_real_error + d_fake_error

            # train G
            for g_train_idx in range(g_steps):
                self.G.zero_grad()

                # conditional data for generator
                training_batch, seq_lens = get_random_batch(
                    triplets, batch_size)
                fake_cqa_batch, fake_cqa_lens = prepare_fake_batch_var(
                    self.G,
                    training_batch,
                    max_len,
                    batch_size,
                    word2index,
                    index2word,
                    embeddings_index,
                    embeddings_size,
                    mode=('word'),
                    detach=False)
                g_fake_data = Variable(
                    fake_cqa_batch[0].cuda()) if use_cuda else Variable(
                        fake_cqa_batch[0])
                dg_fake_decision = self.D.forward(g_fake_data,
                                                  fake_cqa_lens[0])
                target = Variable(torch.FloatTensor([1]*batch_size).cuda()) if use_cuda else \
                    Variable(torch.FloatTensor([1]*batch_size))
                # g_error = criterion(dg_fake_decision, target)
                g_error = -torch.mean(dg_fake_decision)  # wgan loss
                # G_error = -torch.mean(self.log(dg_fake_decision)) # gan loss
                g_error.backward()
                g_optimizer.step()  # Only optimizes G's parameters

            # log error
            print_d_loss_total += d_error.data[0]
            print_g_loss_total += g_error.data[0]
            plot_d_loss_total += d_error.data[0]
            plot_g_loss_total += g_error.data[0]
            if iter % print_every == 0:
                print_d_loss_avg = print_d_loss_total / print_every
                print_g_loss_avg = print_g_loss_total / print_every
                print_d_loss_total = 0
                print_g_loss_total = 0

                if not to_file:
                    print('%s (%d %d%%)' %
                          (timeSince(begin_time, iter / float(n_iters)), iter,
                           iter / n_iters * 100))
                    # print("errors: D: real-%s/fake-%s G: %s " % ( d_real_error.data[0], d_fake_error.data[0], g_error.data[0]) )
                    print("errors: D: %s G: %s " %
                          (print_d_loss_avg, print_g_loss_avg))
                    print('---sample generated question---')
                    # sample a triple and print the generated question
                    evaluate(self.G, triplets, embeddings_index,
                             embeddings_size, word2index, index2word, max_len)
                else:
                    sample_out_f.write(
                        unicode('%s (%d %d%%)\n' %
                                (timeSince(begin_time, iter / float(n_iters)),
                                 iter, float(iter) / float(n_iters) * 100)))
                    evaluate(self.G, triplets, embeddings_index,
                             embeddings_size, word2index, index2word, max_len,
                             to_file, sample_out_f)
                    sample_out_f.write(unicode('\n'))

            if iter % plot_every == 0:
                plot_d_loss_avg = plot_d_loss_total / plot_every
                plot_d_loss_avgs.append(plot_d_loss_avg)
                plot_g_loss_avg = plot_g_loss_total / plot_every
                plot_g_loss_avgs.append(plot_g_loss_avg)
                plot_d_loss_total = 0
                plot_g_loss_total = 0

                if to_file:
                    loss_f.write(
                        unicode('%s (%d %d%%)\n' %
                                (timeSince(begin_time, iter / float(n_iters)),
                                 iter, float(iter) / float(n_iters) * 100)))
                    loss_f.write(
                        unicode("errors: D: %s G: %s " %
                                (print_d_loss_avg, print_g_loss_avg)))
                    loss_f.write(unicode('\n'))

            if (iter % checkpoint_every == 0) or (iter == n_iters):
                checkpoint_fname = 'checkpoint_iter_' + str(iter) + '.pth.tar'
                state = {
                    'iteration': iter + 1,
                    'd_state_dict': self.D.state_dict(),
                    'g_state_dict': self.G.state_dict(),
                    'd_optimizer': d_optimizer.state_dict(),
                    'g_optimizer': g_optimizer.state_dict(),
                }
                torch.save(state, path_to_exp_out + '/' + checkpoint_fname)
                plotLoss(plot_d_loss_avgs,
                         plot_every,
                         save_path=path_to_exp_out,
                         f_name='d_loss_itr_' + str(iter) + '.png',
                         title='training loss D (monitoring purpose)',
                         from_file=False)
                plotLoss(plot_g_loss_avgs,
                         plot_every,
                         save_path=path_to_exp_out,
                         f_name='g_loss_itr_' + str(iter) + '.png',
                         title='training loss G (monitoring purpose)',
                         from_file=False)
Ejemplo n.º 10
0
def train(dataset : List[Sentence],
          token_vocab_size : int, max_length : int, hidden_size : int,
          learning_rate : float, epoch_step : int, gamma : float,
          num_encoder_layers : int, num_decoder_layers : int,
          num_epochs : int, batch_size : int, print_every : int,
          optimizer_f : Callable[..., Optimizer]) \
          -> Iterable[Checkpoint]:
    curtime = time.time()
    print("Building pytorch dataset...", end="")
    sys.stdout.flush()
    data_loader = data.DataLoader(data.TensorDataset(
        torch.LongTensor(dataset[:]), torch.LongTensor(dataset[:])),
                                  batch_size=batch_size,
                                  num_workers=0,
                                  shuffle=True,
                                  pin_memory=True,
                                  drop_last=True)
    print(" {:.2f}s".format(time.time() - curtime))

    curtime = time.time()
    print("Initializing model...", end="")
    sys.stdout.flush()
    encoder = maybe_cuda(
        EncoderRNN(token_vocab_size,
                   hidden_size,
                   num_encoder_layers,
                   batch_size=batch_size))
    decoder = maybe_cuda(
        DecoderRNN(hidden_size,
                   token_vocab_size,
                   num_decoder_layers,
                   batch_size=batch_size))
    encoder_optimizer = optimizer_f(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optimizer_f(decoder.parameters(), lr=learning_rate)
    encoder_adjuster = scheduler.StepLR(encoder_optimizer, epoch_step, gamma)
    decoder_adjuster = scheduler.StepLR(decoder_optimizer, epoch_step, gamma)
    criterion = maybe_cuda(nn.NLLLoss())
    print(" {:.2f}s".format(time.time() - curtime))

    start = time.time()
    num_items = len(dataset) * num_epochs
    total_loss = 0

    print("Training...")
    for epoch in range(num_epochs):
        print("Epoch {}".format(epoch))
        # Adjust learning rates if needed
        encoder_adjuster.step()
        decoder_adjuster.step()

        # Process batches of data
        for batch_num, (input_batch, output_batch) in enumerate(data_loader):
            # Reset the optimizers
            encoder_optimizer.zero_grad()
            decoder_optimizer.zero_grad()

            # Run the autoencoder
            decoded_output = \
                decoder.run_teach(
                    encoder.run(cast(torch.LongTensor, input_batch)),
                    cast(torch.LongTensor, output_batch))

            # Gather the losses
            loss = maybe_cuda(Variable(torch.zeros(1, dtype=torch.float32)))
            output_var = maybe_cuda(Variable(output_batch))
            target_length = output_batch.size()[1]
            for i in range(target_length):
                loss += criterion(decoded_output[i], output_var[:, i])
            total_loss += (loss.data.item() / target_length) * batch_size
            assert total_loss == total_loss
            assert isinstance(total_loss, float)

            # Update the weights
            loss.backward()
            encoder_optimizer.step()
            decoder_optimizer.step()

            # Print status every once in a while
            if (batch_num + 1) % print_every == 0:
                items_processed = (batch_num +
                                   1) * batch_size + epoch * len(dataset)
                progress = items_processed / num_items
                print("{} ({} {:.2f}%) {:.4f}".format(
                    timeSince(start, progress), items_processed,
                    progress * 100, total_loss / items_processed))

        yield Checkpoint(encoder_state=encoder.state_dict(),
                         decoder_state=decoder.state_dict(),
                         training_loss=total_loss)
    pass