Timer['total'] = Tfinal - T1
        # print(Timer)

        if (time.time() - time_save > args.save_every):
            print("==========================================")
            print(bodies[0])
            print("-----------")
            print(sampled_summaries[0])
            print("-----------")
            print("Total score:", total_sampled_scores[0].item())
            for scorer in scorers:
                print(scorer['name'] + " score:",
                      scores_track[scorer['name'] + "_scores"][0].item())
            print("-----------")

            logplot.save(printing=True)
            # print(Timer)

            time_save = time.time()
            print("==========================================")

        if ckpt_every > 0 and len(total_score_history) > ckpt_lookback:
            current_score = np.mean(total_score_history[-ckpt_lookback:])

            if time.time() - time_ckpt > ckpt_every:
                revert_ckpt = best_ckpt_score is not None and current_score < min(
                    1.2 * best_ckpt_score,
                    0.8 * best_ckpt_score)  # Could be negative or positive
                print("================================== CKPT TIME, " +
                      str(datetime.now()) +
                      " =================================")
                                                 is_next)

        loss.backward()
        is_next_acc = is_next.eq(torch.argmax(is_next_logits,
                                              dim=1)).float().mean().item()

        num_predicts = (1.0 - lm_label_ids.eq(-1)).sum().item()
        mlm_acc = (lm_label_ids.view(-1).eq(
            torch.argmax(mlm_logits, dim=2).view(-1)).float().sum() /
                   num_predicts).item()

        if ib % args.optim_every == 0:
            scheduler.step()  # Update learning rate schedule
            optimizer.step()
            optimizer.zero_grad()
            torch.cuda.empty_cache()

        summ.cache(
            {
                "loss": loss.item(),
                "mlm_acc": mlm_acc,
                "is_next_acc": is_next_acc
            },
            prefix="T_")
        if time.time() - time_save > 60.0:
            summ.save(printing=True)
            time_save = time.time()
            torch.save(
                model.state_dict(), "/home/phillab/models/news_bert_bs" +
                str(args.optim_every * args.train_batch_size) + ".bin")