def test_upsample():
    data_loaders = get_data_loaders('../data/ljspeech', -1)
    for phase, data_loader in data_loaders.items():
        train = (phase == "train")
        running_loss = 0.
        test_evaluated = False
        for step, (x, y, c, g, input_lengths) in enumerate(data_loader):
            c = c.unsqueeze(1)
            upconv1 = UpSampleConv()
            c1 = upconv1(c)
            break
                image_patches['orig'] = cv2.resize(image, self.patch_size)

                img_probas = classify_image(model, image_patches)

                name_arch = re.search('b[0-9]', model_arch).group(0)
                image_preds_to_dataFrame(img_probas, name_arch, image_name)

        self.data_frame.to_csv(
            '/Users/eugeneolkhovik/python_files/ML/melanoma/derma_classifier/meta_study/ensemble_pred.csv'
        )


if __name__ == "__main__":
    cfg = ConfigTwoClasses()
    data_manager = DataManager(cfg)
    _, data_loader = get_data_loaders(cfg, )  # args
    patches_name = ['tl', 'tr', 'bl', 'br', 'center', 'orig']

    writer = PredictionWriter()
    writer.create_dataframe(patches_name)
    writer.run()
"""
### experiment -> models directories

all_dirs = os.walk(experiments_dir) 
models_res = []
model_path = []
for i in next(all_dirs)[1]: 
    model_resualts_dir  = os.path.join(experiments_dir, i)
    models_res.append(model_resualts_dir)
Example #3
0
    initial_value = None if initial_value is None else int(initial_value)
    file_name_suffix = args["--file-name-suffix"]
    output_html = args["--output-html"]

    # Override hyper parameters
    hparams.parse(args["--hparams"])
    assert hparams.name == "wavenet_vocoder"

    from train import build_model, get_data_loaders
    from synthesis import wavegen

    # Data
    # Use exactly same testset used in training script
    # disable shuffle for convenience
    test_data_loader = get_data_loaders(data_root,
                                        speaker_id,
                                        test_shuffle=False)["test"]
    test_dataset = test_data_loader.dataset

    # Model
    model = build_model()

    # Load checkpoint
    print("Load checkpoint from {}".format(checkpoint_path))
    checkpoint = torch.load(checkpoint_path)
    model.load_state_dict(checkpoint["state_dict"])
    checkpoint_name = splitext(basename(checkpoint_path))[0]

    os.makedirs(dst_dir, exist_ok=True)
    dst_dir_name = basename(os.path.normpath(dst_dir))
Example #4
0
        with open(preset) as f:
            hparams.parse_json(f.read())
    # Override hyper parameters
    hparams.parse(args["--hparams"])
    assert hparams.name == "wavenet_vocoder"

    # tee sys.stdout to an additional log file in checkpoint_dir
    tee = Tee(join(os.path.dirname(checkpoint_path), 'evaluate.stdout'))

    from train import build_model, get_data_loaders
    from synthesis import wavegen

    # Data
    # Use exactly same testset used in training script
    # disable shuffle for convenience
    test_data_loader = get_data_loaders(data_root, speaker_id, test_shuffle=False, phases=("test",))["test"]
    test_dataset = test_data_loader.dataset

    # Model
    model = build_model().to(device)

    # Load checkpoint
    print("Load checkpoint from {}".format(checkpoint_path))
    if use_cuda:
        checkpoint = torch.load(checkpoint_path)
    else:
        checkpoint = torch.load(checkpoint_path, map_location=lambda storage, loc: storage)
    model.load_state_dict(checkpoint["state_dict"])
    checkpoint_name = splitext(basename(checkpoint_path))[0]

    os.makedirs(dst_dir, exist_ok=True)
Example #5
0
        "DEL",  # DELETION
        "INS",  # INSERTION
        "SUB",  # SUBTRACTION
        "W-PER",  # WHOLE WORD PERMUTATION
        "W-DEL",  # WHOLE WORD DELETION
        "W-INS",  # WHOLE WORD INSERTION
        "W-SUB"  # WHOLE WORD SUBTRACTION
    ]

    MAX_CHARS = 24
    BSZ = 8
    NT = random.choice(TYPES)
    ntype_chckp = CHECKPOINTS_PATH.joinpath(f"MUDE_{NT}")
    checkpoint_path = best_checkpoint_selector(ntype_chckp)

    test_ld = get_data_loaders(NT, BSZ, MAX_CHARS)[2]
    CHAR_VOCAB_SIZE = len(test_ld.dataset.vect.chars)
    TGT_VOCAB_SIZE = len(test_ld.dataset.vocab)

    DIM = 512
    DIM_FFT = int(DIM * 4)
    ATTN_HEADS = 8
    DEPTH = 2
    DIM_HIDDEN = 650
    DROPOUT_RATE = .01
    LR = 1e-4

    mude = MUDE(dim=DIM,
                characters_vocab_size=CHAR_VOCAB_SIZE,
                tokens_vocab_size=TGT_VOCAB_SIZE,
                encoder_depth=DEPTH,
Example #6
0
def train():
    parser = ArgumentParser()
    parser.add_argument("--train_path",
                        type=str,
                        default='data/yesands_train_iter4.json',
                        help="Set data path")
    parser.add_argument("--valid_path",
                        type=str,
                        default='data/yesands_valid.json',
                        help="Set data path")

    parser.add_argument("--correct_bias",
                        type=bool,
                        default=False,
                        help="Set to true to correct bias for Adam optimizer")
    parser.add_argument("--lr",
                        type=float,
                        default=2e-5,
                        help="Set learning rate")
    parser.add_argument("--n_epochs",
                        type=int,
                        default=4,
                        help="Set number of epochs")
    parser.add_argument("--num_warmup_steps",
                        type=float,
                        default=1000,
                        help="Set number of warm-up steps")
    parser.add_argument("--num_total_steps",
                        type=float,
                        default=10000,
                        help="Set number of total steps")
    parser.add_argument("--device",
                        type=str,
                        default="cuda" if torch.cuda.is_available() else "cpu",
                        help="Device (cuda or cpu)")
    parser.add_argument("--max_grad_norm",
                        type=float,
                        default=1.0,
                        help="Set maximum gradient normalization.")
    parser.add_argument(
        "--pretrained_path",
        type=str,
        default='roberta-base',
        help=
        "Choose which pretrained model to use (bert-base-uncased, roberta-base, roberta-large, roberta-large-mnli)"
    )
    parser.add_argument("--batch_size",
                        type=int,
                        default=32,
                        help="Provide the batch size")
    parser.add_argument("--random_seed",
                        type=int,
                        default=42,
                        help="Set the random seed")
    parser.add_argument(
        "--test",
        action='store_true',
        help="If true, run with small dataset for testing code")
    parser.add_argument(
        "--base",
        action='store_true',
        help=
        "If true, run with base experiment configuration (training with spont only) for comparison"
    )

    args = parser.parse_args()

    logging.basicConfig(level=logging.INFO)
    logger.info("Arguments: {}".format(pformat(args)))

    if 'roberta' in args.pretrained_path:
        # initialize tokenizer and model
        logger.info("Initialize model and tokenizer.")
        tokenizer = RobertaTokenizer.from_pretrained(
            args.pretrained_path, cache_dir='../pretrained_models')
        model = RobertaForSequenceClassification.from_pretrained(
            args.pretrained_path, cache_dir='../pretrained_models')

        ### START MODEL MODIFICATION
        # Pretrained model was not trained with token type ids.
        # fix token type embeddings for finetuning. Without this, the model can only take 0s as valid input for token_type_ids
        model.config.type_vocab_size = 2
        model.roberta.embeddings.token_type_embeddings = torch.nn.Embedding(
            2, model.config.hidden_size)
        model.roberta.embeddings.token_type_embeddings.weight.data.normal_(
            mean=0.0, std=model.config.initializer_range)

        ### END MOD
    elif 'bert' in args.pretrained_path:
        model = BertForSequenceClassification.from_pretrained(
            args.pretrained_path, cache_dir='../pretrained_models')
        tokenizer = BertTokenizer.from_pretrained(
            args.pretrained_path, cache_dir='../pretrained_models')

    model.to(args.device)

    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'gamma', 'beta']

    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay_rate':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay_rate':
        0.0
    }]

    optimizer = AdamW(optimizer_grouped_parameters,
                      lr=args.lr,
                      correct_bias=args.correct_bias)
    scheduler = WarmupLinearSchedule(optimizer,
                                     warmup_steps=args.num_warmup_steps,
                                     t_total=args.num_total_steps)

    logger.info("Prepare datasets")

    train_data = get_data(args.train_path)
    valid_data = get_data(args.valid_path)

    if arg.test:
        train_data = train_data[:100]
        valid_data = valid_data[:100]

    logger.info("Loading train set...")
    train_loader, train_sampler = get_data_loaders(args, train_data,
                                                   args.train_path, tokenizer)

    logger.info("Loading validation set...")
    valid_loader, valid_sampler = get_data_loaders(args, valid_data,
                                                   args.valid_path, tokenizer)

    # Training function and trainer
    def update(engine, batch):
        model.train()

        batch = tuple(input_tensor.to(args.device) for input_tensor in batch)
        b_input_ids, b_input_mask, b_input_segment, b_labels = batch

        optimizer.zero_grad()
        #roberta has issues with token_type_ids
        loss, logits = model(b_input_ids,
                             token_type_ids=b_input_segment,
                             attention_mask=b_input_mask,
                             labels=b_labels)
        # loss, logits = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)

        optimizer.step()
        scheduler.step()

        return loss.item(), logits, b_labels

    trainer = Engine(update)

    val_result_f = Path('turn_val_prediction.txt').open('w')

    # Evaluation function and evaluator
    def inference(engine, batch):
        model.eval()

        batch = tuple(input_tensor.to(args.device) for input_tensor in batch)
        b_input_ids, b_input_mask, b_input_segment, b_labels = batch

        with torch.no_grad():
            #roberta has issues with token_type_ids
            # loss, logits = model(b_input_ids, token_type_ids = None, attention_mask=b_input_mask, labels=b_labels)
            loss, logits = model(b_input_ids,
                                 token_type_ids=b_input_segment,
                                 attention_mask=b_input_mask,
                                 labels=b_labels)
            label_ids = b_labels

        return logits, label_ids, loss.item()

    evaluator = Engine(inference)

    trainer.add_event_handler(Events.EPOCH_COMPLETED,
                              lambda _: evaluator.run(valid_loader))

    RunningAverage(output_transform=lambda x: x[0]).attach(trainer, "loss")
    RunningAverage(Accuracy(output_transform=lambda x: (x[1], x[2]))).attach(
        trainer, "accuracy")
    if torch.cuda.is_available():
        GpuInfo().attach(trainer, name='gpu')

    recall = Recall(output_transform=lambda x: (x[0], x[1]))
    precision = Precision(output_transform=lambda x: (x[0], x[1]))
    F1 = (precision * recall * 2 / (precision + recall)).mean()
    accuracy = Accuracy(output_transform=lambda x: (x[0], x[1]))
    metrics = {
        "recall": recall,
        "precision": precision,
        "f1": F1,
        "accuracy": accuracy,
        "loss": Average(output_transform=lambda x: x[2])
    }

    for name, metric in metrics.items():
        metric.attach(evaluator, name)

    pbar = ProgressBar(persist=True)
    pbar.attach(trainer, metric_names=['loss', 'accuracy'])
    pbar.attach(trainer, metric_names=['gpu:0 mem(%)', 'gpu:0 util(%)'])

    evaluator.add_event_handler(
        Events.COMPLETED, lambda _: pbar.log_message(
            "Validation metrics:\n %s" % pformat(evaluator.state.metrics)))

    tb_logger = TensorboardLogger(log_dir=None)
    tb_logger.attach(trainer,
                     log_handler=OutputHandler(tag="training",
                                               metric_names=["loss"]),
                     event_name=Events.ITERATION_COMPLETED)
    tb_logger.attach(trainer,
                     log_handler=OptimizerParamsHandler(optimizer),
                     event_name=Events.ITERATION_STARTED)
    tb_logger.attach(evaluator,
                     log_handler=OutputHandler(tag="valid",
                                               metric_names=list(
                                                   metrics.keys()),
                                               another_engine=trainer),
                     event_name=Events.EPOCH_COMPLETED)

    # tb_logger.writer.log_dir -> tb_logger.writer.logdir (this is the correct attribute name as seen in: https://tensorboardx.readthedocs.io/en/latest/_modules/tensorboardX/writer.html#SummaryWriter)
    checkpoint_handler = ModelCheckpoint(tb_logger.writer.logdir,
                                         'checkpoint',
                                         save_interval=1,
                                         n_saved=5)
    trainer.add_event_handler(
        Events.EPOCH_COMPLETED, checkpoint_handler,
        {'mymodel': getattr(model, 'module', model)
         })  # "getattr" take care of distributed encapsulation

    torch.save(args, tb_logger.writer.logdir + '/model_training_args.bin')
    getattr(model, 'module', model).config.to_json_file(
        os.path.join(tb_logger.writer.logdir, CONFIG_NAME))
    tokenizer.save_vocabulary(tb_logger.writer.logdir)

    trainer.run(train_loader, max_epochs=args.n_epochs)

    if args.n_epochs > 0:
        os.rename(
            checkpoint_handler._saved[-1][1][-1],
            os.path.join(tb_logger.writer.logdir, WEIGHTS_NAME)
        )  # TODO: PR in ignite to have better access to saved file paths (cleaner)
        tb_logger.close()

    val_result_f.close()
Example #7
0
def run():
    parser = ArgumentParser()
    parser.add_argument(
        "--dataset_path",
        type=str,
        default="",
        help="Path or url of the dataset. If empty download from S3.")
    parser.add_argument("--dataset_cache",
                        type=str,
                        default='./dataset_cache',
                        help="Path or url of the dataset cache")
    parser.add_argument(
        "--model",
        type=str,
        default="gpt2",
        help="Model type (openai-gpt or gpt2)",
        choices=['openai-gpt', 'gpt2',
                 'gpt2-medium'])  # anything besides gpt2 will load openai-gpt
    parser.add_argument("--model_checkpoint",
                        type=str,
                        default="",
                        help="Path, url or short name of the model")
    parser.add_argument(
        "--max_history",
        type=int,
        default=2,
        help="Number of previous utterances to keep in history")
    parser.add_argument("--device",
                        type=str,
                        default="cuda" if torch.cuda.is_available() else "cpu",
                        help="Device (cuda or cpu)")
    parser.add_argument("--batch_size", type=int, default=1)
    parser.add_argument("--no_sample",
                        action='store_true',
                        help="Set to use greedy decoding instead of sampling")
    parser.add_argument("--max_length",
                        type=int,
                        default=20,
                        help="Maximum length of the output utterances")
    parser.add_argument("--min_length",
                        type=int,
                        default=1,
                        help="Minimum length of the output utterances")
    parser.add_argument("--num_candidates", type=int, default=1)
    parser.add_argument("--personality_permutations", type=int, default=1)
    parser.add_argument("--seed", type=int, default=0, help="Seed")
    parser.add_argument("--distributed", action='store_true')
    parser.add_argument("--temperature",
                        type=int,
                        default=1,
                        help="Sampling softmax temperature")
    parser.add_argument(
        "--top_k",
        type=int,
        default=0,
        help="Filter top-k tokens before sampling (<=0: no filtering)")
    parser.add_argument(
        "--top_p",
        type=float,
        default=0,
        help="Nucleus filtering (top-p) before sampling (<=0.0: no filtering)")
    parser.add_argument("--entmax_alpha", type=float, default=1.5)
    parser.add_argument("--entmax_k", type=int, default=512)
    parser.add_argument("--entmax_bisect_iter", type=int, default=50)
    parser.add_argument("--loss", default="cross_entropy", type=str)
    parser.add_argument("--metric", default="jsd", type=str)
    parser.add_argument("--epsilon", default=0.000001, type=float)
    parser.add_argument("--name", default='', type=str)
    parser.add_argument("--temp", type=float, default=0)

    args = parser.parse_args()

    logging.basicConfig(level=logging.INFO)
    logger = logging.getLogger(__file__)
    logger.info(pformat(args))

    args.train_batch_size = args.batch_size
    args.valid_batch_size = args.batch_size

    generic_entmax_loss = partial(EntmaxBisectLoss,
                                  alpha=args.entmax_alpha,
                                  n_iter=args.entmax_bisect_iter)

    loss_funcs = {
        "cross_entropy": nn.CrossEntropyLoss,
        "sparsemax": partial(SparsemaxLoss, k=args.entmax_k),
        "entmax15": partial(Entmax15Loss, k=args.entmax_k),
        "entmax": generic_entmax_loss,
        "entmax_alpha": "entmax_alpha"
    }

    assert args.loss in loss_funcs
    loss_func = loss_funcs[args.loss]

    generic_entmax = partial(entmax_bisect,
                             alpha=args.entmax_alpha,
                             n_iter=args.entmax_bisect_iter)

    gen_funcs = {
        "softmax": torch.softmax,
        "sparsemax": partial(sparsemax, k=args.entmax_k),
        "entmax15": partial(entmax15, k=args.entmax_k),
        "entmax": generic_entmax,
        "entmax_alpha": "entmax_alpha"
    }

    if args.loss == "cross_entropy":
        gen_func = gen_funcs["softmax"]
    elif args.loss == "sparsemax":
        gen_func = gen_funcs["sparsemax"]
    elif args.loss == "entmax15":
        gen_func = gen_funcs["entmax15"]
    elif args.loss == "entmax":
        gen_func = gen_funcs["entmax"]
    elif args.loss == "entmax_alpha":
        gen_func = gen_funcs["entmax_alpha"]

    if args.model_checkpoint == "":
        if args.model == 'gpt2' or args.model == 'gpt2-medium':
            raise ValueError(
                "Interacting with GPT2 requires passing a finetuned model_checkpoint"
            )
        else:
            args.model_checkpoint = download_pretrained_model()

    if args.seed != 0:
        random.seed(args.seed)
        torch.random.manual_seed(args.seed)
        torch.cuda.manual_seed(args.seed)

    logger.info("Get pretrained model and tokenizer")
    tokenizer_class, model_class = (
        GPT2Tokenizer, GPT2LMHeadModel
    ) if args.model == 'gpt2' or args.model == 'gpt2-medium' else (
        OpenAIGPTTokenizer, OpenAIGPTLMHeadModel)
    tokenizer = tokenizer_class.from_pretrained(args.model_checkpoint)
    model = model_class.from_pretrained(args.model_checkpoint)
    model.to(args.device)
    add_special_tokens_(model, tokenizer)

    personachat = get_dataset(tokenizer, args.dataset_path, args.dataset_cache)

    bos, eos, speaker1, speaker2, pad = tokenizer.convert_tokens_to_ids(
        SPECIAL_TOKENS)

    model.eval()
    if args.metric == 'f1':
        datasets = {"train": defaultdict(list), "valid": defaultdict(list)}
        for dataset_name, dataset in personachat.items():
            num_candidates = 1
            if dataset_name != 'train':
                for dialog in dataset:
                    persona = dialog["personality"].copy()
                    for utterance in dialog["utterances"]:
                        history = utterance["history"]  #[-(2*2+1):]
                        for j, candidate in enumerate(
                                utterance["candidates"][-num_candidates:]):
                            instance = build_input(persona, history, candidate,
                                                   tokenizer)
                            for input_name, input_array in instance.items():
                                datasets[dataset_name][input_name].append(
                                    input_array)

        logger.info("Pad inputs and convert to Tensor")
        tensor_datasets = {"train": [], "valid": []}
        for dataset_name, dataset in datasets.items():
            if dataset_name != 'train':
                inputs = dataset['input_ids']
                replies = dataset['reply']
                token_type_ids = dataset['token_type_ids']

        special_tokens_ids = tokenizer.convert_tokens_to_ids(SPECIAL_TOKENS)
        predictions = []
        references = []
        preds = []
        refs = []
        histories = []
        for i in range(len(inputs)):
            if i % 100 == 0:
                print(str(i) + ' out of ' + str(len(inputs)))
            inpu = torch.tensor(inputs[i]).unsqueeze(0).cuda()
            token_ids = torch.tensor(token_type_ids[i]).unsqueeze(0).cuda()
            current_output = []
            for i in range(args.max_length):
                if i > 0:
                    inpu = torch.cat([inpu, prev.unsqueeze(0)], 1)
                    if token_ids[0][-1] == 50260:
                        token_ids = torch.cat([
                            token_ids,
                            torch.tensor([50260]).cuda().unsqueeze(0)
                        ], 1)
                    else:
                        token_ids = torch.cat([
                            token_ids,
                            torch.tensor([50261]).cuda().unsqueeze(0)
                        ], 1)

                logits = model(inpu, token_type_ids=token_ids)
                if isinstance(logits, tuple):
                    logits = logits[0]
                logits = logits[0, -1, :]
                if args.top_k != 0 or args.top_p != 0:
                    logits = top_filtering(logits,
                                           top_k=args.top_k,
                                           top_p=args.top_p)
                if args.temp != 0:
                    probs = softmax_temperature(logits.unsqueeze(0),
                                                temperature=args.temp,
                                                axis=1).squeeze(0)
                else:
                    probs = gen_func(logits, dim=-1)

                prev = torch.multinomial(probs, 1)

                if prev.item() in special_tokens_ids:
                    break
                current_output.append(prev.item())

            out_text = tokenizer.decode(current_output,
                                        skip_special_tokens=True)
            target = tokenizer.decode(replies[i])
            history = tokenizer.decode(inputs[i])
            predictions.append(out_text)
            references.append(target)
            preds.append(current_output)
            refs.append(replies[i])

        f1_score = eval_utils.f1(preds, refs)

        print('F1_score:', f1_score)

        distinct_1, distinct_2, distinct_3, distinct_4 = eval_utils.distinct(
            predictions)

        print('distinct_1:', distinct_1)
        print('distinct_2:', distinct_2)
        print('distinct_3:', distinct_3)
        print('distinct_4:', distinct_4)

    else:
        _, val_loader, _, valid_sampler = get_data_loaders(args, tokenizer)
        jsd = 0
        sp = 0
        perp = 0.0
        nb_eval_steps = 0
        v = 0
        for batch in val_loader:
            v += 1
            if v % 100 == 0:
                print(str(v) + ' out of ' + str(7801))
            batch = tuple(
                input_tensor.to(args.device) for input_tensor in batch)
            input_ids, lm_labels, lm_labels, mc_labels, token_type_ids = batch
            lm_logits = model(input_ids, token_type_ids=token_type_ids)
            lm_logits = lm_logits[0]
            lm_logits_flat_shifted = lm_logits[..., :-1, :].contiguous().view(
                -1, lm_logits.size(-1))
            lm_labels_flat_shifted = lm_labels[..., 1:].contiguous().view(-1)

            lm_logits_flat_shifted = list(
                lm_logits_flat_shifted.cpu().detach().numpy())
            lm_labels_flat_shifted = list(lm_labels_flat_shifted.cpu().numpy())
            lm_logits_flat_shifted = [
                lm_logits_flat_shifted[i] if lm_labels_flat_shifted[i] != pad
                and lm_labels_flat_shifted[i] != eos else [0]
                for i in range(len(lm_labels_flat_shifted))
            ]
            lm_logits_flat_shifted = torch.tensor(
                list(filter(lambda a: len(a) != 1, lm_logits_flat_shifted)))
            lm_labels_flat_shifted = torch.tensor(
                list(
                    filter(lambda a: a != pad and a != eos,
                           lm_labels_flat_shifted)))

            if args.top_p > 0 or args.top_k > 0:
                j = 0
                for l in lm_logits_flat_shifted:
                    j += 1
                    if j > 1:
                        shift_logits = torch.cat([
                            shift_logits,
                            top_filtering(l,
                                          top_p=args.top_p,
                                          top_k=args.top_k).unsqueeze(0)
                        ], 0)
                    else:
                        shift_logits = top_filtering(
                            l, top_p=args.top_p, top_k=args.top_k).unsqueeze(0)
            else:
                shift_logits = lm_logits_flat_shifted
            if args.temp != 0:
                probs = softmax_temperature(shift_logits,
                                            temperature=args.temp,
                                            axis=1)
            else:
                probs = gen_func(shift_logits, dim=1)

            jsd_batch = []
            labels = torch.zeros(len(lm_labels_flat_shifted),
                                 shift_logits.size(-1))
            for i in range(len(lm_labels_flat_shifted)):
                labels[i, lm_labels_flat_shifted[i]] = 1
                jsd_ = compute_jsd(probs[i], labels[i])
                jsd_batch.append(jsd_)
            jsd_batch = torch.tensor(jsd_batch).mean()
            jsd += jsd_batch
            sp_batch = []
            for i in range(len(lm_labels_flat_shifted)):
                sp_batch.append(
                    compute_sp(probs.squeeze(0)[i], lm_labels_flat_shifted[i]))

            sp_batch = torch.tensor(sp_batch).mean()
            sp += sp_batch

            if len(probs[0].nonzero()) != len(probs[0]):
                probs = probs[:, :] + args.epsilon
                sums = [probs[i].sum().item() for i in range(probs.size(0))]
                probs = [probs[i] / sums[i] for i in range(len(sums))]
                probs = torch.stack(probs)

            p = [
                probs[i, lm_labels_flat_shifted.squeeze(0)[i].item()]
                for i in range(len(lm_labels_flat_shifted.squeeze(0)))
            ]
            p = torch.stack(p)
            perp += torch.log(p**-1).mean().item()

            nb_eval_steps += 1

        jsd = jsd / nb_eval_steps
        sp = sp / nb_eval_steps
        a = perp / nb_eval_steps
        perplexity = torch.exp(torch.tensor(a))
        print('perplexity:', perplexity)
        print('jsd:', jsd)
        print('sp:', sp)