Ejemplo n.º 1
0
def perplexity_eval(device: torch.device, model: lmp.model.BaseRNNModel,
                    sequence: str,
                    tokenizer: lmp.tokenizer.BaseTokenizer) -> float:
    r"""Helper function for calculating perplexity.

    Args:
        device:
            Model running device.
        model:
            Language model.
        sequence:
            Sequence for evaluation.
        tokenizer:
            Tokenizer for encoding sequence.

    Return:
        Perplexity of `sequence`.
    """
    # Evalation mode.
    model.eval()

    # Encode sequence and convert into tensor. Original sequence length: S.
    # New sequence length: S + 2.
    sequence = tokenizer.encode(sequence, max_seq_len=-1)

    # `sequence[:-2]` means predict tokens include [BOS] output but exclude
    # [EOS] input. `x.shape = (S)`.
    x = torch.LongTensor(sequence[:-2]).to(device)

    # `y.shape = (S)`.
    y = sequence[1:-1]

    # Reshape into `(1, S)` to fit model.
    x = x.reshape(1, -1)

    # Get model vocabulary prediction with shape `(1, S, V)`.
    pred_y = model.predict(x)

    # Reshape into `(S)` for easier maniplation.
    x = x.squeeze(0)

    # Reshape into `(S, V)` for easier maniplation.
    pred_y = pred_y.squeeze(0)

    # Accumulate negative log-likelihood.
    nll = torch.zeros(1).to(device)

    # Iterate through each prediction.
    for pos, token_id in enumerate(y):
        probs = pred_y[pos, token_id]
        nll = nll - torch.log(probs)

    # Normalized by length.
    nll = nll / x.size(0)

    # Take exponential to cancel logarithmic.
    return nll.exp().item()
Ejemplo n.º 2
0
def main() -> None:
    r"""Script entry point."""
    # Parse command-line argument.
    args = parse_arg()

    # Set random seed for reproducibility.
    lmp.util.rand.set_seed(seed=args.seed)

    # Load pre-trained model configuration.
    model_cfg = lmp.util.cfg.load(exp_name=args.exp_name)

    # Load pre-trained tokenizer configuration.
    tknzr_cfg = lmp.util.cfg.load(exp_name=model_cfg.tknzr_exp_name)

    # Load pre-trained tokenizer instance.
    tknzr = lmp.util.tknzr.load(
        exp_name=tknzr_cfg.exp_name,
        tknzr_name=tknzr_cfg.tknzr_name,
    )

    # Load pre-trained model instance.
    model = lmp.util.model.load(
        ckpt=args.ckpt,
        tknzr=tknzr,
        **model_cfg.__dict__,
    )

    # Get inference method.
    infer = lmp.util.infer.create(
        max_seq_len=model_cfg.max_seq_len,
        **args.__dict__,
    )

    # Get model running device.
    device = torch.device('cpu')
    if torch.cuda.is_available():
        device = torch.device('cuda')

    # Set model to evaluation model.
    # This turn off dropout layers in model.
    model = model.eval()

    # Move model to running device.
    model = model.to(device)

    # Generate text with specified inference method.
    txt = infer.gen(model=model, tknzr=tknzr, txt=args.txt)

    # Output generate text.
    print(txt)
Ejemplo n.º 3
0
def main() -> None:
    r"""Script entry point."""
    # Parse command-line argument.
    args = parse_arg()

    # Get dataset instance with specified version.
    dset = lmp.util.dset.load(dset_name=args.dset_name, ver=args.ver)

    # Mini-batch random sampler.
    dldr = torch.utils.data.DataLoader(
        dataset=dset,
        batch_size=args.batch_size,
        shuffle=False,
    )

    # Load pre-trained model configuration.
    model_cfg = lmp.util.cfg.load(exp_name=args.exp_name)

    # Load pre-trained tokenizer configuration.
    tknzr_cfg = lmp.util.cfg.load(exp_name=model_cfg.tknzr_exp_name)

    # Load pre-trained tokenizer instance.
    tknzr = lmp.util.tknzr.load(
        exp_name=tknzr_cfg.exp_name,
        tknzr_name=tknzr_cfg.tknzr_name,
    )

    # Get model running device.
    device = torch.device('cpu')
    if torch.cuda.is_available():
        device = torch.device('cuda')

    # Get tensorboard logger instance.
    writer = lmp.util.log.get_tb_logger(exp_name=args.exp_name)

    # Load pre-trained checkpoints range from `args.first_ckpt` to
    # `args.last_ckpt`.
    for ckpt in lmp.util.model.list_ckpts(
            exp_name=args.exp_name,
            first_ckpt=args.first_ckpt,
            last_ckpt=args.last_ckpt,
    ):
        # Load pre-trained model instance from checkpoint `ckpt`.
        model = lmp.util.model.load(
            ckpt=ckpt,
            tknzr=tknzr,
            **model_cfg.__dict__,
        )

        # Set model to evaluation model.
        # This turn off dropout layers in model.
        model = model.eval()

        # Move model to running device.
        model = model.to(device)

        # Record average perplexity.
        avg_ppl = 0.0
        for batch_txt in tqdm(dldr):

            # Encode batch text into batch of token ids.
            batch_tkids = tknzr.batch_enc(
                batch_txt=batch_txt,
                max_seq_len=model_cfg.max_seq_len,
            )

            # Convert batch of token ids to `torch.Tensor` with
            # `dtype == torch.int64`.
            batch_tkids = torch.LongTensor(batch_tkids)

            # Move tensors to model running device.
            batch_tkids = batch_tkids.to(device)

            # Format batch token ids to satisfy language model training format.
            batch_prev_tkids = batch_tkids[..., :-1]
            batch_next_tkids = batch_tkids[..., 1:]

            # Calculate perplexity.
            batch_avg_ppl = model.ppl(
                batch_next_tkids=batch_next_tkids,
                batch_prev_tkids=batch_prev_tkids,
            )

            # Accumulate average perplexity.
            avg_ppl += batch_avg_ppl * len(batch_txt) / len(dset)

        # Log average perplexity on dataset to CLI and tensorboard.
        writer.add_scalar(f'ppl/{args.dset_name}/{args.ver}', avg_ppl, ckpt)
        print(f'checkpoint {ckpt} ppl: {avg_ppl}')
def perplexity_eval(
        device: torch.device,
        model: Union[lmp.model.BaseRNNModel, lmp.model.BaseResRNNModel],
        sequence: str,
        tokenizer: lmp.tokenizer.BaseTokenizer
) -> float:
    r"""Helper function for calculating perplexity.

    Args:
        device:
            Model running device.
        model:
            Language model.
        sequence:
            Sequence for evaluation. Must not be empty.
        tokenizer:
            Tokenizer for encoding sequence.

    Raises:
        TypeError:
            When one of the arguments are not an instance of their type
            annotation respectively.

    Return:
        Perplexity of `sequence`.
    """
    # Type check.
    if not isinstance(device, torch.device):
        raise TypeError('`device` must be an instance of `torch.device`.')

    if not isinstance(model, (
            lmp.model.BaseRNNModel,
            lmp.model.BaseResRNNModel
    )):
        raise TypeError(
            '`model` must be an instance of '
            '`Union[lmp.model.BaseRNNModel, lmp.model.BaseResRNNModel]`.'
        )

    if not isinstance(sequence, str):
        raise TypeError('`sequence` must be an instance of `str`.')

    if not isinstance(tokenizer, lmp.tokenizer.BaseTokenizer):
        raise TypeError(
            '`tokenizer` must be an instance of `lmp.tokenizer.BaseTokenizer`.'
        )

    # Value check.
    if not sequence:
        raise ValueError('`sequence` must not be empty.')

    # Evalation mode.
    model.eval()

    # Encode sequence and convert into tensor. Original sequence length: S.
    # New sequence length: S + 2.
    sequence = tokenizer.encode(sequence, max_seq_len=-1)

    # `sequence[:-2]` means predict tokens include [bos] output but exclude
    # [eos] input. `x.shape = (S)`.
    x = torch.LongTensor(sequence[:-2]).to(device)

    # `y.shape = (S)`.
    y = sequence[1:-1]

    # Reshape into `(1, S)` to fit model.
    x = x.reshape(1, -1)

    # Get model vocabulary prediction with shape `(1, S, V)`.
    pred_y = model.predict(x)

    # Reshape into `(S)` for easier maniplation.
    x = x.squeeze(0)

    # Reshape into `(S, V)` for easier maniplation.
    pred_y = pred_y.squeeze(0)

    # Accumulate negative log-likelihood.
    nll = torch.zeros(1).to(device)

    # Iterate through each prediction.
    for pos, token_id in enumerate(y):
        probs = pred_y[pos, token_id]
        nll = nll - torch.log(probs)

    # Normalized by length.
    nll = nll / x.size(0)

    # Take exponential to cancel logarithmic.
    return nll.exp().item()
def main(argv: List[str]) -> None:
    """Script entry point.

  Parameters
  ----------
  argv: list[str]
    List of CLI arguments.

  Returns
  -------
  None
  """
    # Parse CLI arguments.
    args = parse_args(argv=argv)

    # `args.batch_size` validation.
    lmp.util.validate.raise_if_wrong_ordered(
        vals=[1, args.batch_size], val_names=['1', 'args.batch_size'])
    # `args.first_ckpt` validation.
    lmp.util.validate.raise_if_wrong_ordered(
        vals=[-1, args.first_ckpt], val_names=['-1', 'args.first_ckpt'])
    # `args.last_ckpt` validation.
    lmp.util.validate.raise_if_wrong_ordered(
        vals=[-1, args.last_ckpt], val_names=['-1', 'args.last_ckpt'])
    # `args.n_worker` validation.
    lmp.util.validate.raise_if_wrong_ordered(
        vals=[0, args.n_worker, len(os.sched_getaffinity(0))],
        val_names=['0', 'args.n_worker', 'number of available CPUs'],
    )
    lmp.util.validate.raise_if_wrong_ordered(
        vals=[args.n_worker, args.batch_size],
        val_names=['args.n_worker', 'args.batch_size'],
    )

    # We use TCP to perform RPC.  Timeout is set to 5 minutes.
    store = dist.TCPStore(
        is_master=args.rank == HOST_RANK,
        host_name=args.host_name,
        port=args.host_port,
        timeout=timedelta(minutes=5),
        world_size=args.world_size,
    )

    # Use NCCL backend to perform CUDA collectives.
    dist.init_process_group(
        backend=dist.Backend.NCCL,
        store=store,
        rank=args.rank,
        timeout=timedelta(minutes=5),
        world_size=args.world_size,
    )

    # Sync arguments.
    dist_args_k = [
        'host_name', 'host_port', 'local_rank', 'rank', 'world_size'
    ]
    for k in args.__dict__.keys():
        if k in dist_args_k:
            continue

        # Host broadcast arguments.
        if args.rank == HOST_RANK:
            store.set(k, str(args.__dict__[k]))
        # Non-host receive host arguments.
        else:
            v = store.get(k)
            if isinstance(args.__dict__[k], str):
                args.__dict__[k] = v.decode('utf-8')
            else:
                args.__dict__[k] = type(args.__dict__[k])(v)

    # Set random seed for reproducibility.  Note that each process use different seed to get different slice of batch.
    lmp.util.rand.set_seed(seed=args.seed + args.rank)

    # Get model running device.
    device = torch.device('cpu')
    if torch.cuda.is_available():
        device = torch.device(f'cuda:{args.local_rank}')

    # Load pre-trained model configuration.
    model_cfg = lmp.util.cfg.load(exp_name=args.exp_name)

    # Load pre-trained tokenizer instance.
    tknzr = lmp.util.tknzr.load(exp_name=model_cfg.tknzr_exp_name)

    # Get dataset instance and convert samples to tensor.
    if args.is_dset_in_memory:
        dset: torch.utils.data.Dataset = lmp.util.dset.FastTensorDset(
            dset=lmp.util.dset.load(**args.__dict__),
            max_seq_len=model_cfg.max_seq_len,
            tknzr=tknzr,
        )
    else:
        dset = lmp.util.dset.SlowTensorDset(
            dset=lmp.util.dset.load(**args.__dict__),
            max_seq_len=model_cfg.max_seq_len,
            tknzr=tknzr,
        )

    dset_size = len(dset)

    # Mini-batch sampler.  Each process will get batches exclusive to itself.
    dist_sampler = torch.utils.data.distributed.DistributedSampler(
        num_replicas=args.world_size,
        rank=args.rank,
        dataset=dset,
        shuffle=False,
    )

    # Mini-batch distributed random sampler.  Only when `args.n_worker > 0` we set `persisten_worker = True`.  We set
    # `pin_memory = True` to speed up process (which only speed up a few seconds).
    data_loader = torch.utils.data.DataLoader(
        batch_size=args.batch_size // args.world_size,
        dataset=dset,
        num_workers=args.n_worker,
        persistent_workers=bool(args.n_worker != 0),
        pin_memory=True,
        sampler=dist_sampler,
    )

    # Get tensorboard logger instance.  Only main process need to log performance.
    if args.rank == HOST_RANK:
        writer = lmp.util.log.get_tb_logger(exp_name=args.exp_name)
    else:
        writer = None

    # Evaluate checkpoints within ranges.
    for ckpt in lmp.util.model.list_ckpts(exp_name=args.exp_name,
                                          first_ckpt=args.first_ckpt,
                                          last_ckpt=args.last_ckpt):
        # Load pre-trained model instance.
        model = lmp.util.model.load(ckpt=ckpt, exp_name=args.exp_name)

        # Set model to evaluation model.  This turn off dropout layers in model.
        model = model.eval()

        # Move model to running device.
        model = model.to(device)

        # Create DDP model.
        dpp_model = torch.nn.parallel.DistributedDataParallel(model)

        # Processes can have unevenly distributed number of batch.  Thus one must use `ddp_model.join()` to avoid dead lock.
        with dpp_model.join():
            # Record average perplexity.
            avg_ppl = 0.0
            for batch_tkids in tqdm(data_loader):
                # Encode text into token ids.  We convert token ids into tensor and move to the same running device as model.
                batch_tkids = batch_tkids.to(device)

                # Format batch token ids to satisfy language model training format.
                batch_cur_tkids = batch_tkids[..., :-1]
                batch_next_tkids = batch_tkids[..., 1:]

                # Loop over token ids to get next token id prediction probability distribution.
                batch_prev_states = None
                batch_tkids_pd = []
                for i in range(batch_cur_tkids.size(1)):
                    batch_next_tkids_pd, batch_prev_states = model.pred(
                        batch_cur_tkids=batch_cur_tkids[:, i],
                        batch_prev_states=batch_prev_states,
                    )

                    # Collect prediction probability distribution.
                    batch_tkids_pd.append(batch_next_tkids_pd)

                # Calculate perplexity.
                batch_ppl = lmp.util.metric.ppl(batch_tkids=batch_next_tkids,
                                                batch_tkids_pd=torch.stack(
                                                    batch_tkids_pd, dim=1))

                # Sum `batch_ppl` from each process.
                dist.all_reduce(batch_ppl, op=dist.ReduceOp.SUM)

                # Accumulate average perplexity.
                avg_ppl += (batch_ppl / dset_size).sum().item()

        # Log average perplexity on dataset to CLI and tensorboard.  Only main process need to log performance.
        if args.rank == HOST_RANK:
            writer.add_scalar(f'ppl/{args.dset_name}/{args.ver}', avg_ppl,
                              ckpt)
        print(f'checkpoint: {ckpt}, avg ppl: {avg_ppl}')

    # Free memory.  This is only need for unit test.
    del args
    del avg_ppl
    del batch_cur_tkids
    del batch_next_tkids
    del batch_next_tkids_pd
    del batch_ppl
    del batch_prev_states
    del batch_tkids
    del batch_tkids_pd
    del ckpt
    del data_loader
    del device
    del dset
    del dset_size
    del model
    del model_cfg
    del tknzr
    del writer
    torch.cuda.empty_cache()
    gc.collect()
def main(argv: List[str]) -> None:
    """Script entry point.

  Parameters
  ----------
  argv: list[str]
    List of CLI arguments.

  Returns
  -------
  None
  """
    # Parse CLI arguments.
    args = parse_args(argv=argv)

    # `args.ckpt` validation.
    lmp.util.validate.raise_if_wrong_ordered(vals=[-1, args.ckpt],
                                             val_names=['-1', 'args.ckpt'])
    # `args.txt` validation.
    lmp.util.validate.raise_if_empty_str(val=args.txt, val_name='args.txt')

    # Set random seed for reproducibility.
    lmp.util.rand.set_seed(seed=args.seed)

    # Load pre-trained model configuration.
    model_cfg = lmp.util.cfg.load(exp_name=args.exp_name)

    # Load pre-trained tokenizer instance.
    tknzr = lmp.util.tknzr.load(exp_name=model_cfg.tknzr_exp_name)

    # Load pre-trained model instance.
    model = lmp.util.model.load(ckpt=args.ckpt, exp_name=args.exp_name)

    # Set model to evaluation model.  This turn off dropout layers in model.
    model = model.eval()

    # Get model running device.
    device = torch.device('cpu')
    if torch.cuda.is_available():
        device = torch.device('cuda')

    # Move model to running device.
    model = model.to(device)

    # Get inference method.
    infer = lmp.util.infer.create(**args.__dict__)

    # Generate text with specified inference method.
    txt = infer.gen(model=model, tknzr=tknzr, txt=args.txt)

    # Output generate text.
    print(txt)

    # Free memory.  This is only need for unit test.
    del args
    del device
    del infer
    del model
    del model_cfg
    del tknzr
    del txt
    torch.cuda.empty_cache()
    gc.collect()
Ejemplo n.º 7
0
def analogy_inference(device: torch.device,
                      model: Union[lmp.model.BaseRNNModel,
                                   lmp.model.BaseResRNNModel],
                      tokenizer: lmp.tokenizer.BaseTokenizer, word_a: str,
                      word_b: str, word_c: str) -> str:
    r"""Generate analog word based on `word_a`, `word_b` and `word_c`.

    This function perform word analogy based on the following rule:
        `word_a` : `word_b` = `word_c` : `word_d`
    Where `word_d` is the prediction target.

    Args:
        device:
            Model running device.
        model:
            Language model.
        tokenizer:
            Converting token (including `word_a`, `word_b` and `word_c`) into
            token id and convert token id back to token (`word_d`). This is
            need since we use word embedding layer in our language model.
        word_a:
        word_b:
        word_c:
            Query words for word analogy.

    Raises:
        TypeError:
            When one of the arguments are not an instance of their type
            annotation respectively.

    Returns:
        Predict word following word analogy.
    """
    # Type check.
    if not isinstance(device, torch.device):
        raise TypeError('`device` must be an instance of `torch.device`.')

    if not isinstance(model,
                      (lmp.model.BaseRNNModel, lmp.model.BaseResRNNModel)):
        raise TypeError(
            '`model` must be an instance of '
            '`Union[lmp.model.BaseRNNModel, lmp.model.BaseResRNNModel]`.')

    if not isinstance(tokenizer, lmp.tokenizer.BaseTokenizer):
        raise TypeError(
            '`tokenizer` must be an instance of `lmp.tokenizer.BaseTokenizer`.'
        )

    if not isinstance(word_a, str):
        raise TypeError('`word_a` must be an instance of `str`.')

    if not isinstance(word_b, str):
        raise TypeError('`word_b` must be an instance of `str`.')

    if not isinstance(word_c, str):
        raise TypeError('`word_c` must be an instance of `str`.')

    # Evaluation mode.
    model.eval()
    model = model.to(device)

    # Convert tokens (query words) into token ids.
    word_a_id = torch.LongTensor([tokenizer.convert_token_to_id(word_a)])
    word_b_id = torch.LongTensor([tokenizer.convert_token_to_id(word_b)])
    word_c_id = torch.LongTensor([tokenizer.convert_token_to_id(word_c)])

    # Perform analogy calculation.
    # Shape: `(1, E)`.
    out = (model.emb_layer(word_b_id.to(device)) -
           model.emb_layer(word_a_id.to(device)) +
           model.emb_layer(word_c_id.to(device)))

    # Calculate cosine similarity.
    # Shape: `(V)`.
    pred = torch.nn.functional.cosine_similarity(
        out,
        model.emb_layer.weight,
    )

    # Get the token id with maximum consine similarity.
    # Shape: `(1)`.
    word_d_id = pred.argmax(dim=0).to('cpu').item()

    # Convert back to token.
    return tokenizer.convert_id_to_token(word_d_id)
def generate_sequence(beam_width: int, begin_of_sequence: str,
                      device: torch.device, max_seq_len: int,
                      model: lmp.model.BaseRNNModel,
                      tokenizer: lmp.tokenizer.BaseTokenizer) -> List[str]:
    r"""Sequences generation using beam search.

    Args:
        beam_width:
            Number of candidate sequences to output.
        begin_of_sequence:
            Begining of sequence which model will auto-complete.
        device:
            Model running device.
        max_seq_len:
            Maximum of output sequences length.
        model:
            Language model.
        tokenizer:
            Tokenizer for encoding and decoding sequences.

    Returns:
        Generated sequences.
    """
    # Evaluation mode.
    model.eval()

    # Encode sequence and convert into tensor. Remove [EOS] since we are using
    # begin of sentence.
    cur_seq = tokenizer.encode(begin_of_sequence, max_seq_len=-1)
    cur_seq = torch.LongTensor(cur_seq)[:-1].to(device)

    # Get begin sequence length.
    seq_len = cur_seq.size(-1)

    # Generated sequence.
    # Start shape (1, S).
    # Final shape (B, S).
    cur_seq = cur_seq.reshape(1, seq_len)

    # Accumulated negative log-likelihood. Using log can change consecutive
    # probability multiplication into sum of log probability which can
    # avoid computational underflow. Initialized to zero with shape (B).
    accum_prob = torch.zeros(beam_width).to(device)

    for _ in range(max_seq_len - seq_len):
        # Model prediction has shape (B, S, V).
        pred_y = model.predict(cur_seq)

        # Record all beams prediction.
        # Each beam will predict `beam_width` different results.
        # So we totally have `beam_width * beam_width` different results.
        top_k_in_all_beams = []
        for out_beam in range(cur_seq.size(0)):
            # Get `beam_width` different prediction from beam `out_beam`.
            # `top_k_prob_in_beam` has shape (B) and
            # `top_k_index_in_beam` has shape (B).
            top_k_prob_in_beam, top_k_index_in_beam = \
                pred_y[out_beam, -1].topk(
                    k=beam_width,
                    dim=-1
                )

            # Record each beam's negative log-likelihood and concate
            # next token id based on prediction.
            for in_beam in range(beam_width):
                # Accumulate negative log-likelihood. Since log out
                # negative value when input is in range 0~1, we negate it
                # to be postive.
                prob = accum_prob[out_beam] - \
                    top_k_prob_in_beam[in_beam].log()
                prob = prob.unsqueeze(0)

                # Concate next predicted token id.
                seq = torch.cat([
                    cur_seq[out_beam],
                    top_k_index_in_beam[in_beam].unsqueeze(0)
                ],
                                dim=-1).unsqueeze(0)

                # Record result.
                top_k_in_all_beams.append({'prob': prob, 'seq': seq})

        # Compare each recorded result in all beams. First concate tensor
        # then use `topk` to get the `beam_width` highest prediction in all
        # beams.
        _, top_k_index_in_all_beams = torch.cat(
            [beam['prob'] for beam in top_k_in_all_beams]).topk(k=beam_width,
                                                                dim=0)

        # Update `cur_seq` which is the `beam_width` highest results.
        cur_seq = torch.cat([
            top_k_in_all_beams[index]['seq']
            for index in top_k_index_in_all_beams
        ],
                            dim=0)

        # Update accumlated negative log-likelihood.
        accum_prob = torch.cat([
            top_k_in_all_beams[index]['prob']
            for index in top_k_index_in_all_beams
        ],
                               dim=0)

    return tokenizer.batch_decode(cur_seq.tolist())
def main() -> None:
    r"""Script entry point."""
    # Parse command-line argument.
    args = parse_arg()

    # Load pre-trained model configuration.
    model_cfg = lmp.util.cfg.load(exp_name=args.exp_name)

    # Load pre-trained tokenizer configuration.
    tknzr_cfg = lmp.util.cfg.load(exp_name=model_cfg.tknzr_exp_name)

    # Load pre-trained tokenizer instance.
    tknzr = lmp.util.tknzr.load(
        exp_name=tknzr_cfg.exp_name,
        tknzr_name=tknzr_cfg.tknzr_name,
    )

    # Load pre-trained model instance.
    model = lmp.util.model.load(
        ckpt=args.ckpt,
        tknzr=tknzr,
        **model_cfg.__dict__,
    )

    # Get model running device.
    device = torch.device('cpu')
    if torch.cuda.is_available():
        device = torch.device('cuda')

    # Set model to evaluation model.
    # This turn off dropout layers in model.
    model = model.eval()

    # Move model to running device.
    model = model.to(device)

    # Encode text into token ids.
    # Wrap as batch with only one sample since `model.ppl` only accept batch.
    batch_tkids = tknzr.batch_enc(
        batch_txt=[args.txt],
        max_seq_len=model_cfg.max_seq_len,
    )

    # Convert token ids to `torch.Tensor` with `dtype == torch.int64`.
    batch_tkids = torch.LongTensor(batch_tkids)

    # Move tensors to model running device.
    batch_tkids = batch_tkids.to(device)

    # Format batch token ids to satisfy language model training format.
    batch_prev_tkids = batch_tkids[..., :-1]
    batch_next_tkids = batch_tkids[..., 1:]

    # Calculate perplexity.
    ppl = model.ppl(
        batch_next_tkids=batch_next_tkids,
        batch_prev_tkids=batch_prev_tkids,
    )

    # Output perplexity on given sample.
    print(ppl)
def main(argv: List[str]) -> None:
  """Script entry point.

  Parameters
  ----------
  argv: list[str]
    List of CLI arguments.

  Returns
  -------
  None
  """
  # Parse CLI arguments.
  args = parse_args(argv=argv)

  # `args.batch_size` validation.
  lmp.util.validate.raise_if_wrong_ordered(vals=[1, args.batch_size], val_names=['1', 'args.batch_size'])
  # `args.first_ckpt` validation.
  lmp.util.validate.raise_if_wrong_ordered(vals=[-1, args.first_ckpt], val_names=['-1', 'args.first_ckpt'])
  # `args.last_ckpt` validation.
  lmp.util.validate.raise_if_wrong_ordered(vals=[-1, args.last_ckpt], val_names=['-1', 'args.last_ckpt'])
  # `args.n_worker` validation.
  lmp.util.validate.raise_if_wrong_ordered(
    vals=[0, args.n_worker, len(os.sched_getaffinity(0))],
    val_names=['0', 'args.n_worker', 'number of available CPUs'],
  )
  lmp.util.validate.raise_if_wrong_ordered(
    vals=[args.n_worker, args.batch_size],
    val_names=['args.n_worker', 'args.batch_size'],
  )

  # Set random seed for reproducibility.
  lmp.util.rand.set_seed(seed=args.seed)

  # Get model running device.
  device = torch.device('cpu')
  if torch.cuda.is_available():
    device = torch.device('cuda')

  # Load pre-trained model configuration.
  model_cfg = lmp.util.cfg.load(exp_name=args.exp_name)

  # Load pre-trained tokenizer instance.
  tknzr = lmp.util.tknzr.load(exp_name=model_cfg.tknzr_exp_name)

  # Get dataset instance and convert samples to tensor.
  if args.is_dset_in_memory:
    dset: torch.utils.data.Dataset = lmp.util.dset.FastTensorDset(
      dset=lmp.util.dset.load(**args.__dict__),
      max_seq_len=model_cfg.max_seq_len,
      tknzr=tknzr,
    )
  else:
    dset = lmp.util.dset.SlowTensorDset(
      dset=lmp.util.dset.load(**args.__dict__),
      max_seq_len=model_cfg.max_seq_len,
      tknzr=tknzr,
    )

  dset_size = len(dset)

  # Mini-batch sampler.  Only when `args.n_worker > 0` we set `persisten_worker = True`.  We set
  # `pin_memory = True` to speed up process (which only speed up a few seconds).
  data_loader = torch.utils.data.DataLoader(
    batch_size=args.batch_size,
    dataset=dset,
    shuffle=False,
    num_workers=args.n_worker,
    persistent_workers=bool(args.n_worker != 0),
    pin_memory=True,
  )

  # Get tensorboard logger instance.
  writer = lmp.util.log.get_tb_logger(exp_name=args.exp_name)

  # Evaluate checkpoints within ranges.
  for ckpt in lmp.util.model.list_ckpts(exp_name=args.exp_name, first_ckpt=args.first_ckpt, last_ckpt=args.last_ckpt):
    # Load pre-trained model instance.
    model = lmp.util.model.load(ckpt=ckpt, exp_name=args.exp_name)

    # Set model to evaluation model.  This turn off dropout layers in model.
    model = model.eval()

    # Move model to running device.
    model = model.to(device)

    # Record average perplexity.
    avg_ppl = 0.0
    for batch_tkids in tqdm(data_loader):
      # Encode text into token ids.  We convert token ids into tensor and move to the same running device as model.
      batch_tkids = batch_tkids.to(device)

      # Format batch token ids to satisfy language model training format.
      batch_cur_tkids = batch_tkids[..., :-1]
      batch_next_tkids = batch_tkids[..., 1:]

      # Loop over token ids to get next token id prediction probability distribution.
      batch_prev_states = None
      batch_tkids_pd = []
      for i in range(batch_cur_tkids.size(1)):
        batch_next_tkids_pd, batch_prev_states = model.pred(
          batch_cur_tkids=batch_cur_tkids[:, i],
          batch_prev_states=batch_prev_states,
        )

        # Collect prediction probability distribution.
        batch_tkids_pd.append(batch_next_tkids_pd)

      # Calculate perplexity.
      batch_ppl = lmp.util.metric.ppl(batch_tkids=batch_next_tkids, batch_tkids_pd=torch.stack(batch_tkids_pd, dim=1))

      # Accumulate average perplexity.
      avg_ppl += (batch_ppl / dset_size).sum().item()

    # Log average perplexity on dataset to CLI and tensorboard.
    writer.add_scalar(f'ppl/{args.dset_name}/{args.ver}', avg_ppl, ckpt)
    print(f'checkpoint: {ckpt}, avg ppl: {avg_ppl}')

  # Free memory.  This is only need for unit test.
  del args
  del avg_ppl
  del batch_cur_tkids
  del batch_next_tkids
  del batch_next_tkids_pd
  del batch_ppl
  del batch_prev_states
  del batch_tkids
  del batch_tkids_pd
  del ckpt
  del data_loader
  del device
  del dset
  del dset_size
  del model
  del model_cfg
  del tknzr
  del writer
  torch.cuda.empty_cache()
  gc.collect()
Ejemplo n.º 11
0
def generate_sequence(
        beam_width: int,
        begin_of_sequence: str,
        device: torch.device,
        max_seq_len: int,
        model: Union[lmp.model.BaseRNNModel, lmp.model.BaseResRNNModel],
        tokenizer: lmp.tokenizer.BaseTokenizer
) -> List[str]:
    r"""Sequences generation using beam search.

    Args:
        beam_width:
            Number of candidate sequences to output. Must be bigger than or
            equal to `1`.
        begin_of_sequence:
            Begining of sequence which model will auto-complete.
        device:
            Model running device.
        max_seq_len:
            Maximum of output sequences length. Must be bigger than or equal to
            `2`.
        model:
            Language model.
        tokenizer:
            Tokenizer for encoding and decoding sequences.

    Raises:
        TypeError:
            When one of the arguments are not an instance of their type
            annotation respectively.
        ValueError:
            When one of the arguments do not follow their constraints. See
            docstring for arguments constraints.

    Returns:
        Generated sequences.
    """
    # Type check.
    if not isinstance(beam_width, int):
        raise TypeError('`beam_width` must be an instance of `int`.')

    if not isinstance(begin_of_sequence, str):
        raise TypeError('`begin_of_sequence` must be an instance of `str`.')

    if not isinstance(device, torch.device):
        raise TypeError('`device` must be an instance of `torch.device`.')

    if not isinstance(max_seq_len, int):
        raise TypeError('`max_seq_len` must be an instance of `int`.')

    if not isinstance(model, (
            lmp.model.BaseRNNModel,
            lmp.model.BaseResRNNModel
    )):
        raise TypeError(
            '`model` must be an instance of '
            '`Union[lmp.model.BaseRNNModel, lmp.model.BaseResRNNModel]`.'
        )

    if not isinstance(tokenizer, lmp.tokenizer.BaseTokenizer):
        raise TypeError(
            '`tokenizer` must be an instance of '
            '`lmp.tokenizer.BaseTokenizer`.'
        )

    # Value check.
    if beam_width < 1:
        raise ValueError('`beam_width` must be bigger than or equal to `1`.')

    if max_seq_len < 2:
        raise ValueError('`max_seq_len` must be bigger than or equal to `2`.')

    # Evaluation mode.
    model.eval()

    # Encode sequence and convert into tensor. Remove `[eos]`` since we are
    # using begin of sentence.
    cur_seq = tokenizer.encode(begin_of_sequence, max_seq_len=-1)
    cur_seq = torch.LongTensor(cur_seq)[:-1].to(device)

    # Get begin sequence length.
    seq_len = cur_seq.size(-1)

    # Generated sequence.
    # Start shape (1, S).
    # Final shape (B, S).
    cur_seq = cur_seq.reshape(1, seq_len)

    # Accumulated negative log-likelihood. Using log can change consecutive
    # probability multiplication into sum of log probability which can
    # avoid computational underflow. Initialized to zero with shape (B).
    accum_prob = torch.zeros(beam_width).to(device)

    for _ in range(max_seq_len - seq_len):
        # Model prediction has shape (B, S, V).
        pred_y = model.predict(cur_seq)

        # Record all beams prediction.
        # Each beam will predict `beam_width` different results.
        # So we totally have `beam_width * beam_width` different results.
        top_k_in_all_beams = []
        for out_beam in range(cur_seq.size(0)):
            # Get `beam_width` different prediction from beam `out_beam`.
            # `top_k_prob_in_beam` has shape (B) and
            # `top_k_index_in_beam` has shape (B).
            top_k_prob_in_beam, top_k_index_in_beam = \
                pred_y[out_beam, -1].topk(
                    k=beam_width,
                    dim=-1
                )

            # Record each beam's negative log-likelihood and concate
            # next token id based on prediction.
            for in_beam in range(beam_width):
                # Accumulate negative log-likelihood. Since log out
                # negative value when input is in range 0~1, we negate it
                # to be postive.
                prob = accum_prob[out_beam] - \
                    top_k_prob_in_beam[in_beam].log()
                prob = prob.unsqueeze(0)

                # Concate next predicted token id.
                seq = torch.cat([
                    cur_seq[out_beam],
                    top_k_index_in_beam[in_beam].unsqueeze(0)
                ], dim=-1).unsqueeze(0)

                # Record result.
                top_k_in_all_beams.append({
                    'prob': prob,
                    'seq': seq
                })

        # Compare each recorded result in all beams. First concate tensor
        # then use `topk` to get the `beam_width` highest prediction in all
        # beams.
        _, top_k_index_in_all_beams = torch.cat([
            beam['prob']
            for beam in top_k_in_all_beams
        ]).topk(k=beam_width, dim=0)

        # Update `cur_seq` which is the `beam_width` highest results.
        cur_seq = torch.cat([
            top_k_in_all_beams[index]['seq']
            for index in top_k_index_in_all_beams
        ], dim=0)

        # Update accumlated negative log-likelihood.
        accum_prob = torch.cat([
            top_k_in_all_beams[index]['prob']
            for index in top_k_index_in_all_beams
        ], dim=0)

    return tokenizer.batch_decode(cur_seq.tolist())