Beispiel #1
0
def main() -> None:
    r"""Script entry point."""
    # Parse command-line argument.
    args = parse_arg()

    # Set random seed for reproducibility.
    lmp.util.rand.set_seed(seed=args.seed)

    # Load pre-trained model configuration.
    model_cfg = lmp.util.cfg.load(exp_name=args.exp_name)

    # Load pre-trained tokenizer configuration.
    tknzr_cfg = lmp.util.cfg.load(exp_name=model_cfg.tknzr_exp_name)

    # Load pre-trained tokenizer instance.
    tknzr = lmp.util.tknzr.load(
        exp_name=tknzr_cfg.exp_name,
        tknzr_name=tknzr_cfg.tknzr_name,
    )

    # Load pre-trained model instance.
    model = lmp.util.model.load(
        ckpt=args.ckpt,
        tknzr=tknzr,
        **model_cfg.__dict__,
    )

    # Get inference method.
    infer = lmp.util.infer.create(
        max_seq_len=model_cfg.max_seq_len,
        **args.__dict__,
    )

    # Get model running device.
    device = torch.device('cpu')
    if torch.cuda.is_available():
        device = torch.device('cuda')

    # Set model to evaluation model.
    # This turn off dropout layers in model.
    model = model.eval()

    # Move model to running device.
    model = model.to(device)

    # Generate text with specified inference method.
    txt = infer.gen(model=model, tknzr=tknzr, txt=args.txt)

    # Output generate text.
    print(txt)
Beispiel #2
0
def main() -> None:
    r"""Script entry point."""
    # Parse command-line argument.
    args = parse_arg()

    # Get dataset instance with specified version.
    dset = lmp.util.dset.load(dset_name=args.dset_name, ver=args.ver)

    # Mini-batch random sampler.
    dldr = torch.utils.data.DataLoader(
        dataset=dset,
        batch_size=args.batch_size,
        shuffle=False,
    )

    # Load pre-trained model configuration.
    model_cfg = lmp.util.cfg.load(exp_name=args.exp_name)

    # Load pre-trained tokenizer configuration.
    tknzr_cfg = lmp.util.cfg.load(exp_name=model_cfg.tknzr_exp_name)

    # Load pre-trained tokenizer instance.
    tknzr = lmp.util.tknzr.load(
        exp_name=tknzr_cfg.exp_name,
        tknzr_name=tknzr_cfg.tknzr_name,
    )

    # Get model running device.
    device = torch.device('cpu')
    if torch.cuda.is_available():
        device = torch.device('cuda')

    # Get tensorboard logger instance.
    writer = lmp.util.log.get_tb_logger(exp_name=args.exp_name)

    # Load pre-trained checkpoints range from `args.first_ckpt` to
    # `args.last_ckpt`.
    for ckpt in lmp.util.model.list_ckpts(
            exp_name=args.exp_name,
            first_ckpt=args.first_ckpt,
            last_ckpt=args.last_ckpt,
    ):
        # Load pre-trained model instance from checkpoint `ckpt`.
        model = lmp.util.model.load(
            ckpt=ckpt,
            tknzr=tknzr,
            **model_cfg.__dict__,
        )

        # Set model to evaluation model.
        # This turn off dropout layers in model.
        model = model.eval()

        # Move model to running device.
        model = model.to(device)

        # Record average perplexity.
        avg_ppl = 0.0
        for batch_txt in tqdm(dldr):

            # Encode batch text into batch of token ids.
            batch_tkids = tknzr.batch_enc(
                batch_txt=batch_txt,
                max_seq_len=model_cfg.max_seq_len,
            )

            # Convert batch of token ids to `torch.Tensor` with
            # `dtype == torch.int64`.
            batch_tkids = torch.LongTensor(batch_tkids)

            # Move tensors to model running device.
            batch_tkids = batch_tkids.to(device)

            # Format batch token ids to satisfy language model training format.
            batch_prev_tkids = batch_tkids[..., :-1]
            batch_next_tkids = batch_tkids[..., 1:]

            # Calculate perplexity.
            batch_avg_ppl = model.ppl(
                batch_next_tkids=batch_next_tkids,
                batch_prev_tkids=batch_prev_tkids,
            )

            # Accumulate average perplexity.
            avg_ppl += batch_avg_ppl * len(batch_txt) / len(dset)

        # Log average perplexity on dataset to CLI and tensorboard.
        writer.add_scalar(f'ppl/{args.dset_name}/{args.ver}', avg_ppl, ckpt)
        print(f'checkpoint {ckpt} ppl: {avg_ppl}')
def main(argv: List[str]) -> None:
    """Script entry point.

  Parameters
  ----------
  argv: list[str]
    List of CLI arguments.

  Returns
  -------
  None
  """
    # Parse CLI arguments.
    args = parse_args(argv=argv)

    # `args.batch_size` validation.
    lmp.util.validate.raise_if_wrong_ordered(
        vals=[1, args.batch_size], val_names=['1', 'args.batch_size'])
    # `args.first_ckpt` validation.
    lmp.util.validate.raise_if_wrong_ordered(
        vals=[-1, args.first_ckpt], val_names=['-1', 'args.first_ckpt'])
    # `args.last_ckpt` validation.
    lmp.util.validate.raise_if_wrong_ordered(
        vals=[-1, args.last_ckpt], val_names=['-1', 'args.last_ckpt'])
    # `args.n_worker` validation.
    lmp.util.validate.raise_if_wrong_ordered(
        vals=[0, args.n_worker, len(os.sched_getaffinity(0))],
        val_names=['0', 'args.n_worker', 'number of available CPUs'],
    )
    lmp.util.validate.raise_if_wrong_ordered(
        vals=[args.n_worker, args.batch_size],
        val_names=['args.n_worker', 'args.batch_size'],
    )

    # We use TCP to perform RPC.  Timeout is set to 5 minutes.
    store = dist.TCPStore(
        is_master=args.rank == HOST_RANK,
        host_name=args.host_name,
        port=args.host_port,
        timeout=timedelta(minutes=5),
        world_size=args.world_size,
    )

    # Use NCCL backend to perform CUDA collectives.
    dist.init_process_group(
        backend=dist.Backend.NCCL,
        store=store,
        rank=args.rank,
        timeout=timedelta(minutes=5),
        world_size=args.world_size,
    )

    # Sync arguments.
    dist_args_k = [
        'host_name', 'host_port', 'local_rank', 'rank', 'world_size'
    ]
    for k in args.__dict__.keys():
        if k in dist_args_k:
            continue

        # Host broadcast arguments.
        if args.rank == HOST_RANK:
            store.set(k, str(args.__dict__[k]))
        # Non-host receive host arguments.
        else:
            v = store.get(k)
            if isinstance(args.__dict__[k], str):
                args.__dict__[k] = v.decode('utf-8')
            else:
                args.__dict__[k] = type(args.__dict__[k])(v)

    # Set random seed for reproducibility.  Note that each process use different seed to get different slice of batch.
    lmp.util.rand.set_seed(seed=args.seed + args.rank)

    # Get model running device.
    device = torch.device('cpu')
    if torch.cuda.is_available():
        device = torch.device(f'cuda:{args.local_rank}')

    # Load pre-trained model configuration.
    model_cfg = lmp.util.cfg.load(exp_name=args.exp_name)

    # Load pre-trained tokenizer instance.
    tknzr = lmp.util.tknzr.load(exp_name=model_cfg.tknzr_exp_name)

    # Get dataset instance and convert samples to tensor.
    if args.is_dset_in_memory:
        dset: torch.utils.data.Dataset = lmp.util.dset.FastTensorDset(
            dset=lmp.util.dset.load(**args.__dict__),
            max_seq_len=model_cfg.max_seq_len,
            tknzr=tknzr,
        )
    else:
        dset = lmp.util.dset.SlowTensorDset(
            dset=lmp.util.dset.load(**args.__dict__),
            max_seq_len=model_cfg.max_seq_len,
            tknzr=tknzr,
        )

    dset_size = len(dset)

    # Mini-batch sampler.  Each process will get batches exclusive to itself.
    dist_sampler = torch.utils.data.distributed.DistributedSampler(
        num_replicas=args.world_size,
        rank=args.rank,
        dataset=dset,
        shuffle=False,
    )

    # Mini-batch distributed random sampler.  Only when `args.n_worker > 0` we set `persisten_worker = True`.  We set
    # `pin_memory = True` to speed up process (which only speed up a few seconds).
    data_loader = torch.utils.data.DataLoader(
        batch_size=args.batch_size // args.world_size,
        dataset=dset,
        num_workers=args.n_worker,
        persistent_workers=bool(args.n_worker != 0),
        pin_memory=True,
        sampler=dist_sampler,
    )

    # Get tensorboard logger instance.  Only main process need to log performance.
    if args.rank == HOST_RANK:
        writer = lmp.util.log.get_tb_logger(exp_name=args.exp_name)
    else:
        writer = None

    # Evaluate checkpoints within ranges.
    for ckpt in lmp.util.model.list_ckpts(exp_name=args.exp_name,
                                          first_ckpt=args.first_ckpt,
                                          last_ckpt=args.last_ckpt):
        # Load pre-trained model instance.
        model = lmp.util.model.load(ckpt=ckpt, exp_name=args.exp_name)

        # Set model to evaluation model.  This turn off dropout layers in model.
        model = model.eval()

        # Move model to running device.
        model = model.to(device)

        # Create DDP model.
        dpp_model = torch.nn.parallel.DistributedDataParallel(model)

        # Processes can have unevenly distributed number of batch.  Thus one must use `ddp_model.join()` to avoid dead lock.
        with dpp_model.join():
            # Record average perplexity.
            avg_ppl = 0.0
            for batch_tkids in tqdm(data_loader):
                # Encode text into token ids.  We convert token ids into tensor and move to the same running device as model.
                batch_tkids = batch_tkids.to(device)

                # Format batch token ids to satisfy language model training format.
                batch_cur_tkids = batch_tkids[..., :-1]
                batch_next_tkids = batch_tkids[..., 1:]

                # Loop over token ids to get next token id prediction probability distribution.
                batch_prev_states = None
                batch_tkids_pd = []
                for i in range(batch_cur_tkids.size(1)):
                    batch_next_tkids_pd, batch_prev_states = model.pred(
                        batch_cur_tkids=batch_cur_tkids[:, i],
                        batch_prev_states=batch_prev_states,
                    )

                    # Collect prediction probability distribution.
                    batch_tkids_pd.append(batch_next_tkids_pd)

                # Calculate perplexity.
                batch_ppl = lmp.util.metric.ppl(batch_tkids=batch_next_tkids,
                                                batch_tkids_pd=torch.stack(
                                                    batch_tkids_pd, dim=1))

                # Sum `batch_ppl` from each process.
                dist.all_reduce(batch_ppl, op=dist.ReduceOp.SUM)

                # Accumulate average perplexity.
                avg_ppl += (batch_ppl / dset_size).sum().item()

        # Log average perplexity on dataset to CLI and tensorboard.  Only main process need to log performance.
        if args.rank == HOST_RANK:
            writer.add_scalar(f'ppl/{args.dset_name}/{args.ver}', avg_ppl,
                              ckpt)
        print(f'checkpoint: {ckpt}, avg ppl: {avg_ppl}')

    # Free memory.  This is only need for unit test.
    del args
    del avg_ppl
    del batch_cur_tkids
    del batch_next_tkids
    del batch_next_tkids_pd
    del batch_ppl
    del batch_prev_states
    del batch_tkids
    del batch_tkids_pd
    del ckpt
    del data_loader
    del device
    del dset
    del dset_size
    del model
    del model_cfg
    del tknzr
    del writer
    torch.cuda.empty_cache()
    gc.collect()
def main() -> None:
    r"""Script entry point."""
    # Parse command-line argument.
    args = parse_arg()

    # Save training configuration.
    lmp.util.cfg.save(args=args, exp_name=args.exp_name)

    # Set random seed for reproducibility.
    lmp.util.rand.set_seed(seed=args.seed)

    # Get dataset instance with specified version.
    dset = lmp.util.dset.load(dset_name=args.dset_name, ver=args.ver)

    # Mini-batch random sampler.
    dldr = torch.utils.data.DataLoader(
        dataset=dset,
        batch_size=args.batch_size,
        shuffle=True,
    )

    # Load pre-trained tokenizer.
    tknzr_cfg = lmp.util.cfg.load(exp_name=args.tknzr_exp_name)
    tknzr = lmp.util.tknzr.load(
        exp_name=args.tknzr_exp_name,
        tknzr_name=tknzr_cfg.tknzr_name,
    )

    # Get model running device.
    device = torch.device('cpu')
    if torch.cuda.is_available():
        device = torch.device('cuda')

    # Get new model instance.
    model = lmp.util.model.create(tknzr=tknzr, **args.__dict__)
    model = model.train()

    # Move model to running device.
    model = model.to(device)

    # Remove weight decay on bias and layer-norm.
    no_decay = ['bias', 'LayerNorm.weight']
    optim_group_params = [
        {
            'params': [
                param for name, param in model.named_parameters()
                if not any(nd in name for nd in no_decay)
            ],
            'weight_decay':
            args.wd,
        },
        {
            'params': [
                param for name, param in model.named_parameters()
                if any(nd in name for nd in no_decay)
            ],
            'weight_decay':
            0.0,
        },
    ]

    # Get new optimizer instance.
    optim = torch.optim.AdamW(
        optim_group_params,
        betas=(args.beta1, args.beta2),
        lr=args.lr,
        eps=args.eps,
    )

    # Get tensorboard logger instance.
    writer = lmp.util.log.get_tb_logger(exp_name=args.exp_name)

    # Log performance target.
    pre_avg_loss = 0.0
    avg_loss = 0.0

    # Global optimization step.
    step = 0

    for epoch in range(args.n_epoch):
        tqdm_dldr = tqdm(
            dldr,
            desc=f'epoch: {epoch}, loss: {pre_avg_loss:.6f}',
        )
        for batch_txt in tqdm_dldr:
            # Encode batch text into batch token ids.
            batch_tkids = tknzr.batch_enc(
                batch_txt=batch_txt,
                max_seq_len=args.max_seq_len,
            )

            # Convert batch token ids to `torch.Tensor` with
            # `dtype == torch.int64`.
            batch_tkids = torch.LongTensor(batch_tkids)

            # Move tensors to model running device.
            batch_tkids = batch_tkids.to(device)

            # Format batch token ids to satisfy language model training format.
            batch_prev_tkids = batch_tkids[..., :-1]
            batch_next_tkids = batch_tkids[..., 1:]

            # Calculate loss using loss function.
            loss = model.loss_fn(
                batch_next_tkids=batch_next_tkids,
                batch_prev_tkids=batch_prev_tkids,
            )

            # Accumulate average loss.
            avg_loss += loss.item()

            # Backward pass / back propagation.
            loss.backward()

            # Perform gradient clipping to avoid gradient explosion.
            torch.nn.utils.clip_grad_norm_(
                model.parameters(),
                max_norm=args.max_norm,
            )

            # Gradient descent.
            optim.step()

            # Clean up gradient.
            # This is needed only in `torch`.
            optim.zero_grad()

            # Increment global step.
            step += 1

            # Save checkpoint for each `ckpt_step` step.
            if step % args.ckpt_step == 0:
                model.save(ckpt=step, exp_name=args.exp_name)

            # Log performance for each `log_step` step.
            if step % args.log_step == 0:
                avg_loss = avg_loss / args.log_step

                # Log on CLI.
                tqdm_dldr.set_description(
                    f'epoch: {epoch}, loss: {avg_loss:.6f}', )

                # Log on tensorboard
                writer.add_scalar(
                    f'loss/{args.dset_name}/{args.ver}',
                    avg_loss,
                    step,
                )

                # Refresh log performance.
                pre_avg_loss = avg_loss
                avg_loss = 0.0

    # Save last checkpoint.
    model.save(ckpt=step, exp_name=args.exp_name)

    # Close tensorboard logger.
    writer.close()
def main(argv: List[str]) -> None:
    """Script entry point.

  Parameters
  ----------
  argv: list[str]
    List of CLI arguments.

  Returns
  -------
  None
  """
    # Parse CLI arguments.
    args = parse_args(argv=argv)

    # `args.ckpt` validation.
    lmp.util.validate.raise_if_wrong_ordered(vals=[-1, args.ckpt],
                                             val_names=['-1', 'args.ckpt'])
    # `args.txt` validation.
    lmp.util.validate.raise_if_empty_str(val=args.txt, val_name='args.txt')

    # Set random seed for reproducibility.
    lmp.util.rand.set_seed(seed=args.seed)

    # Load pre-trained model configuration.
    model_cfg = lmp.util.cfg.load(exp_name=args.exp_name)

    # Load pre-trained tokenizer instance.
    tknzr = lmp.util.tknzr.load(exp_name=model_cfg.tknzr_exp_name)

    # Load pre-trained model instance.
    model = lmp.util.model.load(ckpt=args.ckpt, exp_name=args.exp_name)

    # Set model to evaluation model.  This turn off dropout layers in model.
    model = model.eval()

    # Get model running device.
    device = torch.device('cpu')
    if torch.cuda.is_available():
        device = torch.device('cuda')

    # Move model to running device.
    model = model.to(device)

    # Get inference method.
    infer = lmp.util.infer.create(**args.__dict__)

    # Generate text with specified inference method.
    txt = infer.gen(model=model, tknzr=tknzr, txt=args.txt)

    # Output generate text.
    print(txt)

    # Free memory.  This is only need for unit test.
    del args
    del device
    del infer
    del model
    del model_cfg
    del tknzr
    del txt
    torch.cuda.empty_cache()
    gc.collect()
Beispiel #6
0
def load_model(
        checkpoint: int,
        d_emb: int,
        d_hid: int,
        device: torch.device,
        dropout: float,
        experiment: str,
        model_class: str,
        num_linear_layers: int,
        num_rnn_layers: int,
        pad_token_id: int,
        vocab_size: int
) -> lmp.model.BaseRNNModel:
    r"""Helper function for constructing language model.

    Load optimizer from pre-trained checkpoint when `checkpoint != -1`.

    Args:
        checkpoint:
            Pre-trained model's checkpoint.
        d_emb:
            Embedding matrix vector dimension.
        d_hid:
            GRU layers hidden dimension.
        device:
            Model running device.
        dropout:
            Dropout probability on all layers out (except output layer).
        experiment:
            Name of the pre-trained experiment.
        num_rnn_layers:
            Number of GRU layers to use.
        num_linear_layers:
            Number of Linear layers to use.
        pad_token_id:
            Padding token's id. Embedding layers will initialize padding
            token's vector with zeros.
        vocab_size:
            Embedding matrix vocabulary dimension.

    Raises:
        ValueError:
            If `model` does not supported.

    Returns:
        `lmp.model.BaseRNNModel` if `model_class == 'rnn'`;
        `lmp.model.GRUModel` if `model_class == 'gru'`;
        `lmp.model.LSTMModel` if `model_class == 'lstm'`.
    """

    if model_class == 'rnn':
        model = lmp.model.BaseRNNModel(
            d_emb=d_emb,
            d_hid=d_hid,
            dropout=dropout,
            num_rnn_layers=num_rnn_layers,
            num_linear_layers=num_linear_layers,
            pad_token_id=pad_token_id,
            vocab_size=vocab_size
        )
    elif model_class == 'gru':
        model = lmp.model.GRUModel(
            d_emb=d_emb,
            d_hid=d_hid,
            dropout=dropout,
            num_rnn_layers=num_rnn_layers,
            num_linear_layers=num_linear_layers,
            pad_token_id=pad_token_id,
            vocab_size=vocab_size
        )
    elif model_class == 'lstm':
        model = lmp.model.LSTMModel(
            d_emb=d_emb,
            d_hid=d_hid,
            dropout=dropout,
            num_rnn_layers=num_rnn_layers,
            num_linear_layers=num_linear_layers,
            pad_token_id=pad_token_id,
            vocab_size=vocab_size
        )
    else:
        raise ValueError(
            f'model `{model_class}` does not support.\nSupported options:' +
            ''.join(list(map(
                lambda option: f'\n\t--model {option}',
                [
                    'rnn',
                    'gru',
                    'lstm',
                ]
            )))
        )

    if checkpoint != -1:
        file_path = f'{lmp.path.DATA_PATH}/{experiment}/model-{checkpoint}.pt'
        if not os.path.exists(file_path):
            raise FileNotFoundError(f'file {file_path} does not exist.')
        model.load_state_dict(torch.load(file_path))

    return model.to(device)
def load_model(
    checkpoint: int, d_emb: int, d_hid: int, device: torch.device,
    dropout: float, experiment: str, model_class: str, num_linear_layers: int,
    num_rnn_layers: int, pad_token_id: int, vocab_size: int
) -> Union[lmp.model.BaseRNNModel, lmp.model.BaseResRNNModel]:
    r"""Helper function for constructing language model.

    Supported options:
        --model_class rnn
        --model_class gru
        --model_class lstm
        --model_class res_rnn
        --model_class res_gru
        --model_class res_lstm

    Load model from pre-trained checkpoint when `checkpoint != -1`.

    Args:
        checkpoint:
            Pre-trained model's checkpoint. Must be bigger than or equal to
            `-1`.
        d_emb:
            Embedding matrix vector dimension. Must be bigger than or equal to
            `1`.
        d_hid:
            Model layers hidden dimension. Must be bigger than or equal to
            `1`.
        device:
            Model running device.
        dropout:
            Dropout probability on all layers output (except output layer).
            Must range from `0.0` to `1.0`.
        experiment:
            Name of the pre-trained experiment. Must not be empty when
            `checkpoint != -1`.
        num_linear_layers:
            Number of Linear layers to use. Must be bigger than or equal to
            `1`.
        num_rnn_layers:
            Number of RNN layers to use. Must be bigger than or equal to
            `1`.
        pad_token_id:
            Padding token's id. Embedding layer will initialize padding
            token's vector with zeros. Must be bigger than or equal to `0`, and
            must be smaller than `vocab_size`.
        vocab_size:
            Embedding matrix vocabulary dimension. Must be bigger than or equal
            to `1`.

    Raises:
        TypeError:
            When one of the arguments are not an instance of their type
            annotation respectively.
        ValueError:
            When one of the arguments do not follow their constraints. See
            docstring for arguments constraints.

    Returns:
        `lmp.model.BaseRNNModel` if `model_class == 'rnn'`;
        `lmp.model.GRUModel` if `model_class == 'gru'`;
        `lmp.model.LSTMModel` if `model_class == 'lstm'`;
        `lmp.model.BaseResRNNModel` if `model_class == 'res_rnn'`;
        `lmp.model.ResGRUModel` if `model_class == 'res_gru'`;
        `lmp.model.ResLSTMModel` if `model_class == 'res_lstm'`.
    """
    # Type check.
    if not isinstance(checkpoint, int):
        raise TypeError('`checkpoint` must be an instance of `int`.')

    if not isinstance(experiment, str):
        raise TypeError('`experiment` must be an instance of `str`.')

    if not isinstance(device, torch.device):
        raise TypeError('`device` must be an instance of `torch.device`.')

    if not isinstance(model_class, str):
        raise TypeError('`model_class` must be an instance of `str`.')

    # Value Check.
    if checkpoint < -1:
        raise ValueError('`checkpoint` must be bigger than or equal to `-1`.')

    if model_class == 'rnn':
        model = lmp.model.BaseRNNModel(d_emb=d_emb,
                                       d_hid=d_hid,
                                       dropout=dropout,
                                       num_rnn_layers=num_rnn_layers,
                                       num_linear_layers=num_linear_layers,
                                       pad_token_id=pad_token_id,
                                       vocab_size=vocab_size)

    elif model_class == 'gru':
        model = lmp.model.GRUModel(d_emb=d_emb,
                                   d_hid=d_hid,
                                   dropout=dropout,
                                   num_rnn_layers=num_rnn_layers,
                                   num_linear_layers=num_linear_layers,
                                   pad_token_id=pad_token_id,
                                   vocab_size=vocab_size)

    elif model_class == 'lstm':
        model = lmp.model.LSTMModel(d_emb=d_emb,
                                    d_hid=d_hid,
                                    dropout=dropout,
                                    num_rnn_layers=num_rnn_layers,
                                    num_linear_layers=num_linear_layers,
                                    pad_token_id=pad_token_id,
                                    vocab_size=vocab_size)

    elif model_class == 'res_rnn':
        model = lmp.model.BaseResRNNModel(d_emb=d_emb,
                                          d_hid=d_hid,
                                          dropout=dropout,
                                          num_rnn_layers=num_rnn_layers,
                                          num_linear_layers=num_linear_layers,
                                          pad_token_id=pad_token_id,
                                          vocab_size=vocab_size)

    elif model_class == 'res_gru':
        model = lmp.model.ResGRUModel(d_emb=d_emb,
                                      d_hid=d_hid,
                                      dropout=dropout,
                                      num_rnn_layers=num_rnn_layers,
                                      num_linear_layers=num_linear_layers,
                                      pad_token_id=pad_token_id,
                                      vocab_size=vocab_size)

    elif model_class == 'res_lstm':
        model = lmp.model.ResLSTMModel(d_emb=d_emb,
                                       d_hid=d_hid,
                                       dropout=dropout,
                                       num_rnn_layers=num_rnn_layers,
                                       num_linear_layers=num_linear_layers,
                                       pad_token_id=pad_token_id,
                                       vocab_size=vocab_size)

    else:
        raise ValueError(
            f'model `{model_class}` does not support.\nSupported options:' +
            ''.join(
                list(
                    map(lambda option: f'\n\t--model_class {option}', [
                        'rnn',
                        'gru',
                        'lstm',
                        'res_rnn',
                        'res_gru',
                        'res_lstm',
                    ]))))

    if checkpoint != -1:
        if not experiment:
            raise ValueError('`experiment` must not be empty.')

        file_path = os.path.join(lmp.path.DATA_PATH, experiment,
                                 f'model-{checkpoint}.pt')

        if not os.path.exists(file_path):
            raise FileNotFoundError(f'File {file_path} does not exist.')

        model.load_state_dict(torch.load(file_path))

    return model.to(device)
Beispiel #8
0
def analogy_inference(device: torch.device,
                      model: Union[lmp.model.BaseRNNModel,
                                   lmp.model.BaseResRNNModel],
                      tokenizer: lmp.tokenizer.BaseTokenizer, word_a: str,
                      word_b: str, word_c: str) -> str:
    r"""Generate analog word based on `word_a`, `word_b` and `word_c`.

    This function perform word analogy based on the following rule:
        `word_a` : `word_b` = `word_c` : `word_d`
    Where `word_d` is the prediction target.

    Args:
        device:
            Model running device.
        model:
            Language model.
        tokenizer:
            Converting token (including `word_a`, `word_b` and `word_c`) into
            token id and convert token id back to token (`word_d`). This is
            need since we use word embedding layer in our language model.
        word_a:
        word_b:
        word_c:
            Query words for word analogy.

    Raises:
        TypeError:
            When one of the arguments are not an instance of their type
            annotation respectively.

    Returns:
        Predict word following word analogy.
    """
    # Type check.
    if not isinstance(device, torch.device):
        raise TypeError('`device` must be an instance of `torch.device`.')

    if not isinstance(model,
                      (lmp.model.BaseRNNModel, lmp.model.BaseResRNNModel)):
        raise TypeError(
            '`model` must be an instance of '
            '`Union[lmp.model.BaseRNNModel, lmp.model.BaseResRNNModel]`.')

    if not isinstance(tokenizer, lmp.tokenizer.BaseTokenizer):
        raise TypeError(
            '`tokenizer` must be an instance of `lmp.tokenizer.BaseTokenizer`.'
        )

    if not isinstance(word_a, str):
        raise TypeError('`word_a` must be an instance of `str`.')

    if not isinstance(word_b, str):
        raise TypeError('`word_b` must be an instance of `str`.')

    if not isinstance(word_c, str):
        raise TypeError('`word_c` must be an instance of `str`.')

    # Evaluation mode.
    model.eval()
    model = model.to(device)

    # Convert tokens (query words) into token ids.
    word_a_id = torch.LongTensor([tokenizer.convert_token_to_id(word_a)])
    word_b_id = torch.LongTensor([tokenizer.convert_token_to_id(word_b)])
    word_c_id = torch.LongTensor([tokenizer.convert_token_to_id(word_c)])

    # Perform analogy calculation.
    # Shape: `(1, E)`.
    out = (model.emb_layer(word_b_id.to(device)) -
           model.emb_layer(word_a_id.to(device)) +
           model.emb_layer(word_c_id.to(device)))

    # Calculate cosine similarity.
    # Shape: `(V)`.
    pred = torch.nn.functional.cosine_similarity(
        out,
        model.emb_layer.weight,
    )

    # Get the token id with maximum consine similarity.
    # Shape: `(1)`.
    word_d_id = pred.argmax(dim=0).to('cpu').item()

    # Convert back to token.
    return tokenizer.convert_id_to_token(word_d_id)
def main() -> None:
    r"""Script entry point."""
    # Parse command-line argument.
    args = parse_arg()

    # Load pre-trained model configuration.
    model_cfg = lmp.util.cfg.load(exp_name=args.exp_name)

    # Load pre-trained tokenizer configuration.
    tknzr_cfg = lmp.util.cfg.load(exp_name=model_cfg.tknzr_exp_name)

    # Load pre-trained tokenizer instance.
    tknzr = lmp.util.tknzr.load(
        exp_name=tknzr_cfg.exp_name,
        tknzr_name=tknzr_cfg.tknzr_name,
    )

    # Load pre-trained model instance.
    model = lmp.util.model.load(
        ckpt=args.ckpt,
        tknzr=tknzr,
        **model_cfg.__dict__,
    )

    # Get model running device.
    device = torch.device('cpu')
    if torch.cuda.is_available():
        device = torch.device('cuda')

    # Set model to evaluation model.
    # This turn off dropout layers in model.
    model = model.eval()

    # Move model to running device.
    model = model.to(device)

    # Encode text into token ids.
    # Wrap as batch with only one sample since `model.ppl` only accept batch.
    batch_tkids = tknzr.batch_enc(
        batch_txt=[args.txt],
        max_seq_len=model_cfg.max_seq_len,
    )

    # Convert token ids to `torch.Tensor` with `dtype == torch.int64`.
    batch_tkids = torch.LongTensor(batch_tkids)

    # Move tensors to model running device.
    batch_tkids = batch_tkids.to(device)

    # Format batch token ids to satisfy language model training format.
    batch_prev_tkids = batch_tkids[..., :-1]
    batch_next_tkids = batch_tkids[..., 1:]

    # Calculate perplexity.
    ppl = model.ppl(
        batch_next_tkids=batch_next_tkids,
        batch_prev_tkids=batch_prev_tkids,
    )

    # Output perplexity on given sample.
    print(ppl)
def main(argv: List[str]) -> None:
  """Script entry point.

  Parameters
  ----------
  argv: list[str]
    List of CLI arguments.

  Returns
  -------
  None
  """
  # Parse CLI arguments.
  args = parse_args(argv=argv)

  # `args.batch_size` validation.
  lmp.util.validate.raise_if_wrong_ordered(vals=[1, args.batch_size], val_names=['1', 'args.batch_size'])
  # `args.first_ckpt` validation.
  lmp.util.validate.raise_if_wrong_ordered(vals=[-1, args.first_ckpt], val_names=['-1', 'args.first_ckpt'])
  # `args.last_ckpt` validation.
  lmp.util.validate.raise_if_wrong_ordered(vals=[-1, args.last_ckpt], val_names=['-1', 'args.last_ckpt'])
  # `args.n_worker` validation.
  lmp.util.validate.raise_if_wrong_ordered(
    vals=[0, args.n_worker, len(os.sched_getaffinity(0))],
    val_names=['0', 'args.n_worker', 'number of available CPUs'],
  )
  lmp.util.validate.raise_if_wrong_ordered(
    vals=[args.n_worker, args.batch_size],
    val_names=['args.n_worker', 'args.batch_size'],
  )

  # Set random seed for reproducibility.
  lmp.util.rand.set_seed(seed=args.seed)

  # Get model running device.
  device = torch.device('cpu')
  if torch.cuda.is_available():
    device = torch.device('cuda')

  # Load pre-trained model configuration.
  model_cfg = lmp.util.cfg.load(exp_name=args.exp_name)

  # Load pre-trained tokenizer instance.
  tknzr = lmp.util.tknzr.load(exp_name=model_cfg.tknzr_exp_name)

  # Get dataset instance and convert samples to tensor.
  if args.is_dset_in_memory:
    dset: torch.utils.data.Dataset = lmp.util.dset.FastTensorDset(
      dset=lmp.util.dset.load(**args.__dict__),
      max_seq_len=model_cfg.max_seq_len,
      tknzr=tknzr,
    )
  else:
    dset = lmp.util.dset.SlowTensorDset(
      dset=lmp.util.dset.load(**args.__dict__),
      max_seq_len=model_cfg.max_seq_len,
      tknzr=tknzr,
    )

  dset_size = len(dset)

  # Mini-batch sampler.  Only when `args.n_worker > 0` we set `persisten_worker = True`.  We set
  # `pin_memory = True` to speed up process (which only speed up a few seconds).
  data_loader = torch.utils.data.DataLoader(
    batch_size=args.batch_size,
    dataset=dset,
    shuffle=False,
    num_workers=args.n_worker,
    persistent_workers=bool(args.n_worker != 0),
    pin_memory=True,
  )

  # Get tensorboard logger instance.
  writer = lmp.util.log.get_tb_logger(exp_name=args.exp_name)

  # Evaluate checkpoints within ranges.
  for ckpt in lmp.util.model.list_ckpts(exp_name=args.exp_name, first_ckpt=args.first_ckpt, last_ckpt=args.last_ckpt):
    # Load pre-trained model instance.
    model = lmp.util.model.load(ckpt=ckpt, exp_name=args.exp_name)

    # Set model to evaluation model.  This turn off dropout layers in model.
    model = model.eval()

    # Move model to running device.
    model = model.to(device)

    # Record average perplexity.
    avg_ppl = 0.0
    for batch_tkids in tqdm(data_loader):
      # Encode text into token ids.  We convert token ids into tensor and move to the same running device as model.
      batch_tkids = batch_tkids.to(device)

      # Format batch token ids to satisfy language model training format.
      batch_cur_tkids = batch_tkids[..., :-1]
      batch_next_tkids = batch_tkids[..., 1:]

      # Loop over token ids to get next token id prediction probability distribution.
      batch_prev_states = None
      batch_tkids_pd = []
      for i in range(batch_cur_tkids.size(1)):
        batch_next_tkids_pd, batch_prev_states = model.pred(
          batch_cur_tkids=batch_cur_tkids[:, i],
          batch_prev_states=batch_prev_states,
        )

        # Collect prediction probability distribution.
        batch_tkids_pd.append(batch_next_tkids_pd)

      # Calculate perplexity.
      batch_ppl = lmp.util.metric.ppl(batch_tkids=batch_next_tkids, batch_tkids_pd=torch.stack(batch_tkids_pd, dim=1))

      # Accumulate average perplexity.
      avg_ppl += (batch_ppl / dset_size).sum().item()

    # Log average perplexity on dataset to CLI and tensorboard.
    writer.add_scalar(f'ppl/{args.dset_name}/{args.ver}', avg_ppl, ckpt)
    print(f'checkpoint: {ckpt}, avg ppl: {avg_ppl}')

  # Free memory.  This is only need for unit test.
  del args
  del avg_ppl
  del batch_cur_tkids
  del batch_next_tkids
  del batch_next_tkids_pd
  del batch_ppl
  del batch_prev_states
  del batch_tkids
  del batch_tkids_pd
  del ckpt
  del data_loader
  del device
  del dset
  del dset_size
  del model
  del model_cfg
  del tknzr
  del writer
  torch.cuda.empty_cache()
  gc.collect()
Beispiel #11
0
def main(argv: List[str]) -> None:
  """Script entry point.

  Parameters
  ----------
  argv: list[str]
    List of CLI arguments.

  Returns
  -------
  None
  """
  # Parse CLI arguments.
  args = parse_args(argv=argv)

  # `args.batch_size` validation.
  lmp.util.validate.raise_if_wrong_ordered(vals=[1, args.batch_size], val_names=['1', 'args.batch_size'])
  # `args.ckpt_step` validation.
  lmp.util.validate.raise_if_wrong_ordered(vals=[1, args.ckpt_step], val_names=['1', 'args.ckpt_step'])
  # `args.log_step` validation.
  lmp.util.validate.raise_if_wrong_ordered(vals=[1, args.log_step], val_names=['1', 'args.log_step'])
  # `args.max_norm` validation.
  lmp.util.validate.raise_if_wrong_ordered(vals=[0, args.max_norm], val_names=['0', 'args.max_norm'])
  # `args.n_epoch` validation.
  lmp.util.validate.raise_if_wrong_ordered(vals=[1, args.n_epoch], val_names=['1', 'args.n_epoch'])
  # `args.n_worker` validation.
  lmp.util.validate.raise_if_wrong_ordered(
    vals=[0, args.n_worker, len(os.sched_getaffinity(0))],
    val_names=['0', 'args.n_worker', 'number of available CPUs'],
  )
  lmp.util.validate.raise_if_wrong_ordered(
    vals=[args.n_worker, args.batch_size],
    val_names=['args.n_worker', 'args.batch_size'],
  )

  # Save training configuration.
  lmp.util.cfg.save(args=args, exp_name=args.exp_name)

  # Set random seed for reproducibility.
  lmp.util.rand.set_seed(seed=args.seed)

  # Get model running device.
  device = torch.device('cpu')
  if torch.cuda.is_available():
    device = torch.device('cuda')

  # Load pre-trained tokenizer.
  tknzr = lmp.util.tknzr.load(exp_name=args.tknzr_exp_name)

  # Get dataset instance and convert samples to tensor.
  if args.is_dset_in_memory:
    dset: torch.utils.data.Dataset = lmp.util.dset.FastTensorDset(
      dset=lmp.util.dset.load(**args.__dict__),
      max_seq_len=args.max_seq_len,
      tknzr=tknzr,
    )
  else:
    dset = lmp.util.dset.SlowTensorDset(
      dset=lmp.util.dset.load(**args.__dict__),
      max_seq_len=args.max_seq_len,
      tknzr=tknzr,
    )

  # Mini-batch random sampler.  Only when `args.n_worker > 0` we set `persisten_worker = True`.  We set
  # `pin_memory = True` to speed up process (which only speed up a few seconds).
  data_loader = torch.utils.data.DataLoader(
    batch_size=args.batch_size,
    dataset=dset,
    shuffle=True,
    num_workers=args.n_worker,
    persistent_workers=bool(args.n_worker != 0),
    pin_memory=True,
  )

  # Get new model instance and move model to running device.
  model = lmp.util.model.create(tknzr=tknzr, **args.__dict__)
  model = model.train()
  model = model.to(device)

  # Get new optimizer instance.
  optim = lmp.util.optim.get_optimizer(
    beta1=args.beta1,
    beta2=args.beta2,
    eps=args.eps,
    lr=args.lr,
    model=model,
    wd=args.wd,
  )

  # Get learning rate scheduler.
  schdl = lmp.util.optim.get_scheduler(
    optim=optim,
    total_step=args.n_epoch * len(data_loader),
    warmup_step=args.warmup_step,
  )

  # Get tensorboard logger instance.
  writer = lmp.util.log.get_tb_logger(exp_name=args.exp_name)

  # Log performance target.
  pre_avg_loss = 0.0
  avg_loss = 0.0

  # Global optimization step.
  step = 0
  for epoch in range(args.n_epoch):
    tqdm_data_loader = tqdm(data_loader, desc=f'epoch: {epoch}, loss: {pre_avg_loss:.6f}', dynamic_ncols=True)
    for batch_tkids in tqdm_data_loader:
      # Encode batch text into batch token ids.  We convert batch token ids into tensor and move to tensor to the same
      # running device as model.
      batch_tkids = batch_tkids.to(device)

      # Format batch token ids to satisfy language model training format.
      batch_cur_tkids = batch_tkids[..., :-1]
      batch_next_tkids = batch_tkids[..., 1:]

      # Calculate loss using loss function.
      loss = model(batch_cur_tkids=batch_cur_tkids, batch_next_tkids=batch_next_tkids)

      # Accumulate average loss for logging.  Use `.item()` to avoid construct tensor graph.
      avg_loss += loss.item()

      # Perform backward pass / back propagation.
      loss.backward()

      # Perform gradient clipping to avoid gradient explosion.
      torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=args.max_norm)

      # Gradient descent.
      optim.step()

      # Update learning rate.
      schdl.step()

      # Clean up gradient.
      optim.zero_grad()

      # Increment global step.
      step += 1

      # Save checkpoint for each `ckpt_step` step.  We move model to CPU first then move back to CUDA device.
      if step % args.ckpt_step == 0:
        lmp.util.model.save(ckpt=step, exp_name=args.exp_name, model=copy.deepcopy(model).to('cpu'))

      # Log performance for each `log_step` step.
      if step % args.log_step == 0:
        avg_loss = avg_loss / args.log_step

        # Log on CLI.
        tqdm_data_loader.set_description(f'epoch: {epoch}, loss: {avg_loss:.6f}')

        # Log on tensorboard.
        writer.add_scalar(f'train-loss/{args.dset_name}/{args.ver}', avg_loss, step)
        writer.add_scalar('lr', schdl.get_last_lr()[0], step)

        # Refresh log performance.
        pre_avg_loss = avg_loss
        avg_loss = 0.0

  # Save last checkpoint.
  lmp.util.model.save(ckpt=step, exp_name=args.exp_name, model=copy.deepcopy(model).to('cpu'))

  # Close tensorboard logger.
  writer.close()

  # Free memory.  This is only need for unit test.
  del args
  del avg_loss
  del batch_cur_tkids
  del batch_next_tkids
  del batch_tkids
  del data_loader
  del device
  del dset
  del loss
  del model
  del optim
  del pre_avg_loss
  del schdl
  del step
  del tknzr
  del tqdm_data_loader
  del writer
  torch.cuda.empty_cache()
  gc.collect()
def main(argv: List[str]) -> None:
  """Script entry point.

  Parameters
  ----------
  argv: list[str]
    List of CLI arguments.

  Returns
  -------
  None
  """
  # Parse CLI arguments.
  args = parse_args(argv=argv)

  # `args.batch_size` validation.
  lmp.util.validate.raise_if_wrong_ordered(vals=[1, args.batch_size], val_names=['1', 'args.batch_size'])
  # `args.ckpt_step` validation.
  lmp.util.validate.raise_if_wrong_ordered(vals=[1, args.ckpt_step], val_names=['1', 'args.ckpt_step'])
  # `args.log_step` validation.
  lmp.util.validate.raise_if_wrong_ordered(vals=[1, args.log_step], val_names=['1', 'args.log_step'])
  # `args.max_norm` validation.
  lmp.util.validate.raise_if_wrong_ordered(vals=[0, args.max_norm], val_names=['0', 'args.max_norm'])
  # `args.n_epoch` validation.
  lmp.util.validate.raise_if_wrong_ordered(vals=[1, args.n_epoch], val_names=['1', 'args.n_epoch'])
  # `args.n_worker` validation.
  lmp.util.validate.raise_if_wrong_ordered(
    vals=[0, args.n_worker, len(os.sched_getaffinity(0))],
    val_names=['0', 'args.n_worker', 'number of available CPUs'],
  )
  lmp.util.validate.raise_if_wrong_ordered(
    vals=[args.n_worker, args.batch_size],
    val_names=['args.n_worker', 'args.batch_size'],
  )
  # `args.world_size` validation.
  lmp.util.validate.raise_if_wrong_ordered(vals=[0, args.world_size], val_names=['0', 'args.world_size'])
  # `args.local_rank` validation.
  lmp.util.validate.raise_if_wrong_ordered(vals=[0, args.local_rank], val_names=['0', 'args.local_rank'])
  # `args.rank` validation.
  lmp.util.validate.raise_if_wrong_ordered(
    vals=[0, args.rank, args.world_size - 1],
    val_names=['0', 'args.rank', 'args.world_size - 1'],
  )

  # Save training configuration.  Only main process need to save configuration.
  if args.rank == HOST_RANK:
    lmp.util.cfg.save(args=args, exp_name=args.exp_name)

  # We use TCP to perform RPC.  Timeout is set to 5 minutes.
  store = dist.TCPStore(
    is_master=args.rank == HOST_RANK,
    host_name=args.host_name,
    port=args.host_port,
    timeout=timedelta(minutes=5),
    world_size=args.world_size,
  )

  # Use NCCL backend to perform CUDA collectives.
  dist.init_process_group(
    backend=dist.Backend.NCCL,
    store=store,
    rank=args.rank,
    timeout=timedelta(minutes=5),
    world_size=args.world_size,
  )

  # Sync arguments.
  dist_args_k = ['host_name', 'host_port', 'local_rank', 'rank', 'world_size']
  for k in args.__dict__.keys():
    if k in dist_args_k:
      continue

    # Host broadcast arguments.
    if args.rank == HOST_RANK:
      store.set(k, str(args.__dict__[k]))
    # Non-host receive host arguments.
    else:
      v = store.get(k)
      if isinstance(args.__dict__[k], str):
        args.__dict__[k] = v.decode('utf-8')
      else:
        args.__dict__[k] = type(args.__dict__[k])(v)

  # Set random seed for reproducibility.  Note that each process use different seed to get different slice of batch.
  lmp.util.rand.set_seed(seed=args.seed + args.rank)

  # Get model running device.
  device = torch.device('cpu')
  if torch.cuda.is_available():
    device = torch.device(f'cuda:{args.local_rank}')

  # Load pre-trained tokenizer.
  tknzr = lmp.util.tknzr.load(exp_name=args.tknzr_exp_name)

  # Get dataset instance and convert samples to tensor.
  if args.is_dset_in_memory:
    dset: torch.utils.data.Dataset = lmp.util.dset.FastTensorDset(
      dset=lmp.util.dset.load(**args.__dict__),
      max_seq_len=args.max_seq_len,
      tknzr=tknzr,
    )
  else:
    dset = lmp.util.dset.SlowTensorDset(
      dset=lmp.util.dset.load(**args.__dict__),
      max_seq_len=args.max_seq_len,
      tknzr=tknzr,
    )

  # Mini-batch sampler.  Each process will get batches exclusive to itself.
  dist_sampler = torch.utils.data.distributed.DistributedSampler(
    num_replicas=args.world_size,
    rank=args.rank,
    dataset=dset,
    shuffle=True,
  )

  # Mini-batch distributed random sampler.  Only when `args.n_worker > 0` we set `persisten_worker = True`.  We set
  # `pin_memory = True` to speed up process (which only speed up a few seconds).
  data_loader = torch.utils.data.DataLoader(
    batch_size=args.batch_size // args.world_size,
    dataset=dset,
    num_workers=args.n_worker,
    persistent_workers=bool(args.n_worker != 0),
    pin_memory=True,
    sampler=dist_sampler,
  )

  # Get new model instance and move model to running device.
  model = lmp.util.model.create(tknzr=tknzr, **args.__dict__)
  model = model.train()
  model = model.to(device)

  # Get new optimizer instance.
  optim = lmp.util.optim.get_optimizer(
    beta1=args.beta1,
    beta2=args.beta2,
    eps=args.eps,
    lr=args.lr,
    model=model,
    wd=args.wd,
  )

  # Get learning rate scheduler.
  schdl = lmp.util.optim.get_scheduler(
    optim=optim,
    total_step=args.n_epoch * len(data_loader),
    warmup_step=args.warmup_step,
  )

  # Create DDP model.
  ddp_model = torch.nn.parallel.DistributedDataParallel(model)

  # Get tensorboard logger instance.  Only main process need to log performance.
  if args.rank == HOST_RANK:
    writer = lmp.util.log.get_tb_logger(exp_name=args.exp_name)
  else:
    writer = None

  # Log performance target.
  pre_avg_loss = 0.0
  avg_loss = 0.0

  # Global optimization step.
  step = 0
  for epoch in range(args.n_epoch):
    # Update random sample order.
    dist_sampler.set_epoch(epoch)

    # Processes can have unevenly distributed number of batch.  Thus one must use `ddp_model.join()` to avoid dead lock.
    with ddp_model.join():
      tqdm_data_loader = tqdm(data_loader, desc=f'epoch: {epoch}, loss: {pre_avg_loss:.6f}', dynamic_ncols=True)
      for batch_tkids in tqdm_data_loader:
        # Encode batch text into batch token ids.  We convert batch token ids into tensor and move to tensor to the same
        # running device as model.
        batch_tkids = batch_tkids.to(device)

        # Format batch token ids to satisfy language model training format.
        batch_cur_tkids = batch_tkids[..., :-1]
        batch_next_tkids = batch_tkids[..., 1:]

        # Calculate loss using loss function.
        loss = ddp_model(batch_cur_tkids=batch_cur_tkids, batch_next_tkids=batch_next_tkids)

        # Accumulate average loss for logging.  Use `.item()` to avoid construct tensor graph.
        avg_loss += loss.item()

        # Perform backward pass / back propagation.
        loss.backward()

        # Perform gradient clipping to avoid gradient explosion.
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=args.max_norm)

        # Gradient descent.
        optim.step()

        # Update learning rate.
        schdl.step()

        # Clean up gradient.
        optim.zero_grad()

        # Increment global step.
        step += 1

        # Save checkpoint for each `ckpt_step` step.  We move model to CPU first then move back to CUDA device.  Only
        # main process need to save checkpoint.
        if args.rank == HOST_RANK and step % args.ckpt_step == 0:
          lmp.util.model.save(ckpt=step, exp_name=args.exp_name, model=copy.deepcopy(model).to('cpu'))

        # Log performance for each `log_step` step.
        if step % args.log_step == 0:
          avg_loss = avg_loss / args.log_step

          # Log on CLI.
          tqdm_data_loader.set_description(f'epoch: {epoch}, loss: {avg_loss:.6f}')

          # Log on tensorboard.  Only main process need to log performance.
          if args.rank == HOST_RANK:
            writer.add_scalar(f'train-loss/{args.dset_name}/{args.ver}', avg_loss, step)
            writer.add_scalar('lr', schdl.get_last_lr()[0], step)

          # Refresh log performance.
          pre_avg_loss = avg_loss
          avg_loss = 0.0

  # Save last checkpoint.  Only main process need to save checkpoint.
  if args.rank == HOST_RANK:
    lmp.util.model.save(ckpt=step, exp_name=args.exp_name, model=copy.deepcopy(model).to('cpu'))

    # Close tensorboard logger.
    writer.close()

  # Free memory.  This is only need for unit test.
  del args
  del avg_loss
  del batch_cur_tkids
  del batch_next_tkids
  del batch_tkids
  del data_loader
  del device
  del dist_args_k
  del dist_sampler
  del ddp_model
  del dset
  del loss
  del model
  del optim
  del pre_avg_loss
  del schdl
  del step
  del store
  del tknzr
  del tqdm_data_loader
  del writer
  torch.cuda.empty_cache()
  gc.collect()
Beispiel #13
0
def perplexity_eval(device: torch.device,
                    model: Union[lmp.model.BaseRNNModel,
                                 lmp.model.BaseResRNNModel], sequence: str,
                    tokenizer: lmp.tokenizer.BaseTokenizer) -> float:
    r"""Helper function for calculating perplexity.

    Args:
        device:
            Model running device.
        model:
            Language model.
        sequence:
            Sequence for evaluation. Must not be empty.
        tokenizer:
            Tokenizer for encoding sequence.

    Raises:
        TypeError:
            When one of the arguments are not an instance of their type
            annotation respectively.

    Return:
        Perplexity of `sequence`.
    """
    # Type check.
    if not isinstance(device, torch.device):
        raise TypeError('`device` must be an instance of `torch.device`.')

    if not isinstance(model,
                      (lmp.model.BaseRNNModel, lmp.model.BaseResRNNModel)):
        raise TypeError(
            '`model` must be an instance of '
            '`Union[lmp.model.BaseRNNModel, lmp.model.BaseResRNNModel]`.')

    if not isinstance(sequence, str):
        raise TypeError('`sequence` must be an instance of `str`.')

    if not isinstance(tokenizer, lmp.tokenizer.BaseTokenizer):
        raise TypeError(
            '`tokenizer` must be an instance of `lmp.tokenizer.BaseTokenizer`.'
        )

    # Value check.
    if not sequence:
        raise ValueError('`sequence` must not be empty.')

    # Evalation mode.
    model.eval()
    model = model.to(device)

    # Encode sequence and convert into tensor. Original sequence length: S.
    # New sequence length: S + 2.
    sequence = tokenizer.encode(sequence, max_seq_len=-1)

    # `sequence[:-2]` means predict tokens include [bos] output but exclude
    # [eos] input. `x.shape = (S)`.
    x = torch.LongTensor(sequence[:-2]).to(device)

    # `y.shape = (S)`.
    y = sequence[1:-1]

    # Reshape into `(1, S)` to fit model.
    x = x.reshape(1, -1)

    # Get model vocabulary prediction with shape `(1, S, V)`.
    pred_y = model.predict(x)

    # Reshape into `(S)` for easier maniplation.
    x = x.squeeze(0)

    # Reshape into `(S, V)` for easier maniplation.
    pred_y = pred_y.squeeze(0)

    # Accumulate negative log-likelihood.
    nll = torch.zeros(1).to(device)

    # Iterate through each prediction.
    for pos, token_id in enumerate(y):
        probs = pred_y[pos, token_id]
        nll = nll - torch.log(probs)

    # Normalized by length.
    nll = nll / x.size(0)

    # Take exponential to cancel logarithmic.
    return nll.exp().item()