def test_config_file_exist(
        exp_name: str,
        file_path: str,
        subclss_tknzr: BaseTknzr,
):
    r"""Save configuration as file."""
    subclss_tknzr.save(exp_name)
    assert os.path.exists(file_path)
def test_config_file_format(
        exp_name: str,
        file_path: str,
        subclss_tknzr: BaseTknzr,
):
    r"""Save configuration must be JSON format."""
    subclss_tknzr.save(exp_name)
    with open(file_path, 'r', encoding='utf-8') as input_file:
        # Raise error if not valid JSON.
        assert json.load(input_file)
def test_load_result(
        exp_name: str,
        subclss_tknzr: BaseTknzr,
        subclss_tknzr_clss: Type[BaseTknzr],
):
    r"""Ensure configuration consistency between save and load."""
    subclss_tknzr.save(exp_name)
    load_tknzr = subclss_tknzr_clss.load(exp_name)

    assert subclss_tknzr.is_uncased == load_tknzr.is_uncased
    assert subclss_tknzr.id2tk == load_tknzr.id2tk
    assert subclss_tknzr.max_vocab == load_tknzr.max_vocab
    assert subclss_tknzr.min_count == load_tknzr.min_count
    assert subclss_tknzr.tk2id == load_tknzr.tk2id
Beispiel #4
0
    def gen(self, model: BaseModel, tknzr: BaseTknzr, txt: str) -> str:
        """Generate continual text conditioned on given text segment.

    Top-P inference algorithm is structured as follow:

    #. Encode input text as 1 sample batch.
    #. Remove token ids after ``[eos]`` since model is not trained to predict tokens after seeing ``[eos]``.
    #. Loop over conditioned token ids to generate conditioned hidden states.
    #. Loop to generate token ids.  In each iteration, generated token id was choosed so that it is one of the top-k
       highest probabilities from next token id prediction probability distribution, where :math:`k` is the number of
       token ids whose cumulative probability (after sorting probability in desending order) is less than or equal to
       ``self.p``.  Generating loop will stop early if ``[eos]`` is generated, otherwise generating loop only stop when
       maximum length constraint enforced by ``self.max_seq_len`` is violated.
    #. Decode generated token ids into text and return.

    Parameters
    ----------
    model: lmp.model.BaseModel
      Pre-trained language model which will be used to generate text.
    tknzr: lmp.tknzr.BaseTknzr
      Pre-trained tokenizer which perform text encoding and decoding.
    txt: str
      Text segment which the generation process is conditioned on.

    Returns
    -------
    str
      Generated text.
    """
        # Get model running device.
        device = next(model.parameters()).device

        # Encode as 1 sample batch.  We convert token ids to tensor and move tensor to the same running device as model.
        # shape: (1, max_seq_len).
        batch_cur_tkids = torch.LongTensor(
            tknzr.batch_enc(batch_txt=[txt],
                            max_seq_len=self.max_seq_len)).to(device)

        # Remove token ids after `[eos]` since model is not trained to predict tokens after seeing `[eos]`.
        mask = (batch_cur_tkids == EOS_TKID) | (batch_cur_tkids == PAD_TKID)
        seq_len = batch_cur_tkids.size(1) - mask.sum()
        batch_cur_tkids = batch_cur_tkids[:, :seq_len]

        # Loop over conditioned token ids to generate conditioned hidden states.
        batch_prev_states = None
        for i in range(seq_len - 1):
            _, batch_prev_states = model.pred(
                batch_cur_tkids=batch_cur_tkids[:, i],
                batch_prev_states=batch_prev_states)

        # Calculate how many token at most can be generated.
        out_seq_len = self.max_seq_len - seq_len + 1

        # Generate token ids.
        batch_cur_tkids = batch_cur_tkids[:, -1]
        gen_tkids: List[int] = []
        for _ in range(out_seq_len):
            # Get next token id prediction probability distribution.
            # shape: (1, vocab_size)
            batch_next_tkids_pd, batch_prev_states = model.pred(
                batch_cur_tkids=batch_cur_tkids,
                batch_prev_states=batch_prev_states,
            )

            # Sort the probability distribution in descending order.
            # shape: (1, vocab_size).
            batch_next_tkids_sort_pd, batch_next_tkids_sort = batch_next_tkids_pd.sort(
                dim=1, descending=True)

            # Calculate cumulative probability distribution and retrieve indices which cumulative probability are smaller
            # than threshold `self.p`.
            k = int((batch_next_tkids_sort_pd.cumsum(dim=1) <=
                     self.p).sum().item())

            # Sometimes the highest probability is larger than `self.p`, which means model is highly confident on predicting
            # next token id.  Thus the above calculation will result in `k == 0`.  In that case we only choose the token id
            # with the highest probability, we do this by setting `k = 1`.
            if k == 0:
                k = 1

            # The previous `k` token ids in `batch_next_tkids_sort` have cumulative probability less than or equal to
            # `self.p`.  We fetch them and perform further sampling.
            # shape: (1, k)
            batch_next_tkids_sort_pd = batch_next_tkids_sort_pd[:, :k]
            batch_next_tkids_sort = batch_next_tkids_sort[:, :k]

            # Use the top-k highest probabilities to construct multinomial distribution.  Then sample token id from
            # multinomial distribution as the next token id prediction result.
            # `batch_next_tkids_topk_sample` shape: (1, 1).
            batch_next_tkids_topk_sample = torch.multinomial(
                batch_next_tkids_sort_pd, num_samples=1)

            # Use sampled result to fetch next token id prediction.
            # shape: (1).
            batch_next_tkids = torch.gather(
                input=batch_next_tkids_sort,
                dim=1,
                index=batch_next_tkids_topk_sample,
            ).squeeze(1)
            gen_tkid = int(batch_next_tkids.item())
            gen_tkids.append(gen_tkid)

            # Update input token ids.
            batch_cur_tkids = batch_next_tkids

            # If the prediction token id is `[eos]`, then stop generation immediately.
            if gen_tkid == EOS_TKID:
                break

        # Output generated text.
        return tknzr.batch_dec(batch_tkids=[gen_tkids], rm_sp_tks=True)[0]
    def gen(self, model: BaseModel, tknzr: BaseTknzr, txt: str) -> str:
        """Generate continual text conditioned on given text segment.

    Top-K inference algorithm is structured as follow:

    #. Encode input text as 1 sample batch.
    #. Remove token ids after ``[eos]`` since model is not trained to predict tokens after seeing ``[eos]``.
    #. Loop over conditioned token ids to generate conditioned hidden states.
    #. Loop to generate token ids.  In each iteration, generated token id was choosed so that it is one of the top-K
       highest probabilities from next token id prediction probability distribution.  Generating loop will stop early
       if ``[eos]`` is generated, otherwise generating loop only stop when maximum length constraint enforced by
       ``self.max_seq_len`` is violated.
    #. Decode generated token ids into text and return.

    Parameters
    ----------
    model: lmp.model.BaseModel
      Pre-trained language model which will be used to generate text.
    tknzr: lmp.tknzr.BaseTknzr
      Pre-trained tokenizer which perform text encoding and decoding.
    txt: str
      Text segment which the generation process is conditioned on.

    Returns
    -------
    str
      Generated text.
    """
        # Get model running device.
        device = next(model.parameters()).device

        # Encode as 1 sample batch.  We convert token ids to tensor and move tensor to the same running device as model.
        # shape: (1, max_seq_len).
        batch_cur_tkids = torch.LongTensor(
            tknzr.batch_enc(batch_txt=[txt],
                            max_seq_len=self.max_seq_len)).to(device)

        # Remove token ids after `[eos]` since model is not trained to predict tokens after seeing `[eos]`.
        mask = (batch_cur_tkids == EOS_TKID) | (batch_cur_tkids == PAD_TKID)
        seq_len = batch_cur_tkids.size(1) - mask.sum()
        batch_cur_tkids = batch_cur_tkids[:, :seq_len]

        # Loop over conditioned token ids to generate conditioned hidden states.
        batch_prev_states = None
        for i in range(seq_len - 1):
            _, batch_prev_states = model.pred(
                batch_cur_tkids=batch_cur_tkids[:, i],
                batch_prev_states=batch_prev_states)

        # Calculate how many token at most can be generated.
        out_seq_len = self.max_seq_len - seq_len + 1

        # Generate token ids.
        batch_cur_tkids = batch_cur_tkids[:, -1]
        gen_tkids: List[int] = []
        for _ in range(out_seq_len):
            # Get next token id prediction probability distribution.
            # shape: (1, vocab_size)
            batch_next_tkids_pd, batch_prev_states = model.pred(
                batch_cur_tkids=batch_cur_tkids,
                batch_prev_states=batch_prev_states,
            )

            # Get top-K highest probabilities from next token id prediction probability distribution.
            # shape: (1, k).
            batch_next_tkids_topk_p, batch_next_tkids_topk = batch_next_tkids_pd.topk(
                k=self.k, dim=-1)

            # Use the top-K highest probabilities to construct multinomial distribution.  Then sample token id from
            # multinomial distribution as the next token id prediction result.
            # `batch_next_tkids_topk_sample` shape: (1, 1).
            batch_next_tkids_topk_sample = torch.multinomial(
                batch_next_tkids_topk_p, num_samples=1)

            # Use sampled result to fetch next token id prediction.
            # shape: (1).
            batch_next_tkids = torch.gather(
                input=batch_next_tkids_topk,
                dim=1,
                index=batch_next_tkids_topk_sample,
            ).squeeze(1)
            gen_tkid = int(batch_next_tkids.item())
            gen_tkids.append(gen_tkid)

            # Update input token ids.
            batch_cur_tkids = batch_next_tkids

            # If the prediction token id is `[eos]`, then stop generation immediately.
            if gen_tkid == EOS_TKID:
                break

        # Output generated text.
        return tknzr.batch_dec(batch_tkids=[gen_tkids], rm_sp_tks=True)[0]
def test_arguments() -> None:
    """Must have correct arguments."""
    parser = argparse.ArgumentParser()
    BaseTknzr.add_CLI_args(parser=parser)
    assert parser.parse_args([]) == argparse.Namespace()
Beispiel #7
0
def test_lower_case(subclss_tknzr: BaseTknzr, case_txt: Dict[str, str]):
    r"""Test output text is convert to lower case."""
    if subclss_tknzr.is_uncased:
        assert subclss_tknzr.norm(case_txt['input']) == case_txt['output']
    else:
        assert subclss_tknzr.norm(case_txt['input']) == case_txt['input']
Beispiel #8
0
def test_strip_whitespace(subclss_tknzr: BaseTknzr, htws_txt: Dict[str, str]):
    r"""Test output text is stripped."""
    assert subclss_tknzr.norm(htws_txt['input']) == htws_txt['output']
Beispiel #9
0
def test_collapse_whitespace(subclss_tknzr: BaseTknzr, cws_txt: Dict[str,
                                                                     str]):
    r"""Test output text collapse whitespaces."""
    assert subclss_tknzr.norm(cws_txt['input']) == cws_txt['output']
Beispiel #10
0
def test_nfkc(subclss_tknzr: BaseTknzr, non_nfkc_txt: Dict[str, str]):
    r"""Test output text is normalized with NFKC."""
    assert subclss_tknzr.norm(non_nfkc_txt['input']) == non_nfkc_txt['output']