Example #1
0
    def setup_enc_dec_tokenizers(
        self,
        encoder_tokenizer_name=None,
        encoder_tokenizer_model=None,
        encoder_bpe_dropout=0.0,
        decoder_tokenizer_name=None,
        decoder_tokenizer_model=None,
        decoder_bpe_dropout=0.0,
    ):

        if encoder_tokenizer_name != 'yttm' or decoder_tokenizer_name != 'yttm':
            raise NotImplementedError(
                f"Currently we only support yttm tokenizer.")

        self.encoder_tokenizer = get_tokenizer(
            tokenizer_name=encoder_tokenizer_name,
            tokenizer_model=self.register_artifact(
                "cfg.encoder_tokenizer.tokenizer_model",
                encoder_tokenizer_model),
            bpe_dropout=encoder_bpe_dropout,
        )
        self.decoder_tokenizer = get_tokenizer(
            tokenizer_name=decoder_tokenizer_name,
            tokenizer_model=self.register_artifact(
                "cfg.decoder_tokenizer.tokenizer_model",
                decoder_tokenizer_model),
            bpe_dropout=decoder_bpe_dropout,
        )
Example #2
0
    def get_enc_dec_tokenizers(
        encoder_tokenizer_name=None,
        encoder_tokenizer_model=None,
        encoder_bpe_dropout=0.0,
        decoder_tokenizer_name=None,
        decoder_tokenizer_model=None,
        decoder_bpe_dropout=0.0,
    ):

        if encoder_tokenizer_name != 'yttm' or decoder_tokenizer_name != 'yttm':
            raise NotImplementedError(
                f"Currently we only support yttm tokenizer.")

        encoder_tokenizer = get_tokenizer(
            tokenizer_name=encoder_tokenizer_name,
            tokenizer_model=encoder_tokenizer_model,
            bpe_dropout=encoder_bpe_dropout,
        )
        decoder_tokenizer = get_tokenizer(
            tokenizer_name=decoder_tokenizer_name,
            tokenizer_model=decoder_tokenizer_model,
            bpe_dropout=decoder_bpe_dropout,
        )

        return encoder_tokenizer, decoder_tokenizer
Example #3
0
 def test_get_pretrained_arabic_model(self):
     model_name = 'asafaya/bert-base-arabic'
     self.omega_conf.language_model.pretrained_model_name = model_name
     model = nemo_nlp.modules.get_lm_model(cfg=self.omega_conf)
     assert isinstance(model, nemo_nlp.modules.BertModule)
     tokenizer = get_tokenizer(tokenizer_name=model_name)
     assert isinstance(tokenizer, AutoTokenizer)
Example #4
0
    def setup_tokenizer(self, cfg: DictConfig):
        """Instantiates tokenizer based on config and registers tokenizer artifacts.

           If model is being restored from .nemo file then the tokenizer.vocab_file will
           be used (if it exists).

           Otherwise, we will use the vocab file provided in the config (if it exists).

           Finally, if no vocab file is given (this happens frequently when using HF),
           we will attempt to extract the vocab from the tokenizer object and then register it.

        Args:
            cfg (DictConfig): Tokenizer config
        """

        if self._is_model_being_restored() and os.path.exists('tokenizer.vocab_file'):
            # model is being restored from .nemo file so tokenizer.vocab_file has precedence
            vocab_file = self.register_artifact(config_path='tokenizer.vocab_file', src='tokenizer.vocab_file')
        elif cfg.vocab_file is not None:
            # use vocab file from config
            vocab_file = self.register_artifact(config_path='tokenizer.vocab_file', src=cfg.vocab_file)
        else:
            vocab_file = None

        tokenizer = get_tokenizer(
            tokenizer_name=cfg.tokenizer_name,
            vocab_file=vocab_file,
            special_tokens=OmegaConf.to_container(cfg.special_tokens) if cfg.special_tokens else None,
            tokenizer_model=self.register_artifact(config_path='tokenizer.tokenizer_model', src=cfg.tokenizer_model),
        )
        self.tokenizer = tokenizer

        if vocab_file is None:
            # when there is no vocab file we try to get the vocab from the tokenizer and register it
            self._register_vocab_from_tokenizer(vocab_file_config_path='tokenizer.vocab_file', cfg=cfg)
Example #5
0
 def test_get_pretrained_chinese_bert_wwm_model(self):
     model_name = 'hfl/chinese-bert-wwm'
     self.omega_conf.language_model.pretrained_model_name = model_name
     model = nemo_nlp.modules.get_lm_model(cfg=self.omega_conf)
     assert isinstance(model, nemo_nlp.modules.BertModule)
     tokenizer = get_tokenizer(tokenizer_name=model_name)
     assert isinstance(tokenizer, AutoTokenizer)
Example #6
0
 def setup_tokenizer(self, cfg: DictConfig):
     tokenizer = get_tokenizer(
         tokenizer_name=cfg.tokenizer_name,
         tokenizer_model=cfg.tokenizer_model,
         special_tokens=OmegaConf.to_container(cfg.special_tokens) if cfg.special_tokens else None,
         vocab_file=cfg.vocab_file,
     )
     return tokenizer
 def _setup_tokenizer(self, cfg: DictConfig):
     tokenizer = get_tokenizer(
         tokenizer_name=cfg.tokenizer_name,
         vocab_file=self.register_artifact(config_path='tokenizer.vocab_file', src=cfg.vocab_file),
         special_tokens=OmegaConf.to_container(cfg.special_tokens) if cfg.special_tokens else None,
         tokenizer_model=self.register_artifact(config_path='tokenizer.tokenizer_model', src=cfg.tokenizer_model),
     )
     self.tokenizer = tokenizer
Example #8
0
    def get_monolingual_tokenizer(
        tokenizer_name=None, tokenizer_model=None, bpe_dropout=0.0,
    ):
        if tokenizer_name == 'yttm':
            if bpe_dropout is None:
                bpe_dropout = 0.0
            tokenizer = get_tokenizer(
                tokenizer_name=tokenizer_name, tokenizer_model=tokenizer_model, bpe_dropout=bpe_dropout,
            )
        elif tokenizer_name == 'sentencepiece':
            tokenizer = SentencePieceTokenizer(model_path=tokenizer_model)
        else:
            try:
                tokenizer = get_tokenizer(tokenizer_name, special_tokens={"pad_token": "[PAD]"})
            except Exception as e:
                raise ValueError(f'{tokenizer_name} is not supported by either NeMo or HuggingFace. {e}')

        return tokenizer
Example #9
0
    def __init__(self, cfg: DictConfig, trainer: Trainer = None):

        model_name = cfg.language_model.pretrained_model_name
        self.tokenizer = get_tokenizer(tokenizer_name=model_name)

        super().__init__(cfg=cfg, trainer=trainer)

        self.q_encoder = self.get_lm_model_with_padded_embedding(cfg)
        self.p_encoder = self.get_lm_model_with_padded_embedding(cfg)
        self.loss = SmoothedCrossEntropyLoss(pad_id=self.tokenizer.pad_id)
Example #10
0
    def get_monolingual_tokenizer(
        tokenizer_name=None, tokenizer_model=None, bpe_dropout=0.0,
    ):
        if tokenizer_name != 'yttm':
            raise NotImplementedError(f"Currently we only support yttm tokenizer.")

        tokenizer = get_tokenizer(
            tokenizer_name=tokenizer_name, tokenizer_model=tokenizer_model, bpe_dropout=bpe_dropout,
        )

        return tokenizer
Example #11
0
    def setup_tokenizer(self, cfg: DictConfig):
        """Instantiates tokenizer based on config and registers tokenizer artifacts.

           If model is being restored from .nemo file then the tokenizer.vocab_file will
           be used (if it exists).

           Otherwise, we will use the vocab file provided in the config (if it exists).

           Finally, if no vocab file is given (this happens frequently when using HF),
           we will attempt to extract the vocab from the tokenizer object and then register it.

        Args:
            cfg (DictConfig): Tokenizer config
        """
        vocab_file = None
        if self._is_model_being_restored():
            if os.path.exists('tokenizer.vocab_file'):
                # model is being restored from .nemo file so tokenizer.vocab_file has precedence
                vocab_file = self.register_artifact(
                    config_path='tokenizer.vocab_file',
                    src='tokenizer.vocab_file')

            # tokenizer.vocab_file is added to the config file and registered as artifact for .nemo file
            # during training but this file is missing for load_from_checkpoint() method call
            # it's safer to use restore_from .nemo file
            elif cfg.vocab_file and not os.path.exists(cfg.vocab_file):
                logging.warning(
                    f'tokenizer.vocab_file not found at {cfg.vocab_file}. It is recommended to use restore_from() method with .nemo file.'
                )
            else:
                vocab_file = self.register_artifact(
                    config_path='tokenizer.vocab_file', src=cfg.vocab_file)
        elif cfg.vocab_file:
            # use vocab file from config
            vocab_file = self.register_artifact(
                config_path='tokenizer.vocab_file', src=cfg.vocab_file)

        tokenizer = get_tokenizer(
            tokenizer_name=cfg.tokenizer_name,
            vocab_file=vocab_file,
            special_tokens=OmegaConf.to_container(cfg.special_tokens)
            if cfg.special_tokens else None,
            tokenizer_model=self.register_artifact(
                config_path='tokenizer.tokenizer_model',
                src=cfg.tokenizer_model),
        )
        self.tokenizer = tokenizer

        if vocab_file is None:
            # when there is no vocab file we try to get the vocab from the tokenizer and register it
            self._register_vocab_from_tokenizer(
                vocab_file_config_path='tokenizer.vocab_file', cfg=cfg)
Example #12
0
    def __init__(self, cfg: DictConfig, trainer: Trainer = None):

        model_name = cfg.language_model.pretrained_model_name
        self.tokenizer = get_tokenizer(tokenizer_name=model_name)

        super().__init__(cfg=cfg, trainer=trainer)

        self.bert_model = self.get_lm_model_with_padded_embedding(cfg)
        hidden_size = self.bert_model.config.hidden_size
        self.sim_score_regressor = SequenceRegression(
            hidden_size=hidden_size, num_layers=1, dropout=cfg.language_model.sim_score_dropout,
        )
        self.loss = SmoothedCrossEntropyLoss(pad_id=self.tokenizer.pad_id)
Example #13
0
    def __init__(
        self,
        name,
        indexed_dataset,
        data_prefix,
        num_epochs,
        max_num_samples,
        masked_lm_prob,
        max_seq_length,
        max_seq_length_dec,
        short_seq_prob,
        seed,
    ):

        # Params to store.
        self.name = name
        self.seed = seed
        self.masked_lm_prob = masked_lm_prob
        self.max_seq_length = max_seq_length
        self.max_seq_length_dec = max_seq_length_dec

        # Dataset.
        self.indexed_dataset = indexed_dataset

        # Build the samples mapping.
        self.samples_mapping = get_samples_mapping(
            self.indexed_dataset,
            data_prefix,
            num_epochs,
            max_num_samples,
            self.max_seq_length - 2,  # account for added tokens
            short_seq_prob,
            self.seed,
            self.name,
            False,
        )

        # Vocab stuff.
        tokenizer = get_tokenizer()
        self.vocab_id_list = list(tokenizer.inv_vocab.keys())
        self.vocab_id_to_token_dict = tokenizer.inv_vocab
        self.cls_id = tokenizer.cls
        self.sep_id = tokenizer.sep
        self.mask_id = tokenizer.mask
        self.pad_id = tokenizer.pad
        self.bos_id = tokenizer.bos_token_id
        self.eos_id = tokenizer.eos_token_id
        self.sentinel_tokens = tokenizer.additional_special_tokens_ids
        assert len(
            self.sentinel_tokens
        ) > 0, "Provide the argument --vocab-extra-ids 100 to the script"
Example #14
0
    def __init__(self, cfg: DictConfig, trainer: Trainer = None):

        # shared params for dataset and data loaders
        self.dataset_cfg = cfg.dataset
        self.tokenizer = get_tokenizer(
            tokenizer_name=cfg.language_model.tokenizer,
            vocab_file=cfg.language_model.vocab_file,
            special_tokens=cfg.language_model.special_tokens,
        )

        # make vocabulary size divisible by 8 for fast fp16 training
        vocab_size = 8 * math.ceil(self.tokenizer.vocab_size / 8)

        # init superclass
        super().__init__(cfg=cfg, trainer=trainer)

        self.embedding_layer = TransformerEmbedding(
            vocab_size=vocab_size,
            hidden_size=cfg.language_model.hidden_size,
            max_sequence_length=cfg.language_model.max_seq_length,
            embedding_dropout=cfg.language_model.get("embedding_dropout", 0.0),
            learn_positional_encodings=False,
        )
        self.encoder = TransformerEncoder(
            num_layers=cfg.language_model.num_layers,
            hidden_size=cfg.language_model.hidden_size,
            mask_future=True,
            num_attention_heads=cfg.language_model.num_attn_heads,
            inner_size=cfg.language_model.inner_size,
            ffn_dropout=cfg.language_model.get("ffn_dropout", 0.0),
            hidden_act=cfg.language_model.get("inner_activation", "relu"),
            attn_score_dropout=cfg.language_model.get("attn_score_dropout", 0.0),
            attn_layer_dropout=cfg.language_model.get("attn_layer_dropout", 0.0),
        )
        self.log_softmax = TokenClassifier(
            hidden_size=cfg.language_model.hidden_size, num_classes=vocab_size, log_softmax=True,
        )

        std_init_range = 1 / math.sqrt(cfg.language_model.hidden_size)
        self.apply(lambda module: transformer_weights_init(module, std_init_range))

        # tie weights of embedding and softmax matrices
        self.log_softmax.mlp.layer0.weight = self.embedding_layer.token_embedding.weight

        self.training_loss = SmoothedCrossEntropyLoss(pad_id=self.tokenizer.pad_id)
        self.validation_loss = SmoothedCrossEntropyLoss(
            pad_id=self.tokenizer.pad_id, predict_last_k=self.dataset_cfg.get("predict_last_k", 0),
        )

        # Optimizer setup needs to happen after all model weights are ready
        self.setup_optimization(cfg.optim)
Example #15
0
    def _setup_tokenizer(self, cfg: DictConfig):
        """Instantiates tokenizer based on config and registers tokenizer artifacts.

        Args:
            cfg (DictConfig): Tokenizer config
        """
        tokenizer = get_tokenizer(
            tokenizer_name=cfg.tokenizer_name,
            vocab_file=cfg.vocab_file,
            special_tokens=OmegaConf.to_container(cfg.special_tokens)
            if cfg.special_tokens else None,
            tokenizer_model=cfg.tokenizer_model,
        )
        self.tokenizer = tokenizer
        self.register_tokenizer(cfg)
Example #16
0
    def setup_enc_dec_tokenizers(self, cfg: EncDecNLPModelConfig):
        if cfg.encoder_tokenizer.vocab_file is not None or cfg.decoder_tokenizer.vocab_file is not None:
            raise NotImplemented(
                f'Vocab files are currently not supported. Please use tokenizer name and model instead'
            )

        if cfg.encoder_tokenizer.tokenizer_name != 'yttm' or cfg.decoder_tokenizer.tokenizer_name != 'yttm':
            raise NotImplemented(f"Currently we only support yttm tokenizer.")
        self.encoder_tokenizer = get_tokenizer(
            tokenizer_name=cfg.encoder_tokenizer.tokenizer_name,
            tokenizer_model=self.register_artifact(
                "cfg.encoder_tokenizer.tokenizer_model",
                cfg.encoder_tokenizer.tokenizer_model),
            bpe_dropout=cfg.encoder_tokenizer.bpe_dropout if hasattr(
                cfg.encoder_tokenizer, 'bpe_dropout') else 0.0,
        )
        self.decoder_tokenizer = get_tokenizer(
            tokenizer_name=cfg.decoder_tokenizer.tokenizer_name,
            tokenizer_model=self.register_artifact(
                "cfg.decoder_tokenizer.tokenizer_model",
                cfg.decoder_tokenizer.tokenizer_model),
            bpe_dropout=cfg.decoder_tokenizer.bpe_dropout if hasattr(
                cfg.decoder_tokenizer, 'bpe_dropout') else 0.0,
        )
Example #17
0
    def setup_tokenizer(
        self, tokenizer_name=None, tokenizer_model=None, vocab_file=None, bpe_dropout=0.0,
    ):

        supported_tokenizers = ['yttm', 'huggingface', 'sentencepiece', 'word']
        if tokenizer_name not in supported_tokenizers:
            raise NotImplementedError(f"Currently we only support tokenizers in {supported_tokenizers}.")

        self.tokenizer = get_tokenizer(
            tokenizer_name=tokenizer_name,
            tokenizer_model=self.register_artifact("cfg.tokenizer.tokenizer_model", tokenizer_model),
            vocab_file=vocab_file,
            bpe_dropout=bpe_dropout,
            special_tokens=None,
            use_fast=False,
        )
Example #18
0
    def __init__(self, cfg: DictConfig, trainer: Trainer = None):

        # Get global rank and total number of GPU workers for IterableDataset partitioning, if applicable
        self.global_rank = 0
        self.world_size = 1
        if trainer is not None:
            self.global_rank = (trainer.node_rank *
                                trainer.num_gpus) + trainer.local_rank
            self.world_size = trainer.num_nodes * trainer.num_gpus

        # shared params for dataset and data loaders
        self.dataset_cfg = cfg.dataset

        vocab_file = cfg.language_model.get("vocab_file", None)

        if vocab_file is not None:
            vocab_file = self.register_artifact("language_model.vocab_file",
                                                vocab_file)

        tokenizer_model = cfg.language_model.get("tokenizer_model", None)

        if tokenizer_model is not None:
            tokenizer_model = self.register_artifact(
                "language_model.tokenizer_model", tokenizer_model)

        if cfg.language_model.special_tokens:
            special_tokens = OmegaConf.to_container(
                cfg.language_model.special_tokens, resolve=True)
        else:
            special_tokens = None

        self.tokenizer = get_tokenizer(
            tokenizer_name=cfg.language_model.tokenizer,
            vocab_file=vocab_file,
            special_tokens=special_tokens,
            tokenizer_model=tokenizer_model,
        )

        # make vocabulary size divisible by 8 for fast fp16 training
        vocab_size = 8 * math.ceil(self.tokenizer.vocab_size / 8)

        # init superclass
        super().__init__(cfg=cfg, trainer=trainer)

        self.embedding_layer = TransformerEmbedding(
            vocab_size=vocab_size,
            hidden_size=cfg.language_model.hidden_size,
            max_sequence_length=cfg.language_model.max_seq_length,
            embedding_dropout=cfg.language_model.get("embedding_dropout", 0.0),
            learn_positional_encodings=False,
        )
        self.encoder = TransformerEncoder(
            num_layers=cfg.language_model.num_layers,
            hidden_size=cfg.language_model.hidden_size,
            mask_future=True,
            num_attention_heads=cfg.language_model.num_attn_heads,
            inner_size=cfg.language_model.inner_size,
            ffn_dropout=cfg.language_model.get("ffn_dropout", 0.0),
            hidden_act=cfg.language_model.get("inner_activation", "relu"),
            attn_score_dropout=cfg.language_model.get("attn_score_dropout",
                                                      0.0),
            attn_layer_dropout=cfg.language_model.get("attn_layer_dropout",
                                                      0.0),
        )
        self.log_softmax = TokenClassifier(
            hidden_size=cfg.language_model.hidden_size,
            num_classes=vocab_size,
            log_softmax=True,
        )

        std_init_range = 1 / math.sqrt(cfg.language_model.hidden_size)
        self.apply(
            lambda module: transformer_weights_init(module, std_init_range))

        # tie weights of embedding and softmax matrices
        self.log_softmax.mlp.layer0.weight = self.embedding_layer.token_embedding.weight

        if hasattr(self.tokenizer, 'pad_token'):
            pad_id = self.tokenizer.pad_id
        else:
            raise ValueError(
                "The tokenizer must support a special `pad_token`. Provide it using"
                "the `special_tokens` dictionary.")

        self.training_loss = SmoothedCrossEntropyLoss(pad_id=pad_id)
        self.validation_loss = SmoothedCrossEntropyLoss(
            pad_id=pad_id,
            predict_last_k=self.dataset_cfg.get("predict_last_k", 0),
        )

        self.training_perplexity = Perplexity(dist_sync_on_step=True)
        self.validation_perplexity = Perplexity(compute_on_step=False)

        # Optimizer setup needs to happen after all model weights are ready
        self.setup_optimization()
Example #19
0
def main():
    parser = ArgumentParser()
    parser.add_argument(
        "--lm_model_file",
        type=str,
        required=True,
        help="path to LM model .nemo file or the name of a HuggingFace pretrained models like 'transfo-xl-wt103' or 'gpt2'",
    )
    parser.add_argument("--beams_file", type=str, required=True, help="path to beams .tsv file")
    parser.add_argument(
        "--eval_manifest", type=str, required=True, help="path to the evaluation `.json` manifest file"
    )
    parser.add_argument("--beam_size", type=int, required=True, help="number of beams per candidate")
    parser.add_argument("--batch_size", type=int, default=256, help="inference batch size")
    parser.add_argument("--alpha", type=float, default=None, help="parameter alpha of the fusion")
    parser.add_argument("--beta", type=float, default=None, help="parameter beta of the fusion")
    parser.add_argument("--max_seq_length", default=512, help="Maximum sequence length (in tokens) for the input")
    parser.add_argument(
        "--scores_output_file", default=None, type=str, help="The optional path to store the rescored beams"
    )
    parser.add_argument(
        "--device", default="cuda", type=str, help="The device to load the model onto to calculate the scores"
    )
    parser.add_argument(
        "--use_amp", action="store_true", help="Whether to use AMP if available to calculate the scores"
    )
    args = parser.parse_args()

    device = args.device
    if device.startswith("cuda") and not torch.cuda.is_available():
        logging.info(f"cuda is not available! switched to cpu.")
        device = "cpu"

    if args.lm_model_file.endswith(".nemo"):
        nemo_model = True
        logging.info("Attempting to initialize from .nemo file...")
        model = TransformerLMModel.restore_from(
            restore_path=args.lm_model_file, map_location=torch.device(device)
        ).eval()
        model_tokenizer = model.tokenizer
    else:
        nemo_model = False
        logging.info("Attempting to initialize from a pretrained model from HuggingFace...")
        model = (
            AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path=args.lm_model_file, is_decoder=True)
            .to(device)
            .eval()
        )
        model_tokenizer = get_tokenizer(tokenizer_name=args.lm_model_file)

    max_seq_length = args.max_seq_length
    dataset = BeamScoresDataset(args.beams_file, model_tokenizer, args.eval_manifest, args.beam_size, max_seq_length)
    data_loader = torch.utils.data.DataLoader(dataset=dataset, batch_size=args.batch_size)

    if args.use_amp:
        if torch.cuda.is_available() and hasattr(torch.cuda, 'amp') and hasattr(torch.cuda.amp, 'autocast'):
            logging.info("AMP is enabled!\n")
            autocast = torch.cuda.amp.autocast
    else:

        @contextlib.contextmanager
        def autocast():
            yield

    if "attention_mask" in inspect.getfullargspec(model.forward).args:
        support_att_mask = True
    else:
        support_att_mask = False
    logging.info(f"Rescoring with beam_size: {args.beam_size}")
    logging.info("Calculating the scores...")
    with autocast():
        with torch.no_grad():
            am_scores, lm_scores, dists, ref_lens, lens_in_chars = [], [], [], [], []
            for batch in tqdm.tqdm(data_loader):
                input_ids, input_mask, acoustic_score, dist, ref_len, len_in_chars, idx = batch

                max_len_in_batch = input_mask.sum(dim=0).argmin().item()
                input_ids, input_mask = input_ids[:, :max_len_in_batch], input_mask[:, :max_len_in_batch]
                if torch.cuda.is_available():
                    input_ids, input_mask = input_ids.to(device), input_mask.to(device)
                    dist, acoustic_score, len_in_chars = (
                        dist.to(device),
                        acoustic_score.to(device),
                        len_in_chars.to(device),
                    )
                # some models like Transformer-XL don't need attention_mask as input
                if support_att_mask:
                    log_probs = model(input_ids=input_ids, attention_mask=input_mask)
                else:
                    log_probs = model(input_ids=input_ids)

                if not nemo_model:
                    log_probs = torch.nn.functional.log_softmax(log_probs.logits, dim=-1)

                target_log_probs = log_probs[:, :-1].gather(2, input_ids[:, 1:].unsqueeze(2)).squeeze(2)
                neural_lm_score = torch.sum(target_log_probs * input_mask[:, 1:], dim=-1)

                am_scores.append(acoustic_score)
                lm_scores.append(neural_lm_score)
                dists.append(dist)
                ref_lens.append(ref_len)
                lens_in_chars.append(len_in_chars)

    am_scores = torch.cat(am_scores).view(-1, args.beam_size)
    lm_scores = torch.cat(lm_scores).view(-1, args.beam_size)
    dists = torch.cat(dists).view(-1, args.beam_size)
    ref_lens = torch.cat(ref_lens).view(-1, args.beam_size)
    lens_in_chars = torch.cat(lens_in_chars).view(-1, args.beam_size).to(am_scores.dtype)

    total_len = ref_lens[:, 0].sum()
    model_wer = dists[:, 0].sum() / total_len
    ideal_wer = dists.min(dim=1)[0].sum() / total_len

    if args.alpha is None:
        logging.info("Linear search for alpha...")
        coef1, _ = linear_search_wer(
            dists=dists, scores1=am_scores, scores2=lm_scores, total_len=total_len, param_name='alpha'
        )
        coef1 = np.round(coef1, 3)
        logging.info(f"alpha={coef1} achieved the best WER.")
        logging.info(f"------------------------------------------------")
    else:
        coef1 = args.alpha

    scores = am_scores + coef1 * lm_scores

    if args.beta is None:
        logging.info("Linear search for beta...")
        coef2, _ = linear_search_wer(
            dists=dists, scores1=scores, scores2=lens_in_chars, total_len=total_len, param_name='beta'
        )
        coef2 = np.round(coef2, 3)
        logging.info(f"beta={coef2} achieved the best WER.")
        logging.info(f"------------------------------------------------")
    else:
        coef2 = args.beta

    new_scores = am_scores + coef1 * lm_scores + coef2 * lens_in_chars
    rescored_wer = compute_wer(dists, new_scores, total_len)

    logging.info(f"Input beams WER: {np.round(model_wer.item() * 100, 2)}%")
    logging.info(f"------------------------------------------------")
    logging.info(f"  +LM rescoring WER: {np.round(rescored_wer * 100, 2)}%")
    logging.info(f"  with alpha={coef1}, beta={coef2}")
    logging.info(f"------------------------------------------------")
    logging.info(f"Best possible WER: {np.round(ideal_wer.item() * 100, 2)}%")
    logging.info(f"------------------------------------------------")

    new_scores_flatten = new_scores.flatten()
    if args.scores_output_file is not None:
        logging.info(f'Saving the candidates with their new scores at `{args.scores_output_file}`...')
        with open(args.scores_output_file, "w") as fout:
            for sample_id in range(len(dataset)):
                fout.write(f"{dataset.data[0][sample_id]}\t{new_scores_flatten[sample_id]}\n")
Example #20
0
 def test_get_pretrained_arabic_model(self):
     model_name = 'asafaya/bert-base-arabic'
     model = nemo_nlp.modules.get_lm_model(pretrained_model_name=model_name)
     assert isinstance(model, nemo_nlp.modules.BertModule)
     tokenizer = get_tokenizer(tokenizer_name=model_name)
     assert isinstance(tokenizer, AutoTokenizer)
Example #21
0
 def test_get_pretrained_chinese_bert_wwm_model(self):
     model_name = 'hfl/chinese-bert-wwm'
     model = nemo_nlp.modules.get_lm_model(pretrained_model_name=model_name)
     assert isinstance(model, nemo_nlp.modules.BertModule)
     tokenizer = get_tokenizer(tokenizer_name=model_name)
     assert isinstance(tokenizer, AutoTokenizer)
Example #22
0
 def test_get_pretrained_t5_model(self):
     model_name = 't5-small'
     model = nemo_nlp.modules.get_lm_model(pretrained_model_name=model_name)
     assert isinstance(model, nemo_nlp.modules.BertModule)
     tokenizer = get_tokenizer(tokenizer_name=model_name)
     assert isinstance(tokenizer, AutoTokenizer)
def main():
    text_path = args.text_path
    data_root = args.data_root

    if args.log:
        logging.basicConfig(level=logging.INFO)

    tokenized_cachedir = os.path.join(data_root, '_tokenized_dataset_cachedir')

    if os.path.exists(tokenized_cachedir):
        logging.warning(
            f'Tokenized cache directory {tokenized_cachedir} already potentially contains files.'
            f'In such a case, please be aware that the tarfiles will be **appended** instead of overridden!'
        )

    if not os.path.exists(data_root):
        os.makedirs(data_root)

    chunk_paths = None
    chunk_lens = None

    if os.path.exists(tokenized_cachedir):
        paths = glob.glob(os.path.join(tokenized_cachedir, "*.npy"))
        if len(paths) > 0:
            logging.info(
                "Cached tokenized numpy files found, skipping re-tokenization of dataset"
            )

            chunk_paths = paths
            chunk_lens = None

    if chunk_paths is None:
        if args.tokenizer_name is None:
            raise ValueError(
                "`tokenizer_name` name is required when tokenizing the dataset for the first time."
            )

        if args.tokenizer_vocab_file is None:
            raise ValueError(
                "`tokenizer_vocab_file` is required when constructing the tokenized dataset"
            )

        tokenizer = get_tokenizer(
            tokenizer_name=args.tokenizer_name,
            tokenizer_model=args.tokenizer_model,
            vocab_file=args.tokenizer_vocab_file,
            special_tokens=args.tokenizer_special_tokens,
        )

        logging.info("Built tokenizer")

        # tokenize text data into sub-words
        chunk_paths, chunk_lens = __tokenize_text(
            text_paths=text_path,
            tokenizer=tokenizer,
            tokenized_cachedir=tokenized_cachedir,
            lower_case=args.lower_case,
            chunk_size=args.chunk_size,
            write_buffer=args.chunk_write_buffer,
        )
        logging.info(
            f"Tokenized dataset into sub-words and serialized cache at {tokenized_cachedir}"
        )

    # Write tarred dataset
    __write_tarred_tokenized_text_dataset(data_root,
                                          num_shards=args.num_shards,
                                          chunk_paths=chunk_paths,
                                          chunk_lens=chunk_lens)

    logging.info('Done preparing tokenized dataset!')
        yttm.BPE.train(
            data=args.src_fname,
            vocab_size=args.vocab_size,
            model=os.path.join(args.out_dir, 'tokenizer.encoder.%d.BPE.model' % (args.vocab_size)),
        )

        yttm.BPE.train(
            data=args.tgt_fname,
            vocab_size=args.vocab_size,
            model=os.path.join(args.out_dir, 'tokenizer.decoder.%d.BPE.model' % (args.vocab_size)),
        )
        encoder_tokenizer_model = os.path.join(args.out_dir, 'tokenizer.encoder.%d.BPE.model' % (args.vocab_size))
        decoder_tokenizer_model = os.path.join(args.out_dir, 'tokenizer.decoder.%d.BPE.model' % (args.vocab_size))

    encoder_tokenizer = get_tokenizer(
        tokenizer_name='yttm', tokenizer_model=encoder_tokenizer_model, bpe_dropout=args.bpe_dropout
    )

    decoder_tokenizer = get_tokenizer(
        tokenizer_name='yttm', tokenizer_model=decoder_tokenizer_model, bpe_dropout=args.bpe_dropout
    )

    tokens_in_batch = args.tokens_in_batch
    tar_file_ctr = 1
    num_files_in_tar = 0
    num_lines = 0
    shard_num = 0
    global_batch_ctr = 0
    tmp_f_src = tempfile.NamedTemporaryFile(delete=False, mode='w')
    tmp_f_tgt = tempfile.NamedTemporaryFile(delete=False, mode='w')
    tar_file_ptr = tarfile.open(os.path.join(args.out_dir, 'batches.tokens.%d.%d.tar' % (tokens_in_batch, 1)), 'w')