def setup_enc_dec_tokenizers( self, encoder_tokenizer_name=None, encoder_tokenizer_model=None, encoder_bpe_dropout=0.0, decoder_tokenizer_name=None, decoder_tokenizer_model=None, decoder_bpe_dropout=0.0, ): if encoder_tokenizer_name != 'yttm' or decoder_tokenizer_name != 'yttm': raise NotImplementedError( f"Currently we only support yttm tokenizer.") self.encoder_tokenizer = get_tokenizer( tokenizer_name=encoder_tokenizer_name, tokenizer_model=self.register_artifact( "cfg.encoder_tokenizer.tokenizer_model", encoder_tokenizer_model), bpe_dropout=encoder_bpe_dropout, ) self.decoder_tokenizer = get_tokenizer( tokenizer_name=decoder_tokenizer_name, tokenizer_model=self.register_artifact( "cfg.decoder_tokenizer.tokenizer_model", decoder_tokenizer_model), bpe_dropout=decoder_bpe_dropout, )
def get_enc_dec_tokenizers( encoder_tokenizer_name=None, encoder_tokenizer_model=None, encoder_bpe_dropout=0.0, decoder_tokenizer_name=None, decoder_tokenizer_model=None, decoder_bpe_dropout=0.0, ): if encoder_tokenizer_name != 'yttm' or decoder_tokenizer_name != 'yttm': raise NotImplementedError( f"Currently we only support yttm tokenizer.") encoder_tokenizer = get_tokenizer( tokenizer_name=encoder_tokenizer_name, tokenizer_model=encoder_tokenizer_model, bpe_dropout=encoder_bpe_dropout, ) decoder_tokenizer = get_tokenizer( tokenizer_name=decoder_tokenizer_name, tokenizer_model=decoder_tokenizer_model, bpe_dropout=decoder_bpe_dropout, ) return encoder_tokenizer, decoder_tokenizer
def test_get_pretrained_arabic_model(self): model_name = 'asafaya/bert-base-arabic' self.omega_conf.language_model.pretrained_model_name = model_name model = nemo_nlp.modules.get_lm_model(cfg=self.omega_conf) assert isinstance(model, nemo_nlp.modules.BertModule) tokenizer = get_tokenizer(tokenizer_name=model_name) assert isinstance(tokenizer, AutoTokenizer)
def setup_tokenizer(self, cfg: DictConfig): """Instantiates tokenizer based on config and registers tokenizer artifacts. If model is being restored from .nemo file then the tokenizer.vocab_file will be used (if it exists). Otherwise, we will use the vocab file provided in the config (if it exists). Finally, if no vocab file is given (this happens frequently when using HF), we will attempt to extract the vocab from the tokenizer object and then register it. Args: cfg (DictConfig): Tokenizer config """ if self._is_model_being_restored() and os.path.exists('tokenizer.vocab_file'): # model is being restored from .nemo file so tokenizer.vocab_file has precedence vocab_file = self.register_artifact(config_path='tokenizer.vocab_file', src='tokenizer.vocab_file') elif cfg.vocab_file is not None: # use vocab file from config vocab_file = self.register_artifact(config_path='tokenizer.vocab_file', src=cfg.vocab_file) else: vocab_file = None tokenizer = get_tokenizer( tokenizer_name=cfg.tokenizer_name, vocab_file=vocab_file, special_tokens=OmegaConf.to_container(cfg.special_tokens) if cfg.special_tokens else None, tokenizer_model=self.register_artifact(config_path='tokenizer.tokenizer_model', src=cfg.tokenizer_model), ) self.tokenizer = tokenizer if vocab_file is None: # when there is no vocab file we try to get the vocab from the tokenizer and register it self._register_vocab_from_tokenizer(vocab_file_config_path='tokenizer.vocab_file', cfg=cfg)
def test_get_pretrained_chinese_bert_wwm_model(self): model_name = 'hfl/chinese-bert-wwm' self.omega_conf.language_model.pretrained_model_name = model_name model = nemo_nlp.modules.get_lm_model(cfg=self.omega_conf) assert isinstance(model, nemo_nlp.modules.BertModule) tokenizer = get_tokenizer(tokenizer_name=model_name) assert isinstance(tokenizer, AutoTokenizer)
def setup_tokenizer(self, cfg: DictConfig): tokenizer = get_tokenizer( tokenizer_name=cfg.tokenizer_name, tokenizer_model=cfg.tokenizer_model, special_tokens=OmegaConf.to_container(cfg.special_tokens) if cfg.special_tokens else None, vocab_file=cfg.vocab_file, ) return tokenizer
def _setup_tokenizer(self, cfg: DictConfig): tokenizer = get_tokenizer( tokenizer_name=cfg.tokenizer_name, vocab_file=self.register_artifact(config_path='tokenizer.vocab_file', src=cfg.vocab_file), special_tokens=OmegaConf.to_container(cfg.special_tokens) if cfg.special_tokens else None, tokenizer_model=self.register_artifact(config_path='tokenizer.tokenizer_model', src=cfg.tokenizer_model), ) self.tokenizer = tokenizer
def get_monolingual_tokenizer( tokenizer_name=None, tokenizer_model=None, bpe_dropout=0.0, ): if tokenizer_name == 'yttm': if bpe_dropout is None: bpe_dropout = 0.0 tokenizer = get_tokenizer( tokenizer_name=tokenizer_name, tokenizer_model=tokenizer_model, bpe_dropout=bpe_dropout, ) elif tokenizer_name == 'sentencepiece': tokenizer = SentencePieceTokenizer(model_path=tokenizer_model) else: try: tokenizer = get_tokenizer(tokenizer_name, special_tokens={"pad_token": "[PAD]"}) except Exception as e: raise ValueError(f'{tokenizer_name} is not supported by either NeMo or HuggingFace. {e}') return tokenizer
def __init__(self, cfg: DictConfig, trainer: Trainer = None): model_name = cfg.language_model.pretrained_model_name self.tokenizer = get_tokenizer(tokenizer_name=model_name) super().__init__(cfg=cfg, trainer=trainer) self.q_encoder = self.get_lm_model_with_padded_embedding(cfg) self.p_encoder = self.get_lm_model_with_padded_embedding(cfg) self.loss = SmoothedCrossEntropyLoss(pad_id=self.tokenizer.pad_id)
def get_monolingual_tokenizer( tokenizer_name=None, tokenizer_model=None, bpe_dropout=0.0, ): if tokenizer_name != 'yttm': raise NotImplementedError(f"Currently we only support yttm tokenizer.") tokenizer = get_tokenizer( tokenizer_name=tokenizer_name, tokenizer_model=tokenizer_model, bpe_dropout=bpe_dropout, ) return tokenizer
def setup_tokenizer(self, cfg: DictConfig): """Instantiates tokenizer based on config and registers tokenizer artifacts. If model is being restored from .nemo file then the tokenizer.vocab_file will be used (if it exists). Otherwise, we will use the vocab file provided in the config (if it exists). Finally, if no vocab file is given (this happens frequently when using HF), we will attempt to extract the vocab from the tokenizer object and then register it. Args: cfg (DictConfig): Tokenizer config """ vocab_file = None if self._is_model_being_restored(): if os.path.exists('tokenizer.vocab_file'): # model is being restored from .nemo file so tokenizer.vocab_file has precedence vocab_file = self.register_artifact( config_path='tokenizer.vocab_file', src='tokenizer.vocab_file') # tokenizer.vocab_file is added to the config file and registered as artifact for .nemo file # during training but this file is missing for load_from_checkpoint() method call # it's safer to use restore_from .nemo file elif cfg.vocab_file and not os.path.exists(cfg.vocab_file): logging.warning( f'tokenizer.vocab_file not found at {cfg.vocab_file}. It is recommended to use restore_from() method with .nemo file.' ) else: vocab_file = self.register_artifact( config_path='tokenizer.vocab_file', src=cfg.vocab_file) elif cfg.vocab_file: # use vocab file from config vocab_file = self.register_artifact( config_path='tokenizer.vocab_file', src=cfg.vocab_file) tokenizer = get_tokenizer( tokenizer_name=cfg.tokenizer_name, vocab_file=vocab_file, special_tokens=OmegaConf.to_container(cfg.special_tokens) if cfg.special_tokens else None, tokenizer_model=self.register_artifact( config_path='tokenizer.tokenizer_model', src=cfg.tokenizer_model), ) self.tokenizer = tokenizer if vocab_file is None: # when there is no vocab file we try to get the vocab from the tokenizer and register it self._register_vocab_from_tokenizer( vocab_file_config_path='tokenizer.vocab_file', cfg=cfg)
def __init__(self, cfg: DictConfig, trainer: Trainer = None): model_name = cfg.language_model.pretrained_model_name self.tokenizer = get_tokenizer(tokenizer_name=model_name) super().__init__(cfg=cfg, trainer=trainer) self.bert_model = self.get_lm_model_with_padded_embedding(cfg) hidden_size = self.bert_model.config.hidden_size self.sim_score_regressor = SequenceRegression( hidden_size=hidden_size, num_layers=1, dropout=cfg.language_model.sim_score_dropout, ) self.loss = SmoothedCrossEntropyLoss(pad_id=self.tokenizer.pad_id)
def __init__( self, name, indexed_dataset, data_prefix, num_epochs, max_num_samples, masked_lm_prob, max_seq_length, max_seq_length_dec, short_seq_prob, seed, ): # Params to store. self.name = name self.seed = seed self.masked_lm_prob = masked_lm_prob self.max_seq_length = max_seq_length self.max_seq_length_dec = max_seq_length_dec # Dataset. self.indexed_dataset = indexed_dataset # Build the samples mapping. self.samples_mapping = get_samples_mapping( self.indexed_dataset, data_prefix, num_epochs, max_num_samples, self.max_seq_length - 2, # account for added tokens short_seq_prob, self.seed, self.name, False, ) # Vocab stuff. tokenizer = get_tokenizer() self.vocab_id_list = list(tokenizer.inv_vocab.keys()) self.vocab_id_to_token_dict = tokenizer.inv_vocab self.cls_id = tokenizer.cls self.sep_id = tokenizer.sep self.mask_id = tokenizer.mask self.pad_id = tokenizer.pad self.bos_id = tokenizer.bos_token_id self.eos_id = tokenizer.eos_token_id self.sentinel_tokens = tokenizer.additional_special_tokens_ids assert len( self.sentinel_tokens ) > 0, "Provide the argument --vocab-extra-ids 100 to the script"
def __init__(self, cfg: DictConfig, trainer: Trainer = None): # shared params for dataset and data loaders self.dataset_cfg = cfg.dataset self.tokenizer = get_tokenizer( tokenizer_name=cfg.language_model.tokenizer, vocab_file=cfg.language_model.vocab_file, special_tokens=cfg.language_model.special_tokens, ) # make vocabulary size divisible by 8 for fast fp16 training vocab_size = 8 * math.ceil(self.tokenizer.vocab_size / 8) # init superclass super().__init__(cfg=cfg, trainer=trainer) self.embedding_layer = TransformerEmbedding( vocab_size=vocab_size, hidden_size=cfg.language_model.hidden_size, max_sequence_length=cfg.language_model.max_seq_length, embedding_dropout=cfg.language_model.get("embedding_dropout", 0.0), learn_positional_encodings=False, ) self.encoder = TransformerEncoder( num_layers=cfg.language_model.num_layers, hidden_size=cfg.language_model.hidden_size, mask_future=True, num_attention_heads=cfg.language_model.num_attn_heads, inner_size=cfg.language_model.inner_size, ffn_dropout=cfg.language_model.get("ffn_dropout", 0.0), hidden_act=cfg.language_model.get("inner_activation", "relu"), attn_score_dropout=cfg.language_model.get("attn_score_dropout", 0.0), attn_layer_dropout=cfg.language_model.get("attn_layer_dropout", 0.0), ) self.log_softmax = TokenClassifier( hidden_size=cfg.language_model.hidden_size, num_classes=vocab_size, log_softmax=True, ) std_init_range = 1 / math.sqrt(cfg.language_model.hidden_size) self.apply(lambda module: transformer_weights_init(module, std_init_range)) # tie weights of embedding and softmax matrices self.log_softmax.mlp.layer0.weight = self.embedding_layer.token_embedding.weight self.training_loss = SmoothedCrossEntropyLoss(pad_id=self.tokenizer.pad_id) self.validation_loss = SmoothedCrossEntropyLoss( pad_id=self.tokenizer.pad_id, predict_last_k=self.dataset_cfg.get("predict_last_k", 0), ) # Optimizer setup needs to happen after all model weights are ready self.setup_optimization(cfg.optim)
def _setup_tokenizer(self, cfg: DictConfig): """Instantiates tokenizer based on config and registers tokenizer artifacts. Args: cfg (DictConfig): Tokenizer config """ tokenizer = get_tokenizer( tokenizer_name=cfg.tokenizer_name, vocab_file=cfg.vocab_file, special_tokens=OmegaConf.to_container(cfg.special_tokens) if cfg.special_tokens else None, tokenizer_model=cfg.tokenizer_model, ) self.tokenizer = tokenizer self.register_tokenizer(cfg)
def setup_enc_dec_tokenizers(self, cfg: EncDecNLPModelConfig): if cfg.encoder_tokenizer.vocab_file is not None or cfg.decoder_tokenizer.vocab_file is not None: raise NotImplemented( f'Vocab files are currently not supported. Please use tokenizer name and model instead' ) if cfg.encoder_tokenizer.tokenizer_name != 'yttm' or cfg.decoder_tokenizer.tokenizer_name != 'yttm': raise NotImplemented(f"Currently we only support yttm tokenizer.") self.encoder_tokenizer = get_tokenizer( tokenizer_name=cfg.encoder_tokenizer.tokenizer_name, tokenizer_model=self.register_artifact( "cfg.encoder_tokenizer.tokenizer_model", cfg.encoder_tokenizer.tokenizer_model), bpe_dropout=cfg.encoder_tokenizer.bpe_dropout if hasattr( cfg.encoder_tokenizer, 'bpe_dropout') else 0.0, ) self.decoder_tokenizer = get_tokenizer( tokenizer_name=cfg.decoder_tokenizer.tokenizer_name, tokenizer_model=self.register_artifact( "cfg.decoder_tokenizer.tokenizer_model", cfg.decoder_tokenizer.tokenizer_model), bpe_dropout=cfg.decoder_tokenizer.bpe_dropout if hasattr( cfg.decoder_tokenizer, 'bpe_dropout') else 0.0, )
def setup_tokenizer( self, tokenizer_name=None, tokenizer_model=None, vocab_file=None, bpe_dropout=0.0, ): supported_tokenizers = ['yttm', 'huggingface', 'sentencepiece', 'word'] if tokenizer_name not in supported_tokenizers: raise NotImplementedError(f"Currently we only support tokenizers in {supported_tokenizers}.") self.tokenizer = get_tokenizer( tokenizer_name=tokenizer_name, tokenizer_model=self.register_artifact("cfg.tokenizer.tokenizer_model", tokenizer_model), vocab_file=vocab_file, bpe_dropout=bpe_dropout, special_tokens=None, use_fast=False, )
def __init__(self, cfg: DictConfig, trainer: Trainer = None): # Get global rank and total number of GPU workers for IterableDataset partitioning, if applicable self.global_rank = 0 self.world_size = 1 if trainer is not None: self.global_rank = (trainer.node_rank * trainer.num_gpus) + trainer.local_rank self.world_size = trainer.num_nodes * trainer.num_gpus # shared params for dataset and data loaders self.dataset_cfg = cfg.dataset vocab_file = cfg.language_model.get("vocab_file", None) if vocab_file is not None: vocab_file = self.register_artifact("language_model.vocab_file", vocab_file) tokenizer_model = cfg.language_model.get("tokenizer_model", None) if tokenizer_model is not None: tokenizer_model = self.register_artifact( "language_model.tokenizer_model", tokenizer_model) if cfg.language_model.special_tokens: special_tokens = OmegaConf.to_container( cfg.language_model.special_tokens, resolve=True) else: special_tokens = None self.tokenizer = get_tokenizer( tokenizer_name=cfg.language_model.tokenizer, vocab_file=vocab_file, special_tokens=special_tokens, tokenizer_model=tokenizer_model, ) # make vocabulary size divisible by 8 for fast fp16 training vocab_size = 8 * math.ceil(self.tokenizer.vocab_size / 8) # init superclass super().__init__(cfg=cfg, trainer=trainer) self.embedding_layer = TransformerEmbedding( vocab_size=vocab_size, hidden_size=cfg.language_model.hidden_size, max_sequence_length=cfg.language_model.max_seq_length, embedding_dropout=cfg.language_model.get("embedding_dropout", 0.0), learn_positional_encodings=False, ) self.encoder = TransformerEncoder( num_layers=cfg.language_model.num_layers, hidden_size=cfg.language_model.hidden_size, mask_future=True, num_attention_heads=cfg.language_model.num_attn_heads, inner_size=cfg.language_model.inner_size, ffn_dropout=cfg.language_model.get("ffn_dropout", 0.0), hidden_act=cfg.language_model.get("inner_activation", "relu"), attn_score_dropout=cfg.language_model.get("attn_score_dropout", 0.0), attn_layer_dropout=cfg.language_model.get("attn_layer_dropout", 0.0), ) self.log_softmax = TokenClassifier( hidden_size=cfg.language_model.hidden_size, num_classes=vocab_size, log_softmax=True, ) std_init_range = 1 / math.sqrt(cfg.language_model.hidden_size) self.apply( lambda module: transformer_weights_init(module, std_init_range)) # tie weights of embedding and softmax matrices self.log_softmax.mlp.layer0.weight = self.embedding_layer.token_embedding.weight if hasattr(self.tokenizer, 'pad_token'): pad_id = self.tokenizer.pad_id else: raise ValueError( "The tokenizer must support a special `pad_token`. Provide it using" "the `special_tokens` dictionary.") self.training_loss = SmoothedCrossEntropyLoss(pad_id=pad_id) self.validation_loss = SmoothedCrossEntropyLoss( pad_id=pad_id, predict_last_k=self.dataset_cfg.get("predict_last_k", 0), ) self.training_perplexity = Perplexity(dist_sync_on_step=True) self.validation_perplexity = Perplexity(compute_on_step=False) # Optimizer setup needs to happen after all model weights are ready self.setup_optimization()
def main(): parser = ArgumentParser() parser.add_argument( "--lm_model_file", type=str, required=True, help="path to LM model .nemo file or the name of a HuggingFace pretrained models like 'transfo-xl-wt103' or 'gpt2'", ) parser.add_argument("--beams_file", type=str, required=True, help="path to beams .tsv file") parser.add_argument( "--eval_manifest", type=str, required=True, help="path to the evaluation `.json` manifest file" ) parser.add_argument("--beam_size", type=int, required=True, help="number of beams per candidate") parser.add_argument("--batch_size", type=int, default=256, help="inference batch size") parser.add_argument("--alpha", type=float, default=None, help="parameter alpha of the fusion") parser.add_argument("--beta", type=float, default=None, help="parameter beta of the fusion") parser.add_argument("--max_seq_length", default=512, help="Maximum sequence length (in tokens) for the input") parser.add_argument( "--scores_output_file", default=None, type=str, help="The optional path to store the rescored beams" ) parser.add_argument( "--device", default="cuda", type=str, help="The device to load the model onto to calculate the scores" ) parser.add_argument( "--use_amp", action="store_true", help="Whether to use AMP if available to calculate the scores" ) args = parser.parse_args() device = args.device if device.startswith("cuda") and not torch.cuda.is_available(): logging.info(f"cuda is not available! switched to cpu.") device = "cpu" if args.lm_model_file.endswith(".nemo"): nemo_model = True logging.info("Attempting to initialize from .nemo file...") model = TransformerLMModel.restore_from( restore_path=args.lm_model_file, map_location=torch.device(device) ).eval() model_tokenizer = model.tokenizer else: nemo_model = False logging.info("Attempting to initialize from a pretrained model from HuggingFace...") model = ( AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path=args.lm_model_file, is_decoder=True) .to(device) .eval() ) model_tokenizer = get_tokenizer(tokenizer_name=args.lm_model_file) max_seq_length = args.max_seq_length dataset = BeamScoresDataset(args.beams_file, model_tokenizer, args.eval_manifest, args.beam_size, max_seq_length) data_loader = torch.utils.data.DataLoader(dataset=dataset, batch_size=args.batch_size) if args.use_amp: if torch.cuda.is_available() and hasattr(torch.cuda, 'amp') and hasattr(torch.cuda.amp, 'autocast'): logging.info("AMP is enabled!\n") autocast = torch.cuda.amp.autocast else: @contextlib.contextmanager def autocast(): yield if "attention_mask" in inspect.getfullargspec(model.forward).args: support_att_mask = True else: support_att_mask = False logging.info(f"Rescoring with beam_size: {args.beam_size}") logging.info("Calculating the scores...") with autocast(): with torch.no_grad(): am_scores, lm_scores, dists, ref_lens, lens_in_chars = [], [], [], [], [] for batch in tqdm.tqdm(data_loader): input_ids, input_mask, acoustic_score, dist, ref_len, len_in_chars, idx = batch max_len_in_batch = input_mask.sum(dim=0).argmin().item() input_ids, input_mask = input_ids[:, :max_len_in_batch], input_mask[:, :max_len_in_batch] if torch.cuda.is_available(): input_ids, input_mask = input_ids.to(device), input_mask.to(device) dist, acoustic_score, len_in_chars = ( dist.to(device), acoustic_score.to(device), len_in_chars.to(device), ) # some models like Transformer-XL don't need attention_mask as input if support_att_mask: log_probs = model(input_ids=input_ids, attention_mask=input_mask) else: log_probs = model(input_ids=input_ids) if not nemo_model: log_probs = torch.nn.functional.log_softmax(log_probs.logits, dim=-1) target_log_probs = log_probs[:, :-1].gather(2, input_ids[:, 1:].unsqueeze(2)).squeeze(2) neural_lm_score = torch.sum(target_log_probs * input_mask[:, 1:], dim=-1) am_scores.append(acoustic_score) lm_scores.append(neural_lm_score) dists.append(dist) ref_lens.append(ref_len) lens_in_chars.append(len_in_chars) am_scores = torch.cat(am_scores).view(-1, args.beam_size) lm_scores = torch.cat(lm_scores).view(-1, args.beam_size) dists = torch.cat(dists).view(-1, args.beam_size) ref_lens = torch.cat(ref_lens).view(-1, args.beam_size) lens_in_chars = torch.cat(lens_in_chars).view(-1, args.beam_size).to(am_scores.dtype) total_len = ref_lens[:, 0].sum() model_wer = dists[:, 0].sum() / total_len ideal_wer = dists.min(dim=1)[0].sum() / total_len if args.alpha is None: logging.info("Linear search for alpha...") coef1, _ = linear_search_wer( dists=dists, scores1=am_scores, scores2=lm_scores, total_len=total_len, param_name='alpha' ) coef1 = np.round(coef1, 3) logging.info(f"alpha={coef1} achieved the best WER.") logging.info(f"------------------------------------------------") else: coef1 = args.alpha scores = am_scores + coef1 * lm_scores if args.beta is None: logging.info("Linear search for beta...") coef2, _ = linear_search_wer( dists=dists, scores1=scores, scores2=lens_in_chars, total_len=total_len, param_name='beta' ) coef2 = np.round(coef2, 3) logging.info(f"beta={coef2} achieved the best WER.") logging.info(f"------------------------------------------------") else: coef2 = args.beta new_scores = am_scores + coef1 * lm_scores + coef2 * lens_in_chars rescored_wer = compute_wer(dists, new_scores, total_len) logging.info(f"Input beams WER: {np.round(model_wer.item() * 100, 2)}%") logging.info(f"------------------------------------------------") logging.info(f" +LM rescoring WER: {np.round(rescored_wer * 100, 2)}%") logging.info(f" with alpha={coef1}, beta={coef2}") logging.info(f"------------------------------------------------") logging.info(f"Best possible WER: {np.round(ideal_wer.item() * 100, 2)}%") logging.info(f"------------------------------------------------") new_scores_flatten = new_scores.flatten() if args.scores_output_file is not None: logging.info(f'Saving the candidates with their new scores at `{args.scores_output_file}`...') with open(args.scores_output_file, "w") as fout: for sample_id in range(len(dataset)): fout.write(f"{dataset.data[0][sample_id]}\t{new_scores_flatten[sample_id]}\n")
def test_get_pretrained_arabic_model(self): model_name = 'asafaya/bert-base-arabic' model = nemo_nlp.modules.get_lm_model(pretrained_model_name=model_name) assert isinstance(model, nemo_nlp.modules.BertModule) tokenizer = get_tokenizer(tokenizer_name=model_name) assert isinstance(tokenizer, AutoTokenizer)
def test_get_pretrained_chinese_bert_wwm_model(self): model_name = 'hfl/chinese-bert-wwm' model = nemo_nlp.modules.get_lm_model(pretrained_model_name=model_name) assert isinstance(model, nemo_nlp.modules.BertModule) tokenizer = get_tokenizer(tokenizer_name=model_name) assert isinstance(tokenizer, AutoTokenizer)
def test_get_pretrained_t5_model(self): model_name = 't5-small' model = nemo_nlp.modules.get_lm_model(pretrained_model_name=model_name) assert isinstance(model, nemo_nlp.modules.BertModule) tokenizer = get_tokenizer(tokenizer_name=model_name) assert isinstance(tokenizer, AutoTokenizer)
def main(): text_path = args.text_path data_root = args.data_root if args.log: logging.basicConfig(level=logging.INFO) tokenized_cachedir = os.path.join(data_root, '_tokenized_dataset_cachedir') if os.path.exists(tokenized_cachedir): logging.warning( f'Tokenized cache directory {tokenized_cachedir} already potentially contains files.' f'In such a case, please be aware that the tarfiles will be **appended** instead of overridden!' ) if not os.path.exists(data_root): os.makedirs(data_root) chunk_paths = None chunk_lens = None if os.path.exists(tokenized_cachedir): paths = glob.glob(os.path.join(tokenized_cachedir, "*.npy")) if len(paths) > 0: logging.info( "Cached tokenized numpy files found, skipping re-tokenization of dataset" ) chunk_paths = paths chunk_lens = None if chunk_paths is None: if args.tokenizer_name is None: raise ValueError( "`tokenizer_name` name is required when tokenizing the dataset for the first time." ) if args.tokenizer_vocab_file is None: raise ValueError( "`tokenizer_vocab_file` is required when constructing the tokenized dataset" ) tokenizer = get_tokenizer( tokenizer_name=args.tokenizer_name, tokenizer_model=args.tokenizer_model, vocab_file=args.tokenizer_vocab_file, special_tokens=args.tokenizer_special_tokens, ) logging.info("Built tokenizer") # tokenize text data into sub-words chunk_paths, chunk_lens = __tokenize_text( text_paths=text_path, tokenizer=tokenizer, tokenized_cachedir=tokenized_cachedir, lower_case=args.lower_case, chunk_size=args.chunk_size, write_buffer=args.chunk_write_buffer, ) logging.info( f"Tokenized dataset into sub-words and serialized cache at {tokenized_cachedir}" ) # Write tarred dataset __write_tarred_tokenized_text_dataset(data_root, num_shards=args.num_shards, chunk_paths=chunk_paths, chunk_lens=chunk_lens) logging.info('Done preparing tokenized dataset!')
yttm.BPE.train( data=args.src_fname, vocab_size=args.vocab_size, model=os.path.join(args.out_dir, 'tokenizer.encoder.%d.BPE.model' % (args.vocab_size)), ) yttm.BPE.train( data=args.tgt_fname, vocab_size=args.vocab_size, model=os.path.join(args.out_dir, 'tokenizer.decoder.%d.BPE.model' % (args.vocab_size)), ) encoder_tokenizer_model = os.path.join(args.out_dir, 'tokenizer.encoder.%d.BPE.model' % (args.vocab_size)) decoder_tokenizer_model = os.path.join(args.out_dir, 'tokenizer.decoder.%d.BPE.model' % (args.vocab_size)) encoder_tokenizer = get_tokenizer( tokenizer_name='yttm', tokenizer_model=encoder_tokenizer_model, bpe_dropout=args.bpe_dropout ) decoder_tokenizer = get_tokenizer( tokenizer_name='yttm', tokenizer_model=decoder_tokenizer_model, bpe_dropout=args.bpe_dropout ) tokens_in_batch = args.tokens_in_batch tar_file_ctr = 1 num_files_in_tar = 0 num_lines = 0 shard_num = 0 global_batch_ctr = 0 tmp_f_src = tempfile.NamedTemporaryFile(delete=False, mode='w') tmp_f_tgt = tempfile.NamedTemporaryFile(delete=False, mode='w') tar_file_ptr = tarfile.open(os.path.join(args.out_dir, 'batches.tokens.%d.%d.tar' % (tokens_in_batch, 1)), 'w')