def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None): # Convert to Hydra 1.0 compatible DictConfig cfg = model_utils.convert_model_config_to_dict_config(cfg) cfg = model_utils.maybe_update_config_version(cfg) # Setup normalizer self.normalizer = None self.text_normalizer_call = None self.text_normalizer_call_kwargs = {} self._setup_normalizer(cfg) # Setup tokenizer self.tokenizer = None self._setup_tokenizer(cfg) assert self.tokenizer is not None num_tokens = len(self.tokenizer.tokens) self.tokenizer_pad = self.tokenizer.pad self.tokenizer_unk = self.tokenizer.oov super().__init__(cfg=cfg, trainer=trainer) self.embed = nn.Embedding(num_tokens, cfg.symbols_embedding_dim) self.preprocessor = instantiate(cfg.preprocessor) self.alignment_encoder = instantiate(cfg.alignment_encoder) self.forward_sum_loss = ForwardSumLoss() self.bin_loss = BinLoss() self.add_bin_loss = False self.bin_loss_scale = 0.0 self.bin_loss_start_ratio = cfg.bin_loss_start_ratio self.bin_loss_warmup_epochs = cfg.bin_loss_warmup_epochs
def __init__(self, cfg: DictConfig): cfg = model_utils.convert_model_config_to_dict_config(cfg) # Convert config to support Hydra 1.0+ instantiation cfg = model_utils.maybe_update_config_version(cfg) self._cfg = cfg # Diarizer set up self._diarizer_params = self._cfg.diarizer # init vad model self.has_vad_model = False if not self._diarizer_params.oracle_vad: if self._cfg.diarizer.vad.model_path is not None: self._vad_params = self._cfg.diarizer.vad.parameters self._init_vad_model() # init speaker model self.multiscale_embeddings_and_timestamps = {} self._init_speaker_model() self._speaker_params = self._cfg.diarizer.speaker_embeddings.parameters self._speaker_dir = os.path.join(self._diarizer_params.out_dir, 'speaker_outputs') shutil.rmtree(self._speaker_dir, ignore_errors=True) os.makedirs(self._speaker_dir) # Clustering params self._cluster_params = self._diarizer_params.clustering.parameters self._device = torch.device( "cuda:0" if torch.cuda.is_available() else "cpu")
def __init__(self, cfg: DictConfig, trainer: Trainer = None): # Get global rank and total number of GPU workers for IterableDataset partitioning, if applicable # Global_rank and local_rank is set by LightningModule in Lightning 1.2.0 self.world_size = 1 if trainer is not None: self.world_size = trainer.num_nodes * trainer.num_devices # Convert config to a DictConfig cfg = model_utils.convert_model_config_to_dict_config(cfg) # Convert config to support Hydra 1.0+ instantiation cfg = model_utils.maybe_update_config_version(cfg) self.is_regression_task = cfg.get('is_regression_task', False) # Change labels if needed self._update_decoder_config(cfg.labels, cfg.decoder) super().__init__(cfg=cfg, trainer=trainer) if hasattr(self._cfg, 'spec_augment') and self._cfg.spec_augment is not None: self.spec_augmentation = ASRModel.from_config_dict(self._cfg.spec_augment) else: self.spec_augmentation = None if hasattr(self._cfg, 'crop_or_pad_augment') and self._cfg.crop_or_pad_augment is not None: self.crop_or_pad = ASRModel.from_config_dict(self._cfg.crop_or_pad_augment) else: self.crop_or_pad = None self.preprocessor = self._setup_preprocessor() self.encoder = self._setup_encoder() self.decoder = self._setup_decoder() self.loss = self._setup_loss() self._setup_metrics()
def __init__(self, cfg: DictConfig): cfg = model_utils.convert_model_config_to_dict_config(cfg) # Convert config to support Hydra 1.0+ instantiation cfg = model_utils.maybe_update_config_version(cfg) self._cfg = cfg self._out_dir = self._cfg.diarizer.out_dir if not os.path.exists(self._out_dir): os.mkdir(self._out_dir) # init vad model self.has_vad_model = False self.has_vad_model_to_save = False self._speaker_manifest_path = self._cfg.diarizer.speaker_embeddings.oracle_vad_manifest self.AUDIO_RTTM_MAP = None self.paths2audio_files = self._cfg.diarizer.paths2audio_files if self._cfg.diarizer.vad.model_path is not None: self._init_vad_model() self._vad_dir = os.path.join(self._out_dir, 'vad_outputs') self._vad_out_file = os.path.join(self._vad_dir, "vad_out.json") shutil.rmtree(self._vad_dir, ignore_errors=True) os.makedirs(self._vad_dir) # init speaker model self._speaker_model = ExtractSpeakerEmbeddingsModel.restore_from( self._cfg.diarizer.speaker_embeddings.model_path ) self._num_speakers = self._cfg.diarizer.num_speakers self._speaker_dir = os.path.join(self._out_dir, 'speaker_outputs') self._device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None): # Convert to Hydra 1.0 compatible DictConfig cfg = model_utils.convert_model_config_to_dict_config(cfg) cfg = model_utils.maybe_update_config_version(cfg) super().__init__(cfg=cfg, trainer=trainer) self.audio_to_melspec_precessor = instantiate(cfg.preprocessor) # We use separate preprocessor for training, because we need to pass grads and remove pitch fmax limitation self.trg_melspec_fn = instantiate(cfg.preprocessor, highfreq=None, use_grads=True) self.generator = instantiate(cfg.generator) self.mpd = MultiPeriodDiscriminator(debug=cfg.debug if "debug" in cfg else False) self.msd = MultiScaleDiscriminator(debug=cfg.debug if "debug" in cfg else False) self.feature_loss = FeatureMatchingLoss() self.discriminator_loss = DiscriminatorLoss() self.generator_loss = GeneratorLoss() self.l1_factor = cfg.get("l1_loss_factor", 45) self.sample_rate = self._cfg.preprocessor.sample_rate self.stft_bias = None self.input_as_mel = False if self._train_dl: # TODO(Oktai15): remove it in 1.8.0 version if isinstance(self._train_dl.dataset, MelAudioDataset): self.input_as_mel = True elif isinstance(self._train_dl.dataset, VocoderDataset): self.input_as_mel = self._train_dl.dataset.load_precomputed_mel self.automatic_optimization = False
def __init__(self, cfg: DictConfig, trainer: Trainer = None): # Convert to Hydra 1.0 compatible DictConfig cfg = model_utils.convert_model_config_to_dict_config(cfg) cfg = model_utils.maybe_update_config_version(cfg) # Tokenizer is necessary for this model if 'tokenizer' not in cfg: raise ValueError( "`cfg` must have `tokenizer` config to create a tokenizer !") if not isinstance(cfg, DictConfig): cfg = OmegaConf.create(cfg) # Setup the tokenizer self._setup_tokenizer(cfg.tokenizer) # Initialize a dummy vocabulary vocabulary = self.tokenizer.tokenizer.get_vocab() # Set the new vocabulary with open_dict(cfg): cfg.labels = ListConfig(list(vocabulary)) with open_dict(cfg.decoder): cfg.decoder.vocab_size = len(vocabulary) with open_dict(cfg.joint): cfg.joint.num_classes = len(vocabulary) cfg.joint.vocabulary = ListConfig(list(vocabulary)) cfg.joint.jointnet.encoder_hidden = cfg.model_defaults.enc_hidden cfg.joint.jointnet.pred_hidden = cfg.model_defaults.pred_hidden super().__init__(cfg=cfg, trainer=trainer) # Setup decoding object self.decoding = RNNTBPEDecoding( decoding_cfg=self.cfg.decoding, decoder=self.decoder, joint=self.joint, tokenizer=self.tokenizer, ) # Setup wer object self.wer = RNNTBPEWER( decoding=self.decoding, batch_dim_index=0, use_cer=self._cfg.get('use_cer', False), log_prediction=self._cfg.get('log_prediction', True), dist_sync_on_step=True, ) # Setup fused Joint step if flag is set if self.joint.fuse_loss_wer: self.joint.set_loss(self.loss) self.joint.set_wer(self.wer)
def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None): # Convert to Hydra 1.0 compatible DictConfig cfg = model_utils.convert_model_config_to_dict_config(cfg) cfg = model_utils.maybe_update_config_version(cfg) super().__init__(cfg=cfg, trainer=trainer) self.sigma = self._cfg.sigma self.audio_to_melspec_precessor = instantiate(self._cfg.preprocessor) self.waveglow = instantiate(self._cfg.waveglow) self.loss = WaveGlowLoss()
def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None): # Convert to Hydra 1.0 compatible DictConfig cfg = model_utils.convert_model_config_to_dict_config(cfg) cfg = model_utils.maybe_update_config_version(cfg) # setup normalizer self.normalizer = None self.text_normalizer_call = None self.text_normalizer_call_kwargs = {} self._setup_normalizer(cfg) # setup tokenizer self.tokenizer = None if hasattr(cfg, 'text_tokenizer'): self._setup_tokenizer(cfg) self.num_tokens = len(self.tokenizer.tokens) self.tokenizer_pad = self.tokenizer.pad self.tokenizer_unk = self.tokenizer.oov # assert self.tokenizer is not None else: self.num_tokens = len(cfg.labels) + 3 super().__init__(cfg=cfg, trainer=trainer) schema = OmegaConf.structured(Tacotron2Config) # ModelPT ensures that cfg is a DictConfig, but do this second check in case ModelPT changes if isinstance(cfg, dict): cfg = OmegaConf.create(cfg) elif not isinstance(cfg, DictConfig): raise ValueError( f"cfg was type: {type(cfg)}. Expected either a dict or a DictConfig" ) # Ensure passed cfg is compliant with schema try: OmegaConf.merge(cfg, schema) self.pad_value = cfg.preprocessor.pad_value except ConfigAttributeError: self.pad_value = cfg.preprocessor.params.pad_value logging.warning( "Your config is using an old NeMo yaml configuration. Please ensure that the yaml matches the " "current version in the main branch for future compatibility.") self._parser = None self.audio_to_melspec_precessor = instantiate(cfg.preprocessor) self.text_embedding = nn.Embedding(self.num_tokens, 512) self.encoder = instantiate(self._cfg.encoder) self.decoder = instantiate(self._cfg.decoder) self.postnet = instantiate(self._cfg.postnet) self.loss = Tacotron2Loss() self.calculate_loss = True
def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None): # Convert to Hydra 1.0 compatible DictConfig cfg = model_utils.convert_model_config_to_dict_config(cfg) cfg = model_utils.maybe_update_config_version(cfg) super().__init__(cfg=cfg, trainer=trainer) self.audio_to_melspec_precessor = instantiate(cfg.preprocessor) # We use separate preprocessor for training, because we need to pass grads and remove pitch fmax limitation self.trg_melspec_fn = instantiate(cfg.preprocessor, highfreq=None, use_grads=True) self.generator = instantiate( cfg.generator, n_mel_channels=cfg.preprocessor.nfilt, hop_length=cfg.preprocessor.n_window_stride) self.mpd = MultiPeriodDiscriminator( cfg.discriminator.mpd, debug=cfg.debug if "debug" in cfg else False) self.mrd = MultiResolutionDiscriminator( cfg.discriminator.mrd, debug=cfg.debug if "debug" in cfg else False) self.discriminator_loss = DiscriminatorLoss() self.generator_loss = GeneratorLoss() # Reshape MRD resolutions hyperparameter and apply them to MRSTFT loss self.stft_resolutions = cfg.discriminator.mrd.resolutions self.fft_sizes = [res[0] for res in self.stft_resolutions] self.hop_sizes = [res[1] for res in self.stft_resolutions] self.win_lengths = [res[2] for res in self.stft_resolutions] self.mrstft_loss = MultiResolutionSTFTLoss(self.fft_sizes, self.hop_sizes, self.win_lengths) self.stft_lamb = cfg.stft_lamb self.sample_rate = self._cfg.preprocessor.sample_rate self.stft_bias = None self.input_as_mel = False if self._train_dl: # TODO(Oktai15): remove it in 1.8.0 version if isinstance(self._train_dl.dataset, MelAudioDataset): self.input_as_mel = True elif isinstance(self._train_dl.dataset, VocoderDataset): self.input_as_mel = self._train_dl.dataset.load_precomputed_mel self.automatic_optimization = False
def __init__(self, cfg: DictConfig, trainer=None): # Convert to Hydra 1.0 compatible DictConfig cfg = model_utils.convert_model_config_to_dict_config(cfg) cfg = model_utils.maybe_update_config_version(cfg) if 'tokenizer' not in cfg: raise ValueError( "`cfg` must have `tokenizer` config to create a tokenizer !") # Setup the tokenizer self._setup_tokenizer(cfg.tokenizer) # Initialize a dummy vocabulary vocabulary = self.tokenizer.tokenizer.get_vocab() # Set the new vocabulary with open_dict(cfg): # sidestepping the potential overlapping tokens issue in aggregate tokenizers if self.tokenizer_type == "agg": cfg.decoder.vocabulary = ListConfig(vocabulary) else: cfg.decoder.vocabulary = ListConfig(list(vocabulary.keys())) # Override number of classes if placeholder provided num_classes = cfg.decoder["num_classes"] if num_classes < 1: logging.info( "\nReplacing placeholder number of classes ({}) with actual number of classes - {}" .format(num_classes, len(vocabulary))) cfg.decoder["num_classes"] = len(vocabulary) super().__init__(cfg=cfg, trainer=trainer) # Setup metric objects self._wer = WERBPE( tokenizer=self.tokenizer, batch_dim_index=0, use_cer=self._cfg.get('use_cer', False), ctc_decode=True, dist_sync_on_step=True, log_prediction=self._cfg.get("log_prediction", False), )
def __init__(self, cfg: DictConfig, trainer: Trainer = None): # Convert to Hydra 1.0 compatible DictConfig cfg = model_utils.convert_model_config_to_dict_config(cfg) cfg = model_utils.maybe_update_config_version(cfg) # Setup normalizer self.normalizer = None self.text_normalizer_call = None self.text_normalizer_call_kwargs = {} self._setup_normalizer(cfg) self.learn_alignment = cfg.get("learn_alignment", False) # Setup vocabulary (=tokenizer) and input_fft_kwargs (supported only with self.learn_alignment=True) input_fft_kwargs = {} if self.learn_alignment: self.vocab = None self.ds_class_name = cfg.train_ds.dataset._target_.split(".")[-1] if self.ds_class_name == "TTSDataset": self._setup_tokenizer(cfg) assert self.vocab is not None input_fft_kwargs["n_embed"] = len(self.vocab.tokens) input_fft_kwargs["padding_idx"] = self.vocab.pad elif self.ds_class_name == "AudioToCharWithPriorAndPitchDataset": logging.warning( "AudioToCharWithPriorAndPitchDataset class has been deprecated. No support for" " training or finetuning. Only inference is supported.") tokenizer_conf = self._get_default_text_tokenizer_conf() self._setup_tokenizer(tokenizer_conf) assert self.vocab is not None input_fft_kwargs["n_embed"] = len(self.vocab.tokens) input_fft_kwargs["padding_idx"] = self.vocab.pad else: raise ValueError( f"Unknown dataset class: {self.ds_class_name}") self._parser = None self._tb_logger = None super().__init__(cfg=cfg, trainer=trainer) self.bin_loss_warmup_epochs = cfg.get("bin_loss_warmup_epochs", 100) self.log_train_images = False loss_scale = 0.1 if self.learn_alignment else 1.0 dur_loss_scale = loss_scale pitch_loss_scale = loss_scale if "dur_loss_scale" in cfg: dur_loss_scale = cfg.dur_loss_scale if "pitch_loss_scale" in cfg: pitch_loss_scale = cfg.pitch_loss_scale self.mel_loss = MelLoss() self.pitch_loss = PitchLoss(loss_scale=pitch_loss_scale) self.duration_loss = DurationLoss(loss_scale=dur_loss_scale) self.aligner = None if self.learn_alignment: self.aligner = instantiate(self._cfg.alignment_module) self.forward_sum_loss = ForwardSumLoss() self.bin_loss = BinLoss() self.preprocessor = instantiate(self._cfg.preprocessor) input_fft = instantiate(self._cfg.input_fft, **input_fft_kwargs) output_fft = instantiate(self._cfg.output_fft) duration_predictor = instantiate(self._cfg.duration_predictor) pitch_predictor = instantiate(self._cfg.pitch_predictor) self.fastpitch = FastPitchModule( input_fft, output_fft, duration_predictor, pitch_predictor, self.aligner, cfg.n_speakers, cfg.symbols_embedding_dim, cfg.pitch_embedding_kernel_size, cfg.n_mel_channels, ) self._input_types = self._output_types = None
def __init__(self, cfg: MTEncDecModelConfig, trainer: Trainer = None): cfg = model_utils.convert_model_config_to_dict_config(cfg) # Get global rank and total number of GPU workers for IterableDataset partitioning, if applicable # Global_rank and local_rank is set by LightningModule in Lightning 1.2.0 self.world_size = 1 if trainer is not None: self.world_size = trainer.num_nodes * trainer.num_gpus cfg = model_utils.maybe_update_config_version(cfg) self.src_language = cfg.get("src_language", None) self.tgt_language = cfg.get("tgt_language", None) self.multilingual = cfg.get("multilingual", False) self.multilingual_ids = [] self.encoder_tokenizer_library = cfg.encoder_tokenizer.get( 'library', 'yttm') self.decoder_tokenizer_library = cfg.decoder_tokenizer.get( 'library', 'yttm') # Instantiates tokenizers and register to be saved with NeMo Model archive # After this call, ther will be self.encoder_tokenizer and self.decoder_tokenizer # Which can convert between tokens and token_ids for SRC and TGT languages correspondingly. self.setup_enc_dec_tokenizers( encoder_tokenizer_library=self.encoder_tokenizer_library, encoder_tokenizer_model=cfg.encoder_tokenizer.get( 'tokenizer_model'), encoder_bpe_dropout=cfg.encoder_tokenizer.get( 'bpe_dropout', 0.0) if cfg.encoder_tokenizer.get( 'bpe_dropout', 0.0) is not None else 0.0, encoder_model_name=cfg.encoder.get('model_name') if hasattr( cfg.encoder, 'model_name') else None, encoder_r2l=cfg.encoder_tokenizer.get('r2l', False), decoder_tokenizer_library=self.decoder_tokenizer_library, encoder_tokenizer_vocab_file=cfg.encoder_tokenizer.get( 'vocab_file', None), decoder_tokenizer_model=cfg.decoder_tokenizer.tokenizer_model, decoder_bpe_dropout=cfg.decoder_tokenizer.get( 'bpe_dropout', 0.0) if cfg.decoder_tokenizer.get( 'bpe_dropout', 0.0) is not None else 0.0, decoder_model_name=cfg.decoder.get('model_name') if hasattr( cfg.decoder, 'model_name') else None, decoder_r2l=cfg.decoder_tokenizer.get('r2l', False), ) if self.multilingual: if isinstance(self.src_language, ListConfig) and isinstance( self.tgt_language, ListConfig): raise ValueError( "cfg.src_language and cfg.tgt_language cannot both be lists. We only support many-to-one or one-to-many multilingual models." ) elif isinstance(self.src_language, ListConfig): for lng in self.src_language: self.multilingual_ids.append( self.encoder_tokenizer.token_to_id("<" + lng + ">")) elif isinstance(self.tgt_language, ListConfig): for lng in self.tgt_language: self.multilingual_ids.append( self.encoder_tokenizer.token_to_id("<" + lng + ">")) else: raise ValueError( "Expect either cfg.src_language or cfg.tgt_language to be a list when multilingual=True." ) if isinstance(self.src_language, ListConfig): self.tgt_language = [self.tgt_language] * len( self.src_language) else: self.src_language = [self.src_language] * len( self.tgt_language) self.source_processor_list = [] self.target_processor_list = [] for src_lng, tgt_lng in zip(self.src_language, self.tgt_language): src_prcsr, tgt_prscr = self.setup_pre_and_post_processing_utils( src_lng, tgt_lng) self.source_processor_list.append(src_prcsr) self.target_processor_list.append(tgt_prscr) else: # After this call, the model will have self.source_processor and self.target_processor objects self.setup_pre_and_post_processing_utils(self.src_language, self.tgt_language) self.multilingual_ids = [None] # TODO: Why is this base constructor call so late in the game? super().__init__(cfg=cfg, trainer=trainer) # encoder from NeMo, Megatron-LM, or HuggingFace encoder_cfg_dict = OmegaConf.to_container(cfg.get('encoder')) encoder_cfg_dict['vocab_size'] = self.encoder_vocab_size library = encoder_cfg_dict.pop('library', 'nemo') model_name = encoder_cfg_dict.pop('model_name', None) pretrained = encoder_cfg_dict.pop('pretrained', False) checkpoint_file = encoder_cfg_dict.pop('checkpoint_file', None) self.encoder = get_transformer( library=library, model_name=model_name, pretrained=pretrained, config_dict=encoder_cfg_dict, encoder=True, pre_ln_final_layer_norm=encoder_cfg_dict.get( 'pre_ln_final_layer_norm', False), checkpoint_file=checkpoint_file, ) # decoder from NeMo, Megatron-LM, or HuggingFace decoder_cfg_dict = OmegaConf.to_container(cfg.get('decoder')) decoder_cfg_dict['vocab_size'] = self.decoder_vocab_size library = decoder_cfg_dict.pop('library', 'nemo') model_name = decoder_cfg_dict.pop('model_name', None) pretrained = decoder_cfg_dict.pop('pretrained', False) decoder_cfg_dict['hidden_size'] = self.encoder.hidden_size self.decoder = get_transformer( library=library, model_name=model_name, pretrained=pretrained, config_dict=decoder_cfg_dict, encoder=False, pre_ln_final_layer_norm=decoder_cfg_dict.get( 'pre_ln_final_layer_norm', False), ) self.log_softmax = TokenClassifier( hidden_size=self.decoder.hidden_size, num_classes=self.decoder_vocab_size, activation=cfg.head.activation, log_softmax=cfg.head.log_softmax, dropout=cfg.head.dropout, use_transformer_init=cfg.head.use_transformer_init, ) self.beam_search = BeamSearchSequenceGenerator( embedding=self.decoder.embedding, decoder=self.decoder.decoder, log_softmax=self.log_softmax, max_sequence_length=self.decoder.max_sequence_length, beam_size=cfg.beam_size, bos=self.decoder_tokenizer.bos_id, pad=self.decoder_tokenizer.pad_id, eos=self.decoder_tokenizer.eos_id, len_pen=cfg.len_pen, max_delta_length=cfg.max_generation_delta, ) # tie weights of embedding and softmax matrices self.log_softmax.mlp.layer0.weight = self.decoder.embedding.token_embedding.weight # TODO: encoder and decoder with different hidden size? std_init_range = 1 / self.encoder.hidden_size**0.5 # initialize weights if not using pretrained encoder/decoder if not self._cfg.encoder.get('pretrained', False): self.encoder.apply(lambda module: transformer_weights_init( module, std_init_range)) if not self._cfg.decoder.get('pretrained', False): self.decoder.apply(lambda module: transformer_weights_init( module, std_init_range)) self.log_softmax.apply( lambda module: transformer_weights_init(module, std_init_range)) self.loss_fn = SmoothedCrossEntropyLoss( pad_id=self.decoder_tokenizer.pad_id, label_smoothing=cfg.label_smoothing) self.eval_loss_fn = NLLLoss(ignore_index=self.decoder_tokenizer.pad_id)
def __init__(self, cfg: MTEncDecModelConfig, trainer: Trainer = None): cfg = model_utils.convert_model_config_to_dict_config(cfg) # Get global rank and total number of GPU workers for IterableDataset partitioning, if applicable self.global_rank = 0 self.world_size = 1 if trainer is not None: self.global_rank = (trainer.node_rank * trainer.num_gpus) + trainer.local_rank self.world_size = trainer.num_nodes * trainer.num_gpus cfg = model_utils.maybe_update_config_version(cfg) self.setup_enc_dec_tokenizers(cfg) super().__init__(cfg=cfg, trainer=trainer) # TODO: use get_encoder function with support for HF and Megatron self.encoder = TransformerEncoderNM( vocab_size=self.encoder_vocab_size, hidden_size=cfg.encoder.hidden_size, num_layers=cfg.encoder.num_layers, inner_size=cfg.encoder.inner_size, max_sequence_length=cfg.encoder.max_sequence_length if hasattr( cfg.encoder, 'max_sequence_length') else 512, embedding_dropout=cfg.encoder.embedding_dropout if hasattr( cfg.encoder, 'embedding_dropout') else 0.0, learn_positional_encodings=cfg.encoder.learn_positional_encodings if hasattr(cfg.encoder, 'learn_positional_encodings') else False, num_attention_heads=cfg.encoder.num_attention_heads, ffn_dropout=cfg.encoder.ffn_dropout, attn_score_dropout=cfg.encoder.attn_score_dropout, attn_layer_dropout=cfg.encoder.attn_layer_dropout, hidden_act=cfg.encoder.hidden_act, mask_future=cfg.encoder.mask_future, pre_ln=cfg.encoder.pre_ln, ) # TODO: user get_decoder function with support for HF and Megatron self.decoder = TransformerDecoderNM( vocab_size=self.decoder_vocab_size, hidden_size=cfg.decoder.hidden_size, num_layers=cfg.decoder.num_layers, inner_size=cfg.decoder.inner_size, max_sequence_length=cfg.decoder.max_sequence_length if hasattr( cfg.decoder, 'max_sequence_length') else 512, embedding_dropout=cfg.decoder.embedding_dropout if hasattr( cfg.decoder, 'embedding_dropout') else 0.0, learn_positional_encodings=cfg.decoder.learn_positional_encodings if hasattr(cfg.decoder, 'learn_positional_encodings') else False, num_attention_heads=cfg.decoder.num_attention_heads, ffn_dropout=cfg.decoder.ffn_dropout, attn_score_dropout=cfg.decoder.attn_score_dropout, attn_layer_dropout=cfg.decoder.attn_layer_dropout, hidden_act=cfg.decoder.hidden_act, pre_ln=cfg.decoder.pre_ln, ) self.log_softmax = TokenClassifier( hidden_size=self.decoder.hidden_size, num_classes=self.decoder_vocab_size, activation=cfg.head.activation, log_softmax=cfg.head.log_softmax, dropout=cfg.head.dropout, use_transformer_init=cfg.head.use_transformer_init, ) self.beam_search = BeamSearchSequenceGenerator( embedding=self.decoder.embedding, decoder=self.decoder.decoder, log_softmax=self.log_softmax, max_sequence_length=self.decoder.max_sequence_length, beam_size=cfg.beam_size, bos=self.decoder_tokenizer.bos_id, pad=self.decoder_tokenizer.pad_id, eos=self.decoder_tokenizer.eos_id, len_pen=cfg.len_pen, max_delta_length=cfg.max_generation_delta, ) # tie weights of embedding and softmax matrices self.log_softmax.mlp.layer0.weight = self.decoder.embedding.token_embedding.weight # TODO: encoder and decoder with different hidden size? std_init_range = 1 / self.encoder.hidden_size**0.5 self.apply( lambda module: transformer_weights_init(module, std_init_range)) self.loss_fn = SmoothedCrossEntropyLoss( pad_id=self.decoder_tokenizer.pad_id, label_smoothing=cfg.label_smoothing) self.eval_loss = GlobalAverageLossMetric(dist_sync_on_step=False, take_avg_loss=True)
def __init__(self, cfg: MTEncDecModelConfig, trainer: Trainer = None): cfg = model_utils.convert_model_config_to_dict_config(cfg) # Get global rank and total number of GPU workers for IterableDataset partitioning, if applicable # Global_rank and local_rank is set by LightningModule in Lightning 1.2.0 self.world_size = 1 if trainer is not None: self.world_size = trainer.num_nodes * trainer.num_gpus cfg = model_utils.maybe_update_config_version(cfg) self.src_language: str = cfg.get("src_language", None) self.tgt_language: str = cfg.get("tgt_language", None) # Instantiates tokenizers and register to be saved with NeMo Model archive # After this call, ther will be self.encoder_tokenizer and self.decoder_tokenizer # Which can convert between tokens and token_ids for SRC and TGT languages correspondingly. self.setup_enc_dec_tokenizers( encoder_tokenizer_library=cfg.encoder_tokenizer.get('library', 'yttm'), encoder_tokenizer_model=cfg.encoder_tokenizer.get('tokenizer_model'), encoder_bpe_dropout=cfg.encoder_tokenizer.get('bpe_dropout', 0.0), encoder_model_name=cfg.encoder.get('model_name') if hasattr(cfg.encoder, 'model_name') else None, decoder_tokenizer_library=cfg.decoder_tokenizer.get('library', 'yttm'), decoder_tokenizer_model=cfg.decoder_tokenizer.tokenizer_model, decoder_bpe_dropout=cfg.decoder_tokenizer.get('bpe_dropout', 0.0), decoder_model_name=cfg.decoder.get('model_name') if hasattr(cfg.decoder, 'model_name') else None, ) # After this call, the model will have self.source_processor and self.target_processor objects self.setup_pre_and_post_processing_utils(source_lang=self.src_language, target_lang=self.tgt_language) # TODO: Why is this base constructor call so late in the game? super().__init__(cfg=cfg, trainer=trainer) # encoder from NeMo, Megatron-LM, or HuggingFace encoder_cfg_dict = OmegaConf.to_container(cfg.get('encoder')) encoder_cfg_dict['vocab_size'] = self.encoder_vocab_size library = encoder_cfg_dict.pop('library', 'nemo') model_name = encoder_cfg_dict.pop('model_name', None) pretrained = encoder_cfg_dict.pop('pretrained', False) self.encoder = get_transformer( library=library, model_name=model_name, pretrained=pretrained, config_dict=encoder_cfg_dict, encoder=True, ) # decoder from NeMo, Megatron-LM, or HuggingFace decoder_cfg_dict = OmegaConf.to_container(cfg.get('decoder')) decoder_cfg_dict['vocab_size'] = self.decoder_vocab_size library = decoder_cfg_dict.pop('library', 'nemo') model_name = decoder_cfg_dict.pop('model_name', None) pretrained = decoder_cfg_dict.pop('pretrained', False) decoder_cfg_dict['hidden_size'] = self.encoder.hidden_size self.decoder = get_transformer( library=library, model_name=model_name, pretrained=pretrained, config_dict=decoder_cfg_dict, encoder=False, ) self.log_softmax = TokenClassifier( hidden_size=self.decoder.hidden_size, num_classes=self.decoder_vocab_size, activation=cfg.head.activation, log_softmax=cfg.head.log_softmax, dropout=cfg.head.dropout, use_transformer_init=cfg.head.use_transformer_init, ) self.beam_search = BeamSearchSequenceGenerator( embedding=self.decoder.embedding, decoder=self.decoder.decoder, log_softmax=self.log_softmax, max_sequence_length=self.decoder.max_sequence_length, beam_size=cfg.beam_size, bos=self.decoder_tokenizer.bos_id, pad=self.decoder_tokenizer.pad_id, eos=self.decoder_tokenizer.eos_id, len_pen=cfg.len_pen, max_delta_length=cfg.max_generation_delta, ) # tie weights of embedding and softmax matrices self.log_softmax.mlp.layer0.weight = self.decoder.embedding.token_embedding.weight # TODO: encoder and decoder with different hidden size? std_init_range = 1 / self.encoder.hidden_size ** 0.5 # initialize weights if not using pretrained encoder/decoder if not self._cfg.encoder.get('pretrained', False): self.encoder.apply(lambda module: transformer_weights_init(module, std_init_range)) if not self._cfg.decoder.get('pretrained', False): self.decoder.apply(lambda module: transformer_weights_init(module, std_init_range)) self.log_softmax.apply(lambda module: transformer_weights_init(module, std_init_range)) self.loss_fn = SmoothedCrossEntropyLoss( pad_id=self.decoder_tokenizer.pad_id, label_smoothing=cfg.label_smoothing ) self.eval_loss = GlobalAverageLossMetric(dist_sync_on_step=False, take_avg_loss=True)
def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None): # Convert to Hydra 1.0 compatible DictConfig cfg = model_utils.convert_model_config_to_dict_config(cfg) cfg = model_utils.maybe_update_config_version(cfg) # Setup normalizer self.normalizer = None self.text_normalizer_call = None self.text_normalizer_call_kwargs = {} self._setup_normalizer(cfg) # Setup tokenizer self.tokenizer = None self._setup_tokenizer(cfg) assert self.tokenizer is not None num_tokens = len(self.tokenizer.tokens) self.tokenizer_pad = self.tokenizer.pad self.tokenizer_unk = self.tokenizer.oov super().__init__(cfg=cfg, trainer=trainer) self.pitch_loss_scale = cfg.pitch_loss_scale self.durs_loss_scale = cfg.durs_loss_scale self.mel_loss_scale = cfg.mel_loss_scale self.aligner = instantiate(cfg.alignment_module) self.forward_sum_loss = ForwardSumLoss() self.bin_loss = BinLoss() self.add_bin_loss = False self.bin_loss_scale = 0.0 self.bin_loss_start_ratio = cfg.bin_loss_start_ratio self.bin_loss_warmup_epochs = cfg.bin_loss_warmup_epochs self.cond_on_lm_embeddings = cfg.get("cond_on_lm_embeddings", False) if self.cond_on_lm_embeddings: self.lm_padding_value = ( self._train_dl.dataset.lm_padding_value if self._train_dl is not None else self._get_lm_padding_value(cfg.lm_model) ) self.lm_embeddings = self._get_lm_embeddings(cfg.lm_model) self.lm_embeddings.weight.requires_grad = False self.self_attention_module = instantiate( cfg.self_attention_module, n_lm_tokens_channels=self.lm_embeddings.weight.shape[1] ) self.encoder = instantiate(cfg.encoder, num_tokens=num_tokens, padding_idx=self.tokenizer_pad) self.symbol_emb = self.encoder.to_embed self.duration_predictor = instantiate(cfg.duration_predictor) self.pitch_mean, self.pitch_std = float(cfg.pitch_mean), float(cfg.pitch_std) self.pitch_predictor = instantiate(cfg.pitch_predictor) self.pitch_emb = instantiate(cfg.pitch_emb) self.preprocessor = instantiate(cfg.preprocessor) self.decoder = instantiate(cfg.decoder) self.proj = nn.Linear(self.decoder.d_model, cfg.n_mel_channels)
def __init__(self, cfg: MTEncDecModelConfig, trainer: Trainer = None): cfg = model_utils.convert_model_config_to_dict_config(cfg) # Get global rank and total number of GPU workers for IterableDataset partitioning, if applicable # Global_rank and local_rank is set by LightningModule in Lightning 1.2.0 self.world_size = 1 if trainer is not None: self.world_size = trainer.num_nodes * trainer.num_gpus cfg = model_utils.maybe_update_config_version(cfg) self.src_language: str = cfg.get("src_language", None) self.tgt_language: str = cfg.get("tgt_language", None) # Instantiates tokenizers and register to be saved with NeMo Model archive # After this call, ther will be self.encoder_tokenizer and self.decoder_tokenizer # Which can convert between tokens and token_ids for SRC and TGT languages correspondingly. self.setup_enc_dec_tokenizers( encoder_tokenizer_name=cfg.encoder_tokenizer.tokenizer_name, encoder_tokenizer_model=cfg.encoder_tokenizer.tokenizer_model, encoder_bpe_dropout=cfg.encoder_tokenizer.get('bpe_dropout', 0.0), decoder_tokenizer_name=cfg.decoder_tokenizer.tokenizer_name, decoder_tokenizer_model=cfg.decoder_tokenizer.tokenizer_model, decoder_bpe_dropout=cfg.decoder_tokenizer.get('bpe_dropout', 0.0), ) # After this call, the model will have self.source_processor and self.target_processor objects self.setup_pre_and_post_processing_utils(source_lang=self.src_language, target_lang=self.tgt_language) # TODO: Why is this base constructor call so late in the game? super().__init__(cfg=cfg, trainer=trainer) # TODO: use get_encoder function with support for HF and Megatron self.encoder = TransformerEncoderNM( vocab_size=self.encoder_vocab_size, hidden_size=cfg.encoder.hidden_size, num_layers=cfg.encoder.num_layers, inner_size=cfg.encoder.inner_size, max_sequence_length=cfg.encoder.max_sequence_length if hasattr( cfg.encoder, 'max_sequence_length') else 512, embedding_dropout=cfg.encoder.embedding_dropout if hasattr( cfg.encoder, 'embedding_dropout') else 0.0, learn_positional_encodings=cfg.encoder.learn_positional_encodings if hasattr(cfg.encoder, 'learn_positional_encodings') else False, num_attention_heads=cfg.encoder.num_attention_heads, ffn_dropout=cfg.encoder.ffn_dropout, attn_score_dropout=cfg.encoder.attn_score_dropout, attn_layer_dropout=cfg.encoder.attn_layer_dropout, hidden_act=cfg.encoder.hidden_act, mask_future=cfg.encoder.mask_future, pre_ln=cfg.encoder.pre_ln, ) # TODO: user get_decoder function with support for HF and Megatron self.decoder = TransformerDecoderNM( vocab_size=self.decoder_vocab_size, hidden_size=cfg.decoder.hidden_size, num_layers=cfg.decoder.num_layers, inner_size=cfg.decoder.inner_size, max_sequence_length=cfg.decoder.max_sequence_length if hasattr( cfg.decoder, 'max_sequence_length') else 512, embedding_dropout=cfg.decoder.embedding_dropout if hasattr( cfg.decoder, 'embedding_dropout') else 0.0, learn_positional_encodings=cfg.decoder.learn_positional_encodings if hasattr(cfg.decoder, 'learn_positional_encodings') else False, num_attention_heads=cfg.decoder.num_attention_heads, ffn_dropout=cfg.decoder.ffn_dropout, attn_score_dropout=cfg.decoder.attn_score_dropout, attn_layer_dropout=cfg.decoder.attn_layer_dropout, hidden_act=cfg.decoder.hidden_act, pre_ln=cfg.decoder.pre_ln, ) self.log_softmax = TokenClassifier( hidden_size=self.decoder.hidden_size, num_classes=self.decoder_vocab_size, activation=cfg.head.activation, log_softmax=cfg.head.log_softmax, dropout=cfg.head.dropout, use_transformer_init=cfg.head.use_transformer_init, ) self.beam_search = BeamSearchSequenceGenerator( embedding=self.decoder.embedding, decoder=self.decoder.decoder, log_softmax=self.log_softmax, max_sequence_length=self.decoder.max_sequence_length, beam_size=cfg.beam_size, bos=self.decoder_tokenizer.bos_id, pad=self.decoder_tokenizer.pad_id, eos=self.decoder_tokenizer.eos_id, len_pen=cfg.len_pen, max_delta_length=cfg.max_generation_delta, ) # tie weights of embedding and softmax matrices self.log_softmax.mlp.layer0.weight = self.decoder.embedding.token_embedding.weight # TODO: encoder and decoder with different hidden size? std_init_range = 1 / self.encoder.hidden_size**0.5 self.apply( lambda module: transformer_weights_init(module, std_init_range)) self.loss_fn = SmoothedCrossEntropyLoss( pad_id=self.decoder_tokenizer.pad_id, label_smoothing=cfg.label_smoothing) self.eval_loss = GlobalAverageLossMetric(dist_sync_on_step=False, take_avg_loss=True)
def __init__(self, cfg: DictConfig, trainer: Trainer = None): # Get global rank and total number of GPU workers for IterableDataset partitioning, if applicable self.world_size = 1 if trainer is not None: self.world_size = trainer.num_nodes * trainer.num_gpus cfg = model_utils.convert_model_config_to_dict_config(cfg) cfg = model_utils.maybe_update_config_version(cfg) # Instantiates tokenizer and register to be saved with NeMo Model archive # After this call, ther will be self.tokenizer which can convert between tokens and token_ids. self.setup_tokenizer( tokenizer_name=cfg.tokenizer.get("tokenizer_name", "yttm"), tokenizer_model=cfg.tokenizer.get("tokenizer_model", None), vocab_file=cfg.tokenizer.get("vocab_file", None), bpe_dropout=cfg.tokenizer.get("bpe_dropout", 0.0), special_tokens=cfg.tokenizer.get("special_tokens", {})) # init superclass super().__init__(cfg=cfg, trainer=trainer) # make vocabulary size divisible by 8 for fast fp16 training vocab_size = 8 * math.ceil(self.tokenizer.vocab_size / 8) # encoder from NeMo, Megatron-LM, or HuggingFace encoder_cfg_dict = OmegaConf.to_container(cfg.get('encoder')) encoder_cfg_dict['vocab_size'] = vocab_size library = encoder_cfg_dict.pop('library', 'nemo') model_name = encoder_cfg_dict.pop('model_name', None) pretrained = encoder_cfg_dict.pop('pretrained', False) self.encoder = get_transformer( library=library, model_name=model_name, pretrained=pretrained, config_dict=encoder_cfg_dict, encoder=True, pre_ln_final_layer_norm=encoder_cfg_dict.get( 'pre_ln_final_layer_norm', encoder_cfg_dict.get('pre_ln', True)), ) self.log_softmax = TokenClassifier( hidden_size=self.encoder.hidden_size, num_classes=vocab_size, activation=cfg.head.activation, log_softmax=cfg.head.log_softmax, dropout=cfg.head.dropout, use_transformer_init=cfg.head.use_transformer_init, ) # tie weights of embedding and softmax matrices self.log_softmax.mlp.layer0.weight = self.encoder.embedding.token_embedding.weight std_init_range = 1 / self.encoder.hidden_size**0.5 # initialize weights if not using pretrained encoder if not self._cfg.encoder.get('pretrained', False): self.encoder.apply(lambda module: transformer_weights_init( module, std_init_range)) self.log_softmax.apply( lambda module: transformer_weights_init(module, std_init_range)) self.loss_fn = SmoothedCrossEntropyLoss( pad_id=self.tokenizer.pad_id, label_smoothing=cfg.label_smoothing) self.eval_loss_fn = SmoothedCrossEntropyLoss( pad_id=self.tokenizer.pad_id) self.eval_loss = GlobalAverageLossMetric(dist_sync_on_step=False, take_avg_loss=True) self.eval_ppl = SequencePerplexity()
def __init__(self, cfg: DictConfig, trainer: Trainer = None): # Required loss function if not WARP_RNNT_AVAILABLE: raise ImportError( "Could not import `warprnnt_pytorch`.\n" "Please visit https://github.com/HawkAaron/warp-transducer " "and follow the steps in the readme to build and install the " "pytorch bindings for RNNT Loss, or use the provided docker " "container that supports RNN-T loss.") # Convert to Hydra 1.0 compatible DictConfig cfg = model_utils.convert_model_config_to_dict_config(cfg) cfg = model_utils.maybe_update_config_version(cfg) # Tokenizer is necessary for this model if 'tokenizer' not in cfg: raise ValueError( "`cfg` must have `tokenizer` config to create a tokenizer !") if not isinstance(cfg, DictConfig): cfg = OmegaConf.create(cfg) # Setup the tokenizer self._setup_tokenizer(cfg.tokenizer) # Initialize a dummy vocabulary vocabulary = self.tokenizer.tokenizer.get_vocab() # Set the new vocabulary with open_dict(cfg): cfg.labels = ListConfig(list(vocabulary)) with open_dict(cfg.decoder): cfg.decoder.vocab_size = len(vocabulary) with open_dict(cfg.joint): cfg.joint.num_classes = len(vocabulary) cfg.joint.vocabulary = ListConfig(list(vocabulary)) cfg.joint.jointnet.encoder_hidden = cfg.model_defaults.enc_hidden cfg.joint.jointnet.pred_hidden = cfg.model_defaults.pred_hidden super().__init__(cfg=cfg, trainer=trainer) # Setup decoding object self.decoding = RNNTBPEDecoding( decoding_cfg=self.cfg.decoding, decoder=self.decoder, joint=self.joint, tokenizer=self.tokenizer, ) # Setup wer object self.wer = RNNTBPEWER(decoding=self.decoding, batch_dim_index=0, use_cer=False, log_prediction=True, dist_sync_on_step=True) # Setup fused Joint step if flag is set if self.joint.fuse_loss_wer: self.joint.set_loss(self.loss) self.joint.set_wer(self.wer)