Exemple #1
0
class MTEncDecConfig(NemoConfig):
    name: Optional[str] = 'MTEncDec'
    do_training: bool = True
    do_testing: bool = False
    model: MTEncDecModelConfig = MTEncDecModelConfig()
    trainer: Optional[TrainerConfig] = TrainerConfig()
    exp_manager: Optional[ExpManagerConfig] = ExpManagerConfig(name='MTEncDec', files_to_copy=[])
Exemple #2
0
    def __init__(self, cfg: MTEncDecModelConfig, trainer: Trainer = None) -> None:
        self._cfg = cfg
        self.global_rank = 0
        self.world_size = 1
        if trainer is not None:
            self.global_rank = (trainer.node_rank * trainer.num_gpus) + trainer.local_rank
            self.world_size = trainer.num_nodes * trainer.num_gpus

        if hasattr(cfg, 'train_ds'):
            supported_tokenizers = ['yttm', 'huggingface', 'sentencepiece', 'megatron']
            supported_train_tokenizers = ['yttm', 'sentencepiece']

            if (
                cfg.encoder_tokenizer.get('library') not in supported_tokenizers
                or cfg.decoder_tokenizer.get('library') not in supported_tokenizers
            ):
                raise NotImplementedError(f"Currently we only support {supported_tokenizers}.")

            if cfg.get('shared_tokenizer') and cfg.encoder_tokenizer.get('library') != cfg.decoder_tokenizer.get(
                'library'
            ):
                raise ValueError("Shared tokenizers cannot be from different libraries.")

            # Prepare tokenizers
            if (
                cfg.encoder_tokenizer.get('library') in supported_train_tokenizers
                or cfg.decoder_tokenizer.get('library') in supported_train_tokenizers
            ):

                # Train tokenizer models if using yttm or sentencepiece and they don't exist
                if (
                    cfg.encoder_tokenizer.get('library') in supported_train_tokenizers
                    and cfg.encoder_tokenizer.get('tokenizer_model') is None
                ) or (
                    cfg.decoder_tokenizer.get('library') in supported_train_tokenizers
                    and cfg.decoder_tokenizer.get('tokenizer_model') is None
                ):
                    if cfg.get('preproc_out_dir') is None:
                        raise ValueError('Tokenizer model training required but cfg.preproc_out_dir is None.')
                    if cfg.train_ds.get('src_file_name') is None or cfg.train_ds.get('tgt_file_name') is None:
                        raise ValueError(
                            'src_file_name and tgt_file_name needed to train tokenizers but could not be found.'
                        )

                    src_fname = cfg.train_ds.get('src_file_name')
                    tgt_fname = cfg.train_ds.get('tgt_file_name')
                    src_language = cfg.get('src_language')
                    tgt_language = cfg.get('tgt_language')
                    spt_symbols = None
                    tempdir = tempfile.TemporaryDirectory()

                    if cfg.get('multilingual'):
                        spt_symbols = []
                        if isinstance(src_fname, ListConfig):
                            fnames = (" ").join(src_fname)
                            src_fname = os.path.join(tempdir.name, 'src.txt')
                            os.system('cat %s > %s' % (fnames, src_fname))

                        if isinstance(tgt_fname, ListConfig):
                            fnames = (" ").join(tgt_fname)
                            tgt_fname = os.path.join(tempdir.name, 'tgt.txt')
                            os.system('cat %s > %s' % (fnames, tgt_fname))

                        if isinstance(src_language, ListConfig):
                            for lng in src_language:
                                spt_symbols.append("<" + lng + ">")

                        if isinstance(tgt_language, ListConfig):
                            for lng in tgt_language:
                                spt_symbols.append("<" + lng + ">")

                    # train tokenizer model on training data
                    self.encoder_tokenizer_model, self.decoder_tokenizer_model = MTDataPreproc.train_tokenizers(
                        out_dir=cfg.get('preproc_out_dir'),
                        src_fname=src_fname,
                        tgt_fname=tgt_fname,
                        shared_tokenizer=cfg.get('shared_tokenizer'),
                        encoder_tokenizer_vocab_size=cfg.encoder_tokenizer.get('vocab_size'),
                        decoder_tokenizer_vocab_size=cfg.decoder_tokenizer.get('vocab_size'),
                        encoder_tokenizer_name=cfg.encoder_tokenizer.get('library'),
                        decoder_tokenizer_name=cfg.decoder_tokenizer.get('library'),
                        encoder_tokenizer_coverage=cfg.encoder_tokenizer.get('coverage', 0.999),
                        decoder_tokenizer_coverage=cfg.decoder_tokenizer.get('coverage', 0.999),
                        global_rank=self.global_rank,
                        encoder_training_sample_size=cfg.encoder_tokenizer.get('training_sample_size', -1),
                        decoder_training_sample_size=cfg.decoder_tokenizer.get('training_sample_size', -1),
                        encoder_special_tokens=OmegaConf.to_container(cfg.encoder_tokenizer.special_tokens)
                        if cfg.encoder_tokenizer.special_tokens
                        else None,
                        decoder_special_tokens=OmegaConf.to_container(cfg.decoder_tokenizer.special_tokens)
                        if cfg.decoder_tokenizer.special_tokens
                        else None,
                        spt_symbols=spt_symbols,
                        multilingual=cfg.get('multilingual', False),
                    )
                    # update config
                    self._cfg.encoder_tokenizer.tokenizer_model = self.encoder_tokenizer_model
                    self._cfg.decoder_tokenizer.tokenizer_model = self.decoder_tokenizer_model

                    tempdir.cleanup()
                else:
                    self.encoder_tokenizer_model = cfg.encoder_tokenizer.get('tokenizer_model')
                    self.decoder_tokenizer_model = cfg.decoder_tokenizer.get('tokenizer_model')

            self.encoder_tokenizer, self.decoder_tokenizer = self.get_enc_dec_tokenizers(
                encoder_tokenizer_name=cfg.encoder_tokenizer.get('library'),
                encoder_model_name=cfg.encoder.get('model_name'),
                encoder_tokenizer_model=getattr(self, "encoder_tokenizer_model", None),
                encoder_bpe_dropout=cfg.encoder_tokenizer.get('bpe_dropout', 0.0),
                encoder_r2l=cfg.encoder_tokenizer.get('r2l', False),
                decoder_tokenizer_name=cfg.decoder_tokenizer.get('library'),
                decoder_model_name=cfg.decoder.get('model_name'),
                decoder_tokenizer_model=getattr(self, "decoder_tokenizer_model", None),
                decoder_bpe_dropout=cfg.decoder_tokenizer.get('bpe_dropout', 0.0),
                decoder_r2l=cfg.decoder_tokenizer.get('r2l', False),
            )

            # If using tarred dataset for training, automatically create it if needed
            if cfg.train_ds.get('use_tarred_dataset'):
                if cfg.train_ds.get('tar_files') is None and cfg.train_ds.get('metadata_file') is None:
                    if cfg.get('preproc_out_dir') is None:
                        raise ValueError('Data preprocessing required but cfg.preproc_out_dir is None.')
                    if cfg.train_ds.get('src_file_name') is None or cfg.train_ds.get('tgt_file_name') is None:
                        raise ValueError(
                            'src_file_name and tgt_file_name needed to create tarred dataset but could not be found.'
                        )
                    # Preprocess data and cache for use during training
                    if self.global_rank == 0:
                        logging.info(
                            f"Creating tarred dataset for src: {cfg.train_ds.get('src_file_name')} and tgt: {cfg.train_ds.get('tgt_file_name')}"
                        )

                    if isinstance(cfg.train_ds.get('src_file_name'), str):
                        src_file_list = [cfg.train_ds.get('src_file_name')]
                        tgt_file_list = [cfg.train_ds.get('tgt_file_name')]
                        outdir_list = [cfg.get('preproc_out_dir')]
                    else:
                        src_file_list = cfg.train_ds.get('src_file_name')
                        tgt_file_list = cfg.train_ds.get('tgt_file_name')
                        if isinstance(cfg.get('src_language'), ListConfig):
                            langs = cfg.get('src_language')
                        elif isinstance(cfg.get('tgt_language'), ListConfig):
                            langs = cfg.get('tgt_language')
                        outdir_list = []
                        for lang in langs:
                            outdir_list.append(os.path.join(cfg.get('preproc_out_dir'), lang))

                    if len(src_file_list) != len(tgt_file_list) or len(src_file_list) != len(outdir_list):
                        raise ValueError(
                            "Number of source files, target files, and multilingual language pairs must be the same."
                        )

                    # TODO: have to get tokenizers instide .preprocess_parallel because they can't be pickled
                    metadata_file_list = []
                    for idx, src_file in enumerate(src_file_list):
                        self.train_tar_files, self.train_metadata_file = MTDataPreproc.preprocess_parallel_dataset(
                            clean=cfg.train_ds.clean,
                            src_fname=src_file,
                            tgt_fname=tgt_file_list[idx],
                            out_dir=outdir_list[idx],
                            encoder_tokenizer_name=cfg.encoder_tokenizer.get('library'),
                            encoder_model_name=cfg.encoder.get('model_name'),
                            encoder_tokenizer_model=self.encoder_tokenizer_model,
                            encoder_bpe_dropout=cfg.encoder_tokenizer.get('bpe_dropout', 0.0),
                            encoder_tokenizer_r2l=cfg.encoder_tokenizer.get('r2l', False),
                            decoder_tokenizer_name=cfg.decoder_tokenizer.get('library'),
                            decoder_model_name=cfg.decoder.get('model_name'),
                            decoder_tokenizer_model=self.decoder_tokenizer_model,
                            decoder_bpe_dropout=cfg.decoder_tokenizer.get('bpe_dropout', 0.0),
                            decoder_tokenizer_r2l=cfg.decoder_tokenizer.get('r2l', False),
                            max_seq_length=cfg.train_ds.get('max_seq_length', 512),
                            tokens_in_batch=cfg.train_ds.get('tokens_in_batch', 8192),
                            lines_per_dataset_fragment=cfg.train_ds.get('lines_per_dataset_fragment', 1000000),
                            num_batches_per_tarfile=cfg.train_ds.get('num_batches_per_tarfile', 1000),
                            min_seq_length=1,
                            global_rank=self.global_rank,
                            world_size=self.world_size,
                            n_jobs=cfg.train_ds.get('n_preproc_jobs', -2),
                            tar_file_prefix=cfg.train_ds.get('tar_file_prefix', 'parallel'),
                        )
                        metadata_file_list.append(self.train_metadata_file)
                    # update config
                    # self._cfg.train_ds.tar_files = self.tar_files_to_string(self.train_tar_files)
                    # self._cfg.train_ds.tar_files = self.train_tar_files
                    if isinstance(cfg.train_ds.get('metadata_file'), str):
                        self._cfg.train_ds.metadata_file = metadata_file_list[0]
                    else:
                        self._cfg.train_ds.metadata_file = metadata_file_list

                    logging.info(
                        f"Using tarred dataset created in folder(s) {outdir_list} and metadata created at {self._cfg.train_ds.metadata_file}"
                    )

                elif cfg.train_ds.get('tar_files') is not None and cfg.train_ds.get('metadata_file') is None:
                    raise ValueError('A metadata file is required for tarred dataset but cfg.metadata_file is None.')
                elif cfg.train_ds.get('tar_files') is None and cfg.train_ds.get('metadata_file') is not None:
                    if isinstance(cfg.train_ds.get('metadata_file'), str):
                        metadata_file_list = [cfg.train_ds.get('metadata_file')]
                    else:
                        metadata_file_list = cfg.train_ds.get('metadata_file')

                    for metadata_file in metadata_file_list:
                        with open(metadata_file) as metadata_reader:
                            metadata = json.load(metadata_reader)
                        if metadata['tar_files']:
                            logging.info(f"Using tarred dataset: {metadata['tar_files']}")
                        else:
                            raise ValueError(f'tar_files not provided and metadata does not have tar files')
                else:
                    self.train_tar_files = cfg.train_ds.get('tar_files')
                    self.train_metadata_file = cfg.train_ds.get('metadata_file')
                    logging.info(
                        f"Using tarred dataset from config at {self.train_tar_files} and metadata from {self.train_metadata_file}"
                    )
Exemple #3
0
    def __init__(self, cfg: MTEncDecModelConfig, trainer: Trainer = None):
        cfg = model_utils.convert_model_config_to_dict_config(cfg)
        # Get global rank and total number of GPU workers for IterableDataset partitioning, if applicable
        # Global_rank and local_rank is set by LightningModule in Lightning 1.2.0

        self.world_size = 1
        if trainer is not None:
            self.world_size = trainer.num_nodes * trainer.num_gpus

        cfg = model_utils.maybe_update_config_version(cfg)

        self.src_language: str = cfg.get("src_language", None)
        self.tgt_language: str = cfg.get("tgt_language", None)

        # Instantiates tokenizers and register to be saved with NeMo Model archive
        # After this call, ther will be self.encoder_tokenizer and self.decoder_tokenizer
        # Which can convert between tokens and token_ids for SRC and TGT languages correspondingly.
        self.setup_enc_dec_tokenizers(
            encoder_tokenizer_name=cfg.encoder_tokenizer.tokenizer_name,
            encoder_tokenizer_model=cfg.encoder_tokenizer.tokenizer_model,
            encoder_bpe_dropout=cfg.encoder_tokenizer.get('bpe_dropout', 0.0),
            decoder_tokenizer_name=cfg.decoder_tokenizer.tokenizer_name,
            decoder_tokenizer_model=cfg.decoder_tokenizer.tokenizer_model,
            decoder_bpe_dropout=cfg.decoder_tokenizer.get('bpe_dropout', 0.0),
        )

        # After this call, the model will have  self.source_processor and self.target_processor objects
        self.setup_pre_and_post_processing_utils(source_lang=self.src_language,
                                                 target_lang=self.tgt_language)

        # TODO: Why is this base constructor call so late in the game?
        super().__init__(cfg=cfg, trainer=trainer)

        # TODO: use get_encoder function with support for HF and Megatron
        self.encoder = TransformerEncoderNM(
            vocab_size=self.encoder_vocab_size,
            hidden_size=cfg.encoder.hidden_size,
            num_layers=cfg.encoder.num_layers,
            inner_size=cfg.encoder.inner_size,
            max_sequence_length=cfg.encoder.max_sequence_length if hasattr(
                cfg.encoder, 'max_sequence_length') else 512,
            embedding_dropout=cfg.encoder.embedding_dropout if hasattr(
                cfg.encoder, 'embedding_dropout') else 0.0,
            learn_positional_encodings=cfg.encoder.learn_positional_encodings
            if hasattr(cfg.encoder, 'learn_positional_encodings') else False,
            num_attention_heads=cfg.encoder.num_attention_heads,
            ffn_dropout=cfg.encoder.ffn_dropout,
            attn_score_dropout=cfg.encoder.attn_score_dropout,
            attn_layer_dropout=cfg.encoder.attn_layer_dropout,
            hidden_act=cfg.encoder.hidden_act,
            mask_future=cfg.encoder.mask_future,
            pre_ln=cfg.encoder.pre_ln,
        )

        # TODO: user get_decoder function with support for HF and Megatron
        self.decoder = TransformerDecoderNM(
            vocab_size=self.decoder_vocab_size,
            hidden_size=cfg.decoder.hidden_size,
            num_layers=cfg.decoder.num_layers,
            inner_size=cfg.decoder.inner_size,
            max_sequence_length=cfg.decoder.max_sequence_length if hasattr(
                cfg.decoder, 'max_sequence_length') else 512,
            embedding_dropout=cfg.decoder.embedding_dropout if hasattr(
                cfg.decoder, 'embedding_dropout') else 0.0,
            learn_positional_encodings=cfg.decoder.learn_positional_encodings
            if hasattr(cfg.decoder, 'learn_positional_encodings') else False,
            num_attention_heads=cfg.decoder.num_attention_heads,
            ffn_dropout=cfg.decoder.ffn_dropout,
            attn_score_dropout=cfg.decoder.attn_score_dropout,
            attn_layer_dropout=cfg.decoder.attn_layer_dropout,
            hidden_act=cfg.decoder.hidden_act,
            pre_ln=cfg.decoder.pre_ln,
        )

        self.log_softmax = TokenClassifier(
            hidden_size=self.decoder.hidden_size,
            num_classes=self.decoder_vocab_size,
            activation=cfg.head.activation,
            log_softmax=cfg.head.log_softmax,
            dropout=cfg.head.dropout,
            use_transformer_init=cfg.head.use_transformer_init,
        )

        self.beam_search = BeamSearchSequenceGenerator(
            embedding=self.decoder.embedding,
            decoder=self.decoder.decoder,
            log_softmax=self.log_softmax,
            max_sequence_length=self.decoder.max_sequence_length,
            beam_size=cfg.beam_size,
            bos=self.decoder_tokenizer.bos_id,
            pad=self.decoder_tokenizer.pad_id,
            eos=self.decoder_tokenizer.eos_id,
            len_pen=cfg.len_pen,
            max_delta_length=cfg.max_generation_delta,
        )

        # tie weights of embedding and softmax matrices
        self.log_softmax.mlp.layer0.weight = self.decoder.embedding.token_embedding.weight

        # TODO: encoder and decoder with different hidden size?
        std_init_range = 1 / self.encoder.hidden_size**0.5
        self.apply(
            lambda module: transformer_weights_init(module, std_init_range))

        self.loss_fn = SmoothedCrossEntropyLoss(
            pad_id=self.decoder_tokenizer.pad_id,
            label_smoothing=cfg.label_smoothing)
        self.eval_loss = GlobalAverageLossMetric(dist_sync_on_step=False,
                                                 take_avg_loss=True)
Exemple #4
0
    def __init__(self, cfg: MTEncDecModelConfig, trainer: Trainer = None):
        cfg = model_utils.convert_model_config_to_dict_config(cfg)
        # Get global rank and total number of GPU workers for IterableDataset partitioning, if applicable
        # Global_rank and local_rank is set by LightningModule in Lightning 1.2.0

        self.world_size = 1
        if trainer is not None:
            self.world_size = trainer.num_nodes * trainer.num_gpus

        cfg = model_utils.maybe_update_config_version(cfg)

        self.src_language = cfg.get("src_language", None)
        self.tgt_language = cfg.get("tgt_language", None)

        self.multilingual = cfg.get("multilingual", False)
        self.multilingual_ids = []

        self.encoder_tokenizer_library = cfg.encoder_tokenizer.get(
            'library', 'yttm')
        self.decoder_tokenizer_library = cfg.decoder_tokenizer.get(
            'library', 'yttm')

        # Instantiates tokenizers and register to be saved with NeMo Model archive
        # After this call, ther will be self.encoder_tokenizer and self.decoder_tokenizer
        # Which can convert between tokens and token_ids for SRC and TGT languages correspondingly.
        self.setup_enc_dec_tokenizers(
            encoder_tokenizer_library=self.encoder_tokenizer_library,
            encoder_tokenizer_model=cfg.encoder_tokenizer.get(
                'tokenizer_model'),
            encoder_bpe_dropout=cfg.encoder_tokenizer.get(
                'bpe_dropout', 0.0) if cfg.encoder_tokenizer.get(
                    'bpe_dropout', 0.0) is not None else 0.0,
            encoder_model_name=cfg.encoder.get('model_name') if hasattr(
                cfg.encoder, 'model_name') else None,
            encoder_r2l=cfg.encoder_tokenizer.get('r2l', False),
            decoder_tokenizer_library=self.decoder_tokenizer_library,
            encoder_tokenizer_vocab_file=cfg.encoder_tokenizer.get(
                'vocab_file', None),
            decoder_tokenizer_model=cfg.decoder_tokenizer.tokenizer_model,
            decoder_bpe_dropout=cfg.decoder_tokenizer.get(
                'bpe_dropout', 0.0) if cfg.decoder_tokenizer.get(
                    'bpe_dropout', 0.0) is not None else 0.0,
            decoder_model_name=cfg.decoder.get('model_name') if hasattr(
                cfg.decoder, 'model_name') else None,
            decoder_r2l=cfg.decoder_tokenizer.get('r2l', False),
        )

        if self.multilingual:
            if isinstance(self.src_language, ListConfig) and isinstance(
                    self.tgt_language, ListConfig):
                raise ValueError(
                    "cfg.src_language and cfg.tgt_language cannot both be lists. We only support many-to-one or one-to-many multilingual models."
                )
            elif isinstance(self.src_language, ListConfig):
                for lng in self.src_language:
                    self.multilingual_ids.append(
                        self.encoder_tokenizer.token_to_id("<" + lng + ">"))
            elif isinstance(self.tgt_language, ListConfig):
                for lng in self.tgt_language:
                    self.multilingual_ids.append(
                        self.encoder_tokenizer.token_to_id("<" + lng + ">"))
            else:
                raise ValueError(
                    "Expect either cfg.src_language or cfg.tgt_language to be a list when multilingual=True."
                )

            if isinstance(self.src_language, ListConfig):
                self.tgt_language = [self.tgt_language] * len(
                    self.src_language)
            else:
                self.src_language = [self.src_language] * len(
                    self.tgt_language)

            self.source_processor_list = []
            self.target_processor_list = []
            for src_lng, tgt_lng in zip(self.src_language, self.tgt_language):
                src_prcsr, tgt_prscr = self.setup_pre_and_post_processing_utils(
                    src_lng, tgt_lng)
                self.source_processor_list.append(src_prcsr)
                self.target_processor_list.append(tgt_prscr)

        else:
            # After this call, the model will have  self.source_processor and self.target_processor objects
            self.setup_pre_and_post_processing_utils(self.src_language,
                                                     self.tgt_language)
            self.multilingual_ids = [None]

        # TODO: Why is this base constructor call so late in the game?
        super().__init__(cfg=cfg, trainer=trainer)

        # encoder from NeMo, Megatron-LM, or HuggingFace
        encoder_cfg_dict = OmegaConf.to_container(cfg.get('encoder'))
        encoder_cfg_dict['vocab_size'] = self.encoder_vocab_size
        library = encoder_cfg_dict.pop('library', 'nemo')
        model_name = encoder_cfg_dict.pop('model_name', None)
        pretrained = encoder_cfg_dict.pop('pretrained', False)
        checkpoint_file = encoder_cfg_dict.pop('checkpoint_file', None)
        self.encoder = get_transformer(
            library=library,
            model_name=model_name,
            pretrained=pretrained,
            config_dict=encoder_cfg_dict,
            encoder=True,
            pre_ln_final_layer_norm=encoder_cfg_dict.get(
                'pre_ln_final_layer_norm', False),
            checkpoint_file=checkpoint_file,
        )

        # decoder from NeMo, Megatron-LM, or HuggingFace
        decoder_cfg_dict = OmegaConf.to_container(cfg.get('decoder'))
        decoder_cfg_dict['vocab_size'] = self.decoder_vocab_size
        library = decoder_cfg_dict.pop('library', 'nemo')
        model_name = decoder_cfg_dict.pop('model_name', None)
        pretrained = decoder_cfg_dict.pop('pretrained', False)
        decoder_cfg_dict['hidden_size'] = self.encoder.hidden_size
        self.decoder = get_transformer(
            library=library,
            model_name=model_name,
            pretrained=pretrained,
            config_dict=decoder_cfg_dict,
            encoder=False,
            pre_ln_final_layer_norm=decoder_cfg_dict.get(
                'pre_ln_final_layer_norm', False),
        )

        self.log_softmax = TokenClassifier(
            hidden_size=self.decoder.hidden_size,
            num_classes=self.decoder_vocab_size,
            activation=cfg.head.activation,
            log_softmax=cfg.head.log_softmax,
            dropout=cfg.head.dropout,
            use_transformer_init=cfg.head.use_transformer_init,
        )

        self.beam_search = BeamSearchSequenceGenerator(
            embedding=self.decoder.embedding,
            decoder=self.decoder.decoder,
            log_softmax=self.log_softmax,
            max_sequence_length=self.decoder.max_sequence_length,
            beam_size=cfg.beam_size,
            bos=self.decoder_tokenizer.bos_id,
            pad=self.decoder_tokenizer.pad_id,
            eos=self.decoder_tokenizer.eos_id,
            len_pen=cfg.len_pen,
            max_delta_length=cfg.max_generation_delta,
        )

        # tie weights of embedding and softmax matrices
        self.log_softmax.mlp.layer0.weight = self.decoder.embedding.token_embedding.weight

        # TODO: encoder and decoder with different hidden size?
        std_init_range = 1 / self.encoder.hidden_size**0.5

        # initialize weights if not using pretrained encoder/decoder
        if not self._cfg.encoder.get('pretrained', False):
            self.encoder.apply(lambda module: transformer_weights_init(
                module, std_init_range))

        if not self._cfg.decoder.get('pretrained', False):
            self.decoder.apply(lambda module: transformer_weights_init(
                module, std_init_range))

        self.log_softmax.apply(
            lambda module: transformer_weights_init(module, std_init_range))

        self.loss_fn = SmoothedCrossEntropyLoss(
            pad_id=self.decoder_tokenizer.pad_id,
            label_smoothing=cfg.label_smoothing)
        self.eval_loss_fn = NLLLoss(ignore_index=self.decoder_tokenizer.pad_id)
Exemple #5
0
    def __init__(self, cfg: MTEncDecModelConfig, trainer: Trainer = None) -> None:
        self._cfg = cfg
        self.global_rank = 0
        self.world_size = 1
        if trainer is not None:
            self.global_rank = (trainer.node_rank * trainer.num_gpus) + trainer.local_rank
            self.world_size = trainer.num_nodes * trainer.num_gpus

        if hasattr(cfg, 'train_ds'):
            supported_tokenizers = ['yttm', 'huggingface', 'sentencepiece']
            supported_train_tokenizers = ['yttm', 'sentencepiece']

            if (
                cfg.encoder_tokenizer.get('library') not in supported_tokenizers
                or cfg.decoder_tokenizer.get('library') not in supported_tokenizers
            ):
                raise NotImplementedError(f"Currently we only support {supported_tokenizers}.")

            if cfg.get('shared_tokenizer') and cfg.encoder_tokenizer.get('library') != cfg.decoder_tokenizer.get(
                'library'
            ):
                raise ValueError("Shared tokenizers cannot be from different libraries.")

            # Prepare tokenizers
            if (
                cfg.encoder_tokenizer.get('library') in supported_train_tokenizers
                or cfg.decoder_tokenizer.get('library') in supported_train_tokenizers
            ):

                # Train tokenizer models if using yttm or sentencepiece and they don't exist
                if (
                    cfg.encoder_tokenizer.get('library') in supported_train_tokenizers
                    and cfg.encoder_tokenizer.get('tokenizer_model') is None
                ) or (
                    cfg.decoder_tokenizer.get('library') in supported_train_tokenizers
                    and cfg.decoder_tokenizer.get('tokenizer_model') is None
                ):
                    if cfg.get('preproc_out_dir') is None:
                        raise ValueError('Tokenizer model training required but cfg.preproc_out_dir is None.')
                    if cfg.train_ds.get('src_file_name') is None or cfg.train_ds.get('tgt_file_name') is None:
                        raise ValueError(
                            'src_file_name and tgt_file_name needed to train tokenizers but could not be found.'
                        )
                    # train tokenizer model on training data
                    self.encoder_tokenizer_model, self.decoder_tokenizer_model = MTDataPreproc.train_tokenizers(
                        out_dir=cfg.get('preproc_out_dir'),
                        src_fname=cfg.train_ds.get('src_file_name'),
                        tgt_fname=cfg.train_ds.get('tgt_file_name'),
                        shared_tokenizer=cfg.get('shared_tokenizer'),
                        encoder_tokenizer_vocab_size=cfg.encoder_tokenizer.get('vocab_size'),
                        decoder_tokenizer_vocab_size=cfg.decoder_tokenizer.get('vocab_size'),
                        encoder_tokenizer_name=cfg.encoder_tokenizer.get('library'),
                        decoder_tokenizer_name=cfg.decoder_tokenizer.get('library'),
                        encoder_tokenizer_coverage=cfg.encoder_tokenizer.get('coverage', 0.999),
                        decoder_tokenizer_coverage=cfg.decoder_tokenizer.get('coverage', 0.999),
                        global_rank=self.global_rank,
                        encoder_training_sample_size=cfg.encoder_tokenizer.get('training_sample_size', -1),
                        decoder_training_sample_size=cfg.decoder_tokenizer.get('training_sample_size', -1),
                        encoder_special_tokens=OmegaConf.to_container(cfg.encoder_tokenizer.special_tokens)
                        if cfg.encoder_tokenizer.special_tokens
                        else None,
                        decoder_special_tokens=OmegaConf.to_container(cfg.decoder_tokenizer.special_tokens)
                        if cfg.decoder_tokenizer.special_tokens
                        else None,
                    )
                    # update config
                    self._cfg.encoder_tokenizer.tokenizer_model = self.encoder_tokenizer_model
                    self._cfg.decoder_tokenizer.tokenizer_model = self.decoder_tokenizer_model
                else:
                    self.encoder_tokenizer_model = cfg.encoder_tokenizer.get('tokenizer_model')
                    self.decoder_tokenizer_model = cfg.decoder_tokenizer.get('tokenizer_model')

            self.encoder_tokenizer, self.decoder_tokenizer = self.get_enc_dec_tokenizers(
                encoder_tokenizer_name=cfg.encoder_tokenizer.get('library'),
                encoder_model_name=cfg.encoder.get('model_name'),
                encoder_tokenizer_model=self.encoder_tokenizer_model,
                encoder_bpe_dropout=cfg.encoder_tokenizer.get('bpe_dropout', 0.0),
                decoder_tokenizer_name=cfg.decoder_tokenizer.get('library'),
                decoder_model_name=cfg.decoder.get('model_name'),
                decoder_tokenizer_model=self.decoder_tokenizer_model,
                decoder_bpe_dropout=cfg.decoder_tokenizer.get('bpe_dropout', 0.0),
            )

            # If using tarred dataset for training, automatically create it if needed
            if cfg.train_ds.get('use_tarred_dataset'):
                if cfg.train_ds.get('tar_files') is None and cfg.train_ds.get('metadata_file') is None:
                    if cfg.get('preproc_out_dir') is None:
                        raise ValueError('Data preprocessing required but cfg.preproc_out_dir is None.')
                    if cfg.train_ds.get('src_file_name') is None or cfg.train_ds.get('tgt_file_name') is None:
                        raise ValueError(
                            'src_file_name and tgt_file_name needed to create tarred dataset but could not be found.'
                        )
                    # Preprocess data and cache for use during training
                    if self.global_rank == 0:
                        logging.info(
                            f"Using tarred dataset for src: {cfg.train_ds.get('src_file_name')} and tgt: {cfg.train_ds.get('tgt_file_name')}"
                        )
                    # TODO: have to get tokenizers instide .preprocess_parallel because they can't be pickled
                    self.train_tar_files, self.train_metadata_file = MTDataPreproc.preprocess_parallel_dataset(
                        clean=cfg.train_ds.clean,
                        src_fname=cfg.train_ds.get('src_file_name'),
                        tgt_fname=cfg.train_ds.get('tgt_file_name'),
                        out_dir=cfg.get('preproc_out_dir'),
                        encoder_tokenizer_name=cfg.encoder_tokenizer.get('library'),
                        encoder_model_name=cfg.encoder.get('model_name'),
                        encoder_tokenizer_model=self.encoder_tokenizer_model,
                        encoder_bpe_dropout=cfg.encoder_tokenizer.get('bpe_dropout', 0.0),
                        decoder_tokenizer_name=cfg.decoder_tokenizer.get('library'),
                        decoder_model_name=cfg.decoder.get('model_name'),
                        decoder_tokenizer_model=self.decoder_tokenizer_model,
                        decoder_bpe_dropout=cfg.decoder_tokenizer.get('bpe_dropout', 0.0),
                        max_seq_length=cfg.train_ds.get('max_seq_length', 512),
                        tokens_in_batch=cfg.train_ds.get('tokens_in_batch', 8192),
                        lines_per_dataset_fragment=cfg.train_ds.get('lines_per_dataset_fragment', 1000000),
                        num_batches_per_tarfile=cfg.train_ds.get('num_batches_per_tarfile', 1000),
                        min_seq_length=1,
                        global_rank=self.global_rank,
                        world_size=self.world_size,
                        n_jobs=cfg.train_ds.get('n_preproc_jobs', -2),
                        tar_file_prefix=cfg.train_ds.get('tar_file_prefix', 'parallel'),
                    )
                    # update config
                    # self._cfg.train_ds.tar_files = self.tar_files_to_string(self.train_tar_files)
                    # self._cfg.train_ds.tar_files = self.train_tar_files
                    self._cfg.train_ds.metadata_file = self.train_metadata_file
                    logging.info(
                        f"Using tarred dataset created at {self.train_tar_files} and metadata created at {self._cfg.train_ds.metadata_file}"
                    )
                elif cfg.train_ds.get('tar_files') is not None and cfg.train_ds.get('metadata_file') is None:
                    raise ValueError('A metadata file is required for tarred dataset but cfg.metadata_file is None.')
                elif cfg.train_ds.get('tar_files') is None and cfg.train_ds.get('metadata_file') is not None:
                    metadata = json.load(cfg.train_ds.get('metadata_file'))
                    if metadata['train_tar_files']:
                        logging.info(f"Using tarred dataset: {metadata['train_tar_files']}")
                    else:
                        raise ValueError(f'tar_files not provided and metadata does not have tar files')
                else:
                    self.train_tar_files = cfg.train_ds.get('tar_files')
                    self.train_metadata_file = cfg.train_ds.get('metadata_file')
                    logging.info(
                        f"Using tarred dataset from config at {self.train_tar_files} and metadata from {self.train_metadata_file}"
                    )
Exemple #6
0
    def __init__(self, cfg: MTEncDecModelConfig, trainer: Trainer = None):
        cfg = model_utils.convert_model_config_to_dict_config(cfg)
        # Get global rank and total number of GPU workers for IterableDataset partitioning, if applicable
        self.global_rank = 0
        self.world_size = 1
        if trainer is not None:
            self.global_rank = (trainer.node_rank * trainer.num_gpus) + trainer.local_rank
            self.world_size = trainer.num_nodes * trainer.num_gpus

        cfg = model_utils.maybe_update_config_version(cfg)
        self.setup_enc_dec_tokenizers(cfg)

        super().__init__(cfg=cfg, trainer=trainer)

        self.src_language: str = cfg.get("src_language", None)
        self.tgt_language: str = cfg.get("tgt_language", None)

        # TODO: use get_encoder function with support for HF and Megatron
        self.encoder = TransformerEncoderNM(
            vocab_size=self.encoder_vocab_size,
            hidden_size=cfg.encoder.hidden_size,
            num_layers=cfg.encoder.num_layers,
            inner_size=cfg.encoder.inner_size,
            max_sequence_length=cfg.encoder.max_sequence_length
            if hasattr(cfg.encoder, 'max_sequence_length')
            else 512,
            embedding_dropout=cfg.encoder.embedding_dropout if hasattr(cfg.encoder, 'embedding_dropout') else 0.0,
            learn_positional_encodings=cfg.encoder.learn_positional_encodings
            if hasattr(cfg.encoder, 'learn_positional_encodings')
            else False,
            num_attention_heads=cfg.encoder.num_attention_heads,
            ffn_dropout=cfg.encoder.ffn_dropout,
            attn_score_dropout=cfg.encoder.attn_score_dropout,
            attn_layer_dropout=cfg.encoder.attn_layer_dropout,
            hidden_act=cfg.encoder.hidden_act,
            mask_future=cfg.encoder.mask_future,
            pre_ln=cfg.encoder.pre_ln,
        )

        # TODO: user get_decoder function with support for HF and Megatron
        self.decoder = TransformerDecoderNM(
            vocab_size=self.decoder_vocab_size,
            hidden_size=cfg.decoder.hidden_size,
            num_layers=cfg.decoder.num_layers,
            inner_size=cfg.decoder.inner_size,
            max_sequence_length=cfg.decoder.max_sequence_length
            if hasattr(cfg.decoder, 'max_sequence_length')
            else 512,
            embedding_dropout=cfg.decoder.embedding_dropout if hasattr(cfg.decoder, 'embedding_dropout') else 0.0,
            learn_positional_encodings=cfg.decoder.learn_positional_encodings
            if hasattr(cfg.decoder, 'learn_positional_encodings')
            else False,
            num_attention_heads=cfg.decoder.num_attention_heads,
            ffn_dropout=cfg.decoder.ffn_dropout,
            attn_score_dropout=cfg.decoder.attn_score_dropout,
            attn_layer_dropout=cfg.decoder.attn_layer_dropout,
            hidden_act=cfg.decoder.hidden_act,
            pre_ln=cfg.decoder.pre_ln,
        )

        self.log_softmax = TokenClassifier(
            hidden_size=self.decoder.hidden_size,
            num_classes=self.decoder_vocab_size,
            activation=cfg.head.activation,
            log_softmax=cfg.head.log_softmax,
            dropout=cfg.head.dropout,
            use_transformer_init=cfg.head.use_transformer_init,
        )

        self.beam_search = BeamSearchSequenceGenerator(
            embedding=self.decoder.embedding,
            decoder=self.decoder.decoder,
            log_softmax=self.log_softmax,
            max_sequence_length=self.decoder.max_sequence_length,
            beam_size=cfg.beam_size,
            bos=self.decoder_tokenizer.bos_id,
            pad=self.decoder_tokenizer.pad_id,
            eos=self.decoder_tokenizer.eos_id,
            len_pen=cfg.len_pen,
            max_delta_length=cfg.max_generation_delta,
        )

        # tie weights of embedding and softmax matrices
        self.log_softmax.mlp.layer0.weight = self.decoder.embedding.token_embedding.weight

        # TODO: encoder and decoder with different hidden size?
        std_init_range = 1 / self.encoder.hidden_size ** 0.5
        self.apply(lambda module: transformer_weights_init(module, std_init_range))

        self.loss_fn = SmoothedCrossEntropyLoss(
            pad_id=self.decoder_tokenizer.pad_id, label_smoothing=cfg.label_smoothing
        )
        self.eval_loss = GlobalAverageLossMetric(dist_sync_on_step=False, take_avg_loss=True)
Exemple #7
0
    def __init__(self, cfg: MTEncDecModelConfig, trainer: Trainer = None):
        cfg = model_utils.convert_model_config_to_dict_config(cfg)
        # Get global rank and total number of GPU workers for IterableDataset partitioning, if applicable
        # Global_rank and local_rank is set by LightningModule in Lightning 1.2.0

        self.world_size = 1
        if trainer is not None:
            self.world_size = trainer.num_nodes * trainer.num_gpus

        cfg = model_utils.maybe_update_config_version(cfg)

        self.src_language: str = cfg.get("src_language", None)
        self.tgt_language: str = cfg.get("tgt_language", None)

        # Instantiates tokenizers and register to be saved with NeMo Model archive
        # After this call, ther will be self.encoder_tokenizer and self.decoder_tokenizer
        # Which can convert between tokens and token_ids for SRC and TGT languages correspondingly.
        self.setup_enc_dec_tokenizers(
            encoder_tokenizer_library=cfg.encoder_tokenizer.get('library', 'yttm'),
            encoder_tokenizer_model=cfg.encoder_tokenizer.get('tokenizer_model'),
            encoder_bpe_dropout=cfg.encoder_tokenizer.get('bpe_dropout', 0.0),
            encoder_model_name=cfg.encoder.get('model_name') if hasattr(cfg.encoder, 'model_name') else None,
            decoder_tokenizer_library=cfg.decoder_tokenizer.get('library', 'yttm'),
            decoder_tokenizer_model=cfg.decoder_tokenizer.tokenizer_model,
            decoder_bpe_dropout=cfg.decoder_tokenizer.get('bpe_dropout', 0.0),
            decoder_model_name=cfg.decoder.get('model_name') if hasattr(cfg.decoder, 'model_name') else None,
        )

        # After this call, the model will have  self.source_processor and self.target_processor objects
        self.setup_pre_and_post_processing_utils(source_lang=self.src_language, target_lang=self.tgt_language)

        # TODO: Why is this base constructor call so late in the game?
        super().__init__(cfg=cfg, trainer=trainer)

        # encoder from NeMo, Megatron-LM, or HuggingFace
        encoder_cfg_dict = OmegaConf.to_container(cfg.get('encoder'))
        encoder_cfg_dict['vocab_size'] = self.encoder_vocab_size
        library = encoder_cfg_dict.pop('library', 'nemo')
        model_name = encoder_cfg_dict.pop('model_name', None)
        pretrained = encoder_cfg_dict.pop('pretrained', False)
        self.encoder = get_transformer(
            library=library, model_name=model_name, pretrained=pretrained, config_dict=encoder_cfg_dict, encoder=True,
        )

        # decoder from NeMo, Megatron-LM, or HuggingFace
        decoder_cfg_dict = OmegaConf.to_container(cfg.get('decoder'))
        decoder_cfg_dict['vocab_size'] = self.decoder_vocab_size
        library = decoder_cfg_dict.pop('library', 'nemo')
        model_name = decoder_cfg_dict.pop('model_name', None)
        pretrained = decoder_cfg_dict.pop('pretrained', False)
        decoder_cfg_dict['hidden_size'] = self.encoder.hidden_size
        self.decoder = get_transformer(
            library=library, model_name=model_name, pretrained=pretrained, config_dict=decoder_cfg_dict, encoder=False,
        )

        self.log_softmax = TokenClassifier(
            hidden_size=self.decoder.hidden_size,
            num_classes=self.decoder_vocab_size,
            activation=cfg.head.activation,
            log_softmax=cfg.head.log_softmax,
            dropout=cfg.head.dropout,
            use_transformer_init=cfg.head.use_transformer_init,
        )

        self.beam_search = BeamSearchSequenceGenerator(
            embedding=self.decoder.embedding,
            decoder=self.decoder.decoder,
            log_softmax=self.log_softmax,
            max_sequence_length=self.decoder.max_sequence_length,
            beam_size=cfg.beam_size,
            bos=self.decoder_tokenizer.bos_id,
            pad=self.decoder_tokenizer.pad_id,
            eos=self.decoder_tokenizer.eos_id,
            len_pen=cfg.len_pen,
            max_delta_length=cfg.max_generation_delta,
        )

        # tie weights of embedding and softmax matrices
        self.log_softmax.mlp.layer0.weight = self.decoder.embedding.token_embedding.weight

        # TODO: encoder and decoder with different hidden size?
        std_init_range = 1 / self.encoder.hidden_size ** 0.5

        # initialize weights if not using pretrained encoder/decoder
        if not self._cfg.encoder.get('pretrained', False):
            self.encoder.apply(lambda module: transformer_weights_init(module, std_init_range))

        if not self._cfg.decoder.get('pretrained', False):
            self.decoder.apply(lambda module: transformer_weights_init(module, std_init_range))

        self.log_softmax.apply(lambda module: transformer_weights_init(module, std_init_range))

        self.loss_fn = SmoothedCrossEntropyLoss(
            pad_id=self.decoder_tokenizer.pad_id, label_smoothing=cfg.label_smoothing
        )
        self.eval_loss = GlobalAverageLossMetric(dist_sync_on_step=False, take_avg_loss=True)
Exemple #8
0
    def __init__(self,
                 cfg: MTEncDecModelConfig,
                 trainer: Trainer = None) -> None:
        self._cfg = cfg
        self.global_rank = 0
        self.world_size = 1
        if trainer is not None:
            self.global_rank = (trainer.node_rank *
                                trainer.num_gpus) + trainer.local_rank
            self.world_size = trainer.num_nodes * trainer.num_gpus

        if hasattr(cfg, 'train_ds'):
            if (cfg.encoder_tokenizer.get('tokenizer_name') != 'yttm'
                    or cfg.decoder_tokenizer.get('tokenizer_name') != 'yttm'):
                raise NotImplementedError(
                    f"Currently we only support yttm tokenizer.")

            # Train tokenizer models if they don't exist
            if (cfg.encoder_tokenizer.get('tokenizer_model') is None
                    or cfg.decoder_tokenizer.get('tokenizer_model') is None):
                if cfg.get('preproc_out_dir') is None:
                    raise ValueError(
                        'Tokenizer model training required but cfg.preproc_out_dir is None.'
                    )
                if cfg.train_ds.get(
                        'src_file_name') is None or cfg.train_ds.get(
                            'tgt_file_name') is None:
                    raise ValueError(
                        'src_file_name and tgt_file_name needed to train tokenizers but could not be found.'
                    )
                # train tokenizer model on training data
                self.encoder_tokenizer_model, self.decoder_tokenizer_model = MTDataPreproc.train_tokenizers(
                    out_dir=cfg.get('preproc_out_dir'),
                    src_fname=cfg.train_ds.get('src_file_name'),
                    tgt_fname=cfg.train_ds.get('tgt_file_name'),
                    shared_tokenizer=cfg.get('shared_tokenizer'),
                    encoder_tokenizer_vocab_size=cfg.encoder_tokenizer.get(
                        'vocab_size'),
                    decoder_tokenizer_vocab_size=cfg.decoder_tokenizer.get(
                        'vocab_size'),
                    encoder_tokenizer_name=cfg.encoder_tokenizer.get(
                        'tokenizer_name'),
                    decoder_tokenizer_name=cfg.decoder_tokenizer.get(
                        'tokenizer_name'),
                    encoder_tokenizer_coverage=cfg.encoder_tokenizer.get(
                        'coverage', 0.999),
                    decoder_tokenizer_coverage=cfg.decoder_tokenizer.get(
                        'coverage', 0.999),
                    global_rank=self.global_rank,
                )
                # update config
                self._cfg.encoder_tokenizer.tokenizer_model = self.encoder_tokenizer_model
                self._cfg.decoder_tokenizer.tokenizer_model = self.decoder_tokenizer_model
            else:
                self.encoder_tokenizer_model = cfg.encoder_tokenizer.get(
                    'tokenizer_model')
                self.decoder_tokenizer_model = cfg.decoder_tokenizer.get(
                    'tokenizer_model')

            self.encoder_tokenizer, self.decoder_tokenizer = self.get_enc_dec_tokenizers(
                encoder_tokenizer_name=cfg.encoder_tokenizer.get(
                    'tokenizer_name'),
                encoder_tokenizer_model=self.encoder_tokenizer_model,
                encoder_bpe_dropout=cfg.encoder_tokenizer.get(
                    'bpe_dropout', 0.0),
                decoder_tokenizer_name=cfg.decoder_tokenizer.get(
                    'tokenizer_name'),
                decoder_tokenizer_model=self.decoder_tokenizer_model,
                decoder_bpe_dropout=cfg.decoder_tokenizer.get(
                    'bpe_dropout', 0.0),
            )

            # If using tarred dataset for training, automatically create it if needed
            if cfg.train_ds.get('use_tarred_dataset'):
                if cfg.train_ds.get('tar_files') is None or cfg.train_ds.get(
                        'metadata_file') is None:
                    if cfg.get('preproc_out_dir') is None:
                        raise ValueError(
                            'Data preprocessing required but cfg.preproc_out_dir is None.'
                        )
                    if cfg.train_ds.get(
                            'src_file_name') is None or cfg.train_ds.get(
                                'tgt_file_name') is None:
                        raise ValueError(
                            'src_file_name and tgt_file_name needed to create tarred dataset but could not be found.'
                        )
                    # Preprocess data and cache for use during training
                    if self.global_rank == 0:
                        logging.info(
                            f"Using tarred dataset for src {cfg.train_ds.get('src_file_name')} and tgt {cfg.train_ds.get('tgt_file_name')}"
                        )
                    self.train_tar_files, self.train_metadata_file = MTDataPreproc.preprocess_parallel_dataset(
                        clean=cfg.train_ds.clean,
                        src_fname=cfg.train_ds.get('src_file_name'),
                        tgt_fname=cfg.train_ds.get('tgt_file_name'),
                        out_dir=cfg.get('preproc_out_dir'),
                        encoder_tokenizer=self.encoder_tokenizer,
                        decoder_tokenizer=self.decoder_tokenizer,
                        max_seq_length=cfg.train_ds.get('max_seq_length', 512),
                        tokens_in_batch=cfg.train_ds.get(
                            'tokens_in_batch', 8192),
                        lines_per_dataset_fragment=cfg.train_ds.get(
                            'lines_per_dataset_fragment', 1000000),
                        num_batches_per_tarfile=cfg.train_ds.get(
                            'num_batches_per_tarfile', 1000),
                        min_seq_length=1,
                        pkl_file_prefix=cfg.train_ds.get(
                            'pkl_file_preifx', 'parallel'),
                        global_rank=self.global_rank,
                        world_size=self.world_size,
                    )
                    # update config
                    self._cfg.train_ds.tar_files = self.tar_files_to_string(
                        self.train_tar_files)
                    self._cfg.train_ds.metadata_file = self.train_metadata_file
                    logging.info(
                        f"Using tarred dataset created at {self._cfg.train_ds.tar_files} and metadata created at {self._cfg.train_ds.metadata_file}"
                    )
                else:
                    self.train_tar_files = cfg.train_ds.get('tar_files')
                    self.train_metadata_file = cfg.train_ds.get(
                        'metadata_file')
                    logging.info(
                        f"Using tarred dataset from config at {self.train_tar_files} and metadata from {self.train_metadata_file}"
                    )