Esempio n. 1
0
def main(cfg: MTEncDecConfig) -> None:
    # merge default config with user specified config
    default_cfg = MTEncDecConfig()
    cfg = update_model_config(default_cfg, cfg)
    logging.info("\n\n************** Experiment configuration ***********")
    logging.info(f'Config: {cfg.pretty()}')

    # training is managed by PyTorch Lightning
    trainer = Trainer(**cfg.trainer)

    # tokenizers will be trained and and tarred training data will be created if needed
    # model config is then updated
    MTDataPreproc(cfg=cfg.model, trainer=trainer)

    if cfg.do_training:
        # experiment logs, checkpoints, and auto-resume are managed by exp_manager and PyTorch Lightning
        exp_manager(trainer, cfg.exp_manager)

        # everything needed to train translation models is encapsulated in the NeMo MTEncdDecModel
        mt_model = MTEncDecModel(cfg.model, trainer=trainer)

        logging.info("\n\n************** Model parameters and their sizes ***********")
        for name, param in mt_model.named_parameters():
            print(name, param.size())
        logging.info("***********************************************************\n\n")

        trainer.fit(mt_model)
Esempio n. 2
0
 def build_train_valid_test_datasets(self):
     self._train_ds = MTEncDecModel._setup_dataset_from_config(
         cfg=self._cfg.train_ds,
         encoder_tokenizer=self.encoder_tokenizer,
         decoder_tokenizer=self.decoder_tokenizer,
         global_rank=parallel_state.get_data_parallel_rank(),
         world_size=parallel_state.get_data_parallel_world_size(),
         multilingual=self.multilingual,
         multilingual_ids=self.multilingual_ids,
     )
     self._validation_ds = MTEncDecModel._setup_eval_dataset_from_config(
         cfg=self._cfg.validation_ds,
         multilingual=self.multilingual,
         multilingual_ids=self.multilingual_ids,
         encoder_tokenizer=self.encoder_tokenizer,
         decoder_tokenizer=self.decoder_tokenizer,
     )
     # Test data config is optional.
     if hasattr(self._cfg, 'test_ds'):
         self._test_ds = MTEncDecModel._setup_eval_dataset_from_config(
             cfg=self._cfg.validation_ds,
             multilingual=self.multilingual,
             multilingual_ids=self.multilingual_ids,
             encoder_tokenizer=self.encoder_tokenizer,
             decoder_tokenizer=self.decoder_tokenizer,
         )
Esempio n. 3
0
    def build_train_valid_test_datasets(self):
        """Builds the train, validation, and test datasets."""

        # Builds datasets if the type is tarred or from raw text without memmap.
        if self._cfg.train_ds.dataset_type in ['tarred', 'text']:
            self._train_ds = self.build_tarred_train_dataset()
        elif self._cfg.train_ds.dataset_type in ['bin_memmap', 'text_memmap']:
            self._train_ds = self.build_memmap_dataset_from_config(
                self._cfg.train_ds)

        if self._cfg.validation_ds.get("dataset_type", "text") != "text":
            raise ValueError(
                f"Validation dataset type must be 'text', found {self._cfg.validation_ds.dataset_type}"
            )

        self._validation_ds = MTEncDecModel._setup_eval_dataset_from_config(
            cfg=self._cfg.validation_ds,
            multilingual=self.multilingual,
            multilingual_ids=self.multilingual_ids,
            encoder_tokenizer=self.encoder_tokenizer,
            decoder_tokenizer=self.decoder_tokenizer,
        )
        # Test data config is optional.
        if hasattr(self._cfg, 'test_ds'):
            if self._cfg.validation_ds.get("dataset_type", "text") != "text":
                raise ValueError(
                    f"Test dataset type must be 'text', found {self._cfg.test_ds.dataset_type}"
                )
            self._test_ds = MTEncDecModel._setup_eval_dataset_from_config(
                cfg=self._cfg.validation_ds,
                multilingual=self.multilingual,
                multilingual_ids=self.multilingual_ids,
                encoder_tokenizer=self.encoder_tokenizer,
                decoder_tokenizer=self.decoder_tokenizer,
            )
Esempio n. 4
0
def translate(rank, world_size, args):
    if args.model.endswith(".nemo"):
        logging.info("Attempting to initialize from .nemo file")
        model = MTEncDecModel.restore_from(restore_path=args.model)
    elif args.model.endswith(".ckpt"):
        logging.info("Attempting to initialize from .ckpt file")
        model = MTEncDecModel.load_from_checkpoint(checkpoint_path=args.model)
    model.replace_beam_with_sampling(topk=args.topk)
    model.eval()
    model.to(rank)
    if args.twoside:
        dataset = TarredTranslationDataset(
            text_tar_filepaths=args.text2translate,
            metadata_path=args.metadata_path,
            encoder_tokenizer=model.encoder_tokenizer,
            decoder_tokenizer=model.decoder_tokenizer,
            shuffle_n=100,
            shard_strategy="scatter",
            world_size=world_size,
            global_rank=rank,
            reverse_lang_direction=args.reverse_lang_direction,
        )
    else:
        dataset = TarredOneSideTranslationDataset(
            text_tar_filepaths=args.text2translate,
            metadata_path=args.metadata_path,
            tokenizer=model.encoder_tokenizer,
            shuffle_n=100,
            shard_strategy="scatter",
            world_size=world_size,
            global_rank=rank,
        )
    loader = DataLoader(dataset, batch_size=1)
    result_dir = os.path.join(args.result_dir, f'rank{rank}')
    os.makedirs(result_dir, exist_ok=True)
    originals_file_name = os.path.join(result_dir, 'originals.txt')
    translations_file_name = os.path.join(result_dir, 'translations.txt')
    num_translated_sentences = 0

    with open(originals_file_name, 'w') as of, open(translations_file_name,
                                                    'w') as tf:
        for batch_idx, batch in enumerate(loader):
            for i in range(len(batch)):
                if batch[i].ndim == 3:
                    batch[i] = batch[i].squeeze(dim=0)
                batch[i] = batch[i].to(rank)
            if args.twoside:
                src_ids, src_mask, _, _, _ = batch
            else:
                src_ids, src_mask = batch
            if batch_idx % 100 == 0:
                logging.info(
                    f"{batch_idx} batches ({num_translated_sentences} sentences) were translated by process with "
                    f"rank {rank}")
            num_translated_sentences += len(src_ids)
            inputs, translations = model.batch_translate(src=src_ids,
                                                         src_mask=src_mask)
            for src, translation in zip(inputs, translations):
                of.write(src + '\n')
                tf.write(translation + '\n')
Esempio n. 5
0
    def _build_tokenizer(self):
        # Instantiates tokenizers and register to be saved with NeMo Model archive
        # After this call, there will be self.encoder_tokenizer and self.decoder_tokenizer
        # Which can convert between tokens and token_ids for SRC and TGT languages correspondingly.
        encoder_tokenizer_model = self.register_artifact(
            "encoder_tokenizer.tokenizer_model",
            self._cfg.encoder_tokenizer.get('tokenizer_model'))
        decoder_tokenizer_model = self.register_artifact(
            "decoder_tokenizer.tokenizer_model",
            self._cfg.decoder_tokenizer.get('tokenizer_model'))

        self.encoder_tokenizer, self.decoder_tokenizer = MTEncDecModel.setup_enc_dec_tokenizers(
            encoder_tokenizer_library=self.encoder_tokenizer_library,
            encoder_tokenizer_model=encoder_tokenizer_model,
            encoder_bpe_dropout=self._cfg.encoder_tokenizer.get(
                'bpe_dropout', 0.0) if self._cfg.encoder_tokenizer.get(
                    'bpe_dropout', 0.0) is not None else 0.0,
            encoder_model_name=None,
            encoder_r2l=self._cfg.encoder_tokenizer.get('r2l', False),
            decoder_tokenizer_library=self.decoder_tokenizer_library,
            encoder_tokenizer_vocab_file=self._cfg.encoder_tokenizer.get(
                'vocab_file', None),
            decoder_tokenizer_model=decoder_tokenizer_model,
            decoder_bpe_dropout=self._cfg.decoder_tokenizer.get(
                'bpe_dropout', 0.0) if self._cfg.decoder_tokenizer.get(
                    'bpe_dropout', 0.0) is not None else 0.0,
            decoder_model_name=None,
            decoder_r2l=self._cfg.decoder_tokenizer.get('r2l', False),
            special_tokens=self.special_tokens,
            encoder_sentencepiece_legacy=self._cfg.encoder_tokenizer.get(
                'sentencepiece_legacy', False),
            decoder_sentencepiece_legacy=self._cfg.decoder_tokenizer.get(
                'sentencepiece_legacy', False),
        )

        # Set up pre and post processors as well.
        if self.multilingual:
            (
                self.source_processor_list,
                self.target_processor_list,
                self.multilingual_ids,
            ) = MTEncDecModel.setup_multilingual_ids_and_processors(
                src_language=self.src_language,
                tgt_language=self.tgt_language,
                tokenizer=self.
                encoder_tokenizer,  # Multilingual training requires shared tokenizers.
                tokenizer_library=self.encoder_tokenizer_library,
            )
        else:
            # After this call, the model will have  self.source_processor and self.target_processor objects
            self.source_processor, self.target_processor = MTEncDecModel.setup_pre_and_post_processing_utils(
                self.src_language,
                self.tgt_language,
                self.encoder_tokenizer_library,
                self.decoder_tokenizer_library,
            )
            self.multilingual_ids = [None]
Esempio n. 6
0
def main(cfg: MTEncDecConfig) -> None:
    # # merge default config with user specified config
    default_cfg = MTEncDecConfig()
    cfg = update_model_config(default_cfg, cfg)
    logging.info("\n\n************** Experiment configuration ***********")
    logging.info(f'Config: {cfg.pretty()}')

    trainer = Trainer(**cfg.trainer)
    exp_manager(trainer, cfg.exp_manager)
    mt_model = MTEncDecModel(cfg.model, trainer=trainer)

    logging.info(
        "\n\n************** Model parameters and their sizes ***********")
    for name, param in mt_model.named_parameters():
        print(name, param.size())
    logging.info(
        "***********************************************************\n\n")
    trainer.fit(mt_model)
Esempio n. 7
0
 def build_tarred_train_dataset(self):
     return MTEncDecModel._setup_dataset_from_config(
         cfg=self._cfg.train_ds,
         encoder_tokenizer=self.encoder_tokenizer,
         decoder_tokenizer=self.decoder_tokenizer,
         global_rank=parallel_state.get_data_parallel_rank(),
         world_size=parallel_state.get_data_parallel_world_size(),
         multilingual=self.multilingual,
         multilingual_ids=self.multilingual_ids,
     )
Esempio n. 8
0
 def setup_training_data(self, train_data_config: Optional[DictConfig]):
     # TODO: Figure out how to set global rank and world size for model parallel.
     if hasattr(self, '_train_ds'):
         if train_data_config.dataset_type in ['tarred', 'text']:
             self._train_dl = MTEncDecModel._setup_dataloader_from_config(
                 cfg=train_data_config, dataset=self._train_ds)
         elif train_data_config.dataset_type in [
                 'bin_memmap', 'text_memmap'
         ]:
             consumed_samples = self.compute_consumed_samples(0)
             self._train_dl = self._setup_megatron_dataloader_from_config(
                 cfg=train_data_config,
                 dataset=self._train_ds,
                 consumed_samples=consumed_samples)
Esempio n. 9
0
def main(cfg: MTEncDecConfig) -> None:
    # merge default config with user specified config
    default_cfg = MTEncDecConfig()
    cfg = update_model_config(default_cfg, cfg)
    logging.info("\n\n************** Experiment configuration ***********")
    logging.info(f'Config: {OmegaConf.to_yaml(cfg)}')

    # training is managed by PyTorch Lightning
    trainer_cfg = OmegaConf.to_container(cfg.trainer)
    trainer_cfg.pop('plugins', None)
    trainer = Trainer(plugins=[NLPDDPPlugin(num_nodes=cfg.trainer.num_nodes)],
                      **trainer_cfg)

    # tokenizers will be trained and and tarred training data will be created if needed
    # model config is then updated
    if cfg.model.preproc_out_dir is not None:
        MTDataPreproc(cfg=cfg.model, trainer=trainer)

    # experiment logs, checkpoints, and auto-resume are managed by exp_manager and PyTorch Lightning
    exp_manager(trainer, cfg.exp_manager)

    # everything needed to train translation models is encapsulated in the NeMo MTEncdDecModel
    mt_model = MTEncDecModel(cfg.model, trainer=trainer)

    logging.info(
        "\n\n************** Model parameters and their sizes ***********")
    for name, param in mt_model.named_parameters():
        print(name, param.size())
    logging.info(
        "***********************************************************\n\n")

    if cfg.do_training:
        trainer.fit(mt_model)

    if cfg.do_testing:
        trainer.test(mt_model)
Esempio n. 10
0
 def setup_training_data(self, train_data_config: Optional[DictConfig]):
     # TODO: Figure out how to set global rank and world size for model parallel.
     if hasattr(self, '_train_ds'):
         self._train_dl = MTEncDecModel._setup_dataloader_from_config(
             cfg=train_data_config, dataset=self._train_ds)
Esempio n. 11
0
    def translate(
        self,
        text: List[str],
        source_lang: str = None,
        target_lang: str = None,
        return_beam_scores: bool = False,
        log_timing: bool = False,
    ) -> List[str]:
        """
        Translates list of sentences from source language to target language.
        Should be regular text, this method performs its own tokenization/de-tokenization
        Args:
            text: list of strings to translate
            source_lang: if not "ignore", corresponding MosesTokenizer and MosesPunctNormalizer will be run
            target_lang: if not "ignore", corresponding MosesDecokenizer will be run
            return_beam_scores: if True, returns a list of translations and their corresponding beam scores.
            log_timing: if True, prints timing information.
        Returns:
            list of translated strings
        """
        # __TODO__: This will reset both source and target processors even if you want to reset just one.
        # NOTE: This will also set up appropriate source and target processors for a given src/tgt language for multilingual models instead of creating a list of them.
        if source_lang is not None or target_lang is not None:
            self.source_processor, self.target_processor = MTEncDecModel.setup_pre_and_post_processing_utils(
                source_lang, target_lang, self.encoder_tokenizer_library,
                self.decoder_tokenizer_library)

        mode = self.training
        prepend_ids = []
        if self.multilingual:
            if source_lang is None or target_lang is None:
                raise ValueError(
                    "Expect source_lang and target_lang to run inference for multilingual model."
                )
            src_symbol = self.encoder_tokenizer.token_to_id('<' + source_lang +
                                                            '>')
            tgt_symbol = self.encoder_tokenizer.token_to_id('<' + target_lang +
                                                            '>')
            if src_symbol in self.multilingual_ids:
                prepend_ids = [src_symbol]
            elif tgt_symbol in self.multilingual_ids:
                prepend_ids = [tgt_symbol]

        if log_timing:
            timer = timers.NamedTimer()
        else:
            timer = None

        cache = {
            "timer": timer,
        }

        try:
            self.eval()
            src, src_mask = MTEncDecModel.prepare_inference_batch(
                text=text,
                prepend_ids=prepend_ids,
                target=False,
                source_processor=self.source_processor,
                target_processor=self.target_processor,
                encoder_tokenizer=self.encoder_tokenizer,
                decoder_tokenizer=self.decoder_tokenizer,
                device=self.device,
            )
            predicted_tokens_ids, _ = self.decode(
                src,
                src_mask,
                src.size(1) + self._cfg.
                max_generation_delta,  # Generate up to src-length + max generation delta. TODO: Implement better stopping when everything hits <EOS>.
                tokenizer=self.decoder_tokenizer,
            )
            best_translations = self.postprocess_outputs(
                outputs=predicted_tokens_ids,
                tokenizer=self.decoder_tokenizer,
                processor=self.target_processor)
            return_val = best_translations
        finally:
            self.train(mode=mode)

        if log_timing:
            timing = timer.export()
            timing["mean_src_length"] = src_mask.sum().cpu().item(
            ) / src_mask.shape[0]
            tgt, tgt_mask = self.prepare_inference_batch(
                text=best_translations,
                prepend_ids=prepend_ids,
                target=True,
                source_processor=self.source_processor,
                target_processor=self.target_processor,
                encoder_tokenizer=self.encoder_tokenizer,
                decoder_tokenizer=self.decoder_tokenizer,
                device=self.device,
            )
            timing["mean_tgt_length"] = tgt_mask.sum().cpu().item(
            ) / tgt_mask.shape[0]

            if type(return_val) is tuple:
                return_val = return_val + (timing, )
            else:
                return_val = (return_val, timing)

        return return_val