Python exp_manager Examples, nemo.utils.exp_manager.exp_manager Python Examples

Example #1

0

Show file

File: test_exp_manager.py Project: quuhua911/NeMo

 def test_omegaconf(self):
     """Ensure omegaconf raises an error when an unexcepted argument is passed"""
     with pytest.raises(OmegaConfBaseException):
         exp_manager(pl.Trainer(accelerator='cpu'), {"unused": 1})

Example #2

0

Show file

def main(cfg) -> None:
    logging.info("\n\n************** Experiment configuration ***********")
    logging.info(f'\n{OmegaConf.to_yaml(cfg)}')

    megatron_amp_o2 = cfg.model.get('megatron_amp_O2', False)
    plugins = [
        NLPDDPPlugin(
            no_ddp_communication_hook=True,
            gradient_as_bucket_view=cfg.model.gradient_as_bucket_view,
            find_unused_parameters=False,
        )
    ]
    if cfg.trainer.precision in [16, 'bf16']:
        scaler = None
        if cfg.trainer.precision == 16:
            scaler = GradScaler(
                init_scale=cfg.model.get('native_amp_init_scale', 2 ** 32),
                growth_interval=cfg.model.get('native_amp_growth_interval', 1000),
                hysteresis=cfg.model.get('hysteresis', 2),
            )
        if megatron_amp_o2:
            plugins.append(MegatronHalfPrecisionPlugin(precision=cfg.trainer.precision, device='cuda', scaler=scaler))
        else:
            plugins.append(PipelineMixedPrecisionPlugin(precision=cfg.trainer.precision, device='cuda', scaler=scaler))

    if cfg.get('cluster_type', None) == 'BCP':
        plugins.append(TorchElasticEnvironment())

    trainer = Trainer(plugins=plugins, **cfg.trainer, callbacks=[ModelSummary(max_depth=3)])

    # tokenizers will be trained and and tarred training data will be created if needed
    # model config is then updated
    if cfg.model.preproc_out_dir is not None:
        MTDataPreproc(cfg=cfg.model, trainer=trainer)

    exp_manager(trainer, cfg.exp_manager)

    # update resume from checkpoint found by exp_manager
    if cfg.model.resume_from_checkpoint is not None:
        resume_from_checkpoint = cfg.model.resume_from_checkpoint
    else:
        resume_from_checkpoint = trainer._checkpoint_connector.resume_from_checkpoint_fit_path
    logging.info(f'Resuming training from checkpoint: {resume_from_checkpoint}')

    trainer._checkpoint_connector = CheckpointConnector(trainer, resume_from_checkpoint=resume_from_checkpoint)
    # Override timer callback to a stateless one
    for idx, callback in enumerate(trainer.callbacks):
        if isinstance(callback, Timer):
            trainer.callbacks[idx] = StatelessTimer(cfg.trainer.max_time,)

    # hydra interpolation does not work here as the interpolation key is lost when PTL saves hparams
    with open_dict(cfg):
        cfg.model.precision = cfg.trainer.precision

    if hasattr(cfg.model, 'pretrained_model_path') and cfg.model.pretrained_model_path is not None:
        if not hasattr(cfg.model, 'pretrained_model_type'):
            raise ValueError(f"Pretrained model type must be in [T5, BART].")

        assert cfg.model.pretrained_model_type in ['T5', 'BART']
        if cfg.model.pretrained_model_type == 'T5':
            pretrained_cfg = MegatronT5Model.restore_from(
                cfg.model.pretrained_model_path, trainer=trainer, return_config=True
            )
        else:
            pretrained_cfg = MegatronBARTModel.restore_from(
                cfg.model.pretrained_model_path, trainer=trainer, return_config=True
            )
        OmegaConf.set_struct(pretrained_cfg, True)
        with open_dict(pretrained_cfg):
            pretrained_cfg.masked_softmax_fusion = False
            # Set source and target language/multilingual
            pretrained_cfg.src_language = cfg.model.src_language
            pretrained_cfg.tgt_language = cfg.model.tgt_language
            pretrained_cfg.multilingual = cfg.model.multilingual
            pretrained_cfg.shared_tokenizer = True

            # Max generation delta
            pretrained_cfg.max_generation_delta = cfg.model.max_generation_delta

            # Set label smoothing
            pretrained_cfg.label_smoothing = cfg.model.label_smoothing

            # Set tokenizer paths:
            pretrained_cfg.encoder_tokenizer = pretrained_cfg.tokenizer
            pretrained_cfg.decoder_tokenizer = pretrained_cfg.tokenizer

            # Pre-trained models should use the legacy sentencepiece tokenizer ex: mT5
            pretrained_cfg.encoder_tokenizer.sentencepiece_legacy = True
            pretrained_cfg.decoder_tokenizer.sentencepiece_legacy = True

            # Override dropout
            pretrained_cfg.hidden_dropout = cfg.model.hidden_dropout
            pretrained_cfg.attention_dropout = cfg.model.attention_dropout

            # Override precision
            pretrained_cfg.precision = trainer.precision  # Set above from trainer.precision

            # Override micro/global batch
            pretrained_cfg.micro_batch_size = cfg.model.micro_batch_size
            pretrained_cfg.global_batch_size = cfg.model.global_batch_size

            # O2 AMP
            pretrained_cfg.megatron_amp_O2 = cfg.model.get('megatron_amp_O2', False)

            # Override data and global/micro batch size.
            pretrained_cfg.train_ds = cfg.model.train_ds
            pretrained_cfg.train_ds.micro_batch_size = cfg.model.micro_batch_size
            pretrained_cfg.train_ds.global_batch_size = cfg.model.global_batch_size
            pretrained_cfg.validation_ds = cfg.model.validation_ds
            pretrained_cfg.test_ds = cfg.model.test_ds

            # Class target for the new class being restored.
            pretrained_cfg.target = (
                "nemo.collections.nlp.models.machine_translation.megatron_nmt_model.MegatronNMTModel"
            )

            # Optimizer overrides.
            pretrained_cfg.optim = cfg.model.optim

        model = MegatronNMTModel.restore_from(
            cfg.model.pretrained_model_path,
            trainer=trainer,
            override_config_path=pretrained_cfg,
            save_restore_connector=NLPSaveRestoreConnector(),
        )
    else:
        model = MegatronNMTModel(cfg.model, trainer)
    if cfg.do_training:
        trainer.fit(model)

    if cfg.do_testing:
        trainer.test(model)

Example #3

0

Show file

def main(cfg: DictConfig) -> None:
    logging.info(
        'During evaluation/testing, it is currently advisable to construct a new Trainer with single GPU and \
            no DDP to obtain accurate results')

    if not hasattr(cfg.model, 'test_ds'):
        raise ValueError(
            f'model.test_ds was not found in the config, skipping evaluation')
    else:
        gpu = 1 if cfg.trainer.gpus != 0 else 0

    trainer = pl.Trainer(
        gpus=gpu,
        precision=cfg.trainer.precision,
        amp_level=cfg.trainer.amp_level,
        logger=False,
        checkpoint_callback=False,
    )
    exp_dir = exp_manager(trainer, cfg.exp_manager)

    if not cfg.pretrained_model:
        raise ValueError(
            'To run evaluation and inference script a pre-trained model or .nemo file must be provided.'
            f'Choose from {TokenClassificationModel.list_available_models()} or "pretrained_model"="your_model.nemo"'
        )

    if os.path.exists(cfg.pretrained_model):
        model = TokenClassificationModel.restore_from(cfg.pretrained_model)
    elif cfg.pretrained_model in TokenClassificationModel.get_available_model_names(
    ):
        model = TokenClassificationModel.from_pretrained(cfg.pretrained_model)
    else:
        raise ValueError(
            f'Provide path to the pre-trained .nemo checkpoint or choose from {TokenClassificationModel.list_available_models()}'
        )

    data_dir = cfg.model.dataset.get('data_dir', None)
    if not data_dir:
        raise ValueError(
            'Specify a valid dataset directory that contains test_ds.text_file and test_ds.labels_file \
            with "model.dataset.data_dir" argument')

    if not os.path.exists(data_dir):
        raise ValueError(f'{data_dir} is not found at')

    model.update_data_dir(data_dir=data_dir)
    model._cfg.dataset.use_cache = False

    if not hasattr(cfg.model, 'test_ds'):
        raise ValueError(
            f'model.test_ds was not found in the config, skipping evaluation')
    else:
        if model.prepare_test(trainer):
            model.setup_test_data()
            trainer.test(model)
        else:
            raise ValueError('Terminating evaluation')

    model.evaluate_from_file(
        text_file=os.path.join(data_dir, cfg.model.test_ds.text_file),
        labels_file=os.path.join(data_dir, cfg.model.test_ds.labels_file),
        output_dir=exp_dir,
        add_confusion_matrix=True,
        normalize_confusion_matrix=True,
    )

    # run an inference on a few examples
    queries = [
        'we bought four shirts from the nvidia gear store in santa clara.',
        'Nvidia is a company.'
    ]
    results = model.add_predictions(queries, output_file='predictions.txt')

    for query, result in zip(queries, results):
        logging.info(f'Query : {query}')
        logging.info(f'Result: {result.strip()}\n')

    logging.info(f'Results are saved at {exp_dir}')

Example #4

0

Show file

def main(cfg: DictConfig) -> None:
    logging.info(f'Config Params:\n {OmegaConf.to_yaml(cfg)}')
    trainer = pl.Trainer(**cfg.trainer)
    exp_manager(trainer, cfg.get("exp_manager", None))

    # initialize the model using the config file
    model = MultiLabelIntentSlotClassificationModel(cfg.model, trainer=trainer)

    # training
    logging.info(
        "================================================================================================"
    )
    logging.info('Starting training...')
    trainer.fit(model)
    logging.info('Training finished!')

    # Stop further testing as fast_dev_run does not save checkpoints
    if trainer.fast_dev_run:
        return

    # after model training is done, you can load the model from the saved checkpoint
    # and evaluate it on a data file or on given queries.
    logging.info(
        "================================================================================================"
    )
    logging.info("Starting the testing of the trained model on test set...")
    logging.info(
        "We will load the latest model saved checkpoint from the training...")

    # for evaluation and inference you can load the previously trained model saved in .nemo file
    # like this in your code, but we will just reuse the trained model here
    # eval_model = MultiLabelIntentSlotClassificationModel.restore_from(restore_path=checkpoint_path)
    eval_model = model

    # we will setup testing data reusing the same config (test section)
    eval_model.update_data_dir_for_testing(data_dir=cfg.model.data_dir)
    eval_model.setup_test_data(test_data_config=cfg.model.test_ds)

    trainer.test(model=eval_model, ckpt_path=None, verbose=False)
    logging.info("Testing finished!")

    # Optimize Threshold
    eval_model.optimize_threshold(cfg.model.test_ds, 'dev')

    # run an inference on a few examples
    logging.info(
        "======================================================================================"
    )
    logging.info("Evaluate the model on the given queries...")

    # this will work well if you train the model on ATIS dataset
    # for your own dataset change the examples appropriately
    queries = [
        'i would like to find a flight from charlotte to las vegas that makes a stop in st. louis',
        'on april first i need a ticket from tacoma to san jose departing before 7 am',
        'how much is the limousine service in boston',
    ]

    # We use the optimized threshold for predictions
    pred_intents, pred_slots, pred_list = eval_model.predict_from_examples(
        queries, cfg.model.test_ds)
    logging.info(
        'The prediction results of some sample queries with the trained model:'
    )

    for query, intent, slots in zip(queries, pred_intents, pred_slots):
        logging.info(f'Query : {query}')
        logging.info(f'Predicted Intents: {intent}')
        logging.info(f'Predicted Slots: {slots}')

    logging.info("Inference finished!")

Example #5

0

Show file

File: megatron_t5_seq2seq_eval.py Project: NVIDIA/NeMo

def main(cfg) -> None:
    logging.info("\n\n************** Experiment configuration ***********")
    logging.info(f'\n{OmegaConf.to_yaml(cfg)}')

    megatron_amp_o2 = cfg.model.get('megatron_amp_O2', False)
    plugins = [
        NLPDDPPlugin(
            no_ddp_communication_hook=True,
            gradient_as_bucket_view=cfg.model.gradient_as_bucket_view,
            find_unused_parameters=False,
        )
    ]
    if cfg.trainer.precision in [16, 'bf16']:
        scaler = None
        if cfg.trainer.precision == 16:
            scaler = GradScaler(
                init_scale=cfg.model.get('native_amp_init_scale', 2**32),
                growth_interval=cfg.model.get('native_amp_growth_interval',
                                              1000),
                hysteresis=cfg.model.get('hysteresis', 2),
            )
        if megatron_amp_o2:
            plugins.append(
                MegatronHalfPrecisionPlugin(precision=cfg.trainer.precision,
                                            device='cuda',
                                            scaler=scaler))
        else:
            plugins.append(
                NativeMixedPrecisionPlugin(precision=cfg.trainer.precision,
                                           device='cuda',
                                           scaler=scaler))

    if cfg.get('cluster_type', None) == 'BCP':
        plugins.append(TorchElasticEnvironment())

    trainer = Trainer(plugins=plugins, **cfg.trainer)

    exp_manager(trainer, cfg.exp_manager)

    # Override timer callback to a stateless one
    for idx, callback in enumerate(trainer.callbacks):
        if isinstance(callback, Timer):
            trainer.callbacks[idx] = StatelessTimer(cfg.trainer.max_time, )

    # Get the T5 Base configuration.
    t5_cfg = MegatronT5GLUEModel.restore_from(
        restore_path=cfg.model.restore_from_path,
        trainer=trainer,
        return_config=True)

    # Override the T5 configuration with the one from the config file.
    # NOTE: Only data can be overriden here since this the file being restored here should already correspond to a GLUE/XNLI finetuned model.
    OmegaConf.set_struct(t5_cfg, True)
    with open_dict(t5_cfg):
        t5_cfg.masked_softmax_fusion = False
        t5_cfg.precision = cfg.trainer.precision
        # Overwrite data configs
        t5_cfg.data = cfg.model.data
        # XNLI has eval languages in the yaml config.
        if hasattr(cfg.model, 'eval_languages'):
            t5_cfg.eval_languages = cfg.model.eval_languages

    if hasattr(t5_cfg.data.validation_ds, 'task_name'):
        model = MegatronT5GLUEModel.restore_from(
            restore_path=cfg.model.restore_from_path,
            trainer=trainer,
            override_config_path=t5_cfg)
    else:
        model = MegatronT5FinetuneModel.restore_from(
            restore_path=cfg.model.restore_from_path,
            trainer=trainer,
            override_config_path=t5_cfg)
    model.freeze()
    trainer.validate(model)
    if hasattr(cfg.model.data, 'test_ds'):
        trainer.test(model)

Example #6

0

Show file

File: speech_to_text.py Project: vinayphadnis/NeMo

def main(cfg):
    trainer = pl.Trainer(**cfg.trainer)
    exp_manager(trainer, cfg.get("exp_manager", None))
    asr_model = EncDecCTCModel(cfg=cfg.model, trainer=trainer)

    trainer.fit(asr_model)

Example #7

0

Show file

def main(cfg):
    trainer = pl.Trainer(**cfg.trainer)
    exp_manager(trainer, cfg.get("exp_manager", None))
    model = HifiGanModel(cfg=cfg.model, trainer=trainer)
    model.maybe_init_from_pretrained_checkpoint(cfg=cfg)
    trainer.fit(model)

Example #8

0

Show file

def main(cfg: DictConfig) -> None:
    pl.seed_everything(42)
    logging.info(f'Config: {OmegaConf.to_yaml(cfg)}')

    try:
        plugin = NLPDDPPlugin()
    except (ImportError, ModuleNotFoundError):
        plugin = None

    trainer = pl.Trainer(**cfg.trainer, plugins=plugin)

    exp_manager(trainer, cfg.get("exp_manager", None))

    app_state = AppState()
    if cfg.model.tensor_model_parallel_size > 1:
        app_state.model_parallel_size = cfg.model.tensor_model_parallel_size
        app_state.model_parallel_rank = compute_model_parallel_rank(
            trainer.local_rank, app_state.model_parallel_size)

    if 'bert' in cfg.model.language_model.pretrained_model_name:
        if cfg.model.dataset.task == 'sgd':
            if cfg.model.original_nemo_checkpoint is not None:
                model_class = DialogueZeroShotIntentModel
            else:
                model_class = SGDQAModel
        elif cfg.model.dataset.task in ['zero_shot', 'design']:
            model_class = DialogueZeroShotIntentModel
        else:
            model_class = IntentSlotClassificationModel
    elif 'gpt' in cfg.model.language_model.pretrained_model_name.lower():
        if cfg.model.dataset.task in ['ms_marco', 'mellon_qa']:
            model_class = DialogueGPTGenerationModel
        else:
            model_class = DialogueGPTClassificationModel
    elif ('bart' in cfg.model.language_model.pretrained_model_name.lower()
          or 't5' in cfg.model.language_model.pretrained_model_name.lower()):
        # please use bf16/32 with t5-large and above
        # see https://github.com/huggingface/transformers/pull/10956
        model_class = DialogueS2SGenerationModel
    elif 'sentence-transformers' in cfg.model.language_model.pretrained_model_name.lower(
    ):
        model_class = DialogueNearestNeighbourModel

    if cfg.pretrained_model or (cfg.model.nemo_path
                                and os.path.exists(cfg.model.nemo_path)):
        if cfg.pretrained_model:
            logging.info(f'Loading pretrained model {cfg.pretrained_model}')
            model = model_class.from_pretrained(cfg.pretrained_model)
        else:
            logging.info(f'Restoring model from {cfg.model.nemo_path}')
            model = model_class.restore_from(cfg.model.nemo_path)

        if cfg.do_training:
            model.setup_training_data(train_data_config=cfg.model.train_ds)
            model.setup_multiple_validation_data(
                val_data_config=cfg.model.validation_ds)
    else:
        logging.info(f'Config: {OmegaConf.to_yaml(cfg)}')
        model = model_class(cfg.model, trainer=trainer)

    if cfg.do_training:
        trainer.fit(model)
        if cfg.model.nemo_path:
            model.save_to(cfg.model.nemo_path)
    else:
        data_dir = cfg.model.dataset.get('data_dir', None)
        dialogues_example_dir = cfg.model.dataset.get('dialogues_example_dir',
                                                      None)

        if data_dir is None or dialogues_example_dir is None:
            raise ValueError(
                'No dataset directory provided. Skipping evaluation. ')
        elif not os.path.exists(data_dir):
            raise ValueError(
                f'{data_dir} is not found, skipping evaluation on the test set.'
            )
        else:
            if hasattr(model, "update_data_dirs"):
                model.update_data_dirs(
                    data_dir=data_dir,
                    dialogues_example_dir=dialogues_example_dir)
                model._cfg.dataset = cfg.model.dataset

    if hasattr(cfg.model, 'test_ds') and cfg.model.test_ds.ds_item is not None:
        eval_device = [cfg.trainer.devices[0]] if isinstance(
            cfg.trainer.devices, list) else 1
        trainer = pl.Trainer(devices=eval_device,
                             accelerator=cfg.trainer.accelerator,
                             precision=16)
        model.setup_multiple_test_data(test_data_config=cfg.model.test_ds)
        if model.prepare_test(trainer):
            trainer.test(model)

Example #9

0

Show file

File: text_classification_with_bert.py Project: silencelearner/NeMo

def main(cfg: DictConfig) -> None:
    logging.info(f'\nConfig Params:\n{cfg.pretty()}')
    trainer = pl.Trainer(plugins=[NLPDDPPlugin()], **cfg.trainer)
    exp_manager(trainer, cfg.get("exp_manager", None))

    if not cfg.model.train_ds.file_path:
        raise ValueError(
            "'train_ds.file_path' need to be set for the training!")

    model = TextClassificationModel(cfg.model, trainer=trainer)
    logging.info(
        "==========================================================================================="
    )
    logging.info('Starting training...')
    trainer.fit(model)
    logging.info('Training finished!')
    logging.info(
        "==========================================================================================="
    )

    if cfg.model.nemo_path:
        # '.nemo' file contains the last checkpoint and the params to initialize the model
        model.save_to(cfg.model.nemo_path)
        logging.info(
            f'Model is saved into `.nemo` file: {cfg.model.nemo_path}')

    # We evaluate the trained model on the test set if test_ds is set in the config file
    if cfg.model.test_ds.file_path:
        logging.info(
            "==========================================================================================="
        )
        logging.info(
            "Starting the testing of the trained model on test set...")
        trainer.test(model=model, ckpt_path=None, verbose=False)
        logging.info("Testing finished!")
        logging.info(
            "==========================================================================================="
        )

    # perform inference on a list of queries.
    if "infer_samples" in cfg.model and cfg.model.infer_samples:
        logging.info(
            "==========================================================================================="
        )
        logging.info("Starting the inference on some sample queries...")

        # max_seq_length=512 is the maximum length BERT supports.
        results = model.classifytext(queries=cfg.model.infer_samples,
                                     batch_size=16,
                                     max_seq_length=512)
        logging.info(
            'The prediction results of some sample queries with the trained model:'
        )
        for query, result in zip(cfg.model.infer_samples, results):
            logging.info(f'Query : {query}')
            logging.info(f'Predicted label: {result}')

        logging.info("Inference finished!")
        logging.info(
            "==========================================================================================="
        )

Example #10

0

Show file

File: test_exp_manager.py Project: toonday/NeMo

 def test_omegaconf(self):
     """Ensure omegaconf raises an error when an unexcepted argument is passed"""
     with pytest.raises(OmegaConfBaseException):
         exp_manager(None, {"unused": 1})

Example #11

0

Show file

def main():
    parser = ArgumentParser()
    parser.add_argument(
        "--pretrained_model",
        type=str,
        default="speakerverification_speakernet",
        required=False,
        help="Pass your trained .nemo model",
    )
    parser.add_argument(
        "--finetune_config_file",
        type=str,
        required=True,
        help=
        "path to speakernet config yaml file to load train, validation dataset and also for trainer parameters",
    )

    parser.add_argument(
        "--freeze_encoder",
        type=bool,
        required=False,
        default=True,
        help=
        "True if speakernet encoder paramteres needs to be frozen while finetuning",
    )

    args = parser.parse_args()

    if args.pretrained_model.endswith('.nemo'):
        logging.info(f"Using local speaker model from {args.pretrained_model}")
        speaker_model = EncDecSpeakerLabelModel.restore_from(
            restore_path=args.pretrained_model)
    elif args.pretrained_model.endswith('.ckpt'):
        logging.info(
            f"Using local speaker model from checkpoint {args.pretrained_model}"
        )
        speaker_model = EncDecSpeakerLabelModel.load_from_checkpoint(
            checkpoint_path=args.pretrained_model)
    else:
        logging.info("Using pretrained speaker recognition model from NGC")
        speaker_model = EncDecSpeakerLabelModel.from_pretrained(
            model_name=args.pretrained_model)

    finetune_config = OmegaConf.load(args.finetune_config_file)

    if 'test_ds' in finetune_config.model and finetune_config.model.test_ds is not None:
        finetune_config.model.test_ds = None
        logging.warning("Removing test ds")

    speaker_model.setup_finetune_model(finetune_config.model)
    finetune_trainer = pl.Trainer(**finetune_config.trainer)
    speaker_model.set_trainer(finetune_trainer)

    _ = exp_manager(finetune_trainer, finetune_config.get('exp_manager', None))
    speaker_model.setup_optimization(finetune_config.optim)

    if args.freeze_encoder:
        for param in speaker_model.encoder.parameters():
            param.requires_grad = False

    finetune_trainer.fit(speaker_model)

Example #12

0

Show file

File: ptune_text_classification.py Project: ggrunin/NeMo

def main(cfg: DictConfig) -> None:
    logging.info(f'\nConfig Params:\n{OmegaConf.to_yaml(cfg)}')
    trainer = pl.Trainer(plugins=[NLPDDPPlugin()], **cfg.trainer)
    exp_manager(trainer, cfg.get("exp_manager", None))

    if not cfg.model.train_ds.file_path:
        raise ValueError(
            "'train_ds.file_path' need to be set for the training!")

    model = PTuneTextClassificationModel(cfg.model, trainer=trainer)
    logging.info(
        "==========================================================================================="
    )
    logging.info('Starting training...')
    trainer.fit(model)
    logging.info('Training finished!')
    logging.info(
        "==========================================================================================="
    )

    # We evaluate the trained model on the test set if test_ds is set in the config file
    if cfg.model.test_ds.file_path:
        logging.info(
            "==========================================================================================="
        )
        logging.info(
            "Starting the testing of the trained model on test set...")
        trainer.test(model=model, ckpt_path=None, verbose=False)
        logging.info("Testing finished!")
        logging.info(
            "==========================================================================================="
        )

        # extract the path of the best checkpoint from the training, you may update it to any checkpoint
        checkpoint_path = trainer.checkpoint_callback.best_model_path
        tensor_parallel_size = cfg.model.tensor_model_parallel_size
        pathobj = pathlib.Path(checkpoint_path)
        checkpoint_folder = str(pathobj.parent)
        checkpoint_name = str(pathobj.name)

        rank = trainer.accelerator.training_type_plugin.local_rank
        if tensor_parallel_size > 1:
            # inject model parallel rank
            checkpoint_path = os.path.join(checkpoint_folder,
                                           f'mp_rank_{rank:02d}',
                                           checkpoint_name)
        else:
            checkpoint_path = os.path.join(checkpoint_folder, checkpoint_name)

        # Load the checkpoint
        best_eval_model = PTuneTextClassificationModel.load_from_checkpoint(
            checkpoint_path=checkpoint_path, strict=False, trainer=trainer)
        logging.info(f'best checkpoint path: {checkpoint_path}')
        logging.info("Running Test with best EVAL checkpoint!")
        # setup the test dataset
        best_eval_model.setup_test_data(test_data_config=cfg.model.test_ds)
        if torch.distributed.is_initialized():
            torch.distributed.barrier()
        trainer.test(model=best_eval_model, ckpt_path=None, verbose=False)
        logging.info("Beset EVAL Testing finished!")
        logging.info(
            "==========================================================================================="
        )

    if cfg.model.nemo_path:
        # '.nemo' file contains the last checkpoint and the params to initialize the model
        best_eval_model.save_to(cfg.model.nemo_path)
        logging.info(
            f'Model is saved into `.nemo` file: {cfg.model.nemo_path}')

    # perform inference on a list of queries.
    if "infer_samples" in cfg.model and cfg.model.infer_samples:
        logging.info(
            "==========================================================================================="
        )
        logging.info("Starting the inference on some sample queries...")

        # max_seq_length=512 is the maximum length BERT supports.
        results = best_eval_model.cuda().classifytext(
            queries=cfg.model.infer_samples, batch_size=1, prompt='Sentiment')
        logging.info(
            'The prediction results of some sample queries with the trained model:'
        )
        for query, result in zip(cfg.model.infer_samples, results):
            logging.info(f'Query : {query}')
            logging.info(f'Predicted label: {result}')

        logging.info("Inference finished!")
        logging.info(
            "==========================================================================================="
        )

Example #13

0

Show file

def main(cfg: DictConfig) -> None:
    logging.info(f'Config: {cfg.pretty()}')
    trainer = pl.Trainer(**cfg.trainer)
    exp_manager(trainer, cfg.get("exp_manager", None))
    bert_joint_ir_model = BertJointIRModel(cfg.model, trainer=trainer)
    trainer.fit(bert_joint_ir_model)

Example #14

0

Show file

File: megatron_gpt_pretraining.py Project: NVIDIA/NeMo

def main(cfg) -> None:
    logging.info("\n\n************** Experiment configuration ***********")
    logging.info(f'\n{OmegaConf.to_yaml(cfg)}')

    megatron_amp_o2 = cfg.model.get('megatron_amp_O2', False)

    plugins = [
        NLPDDPPlugin(
            no_ddp_communication_hook=
            True,  # we don't use DDP for async grad allreduce
            gradient_as_bucket_view=cfg.model.gradient_as_bucket_view,
            find_unused_parameters=False,
        )
    ]
    if cfg.trainer.precision in [16, 'bf16']:
        scaler = None
        if cfg.trainer.precision == 16:
            scaler = GradScaler(
                init_scale=cfg.model.get('native_amp_init_scale', 2**32),
                growth_interval=cfg.model.get('native_amp_growth_interval',
                                              1000),
                hysteresis=cfg.model.get('hysteresis', 2),
            )
        if megatron_amp_o2:
            plugins.append(
                MegatronHalfPrecisionPlugin(precision=cfg.trainer.precision,
                                            device='cuda',
                                            scaler=scaler))
        else:
            plugins.append(
                PipelineMixedPrecisionPlugin(precision=cfg.trainer.precision,
                                             device='cuda',
                                             scaler=scaler))

    if cfg.get('cluster_type', None) == 'BCP':
        plugins.append(TorchElasticEnvironment())

    trainer = Trainer(plugins=plugins, **cfg.trainer)

    exp_manager(trainer, cfg.exp_manager)

    # update resume from checkpoint found by exp_manager
    if cfg.model.resume_from_checkpoint is not None:
        resume_from_checkpoint = cfg.model.resume_from_checkpoint
    else:
        resume_from_checkpoint = trainer._checkpoint_connector.resume_from_checkpoint_fit_path

    logging.info(
        f'Resuming training from checkpoint: {resume_from_checkpoint}')

    trainer._checkpoint_connector = CheckpointConnector(
        trainer, resume_from_checkpoint=resume_from_checkpoint)
    # Override timer callback to a stateless one
    for idx, callback in enumerate(trainer.callbacks):
        if isinstance(callback, Timer):
            trainer.callbacks[idx] = StatelessTimer(cfg.trainer.max_time, )

    # hydra interpolation does not work here as the interpolation key is lost when PTL saves hparams
    with open_dict(cfg):
        cfg.model.precision = cfg.trainer.precision

    model = MegatronGPTModel(cfg.model, trainer)

    trainer.fit(model)

Example #15

0

Show file

File: test_exp_manager.py Project: quuhua911/NeMo

    def test_trainer_loggers(self, tmp_path):
        """ Test that a trainer with logger errors out with a number of arguments. Test that it works with
        create_tensorboard_logger set to False
        """
        test_trainer = pl.Trainer(
            accelerator='cpu')  # Should create logger and modelcheckpoint

        with pytest.raises(LoggerMisconfigurationError
                           ):  # Fails because exp_manager defaults to trainer
            exp_manager(test_trainer, {"exp_dir": str(tmp_path)})
        with pytest.raises(LoggerMisconfigurationError
                           ):  # Fails because exp_manager defaults to trainer
            exp_manager(test_trainer, {"explicit_log_dir": str(tmp_path)})
        with pytest.raises(LoggerMisconfigurationError
                           ):  # Fails because exp_manager defaults to trainer
            exp_manager(test_trainer, {"resume_if_exists": True})

        # Check that exp_manager uses trainer.logger, it's exp_dir, name, and version
        log_dir = exp_manager(
            test_trainer, {
                "create_tensorboard_logger": False,
                "create_checkpoint_callback": False
            })
        assert log_dir.resolve() == Path(
            "./lightning_logs/version_0").resolve()
        assert Path("./lightning_logs").exists()
        assert Path("./lightning_logs/version_0").exists()

        # Check that a trainer without a logger gets a logger attached to it
        test_trainer = pl.Trainer(accelerator='cpu', logger=False)
        log_dir = exp_manager(
            test_trainer,
            {
                "create_tensorboard_logger": True,
                "create_checkpoint_callback": False,
                "exp_dir": str(tmp_path)
            },
        )
        assert isinstance(test_trainer.logger, pl.loggers.TensorBoardLogger)

        test_trainer = pl.Trainer(accelerator='cpu', logger=False)
        # Check that a create_wandb_logger=True errors out unless wandb_logger_kwargs is passed.
        with pytest.raises(ValueError):
            log_dir = exp_manager(
                test_trainer,
                {
                    "create_tensorboard_logger": False,
                    "create_checkpoint_callback": False,
                    "exp_dir": str(tmp_path),
                    "create_wandb_logger": True,
                },
            )
        # Check that a WandbLogger is attached to logger if create_wandb_logger=True and wandb_logger_kwargs has name
        # and project
        log_dir = exp_manager(
            test_trainer,
            {
                "create_tensorboard_logger": False,
                "create_checkpoint_callback": False,
                "exp_dir": str(tmp_path),
                "create_wandb_logger": True,
                "wandb_logger_kwargs": {
                    "name": "",
                    "project": ""
                },
            },
        )
        assert isinstance(test_trainer.logger, pl.loggers.WandbLogger)

Example #16

0

Show file

File: nemo_train_qa.py Project: dusty-nv/jetson-voice

config.trainer.gpus = 1 if torch.cuda.is_available() else 0
config.trainer.precision = 16 if torch.cuda.is_available(
) else 32  # For mixed precision training, use precision=16 and amp_level=O1
config.trainer.max_epochs = args.epochs
config.trainer.accelerator = None  # Remove distributed training flags

if args.output != '':
    config.exp_manager.exp_dir = args.output

print(OmegaConf.to_yaml(config))

# create trainer + model
trainer = pl.Trainer(**config.trainer)
model = nemo_nlp.models.QAModel(cfg=config.model, trainer=trainer)
exp_dir = str(exp_manager(trainer, config.get("exp_manager", None)))

print('experiment directory:', exp_dir)

# start the training
trainer.fit(model)

# test the model
model.setup_test_data(test_data_config=config.model.test_ds)
trainer.test(model)

# example inference
all_preds, all_nbests = model.inference(
    file=config.model.test_ds.file,
    output_nbest_file=os.path.join(exp_dir, 'output_prediction.json'),
    output_prediction_file=os.path.join(exp_dir, 'output_nbest.json'),

Example #17

0

Show file

File: test_exp_manager.py Project: quuhua911/NeMo

    def test_resume(self, tmp_path):
        """ Tests the resume capabilities of exp_manager"""
        test_trainer = pl.Trainer(accelerator='cpu',
                                  enable_checkpointing=False,
                                  logger=False)

        # Error because explicit_log_dir does not exist
        with pytest.raises(NotFoundError):
            exp_manager(
                test_trainer,
                {
                    "exp_dir": str(tmp_path / "test_resume"),
                    "resume_if_exists": True,
                    "explicit_log_dir": "Does_not_exist",
                },
            )

        # Error because checkpoints folder does not exist
        with pytest.raises(NotFoundError):
            exp_manager(test_trainer, {
                "resume_if_exists": True,
                "exp_dir": str(tmp_path / "test_resume")
            })

        # No error because we tell exp_manager to ignore notfounderror
        exp_manager(
            test_trainer,
            {
                "resume_if_exists": True,
                "exp_dir": str(tmp_path / "test_resume_2"),
                "resume_ignore_no_checkpoint": True,
            },
        )

        test_trainer = pl.Trainer(accelerator='cpu',
                                  enable_checkpointing=False,
                                  logger=False)
        Path(tmp_path / "test_resume" / "default" / "version_0" /
             "checkpoints").mkdir(parents=True)
        # Error because checkpoints do not exist in folder
        with pytest.raises(NotFoundError):
            exp_manager(
                test_trainer,
                {
                    "resume_if_exists":
                    True,
                    "explicit_log_dir":
                    str(tmp_path / "test_resume" / "default" / "version_0"),
                },
            )

        Path(tmp_path / "test_resume" / "default" / "version_0" /
             "checkpoints" / "mymodel--end.ckpt").touch()
        # Error because *end.ckpt is in folder indicating that training has already finished
        with pytest.raises(ValueError):
            exp_manager(
                test_trainer,
                {
                    "resume_if_exists":
                    True,
                    "explicit_log_dir":
                    str(tmp_path / "test_resume" / "default" / "version_0"),
                },
            )

        Path(tmp_path / "test_resume" / "default" / "version_0" /
             "checkpoints" / "mymodel--end.ckpt").unlink()
        Path(tmp_path / "test_resume" / "default" / "version_0" /
             "checkpoints" / "mymodel--last.ckpt").touch()
        Path(tmp_path / "test_resume" / "default" / "version_0" /
             "checkpoints" / "mymodel2--last.ckpt").touch()
        # Error because multiple *last.ckpt is in folder. If more than one, don't know which to restore
        with pytest.raises(ValueError):
            exp_manager(
                test_trainer,
                {
                    "resume_if_exists":
                    True,
                    "explicit_log_dir":
                    str(tmp_path / "test_resume" / "default" / "version_0"),
                },
            )

        # Finally succeed
        Path(tmp_path / "test_resume" / "default" / "version_0" /
             "checkpoints" / "mymodel2--last.ckpt").unlink()
        log_dir = exp_manager(
            test_trainer,
            {
                "resume_if_exists":
                True,
                "explicit_log_dir":
                str(tmp_path / "test_resume" / "default" / "version_0")
            },
        )
        checkpoint = Path(tmp_path / "test_resume" / "default" / "version_0" /
                          "checkpoints" / "mymodel--last.ckpt")
        assert (Path(
            test_trainer.checkpoint_connector.resume_from_checkpoint_fit_path).
                resolve() == checkpoint.resolve())

        # Succeed again and make sure that run_0 exists and previous log files were moved
        test_trainer = pl.Trainer(accelerator='cpu',
                                  enable_checkpointing=False,
                                  logger=False)
        exp_manager(test_trainer, {
            "resume_if_exists": True,
            "explicit_log_dir": str(log_dir)
        })
        checkpoint = Path(tmp_path / "test_resume" / "default" / "version_0" /
                          "checkpoints" / "mymodel--last.ckpt")
        assert (Path(
            test_trainer.checkpoint_connector.resume_from_checkpoint_fit_path).
                resolve() == checkpoint.resolve())
        prev_run_dir = Path(tmp_path / "test_resume" / "default" /
                            "version_0" / "run_0")
        assert prev_run_dir.exists()
        prev_log = Path(tmp_path / "test_resume" / "default" / "version_0" /
                        "run_0" / "lightning_logs.txt")
        assert prev_log.exists()

Example #18

0

Show file

File: punctuation_capitalization_evaluate.py Project: blisc/NeMo

def main(cfg: DictConfig) -> None:
    logging.info(
        'During evaluation/testing, it is currently advisable to construct a new Trainer with single GPU and \
            no DDP to obtain accurate results')

    if not hasattr(cfg.model, 'test_ds'):
        raise ValueError(
            f'model.test_ds was not found in the config, skipping evaluation')
    else:
        gpu = 1 if cfg.trainer.gpus != 0 else 0

    trainer = pl.Trainer(
        gpus=gpu,
        precision=cfg.trainer.precision,
        amp_level=cfg.trainer.amp_level,
        logger=False,
        checkpoint_callback=False,
    )
    exp_dir = exp_manager(trainer, cfg.exp_manager)

    if not cfg.pretrained_model:
        raise ValueError(
            'To run evaluation and inference script a pre-trained model or .nemo file must be provided.'
            f'Choose from {PunctuationCapitalizationModel.list_available_models()} or "pretrained_model"="your_model.nemo"'
        )

    if os.path.exists(cfg.pretrained_model):
        model = PunctuationCapitalizationModel.restore_from(
            cfg.pretrained_model)
    elif cfg.pretrained_model in PunctuationCapitalizationModel.get_available_model_names(
    ):
        model = PunctuationCapitalizationModel.from_pretrained(
            cfg.pretrained_model)
    else:
        raise ValueError(
            f'Provide path to the pre-trained .nemo file or choose from {PunctuationCapitalizationModel.list_available_models()}'
        )

    data_dir = cfg.model.dataset.get('data_dir', None)

    if data_dir is None:
        logging.error(
            'No dataset directory provided. Skipping evaluation. '
            'To run evaluation on a file, specify path to the directory that contains test_ds.text_file and test_ds.labels_file with "model.dataset.data_dir" argument.'
        )
    elif not os.path.exists(data_dir):
        logging.error(
            f'{data_dir} is not found, skipping evaluation on the test set.')
    else:
        model.update_data_dir(data_dir=data_dir)
        model._cfg.dataset = cfg.model.dataset

        if not hasattr(cfg.model, 'test_ds'):
            logging.error(
                f'model.test_ds was not found in the config, skipping evaluation'
            )
        elif model.prepare_test(trainer):
            model.setup_test_data(cfg.model.test_ds)
            trainer.test(model)
        else:
            logging.error(
                'Skipping the evaluation. The trainer is not setup properly.')

    # run an inference on a few examples
    queries = [
        'we bought four shirts one pen and a mug from the nvidia gear store in santa clara',
        'what can i do for you today',
        'how are you',
    ]

    inference_results = model.add_punctuation_capitalization(
        queries, batch_size=len(queries), max_seq_length=512)

    for query, result in zip(queries, inference_results):
        logging.info(f'Query : {query}')
        logging.info(f'Result: {result.strip()}\n')

    logging.info(f'Results are saved at {exp_dir}')

Example #19

0

Show file

File: token_classification.py Project: toonday/NeMo

def main(cfg: DictConfig) -> None:
    trainer = pl.Trainer(**cfg.trainer)
    exp_dir = exp_manager(trainer, cfg.get("exp_manager", None))
    do_training = True
    if not cfg.pretrained_model:
        logging.info(f'Config: {OmegaConf.to_yaml(cfg)}')
        model = TokenClassificationModel(cfg.model, trainer=trainer)
    else:
        logging.info(f'Loading pretrained model {cfg.pretrained_model}')
        model = TokenClassificationModel.from_pretrained(cfg.pretrained_model)

        data_dir = cfg.model.dataset.get('data_dir', None)
        if data_dir:
            # we can also do finetunining of the pretrained model but it will require
            # setting up train and validation Pytorch DataLoaders
            # setup the data dir to get class weights statistics
            model.update_data_dir(data_dir=data_dir)
            # then we're setting up loss, use model.dataset.class_balancing,
            # if you want to add class weights to the CrossEntropyLoss
            model.setup_loss(class_balancing=cfg.model.dataset.class_balancing)
            # finally, setup train and validation Pytorch DataLoaders
            model.setup_training_data()
            model.setup_validation_data()
            logging.info(f'Using config file of the pretrained model')
        else:
            do_training = False
            logging.info(
                f'Data dir should be specified for finetuning the pretrained model. '
                f'Using pretrained {cfg.pretrained_model} model weights and skipping finetuning.'
            )

    if do_training:
        trainer.fit(model)
        if cfg.model.nemo_path:
            model.save_to(cfg.model.nemo_path)

    """
    After model training is done, you can use the model for inference.
    You can either evaluate data from a text_file that follows training data format,
    or provide a list of queries you want to add entities to

    During evaluation/testing, it is currently advisable to construct a new Trainer with single GPU
    and no DDP to obtain accurate results
    """
    logging.info(
        'During evaluation/testing, it is currently advisable to construct a new Trainer with single GPU '
        'and no DDP to obtain accurate results'
    )
    gpu = 1 if cfg.trainer.gpus != 0 else 0
    trainer = pl.Trainer(gpus=gpu)
    model.set_trainer(trainer)

    if do_training:
        # run evaluation on a dataset from file
        # only possible if model.dataset.data_dir is specified
        # change the path to the file you want to use for the final evaluation
        model.evaluate_from_file(
            text_file=os.path.join(cfg.model.dataset.data_dir, cfg.model.validation_ds.text_file),
            labels_file=os.path.join(cfg.model.dataset.data_dir, cfg.model.validation_ds.labels_file),
            output_dir=exp_dir,
            add_confusion_matrix=True,
            normalize_confusion_matrix=True,
        )

    # run an inference on a few examples
    queries = ['we bought four shirts from the nvidia gear store in santa clara.', 'Nvidia is a company.']
    results = model.add_predictions(queries)

    for query, result in zip(queries, results):
        logging.info(f'Query : {query}')
        logging.info(f'Result: {result.strip()}\n')

Example #20

0

Show file

def main(cfg):
    trainer = pl.Trainer(**cfg.trainer)
    exp_manager(trainer, cfg.get("exp_manager", None))
    model = HifiGanModel(cfg=cfg.model, trainer=trainer)
    trainer.fit(model)