def test_check_val_every_n_epoch_with_max_steps(tmpdir):
    data_samples_train = 2
    check_val_every_n_epoch = 3
    max_epochs = 4

    class TestModel(BoringModel):
        def __init__(self):
            super().__init__()
            self.validation_called_at_step = set()

        def validation_step(self, *args):
            self.validation_called_at_step.add(self.global_step)
            return super().validation_step(*args)

        def train_dataloader(self):
            return DataLoader(RandomDataset(32, data_samples_train))

    model = TestModel()
    trainer = Trainer(
        default_root_dir=tmpdir,
        max_steps=data_samples_train * max_epochs,
        check_val_every_n_epoch=check_val_every_n_epoch,
        num_sanity_val_steps=0,
    )

    trainer.fit(model)

    assert trainer.current_epoch == max_epochs
    assert trainer.global_step == max_epochs * data_samples_train
    assert list(model.validation_called_at_step) == [
        data_samples_train * check_val_every_n_epoch
    ]
def test_check_val_every_n_epoch(tmpdir, max_epochs, expected_val_loop_calls,
                                 expected_val_batches):
    class TestModel(BoringModel):
        val_epoch_calls = 0
        val_batches = []

        def on_train_epoch_end(self, *args, **kwargs):
            self.val_batches.append(
                self.trainer.progress_bar_callback.total_val_batches)

        def on_validation_epoch_start(self) -> None:
            self.val_epoch_calls += 1

    model = TestModel()
    trainer = Trainer(
        default_root_dir=tmpdir,
        max_epochs=max_epochs,
        num_sanity_val_steps=0,
        limit_val_batches=2,
        check_val_every_n_epoch=2,
        logger=False,
    )
    trainer.fit(model)
    assert trainer.state.finished, f"Training failed with {trainer.state}"

    assert model.val_epoch_calls == expected_val_loop_calls
    assert model.val_batches == expected_val_batches
Exemple #3
0
def test():
    saved_model_path = './model.pth'  # lightning_logs/version_N/checkpoints/* 最优模型软链接
    model = BertClassifier.load_from_checkpoint(saved_model_path)
    model.eval()
    print(model)
    trainer = Trainer(gpus=1)
    result = trainer.test(model)
    print(result)
Exemple #4
0
def main():
    parser = ArgumentParser()
    parser.add_argument("--model_file",
                        type=str,
                        default="",
                        required=True,
                        help="Pass path to model's .nemo file")
    parser.add_argument("--prompt",
                        type=str,
                        default="",
                        required=True,
                        help="Prompt for the model (a text to complete)")
    parser.add_argument("--tokens_to_generate",
                        type=int,
                        default="16",
                        required=False,
                        help="How many tokens to add to prompt")
    parser.add_argument(
        "--tensor_model_parallel_size",
        type=int,
        default=1,
        required=True,
    )

    args = parser.parse_args()

    torch.set_grad_enabled(False)

    # trainer required for restoring model parallel models
    trainer = Trainer(plugins=NLPDDPPlugin(),
                      devices=args.tensor_model_parallel_size,
                      precision=16,
                      accelerator='gpu')

    app_state = AppState()
    if args.tensor_model_parallel_size > 1:
        app_state.model_parallel_size = args.tensor_model_parallel_size
        app_state.model_parallel_rank = compute_model_parallel_rank(
            trainer.local_rank, app_state.model_parallel_size)

    model = MegatronT5Model.restore_from(restore_path=args.model_file,
                                         trainer=trainer)
    model.freeze()
    request = {
        "prompt": args.prompt,
        "tokens_to_generate": args.tokens_to_generate,
    }

    dataset = T5RequestDataset(request, model.tokenizer)

    request_dl = DataLoader(dataset)

    response = trainer.predict(model, request_dl)

    print("***************************")
    print(response)
    print("***************************")
Exemple #5
0
def convert(local_rank, rank, world_size, args):

    app_state = AppState()
    app_state.data_parallel_rank = 0
    num_nodes = world_size // args.gpus_per_node
    if args.bcp:
        trainer = Trainer(devices=args.gpus_per_node,
                          num_nodes=num_nodes,
                          accelerator='gpu',
                          plugins=[TorchElasticEnvironment()])
    else:
        trainer = Trainer(devices=args.gpus_per_node,
                          num_nodes=num_nodes,
                          accelerator='gpu')

    app_state.pipeline_model_parallel_size = args.pipeline_model_parallel_size
    app_state.tensor_model_parallel_size = args.tensor_model_parallel_size
    app_state.model_parallel_size = app_state.tensor_model_parallel_size * app_state.pipeline_model_parallel_size

    parallel_state.initialize_model_parallel(
        tensor_model_parallel_size_=app_state.tensor_model_parallel_size,
        pipeline_model_parallel_size_=app_state.pipeline_model_parallel_size,
    )

    app_state.pipeline_model_parallel_rank = parallel_state.get_pipeline_model_parallel_rank(
    )
    app_state.tensor_model_parallel_rank = parallel_state.get_tensor_model_parallel_rank(
    )

    # inject model parallel rank
    checkpoint_path = inject_model_parallel_rank(
        os.path.join(args.checkpoint_folder, args.checkpoint_name))

    logging.info(
        f'rank: {rank}, local_rank: {local_rank}, is loading checkpoint: {checkpoint_path} for tp_rank: {app_state.tensor_model_parallel_rank} and pp_rank: {app_state.pipeline_model_parallel_rank}'
    )

    if args.model_type == 'gpt':
        model = MegatronGPTModel.load_from_checkpoint(
            checkpoint_path, hparams_file=args.hparams_file, trainer=trainer)
    elif args.model_type == 'bert':
        model = MegatronBertModel.load_from_checkpoint(
            checkpoint_path, hparams_file=args.hparams_file, trainer=trainer)
    elif args.model_type == 't5':
        model = MegatronT5Model.load_from_checkpoint(
            checkpoint_path, hparams_file=args.hparams_file, trainer=trainer)
    elif args.model_type == 'nmt':
        model = MegatronNMTModel.load_from_checkpoint(
            checkpoint_path, hparams_file=args.hparams_file, trainer=trainer)
    model._save_restore_connector = NLPSaveRestoreConnector()

    if torch.distributed.is_initialized():
        torch.distributed.barrier()

    model.save_to(args.nemo_file_path)

    logging.info(f'NeMo model saved to: {args.nemo_file_path}')
Exemple #6
0
    def setup_method(self, test_method):
        trainer_config = {
            "devices": 1,
            "num_nodes": 1,
            "accelerator": "gpu",
            "logger": False,
            "precision": 16,
        }
        tensor_model_parallel_size = 1
        pipeline_model_parallel_size = 1
        model_file = '/home/TestData/nlp/megatron_gpt/125M/megatron_gpt.nemo'

        # trainer required for restoring model parallel models
        trainer = Trainer(plugins=NLPDDPPlugin(), **trainer_config)
        assert (
            trainer_config["devices"] *
            trainer_config['num_nodes'] == tensor_model_parallel_size *
            pipeline_model_parallel_size
        ), "devices * num_nodes should equal tensor_model_parallel_size * pipeline_model_parallel_size"

        model = MegatronGPTModel.restore_from(restore_path=model_file,
                                              trainer=trainer)
        model.freeze()

        # has to turn off activations_checkpoint_method for inference
        try:
            model.model.language_model.encoder.activations_checkpoint_method = None
        except AttributeError:
            pass

        self.model = model
Exemple #7
0
def convert(rank, world_size, args):

    app_state = AppState()
    app_state.data_parallel_rank = 0
    trainer = Trainer(gpus=args.tensor_model_parallel_size)
    # TODO: reach out to PTL For an API-safe local rank override
    trainer.accelerator.training_type_plugin._local_rank = rank

    if args.tensor_model_parallel_size is not None and args.tensor_model_parallel_size > 1:
        # inject model parallel rank
        checkpoint_path = os.path.join(args.checkpoint_folder,
                                       f'mp_rank_{rank:02d}',
                                       args.checkpoint_name)
    else:
        checkpoint_path = os.path.join(args.checkpoint_folder,
                                       args.checkpoint_name)

    if args.model_type == 'gpt':
        model = MegatronGPTModel.load_from_checkpoint(
            checkpoint_path, hparams_file=args.hparams_file, trainer=trainer)
    elif args.model_type == 'bert':
        model = MegatronBertModel.load_from_checkpoint(
            checkpoint_path, hparams_file=args.hparams_file, trainer=trainer)
    elif args.model_type == 't5':
        model = MegatronT5Model.load_from_checkpoint(
            checkpoint_path, hparams_file=args.hparams_file, trainer=trainer)

    model._save_restore_connector = NLPSaveRestoreConnector()

    if torch.distributed.is_initialized():
        torch.distributed.barrier()

    model.save_to(args.nemo_file_path)

    logging.info(f'NeMo model saved to: {args.nemo_file_path}')
Exemple #8
0
    def setup_class(cls):
        if not torch.cuda.is_available():
            return
        GPUS = 1
        plugins = [NLPDDPPlugin()]
        TP_SIZE = GPUS
        PP_SIZE = 1
        MB_SIZE = 4
        GB_SIZE = 8
        SEED = 1234
        trainer = Trainer(
            plugins=plugins, devices=GPUS, accelerator='gpu', num_nodes=1, logger=None, log_gpu_memory=None
        )

        initialize_model_parallel_for_nemo(
            world_size=trainer.world_size,
            global_rank=trainer.global_rank,
            local_rank=trainer.local_rank,
            tensor_model_parallel_size=TP_SIZE,
            pipeline_model_parallel_size=PP_SIZE,
            micro_batch_size=MB_SIZE,
            global_batch_size=GB_SIZE,
            seed=SEED,
            apex_transformer_log_level=30,
        )

        def dummy():
            return

        if trainer.strategy.launcher is not None:
            trainer.strategy.launcher.launch(dummy, trainer=trainer)
        trainer.strategy.setup_environment()
        torch.distributed.barrier()
def test_loops_state_dict():
    trainer = Trainer()
    trainer.train_dataloader = Mock()

    fit_loop = FitLoop()
    with pytest.raises(MisconfigurationException, match="Loop FitLoop should be connected to a"):
        fit_loop.trainer = object()

    fit_loop.trainer = trainer
    fit_loop.connect(Mock())
    state_dict = fit_loop.state_dict()

    new_fit_loop = FitLoop()
    new_fit_loop.trainer = trainer

    new_fit_loop.load_state_dict(state_dict)
    assert fit_loop.state_dict() == new_fit_loop.state_dict()
def main(hparams):
    model = LightningModel(hparams)
    if hparams.seed is not None:
        random.seed(hparams.seed)
        t.manual_seed(hparams.seed)
        cudnn.deterministic = True
    exp_root = 'exp'
    log_folder = 'lightning_logs'
    log_root = os.path.join(exp_root, log_folder)
    logger = TestTubeLogger(exp_root, name=log_folder, version=2000)
    checkpoint = ModelCheckpoint(
        filepath='exp/lightning_logs/version_2000/checkpoints/',
        monitor='val_mer',
        verbose=1,
        save_top_k=-1)
    trainer = Trainer(
        logger=logger,
        early_stop_callback=False,
        accumulate_grad_batches=4,
        checkpoint_callback=checkpoint,
        # checkpoint_callback=checkpoint,
        # fast_dev_run=True,
        # overfit_pct=0.03,
        # profiler=True,
        default_save_path='exp/',
        val_check_interval=0.3,
        log_save_interval=50000,
        row_log_interval=50000,
        gpus=1,
        val_percent_check=1,
        # distributed_backend='dp',
        nb_gpu_nodes=hparams.nb_gpu_nodes,
        max_nb_epochs=hparams.epochs,
        gradient_clip_val=5.0,
        min_nb_epochs=3000,
        use_amp=True,
        precision=16,
        nb_sanity_val_steps=0,
        progress_bar_refresh_rate=1,
        resume_from_checkpoint=
        'exp/lightning_logs/version_2000/checkpoints/epoch=114_v1.ckpt')
    # if hparams.evaluate:
    #     trainer.run_evaluation()
    # else:
    trainer.fit(model)
Exemple #11
0
def train():
    from argparse import ArgumentParser
    parser = ArgumentParser()
    parser = BertClassifier.add_model_specific_args(parser)
    parser = Trainer.add_argparse_args(parser)
    args = parser.parse_args()

    model = BertClassifier(batch_size=args.batch_size,
                           learning_rate=args.learning_rate,
                           early_stop=args.early_stop)
    print(model)
    early_stopping = EarlyStopping('val_loss')
    trainer = Trainer.from_argparse_args(args,
                                         callbacks=[early_stopping],
                                         precision=16,
                                         gpus=1,
                                         max_epochs=30)
    trainer.fit(model)
Exemple #12
0
def main(cfg) -> None:

    # trainer required for restoring model parallel models
    trainer = Trainer(plugins=NLPDDPPlugin(), **cfg.trainer)
    assert (
        cfg.trainer.devices * cfg.trainer.num_nodes
        == cfg.tensor_model_parallel_size * cfg.pipeline_model_parallel_size
    ), "devices * num_nodes should equal tensor_model_parallel_size * pipeline_model_parallel_size"

    app_state = AppState()
    app_state.model_parallel_size = cfg.tensor_model_parallel_size * cfg.pipeline_model_parallel_size
    (
        app_state.tensor_model_parallel_rank,
        app_state.pipeline_model_parallel_rank,
        app_state.model_parallel_size,
        app_state.data_parallel_size,
        app_state.pipeline_model_parallel_split_rank,
    ) = fake_initialize_model_parallel(
        world_size=app_state.model_parallel_size,
        rank=trainer.global_rank,
        tensor_model_parallel_size_=cfg.tensor_model_parallel_size,
        pipeline_model_parallel_size_=cfg.pipeline_model_parallel_size,
        pipeline_model_parallel_split_rank_=cfg.pipeline_model_parallel_split_rank,
    )

    if cfg.model_file is not None:
        if not os.path.exists(cfg.model_file):
            raise ValueError(f"Model file {cfg.model_file} does not exist")
        model = MegatronNMTModel.restore_from(
            restore_path=cfg.model_file, trainer=trainer, save_restore_connector=NLPSaveRestoreConnector(),
        )
    elif cfg.checkpoint_dir is not None:
        checkpoint_path = inject_model_parallel_rank(os.path.join(cfg.checkpoint_dir, cfg.checkpoint_name))
        model = MegatronNMTModel.load_from_checkpoint(checkpoint_path, hparams_file=cfg.hparams_file, trainer=trainer)
    else:
        raise ValueError("need at least a nemo file or checkpoint dir")

    model.freeze()

    logging.info(f"Translating: {cfg.srctext}")
    src_text = []
    translations = []
    with open(cfg.srctext, 'r') as src_f, open(cfg.tgtout, 'w') as tgt_f:
        for line in src_f:
            src_text.append(line.strip())
            if len(src_text) == cfg.batch_size:
                translations = model.translate(
                    text=src_text, source_lang=cfg.source_lang, target_lang=cfg.target_lang,
                )
                for translation in translations:
                    tgt_f.write(translation + "\n")
                src_text = []
        if len(src_text) > 0:
            translations = model.translate(text=src_text, source_lang=cfg.source_lang, target_lang=cfg.target_lang,)
            for translation in translations:
                tgt_f.write(translation + "\n")
Exemple #13
0
 def score(self, src: List[str], cand: List[str],
           ref: List[str]) -> COMETResult:
     data = {"src": src, "mt": cand, "ref": ref}
     data = [dict(zip(data, t)) for t in zip(*data.values())]
     dataloader = DataLoader(
         dataset=data,
         batch_size=16,
         collate_fn=lambda x: self.model.prepare_sample(x, inference=True),
         num_workers=4,
     )
     cuda = 1 if torch.cuda.is_available() else 0
     trainer = Trainer(gpus=cuda, deterministic=True, logger=False)
     predictions = trainer.predict(self.model,
                                   dataloaders=dataloader,
                                   return_predictions=True)
     scores = torch.cat(predictions, dim=0).tolist()
     return COMETResult(
         sum(scores) / len(scores), scores, src, cand, ref, self.name,
         self.modelname)
Exemple #14
0
def main(hparams):
    data_path = os.environ['HOME'] + '/data/asr_data/'
    model = LightningModel(hparams)
    if hparams.seed is not None:
        random.seed(hparams.seed)
        t.manual_seed(hparams.seed)
        cudnn.deterministic = True
    exp_root = 'exp'
    log_folder = 'lightning_logs'
    log_root = os.path.join(exp_root, log_folder)
    logger = TestTubeLogger(exp_root, name=log_folder, version=1020)
    checkpoint = ModelCheckpoint(filepath=data_path + '/checkpoints/',
                                 monitor='val_mer',
                                 verbose=1,
                                 save_top_k=-1)
    trainer = Trainer(
        logger=logger,
        early_stop_callback=False,
        checkpoint_callback=checkpoint,
        # checkpoint_callback=checkpoint,
        # fast_dev_run=True,
        # overfit_pct=0.03,
        # profiler=True,
        default_save_path=data_path,
        val_check_interval=1.0,
        log_save_interval=100,
        row_log_interval=10,
        gpus=1,
        precision=16,
        distributed_backend='dp',
        nb_gpu_nodes=hparams.nb_gpu_nodes,
        max_nb_epochs=hparams.epochs,
        gradient_clip_val=5.0,
        min_nb_epochs=3000,
        use_amp=True,
        amp_level='O1',
        nb_sanity_val_steps=0,
        log_gpu_memory='all')
    # if hparams.evaluate:
    #     trainer.run_evaluation()
    # else:
    trainer.fit(model)
def test_loops_state_dict():
    fit_loop = FitLoop()
    with pytest.raises(MisconfigurationException,
                       match="Loop FitLoop should be connected to a"):
        fit_loop.connect(object())  # noqa

    fit_loop.connect(Trainer())
    state_dict = fit_loop.state_dict()
    new_fit_loop = FitLoop()
    new_fit_loop.load_state_dict(state_dict)
    assert fit_loop.state_dict() == new_fit_loop.state_dict()
def main(hparams):
    model = LightningModel(hparams)
    if hparams.seed is not None:
        random.seed(hparams.seed)
        t.manual_seed(hparams.seed)
        cudnn.deterministic = True
    exp_root = 'exp'
    log_folder = 'lightning_logs'
    log_root = os.path.join(exp_root, log_folder)
    logger = TestTubeLogger(exp_root, name=log_folder, version=4000)
    checkpoint = ModelCheckpoint(
        filepath='exp/lightning_logs/version_4000/checkpoints/',
        monitor='val_mer',
        verbose=1,
        save_top_k=-1)
    trainer = Trainer(
        logger=logger,
        early_stop_callback=False,
        checkpoint_callback=checkpoint,
        accumulate_grad_batches=8,
        # checkpoint_callback=checkpoint,
        # fast_dev_run=True,
        # overfit_pct=0.03,
        # profiler=True,
        default_save_path='exp/',
        val_check_interval=1.0,
        log_save_interval=50000,
        row_log_interval=50000,
        gpus=1,
        nb_gpu_nodes=hparams.nb_gpu_nodes,
        max_nb_epochs=hparams.epochs,
        gradient_clip_val=5.0,
        min_nb_epochs=3000,
        use_amp=True,
        amp_level='O1',
        nb_sanity_val_steps=0)
    # if hparams.evaluate:
    #     trainer.run_evaluation()
    # else:
    trainer.fit(model)
Exemple #17
0
def main(hparams):
    model = LightningModel(hparams)
    if hparams.seed is not None:
        random.seed(hparams.seed)
        t.manual_seed(hparams.seed)
        cudnn.deterministic = True
    exp_root = 'exp'
    log_folder = 'lightning_logs'
    log_root = os.path.join(exp_root, log_folder)
    logger = TestTubeLogger(exp_root, name=log_folder, version=5005)
    checkpoint = ModelCheckpoint(filepath='exp/lightning_logs/version_5005/checkpoints/',
                                 monitor='val_loss', verbose=True, save_top_k=-1, mode='min')
    trainer = Trainer(
        logger=logger,
        nb_sanity_val_steps=5,
        early_stop_callback=False,
        checkpoint_callback=checkpoint,
        accumulate_grad_batches=8,
        progress_bar_refresh_rate=10,
        default_save_path='exp/',
        val_check_interval=1.0,
        log_save_interval=50000,
        row_log_interval=50000,
        # gpus=1,
        nb_gpu_nodes=hparams.nb_gpu_nodes,
        max_nb_epochs=hparams.epochs,
        gradient_clip_val=5.0,
        min_nb_epochs=3000,
        gpus=1,
        # num_nodes=1,
        # distributed_backend='dp',
        use_amp=False,
        precision=32,
        # amp_level='O1',
        resume_from_checkpoint='exp/lightning_logs/version_5005/checkpoints/epoch=108.ckpt'
    )
    # if hparams.evaluate:
    #     trainer.run_evaluation()
    # else:
    trainer.fit(model)
Exemple #18
0
def test_loops_state_dict():
    trainer = Trainer()

    fit_loop = FitLoop()

    fit_loop.trainer = trainer
    state_dict = fit_loop.state_dict()

    new_fit_loop = FitLoop()
    new_fit_loop.trainer = trainer

    new_fit_loop.load_state_dict(state_dict)
    assert fit_loop.state_dict() == new_fit_loop.state_dict()
Exemple #19
0
def tune_model(config, ptl_model, dset, train_inds, n_workers, n_val = None, 
               val_inds = None, tune_metrics = None, mode = 'tune', **trainer_kwargs):
    ''' A generic function to hp-tuning and model training with ray and pytorch-lightning '''
    
    model = ptl_model(config = config)
    
    if val_inds is None:
        shuffle(train_inds)

    train_dl = DataLoader(
        torch.utils.data.Subset(dset, train_inds[n_val:] if val_inds is None else train_inds),
        batch_size = config['batch_size'],
        num_workers = n_workers,
        drop_last = True,
        shuffle = True
    )
    val_dl = DataLoader(
        torch.utils.data.Subset(dset, train_inds[:n_val] if val_inds is None else val_inds),
        num_workers = n_workers,
        batch_size = config['batch_size'],
        drop_last = True,
        shuffle = False
    )
    
    callbacks = model.callbacks
    if mode == 'tune':
        callbacks += [
            TuneReportCallback(
                tune_metrics, 
                on = 'validation_end'
            )
        ]

    trainer = PLTrainer(callbacks = callbacks, **trainer_kwargs)
    trainer.fit(model, train_dl, val_dl)
    
    return trainer
def test_loops_state_dict_structure():
    trainer = Trainer()
    # structure saved by the checkpoint connector
    state_dict = {
        "fit_loop": trainer.fit_loop.state_dict(),
        "validate_loop": trainer.validate_loop.state_dict(),
        "test_loop": trainer.test_loop.state_dict(),
        "predict_loop": trainer.predict_loop.state_dict(),
    }
    expected = {
        "fit_loop": {
            'epoch_loop': {
                'batch_loop': {},
                'val_loop': {},
            }
        },
        "validate_loop": {},
        "test_loop": {},
        "predict_loop": {},
    }
    assert state_dict == expected