Example #1
0
def initialize_model_parallel_for_nemo(
    world_size,
    global_rank,
    local_rank,
    tensor_model_parallel_size=1,
    seed=1234,
):

    # updating NeMo globals
    app_state = AppState()
    app_state.global_rank = global_rank
    app_state.world_size = world_size
    app_state.model_parallel_size = tensor_model_parallel_size
    app_state.model_parallel_rank = compute_model_parallel_rank(
        local_rank, tensor_model_parallel_size)

    # update apex.mpu globals
    set_tensor_model_parallel_world_size(tensor_model_parallel_size)
    set_tensor_model_parallel_rank(app_state.model_parallel_rank)

    # pipeline model parallelism not implemented in NeMo yet
    set_pipeline_model_parallel_rank(0)
    set_pipeline_model_parallel_world_size(1)

    _set_random_seed(seed)

    app_state._is_megatron_initialized = True
Example #2
0
    def __init__(self, cfg: DictConfig, trainer: Trainer):
        app_state = AppState()

        if not app_state._is_megatron_initialized:
            logging.info(
                f"Initializing megatron since it hasn't been initialized by the model. This is normal if you are using a NeMo model with Megatron dataloaders."
            )
            app_state.global_rank = trainer.global_rank
            app_state.world_size = trainer.world_size
            app_state.model_parallel_size = 1
            app_state.model_parallel_rank = trainer.global_rank

            initialize_model_parallel_for_nemo(
                world_size=trainer.world_size,
                global_rank=trainer.global_rank,
                local_rank=trainer.local_rank,
                tensor_model_parallel_size=cfg.get(
                    'tensor_model_parallel_size', 1),
                seed=self.cfg.get('seed', 1234),
            )

        try:
            from nemo.collections.nlp.data.language_modeling.megatron.dataset_utils import compile_helper

            compile_helper()
            logging.info('Megatron dataset helper compiled successfully.')
            from nemo.collections.nlp.data.language_modeling.megatron import helpers
        except ImportError:
            raise ImportError(
                f'Could not compile megatron dataset C++ helper functions and therefore cannot import helpers python file.'
            )
Example #3
0
def convert(local_rank, rank, world_size, args):

    app_state = AppState()
    app_state.data_parallel_rank = 0
    num_nodes = world_size // args.gpus_per_node
    if args.bcp:
        trainer = Trainer(devices=args.gpus_per_node,
                          num_nodes=num_nodes,
                          accelerator='gpu',
                          plugins=[TorchElasticEnvironment()])
    else:
        trainer = Trainer(devices=args.gpus_per_node,
                          num_nodes=num_nodes,
                          accelerator='gpu')

    app_state.pipeline_model_parallel_size = args.pipeline_model_parallel_size
    app_state.tensor_model_parallel_size = args.tensor_model_parallel_size
    app_state.model_parallel_size = app_state.tensor_model_parallel_size * app_state.pipeline_model_parallel_size

    parallel_state.initialize_model_parallel(
        tensor_model_parallel_size_=app_state.tensor_model_parallel_size,
        pipeline_model_parallel_size_=app_state.pipeline_model_parallel_size,
    )

    app_state.pipeline_model_parallel_rank = parallel_state.get_pipeline_model_parallel_rank(
    )
    app_state.tensor_model_parallel_rank = parallel_state.get_tensor_model_parallel_rank(
    )

    # inject model parallel rank
    checkpoint_path = inject_model_parallel_rank(
        os.path.join(args.checkpoint_folder, args.checkpoint_name))

    logging.info(
        f'rank: {rank}, local_rank: {local_rank}, is loading checkpoint: {checkpoint_path} for tp_rank: {app_state.tensor_model_parallel_rank} and pp_rank: {app_state.pipeline_model_parallel_rank}'
    )

    if args.model_type == 'gpt':
        model = MegatronGPTModel.load_from_checkpoint(
            checkpoint_path, hparams_file=args.hparams_file, trainer=trainer)
    elif args.model_type == 'bert':
        model = MegatronBertModel.load_from_checkpoint(
            checkpoint_path, hparams_file=args.hparams_file, trainer=trainer)
    elif args.model_type == 't5':
        model = MegatronT5Model.load_from_checkpoint(
            checkpoint_path, hparams_file=args.hparams_file, trainer=trainer)
    elif args.model_type == 'nmt':
        model = MegatronNMTModel.load_from_checkpoint(
            checkpoint_path, hparams_file=args.hparams_file, trainer=trainer)
    model._save_restore_connector = NLPSaveRestoreConnector()

    if torch.distributed.is_initialized():
        torch.distributed.barrier()

    model.save_to(args.nemo_file_path)

    logging.info(f'NeMo model saved to: {args.nemo_file_path}')
Example #4
0
    def __init__(self, cfg: DictConfig, trainer: Trainer):
        app_state = AppState()

        if not app_state._is_megatron_initialized:
            logging.info(
                f"Initializing megatron since it hasn't been initialized by the model. This is normal if you are using a NeMo model with Megatron dataloaders."
            )
            app_state.global_rank = trainer.global_rank
            app_state.world_size = trainer.world_size
            app_state.model_parallel_size = 1
            app_state.model_parallel_rank = trainer.global_rank

            initialize_model_parallel_for_nemo(
                world_size=trainer.world_size,
                global_rank=trainer.global_rank,
                local_rank=trainer.local_rank,
                tensor_model_parallel_size=cfg.get(
                    'tensor_model_parallel_size', 1),
                seed=self.cfg.get('seed', 1234),
            )
Example #5
0
    def restore_from(
        cls,
        restore_path: str,
        override_config_path: Optional[Union[OmegaConf, str]] = None,
        map_location: Optional[torch.device] = None,
        strict: bool = True,
        return_config: bool = False,
        trainer: Trainer = None,
        save_restore_connector: SaveRestoreConnector = None,
    ):
        """
        Restores model instance (weights and configuration) from .nemo file.

        Args:
            restore_path: path to .nemo file from which model should be instantiated
            override_config_path: path to a yaml config that will override the internal
                config file or an OmegaConf / DictConfig object representing the model config.
            map_location: Optional torch.device() to map the instantiated model to a device.
                By default (None), it will select a GPU if available, falling back to CPU otherwise.
            strict: Passed to load_state_dict. Set to True by default.
            return_config: If set to true, will return just the underlying config of the restored
                model as an OmegaConf DictConfig object without instantiating the model.
            trainer: PyTorch Lightning trainer. Must be passed in order to use model parallel .nemo

            Example:
                ```
                model = nemo.collections.nlp.models.TokenClassificationModel.restore_from('token_classification.nemo')
                assert isinstance(model, nemo.collections.nlp.models.TokenClassificationModel)
                ```

        Returns:
            An instance of type cls or its underlying config (if return_config is set).
        """
        if save_restore_connector is None:
            save_restore_connector = SaveRestoreConnector()

        if not os.path.exists(restore_path):
            raise FileNotFoundError(f"Can't find {restore_path}")

        app_state = AppState()
        app_state.model_restore_path = os.path.abspath(
            os.path.expanduser(restore_path))

        # detect if we have a model parallel .nemo file
        with tempfile.TemporaryDirectory() as tmpdir:
            cwd = os.getcwd()
            os.chdir(tmpdir)
            # detect if model parallel from tarfile
            tar = tarfile.open(app_state.model_restore_path, "r:gz")
            names = tar.getnames()
            mp_ranks = []
            for name in names:
                if 'mp_rank' in name:
                    mp_ranks.append(name)
            if mp_ranks:
                app_state.model_parallel_size = len(
                    mp_ranks
                ) // 2  # directory and file are included in getnames()

                # get checkpoint version
                checkpoint_version_member = None
                for member in tar.getmembers():
                    if 'megatron_checkpoint_version.json' in member.name:
                        checkpoint_version_member = member
                tar.extract(checkpoint_version_member, tmpdir)
                with open(checkpoint_version_member.name, 'r') as f:
                    checkpoint_version = json.load(f).get(
                        'checkpoint_version', None)
                logging.info(
                    (f'Detected model parallel .nemo file: {restore_path}. '
                     f'Assuming megatron model parallelism with '
                     f'model_parallel_size: {app_state.model_parallel_size} '
                     f'and checkpoint version: {checkpoint_version}'))
            tar.close()
            os.chdir(cwd)

        if app_state.model_parallel_size is not None:
            if not isinstance(trainer, Trainer):
                raise ValueError(
                    "trainer must be a PyTorch Lightning Trainer to restore model parallel .nemo files."
                )

            if checkpoint_version is None:
                raise ValueError(
                    "Restoring from megatron model parallel .nemo but could not find megatron checkpoint version."
                )
            else:
                logging.info(
                    f"Setting megatron checkpoint version: {checkpoint_version}"
                )
                set_checkpoint_version(checkpoint_version)

            app_state.world_size = trainer.num_gpus * trainer.num_nodes

            if trainer.local_rank is not None:
                app_state.local_rank = trainer.local_rank
            else:
                raise ValueError(
                    "trainer.local_rank is None. local_rank needed to restore model parallel models."
                )

            model_parallel_rank = compute_model_parallel_rank(
                trainer.local_rank, app_state.model_parallel_size)
            app_state.model_parallel_rank = model_parallel_rank

            cls.update_save_restore_connector(save_restore_connector)
            restored_model = cls._save_restore_connector.restore_from(
                cls, app_state.model_restore_path, override_config_path,
                map_location, strict, return_config)
            restored_model.set_trainer(trainer)
            return restored_model
        else:
            return super().restore_from(
                app_state.model_restore_path,
                override_config_path,
                map_location,
                strict,
                return_config,
                save_restore_connector=save_restore_connector,
            )
Example #6
0
def convert(local_rank, rank, world_size, args):

    app_state = AppState()
    app_state.data_parallel_rank = 0
    tensor_model_parallel_size = args.tensor_model_parallel_size
    num_nodes = world_size // args.gpus_per_node
    pipeline_model_parallel_size = world_size // args.tensor_model_parallel_size
    assert args.pipeline_model_parallel_size == pipeline_model_parallel_size

    trainer = Trainer(devices=args.gpus_per_node,
                      accelerator='gpu',
                      num_nodes=num_nodes)

    app_state.pipeline_model_parallel_size = args.pipeline_model_parallel_size
    app_state.tensor_model_parallel_size = args.tensor_model_parallel_size
    app_state.model_parallel_size = app_state.tensor_model_parallel_size * app_state.pipeline_model_parallel_size

    parallel_state.initialize_model_parallel(
        tensor_model_parallel_size_=app_state.tensor_model_parallel_size,
        pipeline_model_parallel_size_=app_state.pipeline_model_parallel_size,
    )

    app_state.pipeline_model_parallel_rank = parallel_state.get_pipeline_model_parallel_rank(
    )
    app_state.tensor_model_parallel_rank = parallel_state.get_tensor_model_parallel_rank(
    )

    pipeline_rank = rank // tensor_model_parallel_size
    tensor_rank = app_state.tensor_model_parallel_rank
    assert pipeline_rank == app_state.pipeline_model_parallel_rank

    if tensor_model_parallel_size is not None and tensor_model_parallel_size > 1 and pipeline_model_parallel_size == 1:
        # inject model parallel rank
        checkpoint_path = os.path.join(args.checkpoint_folder,
                                       f'mp_rank_{tensor_rank:02d}',
                                       args.checkpoint_name)
    elif tensor_model_parallel_size is not None and pipeline_model_parallel_size > 1:
        checkpoint_path = os.path.join(
            args.checkpoint_folder,
            f'mp_rank_{tensor_rank:02d}_{pipeline_rank:03d}',
            args.checkpoint_name)
    else:
        checkpoint_path = os.path.join(args.checkpoint_folder,
                                       args.checkpoint_name)
    logging.info(f"loading checkpoint {checkpoint_path}")

    if args.model_type == 'gpt':
        ## this dictionary is used to rename the model parameters
        name_translate = {}
        name_translate['transformer'] = 'encoder'
        name_translate['.attention.'] = '.self_attention.'
        # nemo megatron doesn't have _for_head key
        name_translate['word_embeddings_for_head'] = 'word_embeddings'
        checkpoint, consumed, steps, version = load_from_checkpoint(
            MegatronGPTModel,
            checkpoint_path,
            hparams_file=args.hparams_file,
            trainer=trainer,
            translator=name_translate,
            strict=False,
        )
    elif args.model_type == 'bert':
        ## this dictionary is used to rename the model parameters
        name_translate = {}
        name_translate['transformer'] = 'encoder'
        name_translate['.attention.'] = '.self_attention.'
        # nemo megatron doesn't have _for_head key
        name_translate['word_embeddings_for_head'] = 'word_embeddings'
        checkpoint, consumed, steps, version = load_from_checkpoint(
            MegatronBertModel,
            checkpoint_path,
            hparams_file=args.hparams_file,
            trainer=trainer,
            translator=name_translate,
            strict=False,
        )
    else:
        raise NotImplemented("{} is not supported".format(args.model_type))

    if torch.distributed.is_initialized():
        torch.distributed.barrier()

    if args.output_ckpt_file_path:
        filepath = args.output_ckpt_file_path
        base_dir = pathlib.Path(filepath).parent
        filename_str = pathlib.Path(filepath).name
        suffix = '.ckpt'
        content = {}
        if consumed is not None:
            content['consumed'] = consumed
        else:
            content['consumed'] = 0
        if steps is not None:
            content['steps'] = steps
        else:
            content['steps'] = 0
        filename = filename_str.format(**content) + suffix
        checkpoint_path_output = inject_model_parallel_rank(
            os.path.join(base_dir, filename))
        trainer.accelerator.training_type_plugin.checkpoint_io.save_checkpoint(
            checkpoint, checkpoint_path_output)
        logging.info(
            f'NeMo model checkpoint files saved to: {args.output_ckpt_file_path}'
        )

    if args.nemo_file_path:
        if args.model_type == 'gpt':
            model = load_model(MegatronGPTModel,
                               checkpoint,
                               strict=False,
                               trainer=trainer)
        elif args.model_type == 'bert':
            model = load_model(MegatronBertModel,
                               checkpoint,
                               strict=False,
                               trainer=trainer)
        else:
            raise NotImplemented("{} is not supported".format(args.model_type))

        # verify tensor parallel rank id and pipeline parallel rank id matches
        assert app_state.data_parallel_size == 1
        assert app_state.tensor_model_parallel_size == tensor_model_parallel_size
        assert app_state.tensor_model_parallel_rank == tensor_rank
        assert app_state.pipeline_model_parallel_size == pipeline_model_parallel_size
        assert app_state.pipeline_model_parallel_rank == pipeline_rank
        model._save_restore_connector = NLPSaveRestoreConnector()
        model.save_to(args.nemo_file_path)
        logging.info(f'NeMo model saved to: {args.nemo_file_path}')
Example #7
0
def get_megatron_lm_model(
    pretrained_model_name: str,
    config_dict: Optional[dict] = None,
    config_file: Optional[str] = None,
    checkpoint_file: Optional[str] = None,
) -> Tuple[MegatronBertEncoder, str]:
    """
    Returns MegatronBertEncoder and a default or user specified path to the checkpoint file

    Args:
        pretrained_mode_name: model name from MEGATRON_CONFIG_MAP
            for example: megatron-bert-cased
        config_dict: model configuration parameters
        config_file: path to model configuration file. Takes precedence over config_dict if both supplied.
        checkpoint_file: path to checkpoint file or directory if using model parallel.

    Returns:
        model: MegatronBertEncoder
        checkpoint_file: path to checkpoint file or directory
    """
    config = None
    # get default config and checkpoint
    if config_file:
        with open(config_file) as f:
            config = json.load(f)
            # replace dashes with underscores in config keys
            fixed_config = {}
            for key in config.keys():
                fixed_key = key.replace("-", "_")
                if fixed_key == 'max_seq_length':
                    fixed_key = 'max_position_embeddings'
                fixed_config[fixed_key] = config[key]
            # 'vocab_size" no longer used.
            if 'vocab_size' in fixed_config:
                fixed_config.pop('vocab_size')
            config = fixed_config
    elif config_dict:
        config = config_dict
    elif pretrained_model_name in get_megatron_lm_models_list():
        config = get_megatron_config(pretrained_model_name)
    else:
        raise ValueError(f"{pretrained_model_name} is not supported")

    if config is None:
        raise ValueError(
            f"config_file or config_dict is required for {pretrained_model_name}"
        )

    if not checkpoint_file:
        checkpoint_file = get_megatron_checkpoint(pretrained_model_name)

    vocab = get_megatron_vocab_file(pretrained_model_name)

    # if checkpoint path is a directory, then we automatically compute model parallel size,
    # and model parallel rank
    if os.path.isdir(checkpoint_file):
        app_state = AppState()
        model_parallel_size = len(os.listdir(checkpoint_file))
        app_state.model_parallel_size = model_parallel_size
        logging.info((f'restore_path: {checkpoint_file} is a directory. '
                      f'Assuming megatron model parallelism with '
                      f'model_parallel_size: {model_parallel_size}'))
        # try to get local rank from global
        local_rank = None
        try:
            local_rank = int(os.environ['LOCAL_RANK'])
        except:
            logging.info('Global variable LOCAL_RANK not yet specified')
        if local_rank is not None:
            app_state.local_rank = local_rank
        else:
            # if local is None then we are on the main process
            local_rank = 0
        model_parallel_rank = compute_model_parallel_rank(
            local_rank, model_parallel_size)
        app_state.model_parallel_rank = model_parallel_rank
    else:
        model_parallel_size = None
        model_parallel_rank = None

    model = MegatronBertEncoder(
        model_name=pretrained_model_name,
        config=config,
        vocab_file=vocab,
        model_parallel_size=model_parallel_size,
        model_parallel_rank=model_parallel_rank,
    )

    return model, checkpoint_file