Exemple #1
0
    def init_model_parallel(self, global_rank: int, world_size: int) -> None:
        """ Initializes Megatron-LM model parallel if using model parallelism.

        Args:
            global_rank (int): the global process index.
            world_size (int): the total number of GPUs, num_nodes * num_gpus
            is_slurm_managing_tasks (bool, optional): is the cluster managed by SLURM.
        """
        app_state = AppState()

        # we initialize megatron-lm model parallel and data parallel groups
        # after initializing DDP with PTL.
        if app_state.model_parallel_size is not None:
            if torch.distributed.is_initialized():
                parallel_state.initialize_model_parallel(
                    app_state.model_parallel_size)
                app_state.model_parallel_group = parallel_state.get_tensor_model_parallel_group(
                )
                app_state.data_parallel_group = parallel_state.get_data_parallel_group(
                )
                app_state.model_parallel_rank = parallel_state.get_tensor_model_parallel_rank(
                )
                app_state.data_parallel_rank = parallel_state.get_data_parallel_rank(
                )
                app_state.data_parallel_size = parallel_state.get_data_parallel_world_size(
                )
                logging.info(f'mp_rank: {app_state.model_parallel_rank}')
                logging.info(f'dp_rank: {app_state.data_parallel_rank}')
Exemple #2
0
    def init_model_parallel(self, global_rank: int, world_size: int) -> None:
        """ Initializes Megatron-LM model parallel if using model parallelism.

        Args:
            global_rank (int): the global process index.
            world_size (int): the total number of GPUs, num_nodes * num_gpus
            is_slurm_managing_tasks (bool, optional): is the cluster managed by SLURM.
        """
        app_state = AppState()

        # we initialize megatron-lm model parallel and data parallel groups
        # after initializing DDP with PTL.
        if app_state.model_parallel_size is not None:
            if torch.distributed.is_initialized():
                mpu.initialize_model_parallel(app_state.model_parallel_size)
                app_state.model_parallel_group = mpu.get_model_parallel_group()
                app_state.data_parallel_group = mpu.get_data_parallel_group()
                app_state.model_parallel_rank = mpu.get_tensor_model_parallel_rank(
                )
                app_state.data_parallel_rank = mpu.get_data_parallel_rank()
                app_state.data_parallel_size = mpu.get_data_parallel_world_size(
                )
                logging.info(f'mp_rank: {app_state.model_parallel_rank}')
                logging.info(f'dp_rank: {app_state.data_parallel_rank}')
                # TODO: get random seed from PTL
                seed = os.environ.get("PL_GLOBAL_SEED", 1234)
                # random seed must be set for megatron model parallel init
                _set_random_seed(seed)
Exemple #3
0
def convert(rank, world_size, args):

    app_state = AppState()
    app_state.data_parallel_rank = 0
    trainer = Trainer(gpus=args.tensor_model_parallel_size)
    # TODO: reach out to PTL For an API-safe local rank override
    trainer.accelerator.training_type_plugin._local_rank = rank

    if args.tensor_model_parallel_size is not None and args.tensor_model_parallel_size > 1:
        # inject model parallel rank
        checkpoint_path = os.path.join(args.checkpoint_folder,
                                       f'mp_rank_{rank:02d}',
                                       args.checkpoint_name)
    else:
        checkpoint_path = os.path.join(args.checkpoint_folder,
                                       args.checkpoint_name)

    if args.model_type == 'gpt':
        model = MegatronGPTModel.load_from_checkpoint(
            checkpoint_path, hparams_file=args.hparams_file, trainer=trainer)
    elif args.model_type == 'bert':
        model = MegatronBertModel.load_from_checkpoint(
            checkpoint_path, hparams_file=args.hparams_file, trainer=trainer)
    elif args.model_type == 't5':
        model = MegatronT5Model.load_from_checkpoint(
            checkpoint_path, hparams_file=args.hparams_file, trainer=trainer)

    model._save_restore_connector = NLPSaveRestoreConnector()

    if torch.distributed.is_initialized():
        torch.distributed.barrier()

    model.save_to(args.nemo_file_path)

    logging.info(f'NeMo model saved to: {args.nemo_file_path}')
Exemple #4
0
    def init_ddp_connection(self,
                            global_rank: int,
                            world_size: int,
                            is_slurm_managing_tasks: bool = True) -> None:
        """ Override for LightningModule DDP initialization.
            Initializes Megatron-LM model parallel if using model parallelism.

        Args:
            global_rank (int): the global process index.
            world_size (int): the total number of GPUs, num_nodes * num_gpus
            is_slurm_managing_tasks (bool, optional): is the cluster managed by SLURM.
        """
        LightningModule.init_ddp_connection(self, global_rank, world_size,
                                            is_slurm_managing_tasks)

        app_state = AppState()

        # we initialize megatron-lm model parallel and data parallel groups
        # after initializing DDP with PTL.
        if app_state.model_parallel_size is not None:
            if app_state.model_parallel_group is None:
                mpu.initialize_model_parallel(app_state.model_parallel_size)
                app_state.model_parallel_group = mpu.get_model_parallel_group()
                app_state.data_parallel_group = mpu.get_data_parallel_group()
                app_state.model_parallel_rank = torch.distributed.get_rank(
                    group=app_state.model_parallel_group)
                app_state.data_parallel_rank = torch.distributed.get_rank(
                    group=app_state.data_parallel_group)
                logging.info(f'mp_rank: {app_state.model_parallel_rank}')
                logging.info(f'dp_rank: {app_state.data_parallel_rank}')
Exemple #5
0
def convert(local_rank, rank, world_size, args):

    app_state = AppState()
    app_state.data_parallel_rank = 0
    num_nodes = world_size // args.gpus_per_node
    if args.bcp:
        trainer = Trainer(devices=args.gpus_per_node,
                          num_nodes=num_nodes,
                          accelerator='gpu',
                          plugins=[TorchElasticEnvironment()])
    else:
        trainer = Trainer(devices=args.gpus_per_node,
                          num_nodes=num_nodes,
                          accelerator='gpu')

    app_state.pipeline_model_parallel_size = args.pipeline_model_parallel_size
    app_state.tensor_model_parallel_size = args.tensor_model_parallel_size
    app_state.model_parallel_size = app_state.tensor_model_parallel_size * app_state.pipeline_model_parallel_size

    parallel_state.initialize_model_parallel(
        tensor_model_parallel_size_=app_state.tensor_model_parallel_size,
        pipeline_model_parallel_size_=app_state.pipeline_model_parallel_size,
    )

    app_state.pipeline_model_parallel_rank = parallel_state.get_pipeline_model_parallel_rank(
    )
    app_state.tensor_model_parallel_rank = parallel_state.get_tensor_model_parallel_rank(
    )

    # inject model parallel rank
    checkpoint_path = inject_model_parallel_rank(
        os.path.join(args.checkpoint_folder, args.checkpoint_name))

    logging.info(
        f'rank: {rank}, local_rank: {local_rank}, is loading checkpoint: {checkpoint_path} for tp_rank: {app_state.tensor_model_parallel_rank} and pp_rank: {app_state.pipeline_model_parallel_rank}'
    )

    if args.model_type == 'gpt':
        model = MegatronGPTModel.load_from_checkpoint(
            checkpoint_path, hparams_file=args.hparams_file, trainer=trainer)
    elif args.model_type == 'bert':
        model = MegatronBertModel.load_from_checkpoint(
            checkpoint_path, hparams_file=args.hparams_file, trainer=trainer)
    elif args.model_type == 't5':
        model = MegatronT5Model.load_from_checkpoint(
            checkpoint_path, hparams_file=args.hparams_file, trainer=trainer)
    elif args.model_type == 'nmt':
        model = MegatronNMTModel.load_from_checkpoint(
            checkpoint_path, hparams_file=args.hparams_file, trainer=trainer)
    model._save_restore_connector = NLPSaveRestoreConnector()

    if torch.distributed.is_initialized():
        torch.distributed.barrier()

    model.save_to(args.nemo_file_path)

    logging.info(f'NeMo model saved to: {args.nemo_file_path}')
def convert(rank, world_size, args):

    app_state = AppState()
    app_state.data_parallel_rank = 0
    trainer = Trainer(gpus=args.tensor_model_parallel_size)
    # TODO: reach out to PTL For an API-safe local rank override
    trainer.accelerator.training_type_plugin._local_rank = rank

    if args.tensor_model_parallel_size is not None and args.tensor_model_parallel_size > 1:
        # inject model parallel rank
        checkpoint_path = os.path.join(args.checkpoint_folder,
                                       f'mp_rank_{rank:02d}',
                                       args.checkpoint_name)
    else:
        checkpoint_path = os.path.join(args.checkpoint_folder,
                                       args.checkpoint_name)

    if args.model_type == 'gpt':
        ## this dictionary is used to rename the model parameters
        name_translate = {}
        name_translate['transformer'] = 'encoder'
        name_translate['.attention.'] = '.self_attention.'
        model = load_from_checkpoint(
            MegatronGPTModel,
            checkpoint_path,
            hparams_file=args.hparams_file,
            trainer=trainer,
            translator=name_translate,
            strict=False,
        )
    elif args.model_type == 'bert':
        ## this dictionary is used to rename the model parameters
        name_translate = {}
        name_translate['transformer'] = 'encoder'
        name_translate['.attention.'] = '.self_attention.'
        model = load_from_checkpoint(
            MegatronBertModel,
            checkpoint_path,
            hparams_file=args.hparams_file,
            trainer=trainer,
            translator=name_translate,
            strict=False,
        )
    else:
        raise NotImplemented("{} is not supported".format(args.model_type))

    model._save_restore_connector = NLPSaveRestoreConnector()

    if torch.distributed.is_initialized():
        torch.distributed.barrier()

    model.save_to(args.nemo_file_path)

    logging.info(f'NeMo model saved to: {args.nemo_file_path}')
Exemple #7
0
def convert(rank, world_size, args):

    app_state = AppState()
    app_state.data_parallel_rank = 0
    trainer = Trainer(gpus=args.tensor_model_parallel_size)
    # TODO: reach out to PTL For an API-safe local rank override
    trainer.accelerator.training_type_plugin._local_rank = rank
    checkpoint_path = os.path.join(args.checkpoint_folder, f'mp_rank_{rank:02d}', args.checkpoint_name)
    model = MegatronGPTModel.load_from_checkpoint(checkpoint_path, hparams_file=args.hparams_file, trainer=trainer)
    model._save_restore_connector = NLPSaveRestoreConnector()
    model.save_to(args.nemo_file_path)
    logging.info(f'NeMo model saved to: {args.nemo_file_path}')
Exemple #8
0
    def init_model_parallel(self, global_rank: int, world_size: int) -> None:
        """ Initializes Megatron-LM model parallel if using model parallelism.

        Args:
            global_rank (int): the global process index.
            world_size (int): the total number of GPUs, num_nodes * num_devices
            is_slurm_managing_tasks (bool, optional): is the cluster managed by SLURM.
        """
        app_state = AppState()

        # we initialize megatron-lm model parallel and data parallel groups
        # after initializing DDP with PTL.
        if app_state.model_parallel_size is not None:
            # destroy groups in case they have already been created
            # this happens with multiple calls to trainer.test for example
            parallel_state.destroy_model_parallel()
            if torch.distributed.is_initialized():
                parallel_state.initialize_model_parallel(
                    tensor_model_parallel_size_=app_state.
                    tensor_model_parallel_size,
                    pipeline_model_parallel_size_=app_state.
                    pipeline_model_parallel_size,
                    pipeline_model_parallel_split_rank_=app_state.
                    pipeline_model_parallel_split_rank,
                )

                # assert that fake tp and pp rank match after model parallel init
                assert app_state.tensor_model_parallel_rank == parallel_state.get_tensor_model_parallel_rank(
                )
                assert app_state.pipeline_model_parallel_rank == parallel_state.get_pipeline_model_parallel_rank(
                )

                app_state.tensor_model_parallel_group = parallel_state.get_tensor_model_parallel_group(
                )
                app_state.data_parallel_group = parallel_state.get_data_parallel_group(
                )
                app_state.data_parallel_rank = parallel_state.get_data_parallel_rank(
                )
                app_state.data_parallel_size = parallel_state.get_data_parallel_world_size(
                )
                app_state.pipeline_model_parallel_group = parallel_state.get_pipeline_model_parallel_group(
                )
Exemple #9
0
def convert(local_rank, rank, world_size, args):

    app_state = AppState()
    app_state.data_parallel_rank = 0
    tensor_model_parallel_size = args.tensor_model_parallel_size
    num_nodes = world_size // args.gpus_per_node
    pipeline_model_parallel_size = world_size // args.tensor_model_parallel_size
    assert args.pipeline_model_parallel_size == pipeline_model_parallel_size

    trainer = Trainer(devices=args.gpus_per_node,
                      accelerator='gpu',
                      num_nodes=num_nodes)

    app_state.pipeline_model_parallel_size = args.pipeline_model_parallel_size
    app_state.tensor_model_parallel_size = args.tensor_model_parallel_size
    app_state.model_parallel_size = app_state.tensor_model_parallel_size * app_state.pipeline_model_parallel_size

    parallel_state.initialize_model_parallel(
        tensor_model_parallel_size_=app_state.tensor_model_parallel_size,
        pipeline_model_parallel_size_=app_state.pipeline_model_parallel_size,
    )

    app_state.pipeline_model_parallel_rank = parallel_state.get_pipeline_model_parallel_rank(
    )
    app_state.tensor_model_parallel_rank = parallel_state.get_tensor_model_parallel_rank(
    )

    pipeline_rank = rank // tensor_model_parallel_size
    tensor_rank = app_state.tensor_model_parallel_rank
    assert pipeline_rank == app_state.pipeline_model_parallel_rank

    if tensor_model_parallel_size is not None and tensor_model_parallel_size > 1 and pipeline_model_parallel_size == 1:
        # inject model parallel rank
        checkpoint_path = os.path.join(args.checkpoint_folder,
                                       f'mp_rank_{tensor_rank:02d}',
                                       args.checkpoint_name)
    elif tensor_model_parallel_size is not None and pipeline_model_parallel_size > 1:
        checkpoint_path = os.path.join(
            args.checkpoint_folder,
            f'mp_rank_{tensor_rank:02d}_{pipeline_rank:03d}',
            args.checkpoint_name)
    else:
        checkpoint_path = os.path.join(args.checkpoint_folder,
                                       args.checkpoint_name)
    logging.info(f"loading checkpoint {checkpoint_path}")

    if args.model_type == 'gpt':
        ## this dictionary is used to rename the model parameters
        name_translate = {}
        name_translate['transformer'] = 'encoder'
        name_translate['.attention.'] = '.self_attention.'
        # nemo megatron doesn't have _for_head key
        name_translate['word_embeddings_for_head'] = 'word_embeddings'
        checkpoint, consumed, steps, version = load_from_checkpoint(
            MegatronGPTModel,
            checkpoint_path,
            hparams_file=args.hparams_file,
            trainer=trainer,
            translator=name_translate,
            strict=False,
        )
    elif args.model_type == 'bert':
        ## this dictionary is used to rename the model parameters
        name_translate = {}
        name_translate['transformer'] = 'encoder'
        name_translate['.attention.'] = '.self_attention.'
        # nemo megatron doesn't have _for_head key
        name_translate['word_embeddings_for_head'] = 'word_embeddings'
        checkpoint, consumed, steps, version = load_from_checkpoint(
            MegatronBertModel,
            checkpoint_path,
            hparams_file=args.hparams_file,
            trainer=trainer,
            translator=name_translate,
            strict=False,
        )
    else:
        raise NotImplemented("{} is not supported".format(args.model_type))

    if torch.distributed.is_initialized():
        torch.distributed.barrier()

    if args.output_ckpt_file_path:
        filepath = args.output_ckpt_file_path
        base_dir = pathlib.Path(filepath).parent
        filename_str = pathlib.Path(filepath).name
        suffix = '.ckpt'
        content = {}
        if consumed is not None:
            content['consumed'] = consumed
        else:
            content['consumed'] = 0
        if steps is not None:
            content['steps'] = steps
        else:
            content['steps'] = 0
        filename = filename_str.format(**content) + suffix
        checkpoint_path_output = inject_model_parallel_rank(
            os.path.join(base_dir, filename))
        trainer.accelerator.training_type_plugin.checkpoint_io.save_checkpoint(
            checkpoint, checkpoint_path_output)
        logging.info(
            f'NeMo model checkpoint files saved to: {args.output_ckpt_file_path}'
        )

    if args.nemo_file_path:
        if args.model_type == 'gpt':
            model = load_model(MegatronGPTModel,
                               checkpoint,
                               strict=False,
                               trainer=trainer)
        elif args.model_type == 'bert':
            model = load_model(MegatronBertModel,
                               checkpoint,
                               strict=False,
                               trainer=trainer)
        else:
            raise NotImplemented("{} is not supported".format(args.model_type))

        # verify tensor parallel rank id and pipeline parallel rank id matches
        assert app_state.data_parallel_size == 1
        assert app_state.tensor_model_parallel_size == tensor_model_parallel_size
        assert app_state.tensor_model_parallel_rank == tensor_rank
        assert app_state.pipeline_model_parallel_size == pipeline_model_parallel_size
        assert app_state.pipeline_model_parallel_rank == pipeline_rank
        model._save_restore_connector = NLPSaveRestoreConnector()
        model.save_to(args.nemo_file_path)
        logging.info(f'NeMo model saved to: {args.nemo_file_path}')