コード例 #1
0
def split_partition(model, partitions, tp_size, write_path=None):
    if len(partitions) != 1:
        raise ValueError(
            "Can only split partitions of model with TP=1. For partitions of models with TP>1, merge first."
        )

    if tp_size < 1:
        raise ValueError("TP size must to be >= 1.")

    app_state = AppState()
    app_state.data_parallel_rank = 0
    app_state.model_parallel_size = tp_size
    app_state.model_parallel_rank = tp_size - 1

    idx = 0
    splits = []
    for _, param in model.named_parameters():
        if param.shape == partitions[0][idx].shape:
            split = [partitions[0][idx].data] * tp_size
        elif param.shape[0] == partitions[0][idx].shape[0]:
            split = torch.split(partitions[0][idx].data,
                                param.shape[-1],
                                dim=-1)
        else:
            split = torch.split(partitions[0][idx].data, param.shape[0], dim=0)
        splits.append(split)
        idx += 1

    for i in range(tp_size - 1, -1, -1):
        app_state.model_parallel_rank = i

        idx = 0
        for name, param in model.named_parameters():
            split_val = splits[idx][i]

            if param.shape != split_val.shape:
                logging.info(
                    f"Warning: Shape mismatch for parameter {name} required shape: {param.shape}, split shape: {split_val.shape}. Padding to match required size."
                )

                if split_val.shape[1:] == param.shape[1:]:
                    pad = [0, 0] * len(split_val.shape)
                    pad[-1] = param.shape[0] - split_val.shape[0]
                    split_val = torch.nn.functional.pad(
                        split_val, pad, 'constant')
                elif split_val.shape[:-1] == param.shape[:-1]:
                    pad = [0, param.shape[-1] - split_val.shape[-1]]
                    split_val = torch.nn.functional.pad(
                        split_val, pad, 'constant')
                else:
                    raise RuntimeError(
                        f"Can not handle parameter {name}, required shape: {param.shape}, split shape: {split_val.shape}."
                    )

            param.data = split_val
            idx += 1

        if write_path is not None:
            model.save_to(write_path)
コード例 #2
0
ファイル: megatron_gpt_test.py プロジェクト: quuhua911/NeMo
def main(cfg) -> None:
    logging.info("\n\n************** Experiment configuration ***********")
    logging.info(f'\n{OmegaConf.to_yaml(cfg)}')

    trainer = None
    if cfg.trainer.precision == 16:
        trainer = Trainer(
            plugins=[
                NLPDDPPlugin(),
                NLPNativeMixedPrecisionPlugin(
                    init_scale=cfg.model.get('native_amp_init_scale', 2 ** 32),
                    growth_interval=cfg.model.get('native_amp_growth_interval', 1000),
                ),
            ],
            **cfg.trainer,
        )
    elif cfg.trainer.precision == 'bf16':
        trainer = Trainer(plugins=[NLPDDPPlugin(), NLPNativeBfloat16PrecisionPlugin(),], **cfg.trainer,)
    else:
        trainer = Trainer(plugins=[NLPDDPPlugin(), NLPPrecisionPlugin()], **cfg.trainer)

    app_state = AppState()
    app_state.model_parallel_size = cfg.model.tensor_model_parallel_size
    app_state.model_parallel_rank = compute_model_parallel_rank(trainer.local_rank, app_state.model_parallel_size)

    model = MegatronGPTModel.restore_from(
        cfg.restore_from_path, trainer=trainer, save_restore_connector=NLPSaveRestoreConnector(),
    )

    # Note: most nemo models must have the data paths configured before instantiating the model
    # MegatronGPTMOdel sets up the data in the PTL method .setup which happens after DDP spawns.
    model.cfg.data.splits_string = cfg.model.data.splits_string

    trainer.test(model)
コード例 #3
0
def main():
    parser = ArgumentParser()
    parser.add_argument("--model_file",
                        type=str,
                        default="",
                        required=True,
                        help="Pass path to model's .nemo file")
    parser.add_argument("--prompt",
                        type=str,
                        default="",
                        required=True,
                        help="Prompt for the model (a text to complete)")
    parser.add_argument("--tokens_to_generate",
                        type=int,
                        default="16",
                        required=False,
                        help="How many tokens to add to prompt")
    parser.add_argument(
        "--tensor_model_parallel_size",
        type=int,
        default=1,
        required=True,
    )

    args = parser.parse_args()

    torch.set_grad_enabled(False)

    # trainer required for restoring model parallel models
    trainer = Trainer(plugins=NLPDDPPlugin(),
                      devices=args.tensor_model_parallel_size,
                      precision=16,
                      accelerator='gpu')

    app_state = AppState()
    if args.tensor_model_parallel_size > 1:
        app_state.model_parallel_size = args.tensor_model_parallel_size
        app_state.model_parallel_rank = compute_model_parallel_rank(
            trainer.local_rank, app_state.model_parallel_size)

    model = MegatronT5Model.restore_from(restore_path=args.model_file,
                                         trainer=trainer)
    model.freeze()
    request = {
        "prompt": args.prompt,
        "tokens_to_generate": args.tokens_to_generate,
    }

    dataset = T5RequestDataset(request, model.tokenizer)

    request_dl = DataLoader(dataset)

    response = trainer.predict(model, request_dl)

    print("***************************")
    print(response)
    print("***************************")
コード例 #4
0
ファイル: sgd_gen.py プロジェクト: quuhua911/NeMo
def main(cfg: DictConfig) -> None:
    pl.seed_everything(42)
    logging.info(f'Config: {OmegaConf.to_yaml(cfg)}')

    plugin = NLPDDPPlugin()
    trainer = pl.Trainer(**cfg.trainer, plugins=plugin)

    exp_manager(trainer, cfg.get("exp_manager", None))

    app_state = AppState()
    if cfg.model.tensor_model_parallel_size > 1:
        app_state.model_parallel_size = cfg.model.tensor_model_parallel_size
        app_state.model_parallel_rank = compute_model_parallel_rank(
            trainer.local_rank, app_state.model_parallel_size)

    if 'bert' in cfg.model.language_model.pretrained_model_name:
        if cfg.model.dataset.task == 'sgd':
            model_class = SGDQAModel
        else:
            model_class = IntentSlotClassificationModel
    elif 'gpt' in cfg.model.language_model.pretrained_model_name.lower():
        model_class = DialogueGPTModel

    if cfg.pretrained_model or (cfg.model.nemo_path
                                and os.path.exists(cfg.model.nemo_path)):
        if cfg.pretrained_model:
            logging.info(f'Loading pretrained model {cfg.pretrained_model}')
            model = model_class.from_pretrained(cfg.pretrained_model)
        else:
            logging.info(f'Restoring model from {cfg.model.nemo_path}')
            model = model_class.restore_from(cfg.model.nemo_path)
        if cfg.do_training:
            model.setup_training_data(train_data_config=cfg.model.train_ds)
            model.setup_multiple_validation_data(
                val_data_config=cfg.model.validation_ds)
    else:
        logging.info(f'Config: {OmegaConf.to_yaml(cfg)}')
        model = model_class(cfg.model, trainer=trainer)

    if cfg.do_training:
        trainer.fit(model)
        if cfg.model.nemo_path:
            model.save_to(cfg.model.nemo_path)
    else:
        data_dir = cfg.model.dataset.get('data_dir', None)
        dialogues_example_dir = cfg.model.dataset.get('dialogues_example_dir',
                                                      None)

        if data_dir is None or dialogues_example_dir is None:
            raise ValueError(
                'No dataset directory provided. Skipping evaluation. ')
        elif not os.path.exists(data_dir):
            raise ValueError(
                f'{data_dir} is not found, skipping evaluation on the test set.'
            )
        else:
            model.update_data_dirs(data_dir=data_dir,
                                   dialogues_example_dir=dialogues_example_dir)
            model._cfg.dataset = cfg.model.dataset

    if hasattr(cfg.model, 'test_ds') and cfg.model.test_ds.ds_item is not None:
        trainer = pl.Trainer(devices=1,
                             accelerator=cfg.trainer.accelerator,
                             plugins=plugin,
                             precision=16)
        model.setup_multiple_test_data(test_data_config=cfg.model.test_ds)
        if model.prepare_test(trainer):
            trainer.test(model)
コード例 #5
0
def main():
    parser = ArgumentParser()
    parser.add_argument("--use_soft_prompts", action="store_true", help="Use model's existing soft prompts")
    parser.add_argument("--model_file", type=str, default="", required=True, help="Pass path to model's .nemo file")
    parser.add_argument(
        "--path_to_file", type=str, default="", required=False, help="Path to file with prompts (a text to complete)"
    )
    parser.add_argument(
        "--prompt", type=str, default="", required=False, help="Prompt for the model (a text to complete)"
    )
    parser.add_argument(
        "--prompt_tag", type=str, default="", required=False, help="Prompt tag string for task specific soft prompt"
    )
    parser.add_argument(
        "--tokens_to_generate", type=int, default="1", required=False, help="How many tokens to add to prompt"
    )
    parser.add_argument(
        "--stop_after_sentence",
        type=bool,
        default="True",
        required=False,
        help="True/False: whether to stop after full sentence has been generated.",
    )
    parser.add_argument(
        "--tensor_model_parallel_size", type=int, default=1, required=False,
    )
    parser.add_argument("--precision", default=16, help="PyTorch Lightning Trainer precision flag")
    parser.add_argument("--batch_size", default=1, required=False, help="Evaluation batch_size")
    parser.add_argument(
        "--compute_logprobs", type=bool, default=False, required=False, help="Method for logprobs computation"
    )

    args = parser.parse_args()

    # cast precision to int if 32 or 16
    if args.precision in ["32", "16"]:
        args.precision = int(float(args.precision))

    # trainer required for restoring model parallel models
    trainer = Trainer(plugins=NLPDDPPlugin(), gpus=args.tensor_model_parallel_size, precision=args.precision)

    app_state = AppState()
    if args.tensor_model_parallel_size is not None and args.tensor_model_parallel_size > 1:
        app_state.model_parallel_size = args.tensor_model_parallel_size
        app_state.model_parallel_rank = compute_model_parallel_rank(trainer.local_rank, app_state.model_parallel_size)

    model = MegatronGPTModel.restore_from(restore_path=args.model_file, trainer=trainer)
    model.freeze()

    def pad_collate(batch):
        tokens, tokens_to_generate = batch[0]['data'], batch[0]['tokens_to_generate']
        compute_logprobs = batch[0]['compute_logprobs']
        lens = [len(token) for token in tokens]

        tokens_pad = pad_sequence(tokens, batch_first=False, padding_value=50256)
        data = []

        if 'prompt_tags' in batch[0]:
            # Keep track of soft prompt tags
            prompt_tags = batch[0]['prompt_tags']

            for token, lenn, prompt_tag in zip(tokens_pad.T, lens, prompt_tags):
                data.append((token, lenn, tokens_to_generate, compute_logprobs, prompt_tag))
        else:
            for token, lenn in zip(tokens_pad.T, lens):
                data.append((token, lenn, tokens_to_generate, compute_logprobs))

        return data

    # defining type of request
    if args.path_to_file != "":
        request = []
        prompts = open(args.path_to_file, 'r')

        for prompt in prompts.readlines():
            prompt = prompt.split('\n')[0]

            if args.use_soft_prompts and model.use_soft_prompts:
                prompt = json.loads(prompt)

            request.append(prompt)

        dataset = GPTRequestDataset(request, model.tokenizer, args.tokens_to_generate, args.compute_logprobs)
        request_dl = DataLoader(dataset=pad_collate(dataset), batch_size=int(args.batch_size))

    else:
        if args.use_soft_prompts and model.use_soft_prompts:
            request = [{'prompt_tag': args.prompt_tag, 'text': args.prompt}]
        else:
            request = [args.prompt]

        dataset = GPTRequestDataset(request, model.tokenizer, args.tokens_to_generate, args.compute_logprobs)
        request_dl = DataLoader(dataset=pad_collate(dataset), batch_size=1)

    # For GPT models that have had soft prompt tuning but you don't want to use any soft prompts
    if not args.use_soft_prompts and model.use_soft_prompts:
        model.use_soft_prompts = False

    response = trainer.predict(model, request_dl)

    print("***************************")
    print(response)
    print("***************************")
コード例 #6
0
def main():
    parser = ArgumentParser()
    parser.add_argument("--model_file",
                        type=str,
                        default="",
                        required=True,
                        help="Pass path to model's .nemo file")
    parser.add_argument("--prompt",
                        type=str,
                        default="",
                        required=True,
                        help="Prompt for the model (a text to complete)")
    parser.add_argument("--tokens_to_generate",
                        type=int,
                        default="64",
                        required=False,
                        help="How many tokens to add to prompt")
    parser.add_argument(
        "--stop_after_sentence",
        type=bool,
        default="True",
        required=False,
        help=
        "True/False: whether to stop after full sentence has been generated.",
    )
    parser.add_argument(
        "--tensor_model_parallel_size",
        type=int,
        default=1,
        required=True,
    )
    parser.add_argument("--precision",
                        default=32,
                        help="PyTorch Lightning Trainer precision flag")

    args = parser.parse_args()

    # cast precision to int if 32 or 16
    if args.precision in ["32", "16"]:
        args.precision = int(float(args.precision))

    # trainer required for restoring model parallel models
    trainer = Trainer(plugins=NLPDDPPlugin(),
                      gpus=args.tensor_model_parallel_size,
                      precision=args.precision)

    app_state = AppState()
    if args.tensor_model_parallel_size is not None and args.tensor_model_parallel_size > 1:
        app_state.model_parallel_size = args.tensor_model_parallel_size
        app_state.model_parallel_rank = compute_model_parallel_rank(
            trainer.local_rank, app_state.model_parallel_size)

    model = MegatronGPTModel.restore_from(restore_path=args.model_file,
                                          trainer=trainer)

    model.freeze()

    request = {
        "prompt": args.prompt,
        "tokens_to_generate": args.tokens_to_generate,
        "stop_after_sentence": args.stop_after_sentence,
    }

    dataset = GPTRequestDataset(request, model.tokenizer)

    request_dl = DataLoader(dataset)

    response = trainer.predict(model, request_dl)

    print("***************************")
    print(response[0]['completion']['text'])
    print("***************************")
    logging.info(
        f"Generation stopped because: {response[0]['completion']['stop reason']}"
    )
コード例 #7
0
def main(cfg: DictConfig) -> None:
    pl.seed_everything(42)
    logging.info(f'Config: {OmegaConf.to_yaml(cfg)}')

    try:
        plugin = NLPDDPPlugin()
    except (ImportError, ModuleNotFoundError):
        plugin = None

    trainer = pl.Trainer(**cfg.trainer, plugins=plugin)

    exp_manager(trainer, cfg.get("exp_manager", None))

    app_state = AppState()
    if cfg.model.tensor_model_parallel_size > 1:
        app_state.model_parallel_size = cfg.model.tensor_model_parallel_size
        app_state.model_parallel_rank = compute_model_parallel_rank(
            trainer.local_rank, app_state.model_parallel_size)

    if 'bert' in cfg.model.language_model.pretrained_model_name:
        if cfg.model.dataset.task == 'sgd':
            if cfg.model.original_nemo_checkpoint is not None:
                model_class = DialogueZeroShotIntentModel
            else:
                model_class = SGDQAModel
        elif cfg.model.dataset.task in ['zero_shot', 'design']:
            model_class = DialogueZeroShotIntentModel
        else:
            model_class = IntentSlotClassificationModel
    elif 'gpt' in cfg.model.language_model.pretrained_model_name.lower():
        if cfg.model.dataset.task in ['ms_marco', 'mellon_qa']:
            model_class = DialogueGPTGenerationModel
        else:
            model_class = DialogueGPTClassificationModel
    elif ('bart' in cfg.model.language_model.pretrained_model_name.lower()
          or 't5' in cfg.model.language_model.pretrained_model_name.lower()):
        # please use bf16/32 with t5-large and above
        # see https://github.com/huggingface/transformers/pull/10956
        model_class = DialogueS2SGenerationModel
    elif 'sentence-transformers' in cfg.model.language_model.pretrained_model_name.lower(
    ):
        model_class = DialogueNearestNeighbourModel

    if cfg.pretrained_model or (cfg.model.nemo_path
                                and os.path.exists(cfg.model.nemo_path)):
        if cfg.pretrained_model:
            logging.info(f'Loading pretrained model {cfg.pretrained_model}')
            model = model_class.from_pretrained(cfg.pretrained_model)
        else:
            logging.info(f'Restoring model from {cfg.model.nemo_path}')
            model = model_class.restore_from(cfg.model.nemo_path)

        if cfg.do_training:
            model.setup_training_data(train_data_config=cfg.model.train_ds)
            model.setup_multiple_validation_data(
                val_data_config=cfg.model.validation_ds)
    else:
        logging.info(f'Config: {OmegaConf.to_yaml(cfg)}')
        model = model_class(cfg.model, trainer=trainer)

    if cfg.do_training:
        trainer.fit(model)
        if cfg.model.nemo_path:
            model.save_to(cfg.model.nemo_path)
    else:
        data_dir = cfg.model.dataset.get('data_dir', None)
        dialogues_example_dir = cfg.model.dataset.get('dialogues_example_dir',
                                                      None)

        if data_dir is None or dialogues_example_dir is None:
            raise ValueError(
                'No dataset directory provided. Skipping evaluation. ')
        elif not os.path.exists(data_dir):
            raise ValueError(
                f'{data_dir} is not found, skipping evaluation on the test set.'
            )
        else:
            if hasattr(model, "update_data_dirs"):
                model.update_data_dirs(
                    data_dir=data_dir,
                    dialogues_example_dir=dialogues_example_dir)
                model._cfg.dataset = cfg.model.dataset

    if hasattr(cfg.model, 'test_ds') and cfg.model.test_ds.ds_item is not None:
        eval_device = [cfg.trainer.devices[0]] if isinstance(
            cfg.trainer.devices, list) else 1
        trainer = pl.Trainer(devices=eval_device,
                             accelerator=cfg.trainer.accelerator,
                             precision=16)
        model.setup_multiple_test_data(test_data_config=cfg.model.test_ds)
        if model.prepare_test(trainer):
            trainer.test(model)