Example #1
0
def get_gpt2_model(args_others, mp_size=1):
    from megatron.model import GPT2Model
    from megatron.initialize import initialize_megatron

    args_defaults = {
        'vocab_file': get_test_path('gpt2-vocab.json'),
        'merge_file': get_test_path('gpt2-merges.txt'),
        'tokenizer_type': 'GPT2BPETokenizer',
    }

    args_defaults.update(args_others)

    # setting "make-vocab-size-divisible-by" to avoid word-embedding size change in resizing testing.
    sys.argv.extend([
        '--model-parallel-size',
        str(mp_size), '--make-vocab-size-divisible-by',
        str(1)
    ])

    initialize_megatron(args_defaults=args_defaults, ignore_unknown_args=True)
    model = GPT2Model(num_tokentypes=0, parallel_output=False)
    model.cuda()
    from torch.nn.parallel.distributed import DistributedDataParallel as torchDDP
    from megatron import mpu
    i = torch.cuda.current_device()
    model = torchDDP(model,
                     device_ids=[i],
                     output_device=i,
                     process_group=mpu.get_data_parallel_group())

    return model
def main():
    """Main program."""

    initialize_megatron(extra_args_provider=add_text_generate_args,
                        args_defaults={
                            'tokenizer_type': 'GPT2BPETokenizer',
                            'no_load_rng': True,
                            'no_load_optim': True
                        })

    args = get_args()
    if args.num_layers_per_virtual_pipeline_stage is not None:
        print(
            "Interleaved pipeline schedule is not yet supported for text generation."
        )
        exit()

    # Set up model and load checkpoint.
    model = get_model(model_provider)

    if args.load is not None:
        _ = load_checkpoint(model, None, None)

    assert len(model) == 1, "Above condition should have caught this"
    model = model[0]

    # Generate samples.
    if args.num_samples == 0:
        args.micro_batch_size = 1
        if args.sample_input_file != None:
            generate_samples_input_from_file(model)
        else:
            generate_samples_interactive(model)
    else:
        generate_and_write_samples_unconditional(model)
Example #3
0
def setup_for_inference_or_eval(inference=True,
                                get_key_value=True,
                                overwrite_values=None):

    from megatron.neox_arguments import NeoXArgs
    from megatron.initialize import initialize_megatron
    from megatron.training import setup_model_and_optimizer

    _overwrite_values = {
        "checkpoint_activations": False,
        "partition_activations": False,
        "no_load_optim": True,
    }
    if overwrite_values:
        _overwrite_values.update(overwrite_values)
    neox_args = NeoXArgs.consume_neox_args(overwrite_values=_overwrite_values)
    neox_args.configure_distributed_args()
    neox_args.build_tokenizer()

    if neox_args.load is None:
        raise ValueError("`load` parameter must be supplied to load a model`")

    # initialize megatron
    initialize_megatron(neox_args)

    # set up model and load checkpoint.
    model, _, _ = setup_model_and_optimizer(
        neox_args=neox_args, inference=inference, get_key_value=get_key_value
    )  # we use setup_model_and_optimizer instead of get_model in order to initialize deepspeed
    print_rank_0('Finished loading model')
    return model, neox_args
Example #4
0
def setup_for_inference_or_eval(
    use_cache=True,
    overwrite_values=None,
):
    """
    Initializes the model for evaluation or inference (doesn't load optimizer states, etc.) from command line args.

    use_cache: bool
        Whether to use key value caching in inference.
    overwrite_values: dict
        Optional Values to overwrite in the model config.
    """

    from megatron.neox_arguments import NeoXArgs
    from megatron.initialize import initialize_megatron
    from megatron.training import setup_model_and_optimizer

    _overwrite_values = {
        "checkpoint_activations": False,
        "partition_activations": False,
        "no_load_optim": True,
        "zero_optimization":
        None,  # disable zero optimization (won't be used in inference, and loading zero optimizer can cause errors)
    }
    if overwrite_values:
        _overwrite_values.update(overwrite_values)
    neox_args = NeoXArgs.consume_neox_args(overwrite_values=_overwrite_values)
    neox_args.configure_distributed_args()
    neox_args.build_tokenizer()

    if neox_args.load is None:
        raise ValueError("`load` parameter must be supplied to load a model`")

    # initialize megatron
    initialize_megatron(neox_args)

    # set up model and load checkpoint.
    model, _, _ = setup_model_and_optimizer(
        neox_args=neox_args,
        use_cache=use_cache,
        iteration=neox_args.iteration,
    )  # we use setup_model_and_optimizer instead of get_model in order to initialize deepspeed
    print_rank_0("Finished loading model")

    model.module.inference_mode(use_cache=use_cache)
    return model, neox_args
def main():
    """Create a BlockData data structure by running an IndexBuilder over an ICT Dataset
    - Include all args needed for initial model specification

    Other key args:
        --block-data-path: path to write to
        --ict-load or --realm-load: path to checkpoint with which to embed
        --data-path and --titles-data-path: paths for dataset
        --indexer-log-interval: reporting interval
        --indexer-batch-size: size specific for indexer jobs

    Check README.md for example script
    """

    initialize_megatron(
        extra_args_provider=None,
        args_defaults={'tokenizer_type': 'BertWordPieceLowerCase'})
    index_builder = IndexBuilder()
    index_builder.build_and_save_index()
Example #6
0
    def __init__(self, num_layers, mp_size, args_others, topo, **kwargs):
        from megatron.initialize import initialize_megatron

        args_defaults = {
            'vocab_file': get_test_path('gpt2-vocab.json'),
            'merge_file': get_test_path('gpt2-merges.txt'),
            'tokenizer_type': 'GPT2BPETokenizer',
        }

        args_defaults.update(args_others)

        # setting "make-vocab-size-divisible-by" to avoid word-embedding size change in resizing testing.
        sys.argv.extend([
            '--model-parallel-size',
            str(mp_size), '--make-vocab-size-divisible-by',
            str(1)
        ])

        initialize_megatron(args_defaults=args_defaults,
                            ignore_unknown_args=True)

        from megatron.model.transformer import ParallelTransformerLayer

        class ParallelTransformerLayerPipe(ParallelTransformerLayer):
            def forward(self, args):
                # hardcode attn mask for testing, PP requires the attn_mask to be stashed
                attention_mask = torch.tensor(
                    [[True]], device=torch.cuda.current_device())
                return super().forward(args, attention_mask)

        layers = []
        for x in range(num_layers):
            layers.append(
                LayerSpec(ParallelTransformerLayerPipe,
                          self.gpt2_attention_mask_func,
                          self.init_method_normal(0.02),
                          self.scaled_init_method_normal(0.02, num_layers), x))
        super().__init__(layers=layers,
                         loss_fn=torch.nn.CrossEntropyLoss(),
                         topology=topo,
                         **kwargs)
    def __init__(
        self,
        model_name,
        vocab_file,
        hidden_size=1024,
        num_attention_heads=16,
        num_layers=24,
        max_seq_length=512,
        tokenizer_type='BertWordPieceLowerCase',
        init_method_std=0.02,
        num_tokentypes=2,
    ):

        super().__init__()

        if not os.path.exists(vocab_file):
            raise ValueError(f'Vocab file not found at {vocab_file}')

        megatron_args = {
            "num_layers": num_layers,
            "hidden_size": hidden_size,
            "num_attention_heads": num_attention_heads,
            "max_position_embeddings": max_seq_length,
            "tokenizer_type": tokenizer_type,
            "vocab_file": vocab_file,
        }

        initialize_megatron(None, megatron_args, ignore_unknown_args=True)
        init_method = init_method_normal(init_method_std)

        self.language_model, self._language_model_key = get_language_model(
            attention_mask_func=bert_attention_mask_func,
            num_tokentypes=num_tokentypes,
            add_pooler=False,
            init_method=init_method,
            scaled_init_method=scaled_init_method_normal(
                init_method_std, num_layers),
        )

        self.language_model.to(self._device)
        self._hidden_size = self.language_model.hidden_size
Example #8
0
    def __init__(self, num_layers, mp_size, args_others, topo, **kwargs):
        from megatron.initialize import initialize_megatron

        args_defaults = {
            'vocab_file': 'tests/unit/gpt2-vocab.json',
            'merge_file': 'tests/unit/gpt2-merges.txt',
            'tokenizer_type': 'GPT2BPETokenizer',
        }

        args_defaults.update(args_others)

        # setting "make-vocab-size-divisible-by" to avoid word-embedding size change in resizing testing.
        sys.argv.extend([
            '--model-parallel-size',
            str(mp_size),
            '--make-vocab-size-divisible-by',
            str(1)
        ])

        initialize_megatron(args_defaults=args_defaults, ignore_unknown_args=True)

        from megatron.model.transformer import ParallelTransformerLayer

        class ParallelTransformerLayerPipe(ParallelTransformerLayer):
            def forward(self, args):
                hidden_states, attention_mask = args[0], args[1]
                return super().forward(*args), attention_mask

        layers = []
        for x in range(num_layers):
            layers.append(
                LayerSpec(ParallelTransformerLayerPipe,
                          self.gpt2_attention_mask_func,
                          self.init_method_normal(0.02),
                          self.scaled_init_method_normal(0.02,
                                                         num_layers),
                          x))
        super().__init__(layers=layers,
                         loss_fn=torch.nn.CrossEntropyLoss(),
                         topology=topo,
                         **kwargs)
Example #9
0
def main():
    """Main program."""

    initialize_megatron(extra_args_provider=add_text_generate_args,
                        args_defaults={'tokenizer_type': 'GPT2BPETokenizer'})

    # Set up model and load checkpoint.
    model = get_model(model_provider)
    args = get_args()
    if args.load is not None:
        _ = load_checkpoint(model, None, None)

    # Generate samples.
    if args.num_samples == 0:
        args.batch_size = 1
        if args.sample_input_file != "":
            generate_samples_input_from_file(model)
        else:
            generate_samples_interactive(model)
    else:
        generate_and_write_samples_unconditional(model)
Example #10
0
    def __init__(self,
                 max_seq_len=DEFAULT_MAX_SEQ_LEN,
                 vocab_path=DEFAULT_VOCAB_PATH,
                 regex=REGEX,
                 default_chem_token_start=DEFAULT_CHEM_TOKEN_START,
                 checkpoints_dir=CHECKPOINTS_DIR,
                 num_layers=DEFAULT_NUM_LAYERS,
                 hidden_size=DEFAULT_D_MODEL,
                 num_attention_heads=DEFAULT_NUM_HEADS,
                 decoder_max_seq_len=None) -> None:
        super().__init__()

        torch.set_grad_enabled(
            False
        )  # Testing this instead of `with torch.no_grad():` context since it doesn't exit

        self.device = 'cuda'  # Megatron arg loading seems to only work with GPU
        self.min_jitter_radius = 1.0
        self.max_model_position_embeddings = max_seq_len

        args = {
            'num_layers': num_layers,
            'hidden_size': hidden_size,
            'num_attention_heads': num_attention_heads,
            'max_position_embeddings': self.max_model_position_embeddings,
            'tokenizer_type': 'GPT2BPETokenizer',
            'vocab_file': vocab_path,
            'load': checkpoints_dir
        }

        with torch.no_grad():
            initialize_megatron(args_defaults=args, ignore_unknown_args=True)
            args = get_args()
            self.tokenizer = self.load_tokenizer(args.vocab_file, regex,
                                                 default_chem_token_start)
            self.model = self.load_model(args, self.tokenizer,
                                         decoder_max_seq_len)
Example #11
0
                       default=None,
                       help='Whitespace separated paths or corpora names '
                       'for training.')
    group.add_argument('--valid-data',
                       nargs='*',
                       default=None,
                       help='path(s) to the validation data.')
    group.add_argument('--overlapping-eval',
                       type=int,
                       default=32,
                       help='Sliding window for overlapping evaluation.')
    group.add_argument('--strict-lambada',
                       action='store_true',
                       help='Use more difficult formulation of lambada.')

    return parser


if __name__ == '__main__':

    initialize_megatron(extra_args_provider=get_tasks_args)

    args = get_args()
    if args.task in ['LAMBADA', 'WIKITEXT103']:
        from zeroshot_gpt2.evaluate import main
    else:
        raise NotImplementedError('Task {} is not implemented.'.format(
            args.task))

    main()
Example #12
0
def pretrain(train_valid_test_dataset_provider, model_provider,
             forward_step_func, extra_args_provider=None, args_defaults={}):
    """Main training program.

    This function will run the followings in the order provided:
        1) initialize Megatron.
        2) setup model, optimizer and lr schedule using the model_provider.
        3) call train_val_test_data_provider to get train/val/test datasets.
        4) train the modle using the forward_step_func.

    Arguments:
        train_valid_test_dataset_provider: a function that takes the size of
            train/valid/test dataset and returns `train, valid, test` datasets.
        model_provider: a function that returns a vanilla version of the
            model. By vanilla we mean a simple model on cpu with no fp16 or ddp.
        forward_step_func: a function that takes a `data iterator` and `model`,
            and returns a `loss` scalar with a dictionary with key:values being
            the info we would like to monitor during training, for example
            `lm-loss: value`. We also require that this function add
            `batch generator` to the timers class.
        extra_args_provider: a function that takes a parser and adds arguments
            to it. It is used for programs to add their own arguments.
        args_defaults: a dictionary from argument-name to argument-value. It
            to set already parse arguments.
    """

    # Initalize and get arguments, timers, and Tensorboard writer.
    initialize_megatron(extra_args_provider=extra_args_provider,
                        args_defaults=args_defaults)

    args = get_args()
    timers = get_timers()

    # Model, optimizer, and learning rate.
    timers('model and optimizer').start()
    model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider)
    timers('model and optimizer').stop()

    # Data stuff.
    timers('train/valid/test data iterators').start()
    train_data_iterator, valid_data_iterator, test_data_iterator \
        = build_train_valid_test_data_iterators(
            train_valid_test_dataset_provider)
    timers('train/valid/test data iterators').stop()

    # Print setup timing.
    print_rank_0('done with setups ...')
    timers.log(['model and optimizer', 'train/valid/test data iterators'])
    print_rank_0('training ...')

    iteration = 0
    if args.do_train and args.train_iters > 0:
        iteration = train(forward_step_func,
                          model, optimizer, lr_scheduler,
                          train_data_iterator, valid_data_iterator)

    if args.do_valid:
        prefix = 'the end of training for val data'
        evaluate_and_print_results(prefix, forward_step_func,
                                   valid_data_iterator, model,
                                   iteration, False)

    if args.save and iteration != 0:
        save_checkpoint(iteration, model, optimizer, lr_scheduler)

    if args.do_test:
        # Run on test data.
        prefix = 'the end of training for test data'
        evaluate_and_print_results(prefix, forward_step_func,
                                   test_data_iterator, model,
                                   0, True)
Example #13
0
def pretrain(train_valid_test_dataset_provider,
             model_provider,
             forward_step_func,
             extra_args_provider=None,
             args_defaults={}):
    """Main training program.

    This function will run the followings in the order provided:
        1) initialize Megatron.
        2) setup model, optimizer and lr schedule using the model_provider.
        3) call train_val_test_data_provider to get train/val/test datasets.
        4) train the modle using the forward_step_func.

    Arguments:
        train_valid_test_dataset_provider: a function that takes the size of
            train/valid/test dataset and returns `train, valid, test` datasets.
        model_provider: a function that returns a vanilla version of the
            model. By vanilla we mean a simple model on cpu with no fp16 or ddp.
        forward_step_func: a function that takes a `data iterator` and `model`,
            and returns a `loss` scalar with a dictionary with key:values being
            the info we would like to monitor during training, for example
            `lm-loss: value`. We also require that this function add
            `batch generator` to the timers class.
        extra_args_provider: a function that takes a parser and adds arguments
            to it. It is used for programs to add their own arguments.
        args_defaults: a dictionary from argument-name to argument-value. It
            to set already parse arguments.
    """

    # Initalize and get arguments, timers, and Tensorboard writer.
    initialize_megatron(extra_args_provider=extra_args_provider,
                        args_defaults=args_defaults)

    # Adjust the startup time so it reflects the largest value.
    # This will be closer to what scheduler will see (outside of
    # image ... launches.
    global _TRAIN_START_TIME
    start_time_tensor = torch.cuda.FloatTensor([_TRAIN_START_TIME])
    torch.distributed.all_reduce(start_time_tensor,
                                 op=torch.distributed.ReduceOp.MIN)
    _TRAIN_START_TIME = start_time_tensor.item()
    print_rank_0('time to initialize megatron (seconds): {:.3f}'.format(
        time.time() - _TRAIN_START_TIME))
    print_datetime('after megatron is initialized')

    args = get_args()
    timers = get_timers()

    # Model, optimizer, and learning rate.
    timers('model and optimizer').start()
    model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider)
    timers('model and optimizer').stop()
    print_datetime('after model, optimizer, and learning rate '
                   'scheduler are built')

    # Data stuff.
    timers('train/valid/test data iterators').start()
    train_data_iterator, valid_data_iterator, test_data_iterator \
        = build_train_valid_test_data_iterators(
            train_valid_test_dataset_provider)
    timers('train/valid/test data iterators').stop()
    print_datetime('after dataloaders are built')

    # Print setup timing.
    print_rank_0('done with setups ...')
    timers.log(['model and optimizer', 'train/valid/test data iterators'])
    print_rank_0('training ...')

    iteration = 0
    if args.do_train and args.train_iters > 0:
        iteration = train(forward_step_func, model, optimizer, lr_scheduler,
                          train_data_iterator, valid_data_iterator)
    print_datetime('after training is done')

    if args.do_valid:
        prefix = 'the end of training for val data'
        evaluate_and_print_results(prefix, forward_step_func,
                                   valid_data_iterator, model, iteration,
                                   False)

    if args.save and iteration != 0:
        save_checkpoint(iteration, model, optimizer, lr_scheduler)

    if args.do_test:
        # Run on test data.
        prefix = 'the end of training for test data'
        evaluate_and_print_results(prefix, forward_step_func,
                                   test_data_iterator, model, 0, True)
Example #14
0
def pretrain(neox_args):
    """Main training program.

    This function will run the following in the order provided:
        1) initialize Megatron.
        2) setup model, optimizer and lr schedule
        3) call train_val_test_data_provider to get train/val/test datasets.
        4) train the model.

    Arguments:
        neox_args: an instance of NeoXArgs containing the configuration for pretrain

    """
    # setup logging and timers
    init_wandb(neox_args=neox_args)
    timers = Timers(use_wandb=neox_args.use_wandb,
                    tensorboard_writer=neox_args.tensorboard_writer)

    # Initialize and get arguments, timers, and Tensorboard writer.
    initialize_megatron(neox_args=neox_args)

    # Model, optimizer, and learning rate.
    timers("model and optimizer").start()
    model, optimizer, lr_scheduler = setup_model_and_optimizer(
        neox_args=neox_args, use_cache=False)
    timers("model and optimizer").stop()

    # Data stuff.
    timers("train/valid/test data iterators").start()
    (
        train_data_iterator,
        valid_data_iterator,
        test_data_iterator,
    ) = build_train_valid_test_data_iterators(neox_args=neox_args)
    timers("train/valid/test data iterators").stop()

    # Print setup timing.
    print_rank_0("done with setups ...")
    timers.log(["model and optimizer", "train/valid/test data iterators"])
    print_rank_0("training ...")

    iteration = 0
    if neox_args.do_train and neox_args.train_iters > 0:
        iteration = train(
            neox_args=neox_args,
            timers=timers,
            model=model,
            optimizer=optimizer,
            lr_scheduler=lr_scheduler,
            train_data_iterator=train_data_iterator,
            valid_data_iterator=valid_data_iterator,
        )

    if neox_args.do_valid:
        prefix = "the end of training for val data"
        evaluate_and_print_results(
            neox_args=neox_args,
            prefix=prefix,
            forward_step_func=forward_step,
            data_iterator=valid_data_iterator,
            model=model,
            iteration=iteration,
            verbose=False,
            timers=timers,
        )

    if neox_args.save and iteration != 0:
        save_checkpoint(
            neox_args=neox_args,
            iteration=iteration,
            model=model,
            optimizer=optimizer,
            lr_scheduler=lr_scheduler,
        )

    if neox_args.do_test:
        # Run on test data.
        prefix = "the end of training for test data"
        evaluate_and_print_results(
            neox_args=neox_args,
            prefix=prefix,
            forward_step_func=forward_step,
            data_iterator=test_data_iterator,
            model=model,
            iteration=0,  # iteration 0 in order to always use full test data
            verbose=True,
            timers=timers,
        )
Example #15
0
def main():
    """Main program."""
    drmode = 0
    mode = 0
    initialize_megatron(extra_args_provider=add_text_generate_args,
                        args_defaults={'tokenizer_type': 'GPT2BPETokenizer'})

    # Set up model and load checkpoint.
    model = get_model(model_provider)
    args = get_args()
    tokenizer = get_tokenizer()
    if args.load is not None:
        _ = load_checkpoint(model, None, None)

    # Generate samples.
    if drmode == 1:
        f = open("questions.txt", 'r')
        if mode == 0:
            dir = "qa_345M"
        else:
            dir = "qa_345M_ip"
    if drmode == 0:
        f = open("para.txt", 'r')
        if mode == 0:
            dir = "pa_345M"
        else:
            dir = "pa_345M_ip"

    qs = f.readlines()
    question_list = []
    import json
    for i in qs:
        question_list.append(i)
    f.close()
    fdir = os.listdir()

    if not (dir in fdir):
        os.mkdir(dir)
    import random
    import jsonlines
    while True:

        q = random.choice(question_list)
        lists = os.listdir(dir)
        question = q
        lts = question[:20] + '.jsonl'
        if (lts in lists):
            continue
        #str=generate_token_tensor(str,tokenizer)

        if mode == 0:
            output_string = generate_one_text(model, tokenizer, args, question)
            print(question, output_string)

            text_dir = dir + "/"
            already = []
            with jsonlines.open(text_dir + question[:20] + '.jsonl',
                                mode='w') as writer:

                otc = {}
                otc['question'] = question
                otc['answer'] = output_string
                #print(otc)
                writer.write(otc)
        else:
            output_string, output_scores = generate_string(
                model, tokenizer, args, question)
            ranklist = np.argsort(output_scores)
            best_score = output_scores[ranklist[0]]
            text_dir = dir + "/"
            already = []
            with jsonlines.open(text_dir + question[:20] + '.jsonl',
                                mode='w') as writer:

                otc = {}
                otc['question'] = question
                otc['answer'] = output_string[ranklist[0]]
                #print(otc)
                writer.write(otc)
Example #16
0
    group.add_argument("--top_p",
                       type=float,
                       default=0.0,
                       help='Top p sampling.')
    group.add_argument("--top_k", type=int, default=0, help='Top k sampling.')
    group.add_argument("--out-seq-length",
                       type=int,
                       default=1024,
                       help='Size of the output generated text.')
    return parser


if __name__ == "__main__":
    initialize_megatron(extra_args_provider=add_text_generate_args,
                        args_defaults={
                            'tokenizer_type': 'GPT2BPETokenizer',
                            'no_load_rng': True,
                            'no_load_optim': True
                        })

    args = get_args()
    if args.num_layers_per_virtual_pipeline_stage is not None:
        print(
            "Interleaved pipeline schedule is not yet supported for text generation."
        )
        exit()
    # Set up model and load checkpoint
    model = get_model(model_provider, wrap_with_ddp=False)

    if args.load is not None:
        _ = load_checkpoint(model, None, None)