def get_model(args):
    """Build the model."""

    print_rank_0('building GPT3 model ...')
    model = GPT3Model(num_layers=args.num_layers,
                      vocab_size=args.vocab_size,
                      hidden_size=args.hidden_size,
                      num_attention_heads=args.num_attention_heads,
                      embedding_dropout_prob=args.hidden_dropout,
                      attention_dropout_prob=args.attention_dropout,
                      output_dropout_prob=args.hidden_dropout,
                      max_sequence_length=args.max_position_embeddings,
                      checkpoint_activations=args.checkpoint_activations,
                      checkpoint_num_layers=args.checkpoint_num_layers,
                      parallel_output=False)

    if mpu.get_data_parallel_rank() == 0:
        print(' > number of parameters on model parallel rank {}: {}'.format(
            mpu.get_model_parallel_rank(),
            sum([p.nelement() for p in model.parameters()])),
              flush=True)

    # GPU allocation.
    model.cuda(torch.cuda.current_device())

    # Fp16 conversion.
    if args.fp16:
        model = FP16_Module(model)

    # Wrap model for distributed training.
    model = DDP(model)

    return model
Ejemplo n.º 2
0
def save_checkpoint(iteration,
                    model,
                    optimizer,
                    lr_scheduler,
                    args,
                    deepspeed=False):
    """Save a model checkpoint."""
    if deepspeed:
        save_ds_checkpoint(iteration, model, args)
    else:
        # Only rank zer0 of the data parallel writes to the disk.
        if isinstance(model, torchDDP):
            model = model.module

        if mpu.get_data_parallel_rank() == 0:
            checkpoint_name = get_checkpoint_name(args.save, iteration)
            print(
                'global rank {} is saving checkpoint at iteration {:7d} to {}'.
                format(torch.distributed.get_rank(), iteration,
                       checkpoint_name))

            sd = {}
            sd['iteration'] = iteration
            sd['model'] = model.state_dict()

            # Optimizer stuff.
            if not args.no_save_optim:
                if optimizer is not None:
                    sd['optimizer'] = optimizer.state_dict()
                if lr_scheduler is not None:
                    sd['lr_scheduler'] = lr_scheduler.state_dict()

            # rng states.
            if not args.no_save_rng:
                sd['random_rng_state'] = random.getstate()
                sd['np_rng_state'] = np.random.get_state()
                sd['torch_rng_state'] = torch.get_rng_state()
                sd['cuda_rng_state'] = torch.cuda.get_rng_state()
                sd['rng_tracker_states'] = mpu.get_cuda_rng_tracker(
                ).get_states()

            ensure_directory_exists(checkpoint_name)
            torch.save(sd, checkpoint_name)
            print('  successfully saved {}'.format(checkpoint_name))

    # Wait so everyone is done (necessary)
    torch.distributed.barrier()
    # And update the latest iteration
    if torch.distributed.get_rank() == 0:
        tracker_filename = get_checkpoint_tracker_filename(args.save)
        with open(tracker_filename, 'w') as f:
            f.write(str(iteration))
    # Wait so everyone is done (not necessary)
    torch.distributed.barrier()
Ejemplo n.º 3
0
def get_model(args):
    """Build the model."""

    print_rank_0('building GPT3 model ...')
    assert args.num_attention_heads % args.model_parallel_size == 0
    num_local_heads = args.num_attention_heads // args.model_parallel_size
    deepspeed_sparsity_config = None
    if DEEPSPEED_WRAP and args.deepspeed:
        deepspeed_sparsity_config = get_sparse_attention_config(args, num_local_heads)
    if deepspeed_sparsity_config is not None:
        print_rank_0(f"Use sparse attention with mode {args.sparse_mode}")
    model = GPT3Model(num_layers=args.num_layers,
                      vocab_size=args.vocab_size,
                      hidden_size=args.hidden_size,
                      num_attention_heads=args.num_attention_heads,
                      embedding_dropout_prob=args.hidden_dropout,
                      attention_dropout_prob=args.attention_dropout,
                      output_dropout_prob=args.hidden_dropout,
                      max_sequence_length=args.max_position_embeddings,
                      checkpoint_activations=args.checkpoint_activations,
                      checkpoint_num_layers=args.checkpoint_num_layers,
                      parallel_output=True,
                      deepspeed_sparsity_config=deepspeed_sparsity_config,
                      sparse_mode=args.sparse_mode)

    if args.load_huggingface is not None:
        model = load_huggingface_model(model, args.load_huggingface, args.huggingface_double_pos_embeddings)

    if mpu.get_data_parallel_rank() == 0:
        print(' > number of parameters on model parallel rank {}: {}'.format(
            mpu.get_model_parallel_rank(),
            sum([p.nelement() for p in model.parameters()])), flush=True)

    # To prevent OOM for model sizes that cannot fit in GPU memory in full precision
    if DEEPSPEED_WRAP and args.deepspeed and args.fp16:
        model.half()

    # GPU allocation.
    model.cuda(torch.cuda.current_device())

    # Fp16 conversion.
    if args.fp16:
        model = FP16_Module(model)

    # Wrap model for distributed training.
    if USE_TORCH_DDP:
        i = torch.cuda.current_device()
        model = DDP(model, device_ids=[i], output_device=i,
                    process_group=mpu.get_data_parallel_group())
    else:
        model = DDP(model)

    return model
Ejemplo n.º 4
0
def get_checkpoint_name(checkpoints_path,
                        iteration,
                        release=False,
                        zero=False):
    if release:
        d = 'release'
    else:
        d = 'iter_{:07d}'.format(iteration)
    if zero:
        dp_rank = mpu.get_data_parallel_rank()
        d += '_zero_dp_rank_{}'.format(dp_rank)
    return os.path.join(checkpoints_path, d,
                        'mp_rank_{:02d}'.format(mpu.get_model_parallel_rank()),
                        'model_optim_rng.pt')
Ejemplo n.º 5
0
def get_model(args):
    """Build the model."""

    print_rank_0('building GPT3 model ...')
    print ("Calling GPT3Model constructor...")  
    model = GPT3Model(num_layers=args.num_layers,
                      vocab_size=args.vocab_size,
                      hidden_size=args.hidden_size,
                      num_attention_heads=args.num_attention_heads,
                      embedding_dropout_prob=args.hidden_dropout,
                      attention_dropout_prob=args.attention_dropout,
                      output_dropout_prob=args.hidden_dropout,
                      max_sequence_length=args.max_position_embeddings,
                      checkpoint_activations=args.checkpoint_activations,
                      checkpoint_num_layers=args.checkpoint_num_layers,
                      parallel_output=False)

    if mpu.get_data_parallel_rank() == 0:
        print(' > number of parameters on model parallel rank {}: {}'.format(
            mpu.get_model_parallel_rank(),
            sum([p.nelement() for p in model.parameters()])), flush=True)

    # GPU allocation.
    print (f"placing the model on device {torch.cuda.current_device()}")
    model.cuda(torch.cuda.current_device())

    # Fp16 conversion.
    if args.fp16:
        rint ("we have NOT halfed the model before, and now we're wrapping it into a fp16_module. For...some reason...")
        model = FP16_Module(model)

    # Wrap model for distributed training.
    print ("Setting up distributed training...")
    print ("No classic pytorch DDP this time; \nUsing sberbank magic DDP")
    model = DDP(model)

    input ("ready to return model")
    return model
Ejemplo n.º 6
0
def make_gpt3_dataloaders(args):
    # Data parallel arguments
    world_size = mpu.get_data_parallel_world_size()
    rank = mpu.get_data_parallel_rank()
    # global_batch_size = args.batch_size * world_size
    num_workers = args.num_workers

    # data_dir = args.train_data_path if args.train_data_path else os.path.dirname(args.test_data_path)
    tokenizer_path = args.load_huggingface if args.load_huggingface else \
        (args.tokenizer_path if args.tokenizer_path else os.path.join(os.path.dirname(args.train_data_path),
                                                                      '_tokenizer/'))
    print_rank_0('Load tokenizer from ' + tokenizer_path)
    tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_path)
    tokenizer.add_special_tokens({"bos_token": "<s>"})
    tokenizer.add_special_tokens({"eos_token": "</s>"})

    print("Add answer_sep:", args.answer_sep)
    tokenizer.add_tokens(args.answer_sep)

    print("Add start_sep", args.start_sep)
    tokenizer.add_tokens(args.start_sep)

    print("Add start_sep", args.end_sep)
    tokenizer.add_tokens(args.end_sep)

    eod_token = tokenizer.encoder['<pad>']
    num_tokens = len(tokenizer)

    train_dataset_args = RuGpt3DatasetArguments(
        block_size=args.seq_length,
        max_files_load=args.max_files_per_process,
        overwrite_cache=args.overwrite_cache,
        tqdm=False)
    eval_dataset_args = RuGpt3DatasetArguments(
        block_size=args.seq_length,
        max_files_load=args.max_files_per_process,
        overwrite_cache=args.overwrite_cache,
        tqdm=True)

    def make_data_loader_(data_path, dataset_args):
        print_rank_0(
            f'Load RuGPT3 Dataset from {data_path}, {dataset_args.max_files_load} files per process'
        )
        dataset = RuGpt3TextDataset(
            tokenizer=tokenizer,
            args=dataset_args,
            rank=rank,
            world_size=world_size,
            file_path=data_path,
            # cache_prefix=args.cache_prefix
            all_args=args)
        # Use a simple sampler with distributed batch sampler.
        sampler = torch.utils.data.SequentialSampler(dataset)
        batch_sampler = ResumableBatchSampler(sampler=sampler,
                                              batch_size=args.batch_size,
                                              drop_last=True)

        return InfiniteDataLoader(dataset,
                                  batch_sampler=batch_sampler,
                                  num_workers=num_workers,
                                  pin_memory=True)

    train = make_data_loader_(
        args.train_data_path,
        train_dataset_args) if args.train_data_path else None
    valid = make_data_loader_(
        args.val_data_path, eval_dataset_args) if args.val_data_path else None
    test = make_data_loader_(
        args.test_data_path,
        eval_dataset_args) if args.test_data_path else None

    args.do_train = train is not None
    args.do_valid = valid is not None
    args.do_test = test is not None

    return (train, valid, test), num_tokens, eod_token, tokenizer
Ejemplo n.º 7
0
def load_checkpoint(model, optimizer, lr_scheduler, args, deepspeed=False):
    """Load a model checkpoint."""

    print(f"load_checkpoint has deepspeed {deepspeed}")
    iteration, release, success = get_checkpoint_iteration(args)

    if not success:
        return 0

    if deepspeed:
        print("loading a deepspeed checkpoint...")

        import torch
        thingy = torch.cuda.memory_summary(device=torch.cuda.current_device(),
                                           abbreviated=False)
        print(f"reporting memory usage:{thingy}")

        print("Attempting to free some memory...")
        try:
            torch.cuda.empty_cache()
        except Error as e:
            print(f"Something went wrong:{e}")
        print("Trying to collect garbage:")
        try:
            import gc
            gc.collect()
        except Error as e:
            print(f"Something went wrong:{e}")
        thingy = torch.cuda.memory_summary(device=torch.cuda.current_device(),
                                           abbreviated=False)
        print(
            f"We've tried to free some memory, \nand now are reporting memory usage:{thingy}"
        )

        #         load_optim = not args.no_load_optim
        print("Oh, screw it, let's not load an optimizer...")
        load_optim = False
        checkpoint_name, sd = model.load_checkpoint(
            args.load,
            iteration,
            load_optimizer_states=load_optim,
            load_lr_scheduler_states=load_optim)

        if checkpoint_name is None:
            if mpu.get_data_parallel_rank() == 0:
                print("Unable to load checkpoint.")
            return iteration

    else:

        # Checkpoint.
        checkpoint_name = get_checkpoint_name(args.load, iteration, release)

        if mpu.get_data_parallel_rank() == 0:
            print('global rank {} is loading checkpoint {}'.format(
                torch.distributed.get_rank(), checkpoint_name))

        # Load the checkpoint.
        sd = torch.load(checkpoint_name, map_location='cpu')

        if isinstance(model, torchDDP):
            model = model.module

        # Model.
        try:
            model.load_state_dict(sd['model'])
        except KeyError:
            print_rank_0('A metadata file exists but unable to load model '
                         'from checkpoint {}, exiting'.format(checkpoint_name))
            exit()

        # Optimizer.
        if not release and not args.finetune and not args.no_load_optim:
            try:
                if optimizer is not None:
                    optimizer.load_state_dict(sd['optimizer'])
                if lr_scheduler is not None:
                    lr_scheduler.load_state_dict(sd['lr_scheduler'])
            except KeyError:
                print_rank_0(
                    'Unable to load optimizer from checkpoint {}, exiting. '
                    'Specify --no-load-optim or --finetune to prevent '
                    'attempting to load the optimizer '
                    'state.'.format(checkpoint_name))
                exit()

    # Iterations.
    if args.finetune or release:
        iteration = 0
    else:
        try:
            iteration = sd['iteration']
        except KeyError:
            try:  # Backward compatible with older checkpoints
                iteration = sd['total_iters']
            except KeyError:
                print_rank_0(
                    'A metadata file exists but Unable to load iteration '
                    ' from checkpoint {}, exiting'.format(checkpoint_name))
                exit()

    # rng states.
    if not release and not args.finetune and not args.no_load_rng:
        try:
            random.setstate(sd['random_rng_state'])
            np.random.set_state(sd['np_rng_state'])
            torch.set_rng_state(sd['torch_rng_state'])
            torch.cuda.set_rng_state(sd['cuda_rng_state'])
            mpu.get_cuda_rng_tracker().set_states(sd['rng_tracker_states'])
        except KeyError:
            print_rank_0(
                'Unable to load optimizer from checkpoint {}, exiting. '
                'Specify --no-load-optim or --finetune to prevent '
                'attempting to load the optimizer '
                'state.'.format(checkpoint_name))
            exit()

    torch.distributed.barrier()
    if mpu.get_data_parallel_rank() == 0:
        print('  successfully loaded {}'.format(checkpoint_name))

    return iteration
Ejemplo n.º 8
0
def load_checkpoint(model, optimizer, lr_scheduler, args, deepspeed=False):
    """Load a model checkpoint."""

    iteration, release, success = get_checkpoint_iteration(args)

    if not success:
        return 0

    if deepspeed:
        load_optim = not args.no_load_optim
        checkpoint_name, sd = model.load_checkpoint(
            args.load,
            iteration,
            load_optimizer_states=load_optim,
            load_lr_scheduler_states=load_optim)

        if checkpoint_name is None:
            if mpu.get_data_parallel_rank() == 0:
                print("Unable to load checkpoint.")
            return iteration

    else:

        # Checkpoint.
        checkpoint_name = get_checkpoint_name(args.load, iteration, release)

        if mpu.get_data_parallel_rank() == 0:
            print('global rank {} is loading checkpoint {}'.format(
                torch.distributed.get_rank(), checkpoint_name))

        # Load the checkpoint.
        sd = torch.load(checkpoint_name, map_location='cpu')

        if isinstance(model, torchDDP):
            model = model.module

        # Model.
        try:
            model.load_state_dict(sd['model'])
        except KeyError:
            print_rank_0('A metadata file exists but unable to load model '
                         'from checkpoint {}, exiting'.format(checkpoint_name))
            exit()

        # Optimizer.
        if not release and not args.finetune and not args.no_load_optim:
            try:
                if optimizer is not None:
                    optimizer.load_state_dict(sd['optimizer'])
                if lr_scheduler is not None:
                    lr_scheduler.load_state_dict(sd['lr_scheduler'])
            except KeyError:
                print_rank_0(
                    'Unable to load optimizer from checkpoint {}, exiting. '
                    'Specify --no-load-optim or --finetune to prevent '
                    'attempting to load the optimizer '
                    'state.'.format(checkpoint_name))
                exit()

    # Iterations.
    if args.finetune or release:
        iteration = 0
    else:
        try:
            iteration = sd['iteration']
        except KeyError:
            try:  # Backward compatible with older checkpoints
                iteration = sd['total_iters']
            except KeyError:
                print_rank_0(
                    'A metadata file exists but Unable to load iteration '
                    ' from checkpoint {}, exiting'.format(checkpoint_name))
                exit()

    # rng states.
    if not release and not args.finetune and not args.no_load_rng:
        try:
            random.setstate(sd['random_rng_state'])
            np.random.set_state(sd['np_rng_state'])
            torch.set_rng_state(sd['torch_rng_state'])
            torch.cuda.set_rng_state(sd['cuda_rng_state'])
            mpu.get_cuda_rng_tracker().set_states(sd['rng_tracker_states'])
        except KeyError:
            print_rank_0(
                'Unable to load optimizer from checkpoint {}, exiting. '
                'Specify --no-load-optim or --finetune to prevent '
                'attempting to load the optimizer '
                'state.'.format(checkpoint_name))
            exit()

    torch.distributed.barrier()
    if mpu.get_data_parallel_rank() == 0:
        print('  successfully loaded {}'.format(checkpoint_name))

    return iteration
Ejemplo n.º 9
0
def get_model(args):
    """Build the model."""

    print_rank_0('building GPT3 model ...')
    print ("asserting we have a correct number of attention heads...")
    assert args.num_attention_heads % args.model_parallel_size == 0
    num_local_heads = args.num_attention_heads // args.model_parallel_size
    deepspeed_sparsity_config = None
    if DEEPSPEED_WRAP and args.deepspeed:
        print ("we're using deepspeed, and so we're getting a sparse attention config")
        deepspeed_sparsity_config = get_sparse_attention_config(args, num_local_heads)
    if deepspeed_sparsity_config is not None:
        print_rank_0(f"Using sparse attention with mode {args.sparse_mode}")
    print ("Calling GPT3Model constructor...")    
    model = GPT3Model(num_layers=args.num_layers,
                      vocab_size=args.vocab_size,
                      hidden_size=args.hidden_size,
                      num_attention_heads=args.num_attention_heads,
                      embedding_dropout_prob=args.hidden_dropout,
                      attention_dropout_prob=args.attention_dropout,
                      output_dropout_prob=args.hidden_dropout,
                      max_sequence_length=args.max_position_embeddings,
                      checkpoint_activations=args.checkpoint_activations,
                      checkpoint_num_layers=args.checkpoint_num_layers,
                      parallel_output=True,
                      deepspeed_sparsity_config=deepspeed_sparsity_config,
                      sparse_mode=args.sparse_mode)

    if args.load_huggingface is not None:
        print ("Loading huggingface model...")
        model = load_huggingface_model(model, args.load_huggingface, args.huggingface_double_pos_embeddings)

    if mpu.get_data_parallel_rank() == 0:
        print(' > number of parameters on model parallel rank {}: {}'.format(
            mpu.get_model_parallel_rank(),
            sum([p.nelement() for p in model.parameters()])), flush=True)

    # To prevent OOM for model sizes that cannot fit in GPU memory in full precision
    if DEEPSPEED_WRAP and args.deepspeed and args.fp16:
        print ("We've had deepspeed AND fp16, so we're halfing the model...")
        model.half()

    # GPU allocation.
    print (f"placing the model on device {torch.cuda.current_device()}")
    model.cuda(torch.cuda.current_device())

    # Fp16 conversion.
    if args.fp16:
        print ("we've halfed the model before, but now we're wrapping it into a fp16_module. For...some reason...")
        model = FP16_Module(model)

    # Wrap model for distributed training.
    print ("Setting up distributed training...")
    if USE_TORCH_DDP:
        i = torch.cuda.current_device()
        print (f"Using classic pytorch DDP with device {i}")
        model = DDP(model, device_ids=[i], output_device=i,
                    process_group=mpu.get_data_parallel_group())
    else:
        print ("Using sberbank magic DDP")
        model = DDP(model)

#     input ("ready to return model")
    print ("ready to return model")
    return model