import random import deepspeed import torch from torch.utils.data import DataLoader from tqdm.auto import trange import torch.distributed as distributed from gpt_neox import (GPTNeoX, AutoregressiveWrapper, GPT2Dataset, extract_tarfile, prepare_optimizer_parameters, get_tokenizer, is_main, prepare_data) from gpt_neox.utils import get_args, get_params train_args = get_args() params = get_params(train_args.model) # tokenizer tokenizer = get_tokenizer( tokenizer_type=params["tokenizer"].get("type", None), from_pretrained=params["tokenizer"].get("from_pretrained", True), add_padding_token=params["tokenizer"].get("add_padding_token", False)) vocab_size = len( tokenizer) if params["vocab_size"] is None else params["vocab_size"] # instantiate GPT-like decoder model model = GPTNeoX(num_tokens=vocab_size, dim=params["hidden_dim"], seq_len=params["seq_len"], depth=params["n_layers"], heads=params["n_heads"], dim_head=params["dim_head"])
def build_eval_data_iter(dataset, model): sampler = torch.utils.data.distributed.DistributedSampler( dataset, num_replicas=model.dp_world_size, rank=model.mpu.get_data_parallel_rank(), shuffle=False) # Build a loader and make it repeating. pipe_dataloader = model.deepspeed_io(dataset, data_sampler=sampler) pipe_dataloader = RepeatingLoader(pipe_dataloader) return pipe_dataloader if __name__ == '__main__': # arguments args = get_args() IS_MAIN = is_main(args) deepspeed.init_distributed(dist_backend='nccl') # tokenizer tokenizer = get_tokenizer( tokenizer_type=args["tokenizer"].get("type", None), from_pretrained=args["tokenizer"].get("from_pretrained", True), add_padding_token=args["tokenizer"].get("add_padding_token", False)) vocab_size = len( tokenizer) if args["vocab_size"] is None else args["vocab_size"] # model model = GPTNeoX_Pipe(num_tokens=vocab_size,