train_args = get_args()
params = get_params(train_args.model)

# tokenizer
tokenizer = get_tokenizer(
    tokenizer_type=params["tokenizer"].get("type", None),
    from_pretrained=params["tokenizer"].get("from_pretrained", True),
    add_padding_token=params["tokenizer"].get("add_padding_token", False))
vocab_size = len(
    tokenizer) if params["vocab_size"] is None else params["vocab_size"]

# instantiate GPT-like decoder model
model = GPTNeoX(num_tokens=vocab_size,
                dim=params["hidden_dim"],
                seq_len=params["seq_len"],
                depth=params["n_layers"],
                heads=params["n_heads"],
                dim_head=params["dim_head"])

model = AutoregressiveWrapper(model)

# prepare data
dset_params = params["dataset"]
assert dset_params is not None

deepspeed.init_distributed(dist_backend='nccl')
torch.distributed.barrier(
)  # barrier will force processes to stop until *all* processes have reached the barrier
if is_main(train_args):
    prepare_data(dset_params["name"])
    torch.distributed.barrier(
Exemple #2
0
train_args = get_args()
params = get_params(train_args.model)

# tokenizer
tokenizer = get_tokenizer(
    tokenizer_type=params["tokenizer"].get("type", None),
    from_pretrained=params["tokenizer"].get("from_pretrained", True),
    add_padding_token=params["tokenizer"].get("add_padding_token", False))
vocab_size = len(
    tokenizer) if params["vocab_size"] is None else params["vocab_size"]

# instantiate GPT-like decoder model
model = GPTNeoX(num_tokens=vocab_size,
                dim=params["hidden_dim"],
                seq_len=params["seq_len"],
                depth=params["n_layers"],
                heads=params["n_heads"],
                dim_head=params["dim_head"],
                gradient_checkpointing=params.get("gradient_checkpointing",
                                                  True))

model = AutoregressiveWrapper(model)
# prepare data
dset_params = params["dataset"]
assert dset_params is not None

deepspeed.init_distributed(dist_backend='nccl')
torch.distributed.barrier(
)  # barrier will force processes to stop until *all* processes have reached the barrier
if is_main(train_args):
    prepare_data(dset_params["name"])
    torch.distributed.barrier(