train_args = get_args()
params = get_params(train_args.model)

# tokenizer
tokenizer = get_tokenizer(
    tokenizer_type=params["tokenizer"].get("type", None),
    from_pretrained=params["tokenizer"].get("from_pretrained", True),
    add_padding_token=params["tokenizer"].get("add_padding_token", False))
vocab_size = len(
    tokenizer) if params["vocab_size"] is None else params["vocab_size"]

# instantiate GPT-like decoder model
params["seq_len"] = 2049
model = GPTNeoX(num_tokens=vocab_size,
                dim=params["hidden_dim"],
                seq_len=params["seq_len"],
                depth=params["n_layers"],
                heads=params["n_heads"],
                dim_head=params["dim_head"])

model = AutoregressiveWrapper(model)

# prepare data
dset_params = params["dataset"]
assert dset_params is not None

deepspeed.init_distributed(dist_backend='nccl')
torch.distributed.barrier(
)  # barrier will force processes to stop until *all* processes have reached the barrier
# if is_main(train_args):
#     prepare_data(dset_params["name"])
#     torch.distributed.barrier()  # barrier will force processes to stop until *all* processes have reached the barrier
train_args = get_args()
params = get_params(train_args.model)

# tokenizer
tokenizer = get_tokenizer(
    tokenizer_type=params["tokenizer"].get("type", None),
    from_pretrained=params["tokenizer"].get("from_pretrained", True),
    add_padding_token=params["tokenizer"].get("add_padding_token", False))
vocab_size = len(
    tokenizer) if params["vocab_size"] is None else params["vocab_size"]

# instantiate GPT-like decoder model
model = GPTNeoX(num_tokens=vocab_size,
                dim=params["hidden_dim"],
                seq_len=params["seq_len"],
                depth=params["n_layers"],
                heads=params["n_heads"],
                dim_head=params["dim_head"])

model = AutoregressiveWrapper(model)

# prepare data
dset_params = params["dataset"]
assert dset_params is not None

deepspeed.init_distributed(dist_backend='nccl')
torch.distributed.barrier(
)  # barrier will force processes to stop until *all* processes have reached the barrier
if is_main(train_args):
    prepare_data(dset_params["name"])
    torch.distributed.barrier(
Exemple #3
0
        torch.distributed.barrier()

if __name__ == '__main__':
	#arguments
	IS_MAIN = is_main(train_args)
	deepspeed.init_distributed(dist_backend='nccl')

	# only display system stats from one worker per machine
	wandb_settings = wandb.Settings() if is_main(train_args) else wandb.Settings(_disable_stats=True)
	name = f'{socket.gethostname()}-{train_args.local_rank}' if train_args.group_name else None

	if train_args.mode == 'no_pipeline':
		model = GPTNeoX(
    		num_tokens=vocab_size,
    		dim=params["hidden_dim"],
    		seq_len=params["seq_len"],
    		depth=params["n_layers"],
    		heads=params["n_heads"],
    		dim_head=params["dim_head"]
		)
		use_wandb = True
		try:
    		wandb.init(project="neox_train_enwik8", group=train_args.group_name, name=name, save_code=True, force=False,
               		entity=params.get('wandb', {}).get('team'), settings=wandb_settings)
		except UsageError as e:
    		use_wandb = False
    		print(e)
    		print('Skipping wandb. Execute `wandb login` on local machine to enable.')

    	model = AutoregressiveWrapper(model)
		# prepare data
		dset_params = params["dataset"]