def read_context(tokenizer, args, output): terminate_runs, skip_run = 0, 0 if mpu.get_model_parallel_rank() == 0: while True: raw_text = input("\nContext prompt (stop to exit) >>> ") if not raw_text: print('Prompt should not be empty!') continue if raw_text == "stop": terminate_runs = 1 break generation_mask = '[gMASK]' if args.task_mask else '[MASK]' if args.block_lm and 'MASK]' not in raw_text: raw_text += ' ' + generation_mask output.write(raw_text) context_tokens = tokenizer.EncodeAsIds(raw_text).tokenization if args.block_lm: context_tokens = [tokenizer.get_command('ENC').Id ] + context_tokens if not raw_text.endswith('MASK]'): context_tokens = context_tokens + [ tokenizer.get_command('eos').Id ] context_length = len(context_tokens) if context_length >= args.seq_length: print("\nContext length", context_length, "\nPlease give smaller context than the window length!") continue break else: context_length = 0 terminate_runs_tensor = torch.cuda.LongTensor([terminate_runs]) torch.distributed.broadcast(terminate_runs_tensor, mpu.get_model_parallel_src_rank(), group=mpu.get_model_parallel_group()) terminate_runs = terminate_runs_tensor[0].item() if terminate_runs == 1: return terminate_runs, None, None, None context_length_tensor = torch.cuda.LongTensor([context_length]) torch.distributed.broadcast(context_length_tensor, mpu.get_model_parallel_src_rank(), group=mpu.get_model_parallel_group()) context_length = context_length_tensor[0].item() if mpu.get_model_parallel_rank() == 0: context_tokens_tensor = torch.cuda.LongTensor(context_tokens) else: context_tokens_tensor = torch.cuda.LongTensor([0] * context_length) torch.distributed.broadcast(context_tokens_tensor, mpu.get_model_parallel_src_rank(), group=mpu.get_model_parallel_group()) if mpu.get_model_parallel_rank() != 0: raw_text = tokenizer.DecodeIds(context_tokens_tensor.tolist()) return terminate_runs, raw_text, context_tokens_tensor, context_length
def slice_state_dict(config, loaded_state_dict): sliced_state_dict = OrderedDict() start_layer_id = ( config.n_total_layers // mpu.get_pipeline_parallel_world_size() * mpu.get_pipeline_parallel_group_rank() + min(mpu.get_pipeline_parallel_group_rank(), config.n_total_layers % mpu.get_pipeline_parallel_world_size())) end_layer_id = start_layer_id + config.n_layers for key, value in loaded_state_dict.items(): keys = key.split('.') global_layer_id = int(keys[2]) if start_layer_id <= global_layer_id < end_layer_id: local_layer_id = global_layer_id - start_layer_id new_key = '.'.join(keys[:2] + [str(local_layer_id)] + keys[3:]) if keys[3] == 'attn' and keys[4] == 'in_proj': in_size = mpu.divide(value.size(0), mpu.get_model_parallel_world_size()) if keys[5] in ('weight', 'bias'): new_value = value[mpu.get_model_parallel_rank() * in_size:(mpu.get_model_parallel_rank() + 1) * in_size] else: raise NotImplementedError(f"Unknown key {key}") elif keys[3] == 'attn' and keys[4] == 'out_proj': if keys[5] == 'weight': out_size = mpu.divide(value.size(1), mpu.get_model_parallel_world_size()) new_value = value[:, mpu.get_model_parallel_rank() * out_size:(mpu.get_model_parallel_rank() + 1) * out_size] elif keys[5] == 'bias': new_value = value else: raise NotImplementedError(f"Unknown key {key}") elif keys[3] == 'fc1': in_size = mpu.divide(value.size(0), mpu.get_model_parallel_world_size()) if keys[4] in ('weight', 'bias'): new_value = value[mpu.get_model_parallel_rank() * in_size:(mpu.get_model_parallel_rank() + 1) * in_size] else: raise NotImplementedError(f"Unknown key {key}") elif keys[3] == 'fc2': if keys[4] == 'weight': out_size = mpu.divide(value.size(1), mpu.get_model_parallel_world_size()) new_value = value[:, mpu.get_model_parallel_rank() * out_size:(mpu.get_model_parallel_rank() + 1) * out_size] elif keys[4] == 'bias': new_value = value else: raise NotImplementedError(f"Unknown key {key}") else: new_value = value sliced_state_dict[new_key] = new_value return sliced_state_dict
def prepare_tokenizer(args): add_sentinel_token = 0 if args.sentinel_token: add_sentinel_token = args.max_position_embeddings tokenizer = make_tokenizer(args.tokenizer_type, None, args.tokenizer_path, args.vocab_size, args.tokenizer_model_type, add_block_symbols=args.block_lm, cache_dir=args.cache_dir, add_sentinel_token=add_sentinel_token, add_task_mask=args.task_mask, add_decoder_mask=args.block_mask_prob > 0.0 or args.context_mask_ratio > 0.0, fix_command_token=args.fix_command_token) if mpu.get_model_parallel_rank() == 0: num_tokens = tokenizer.num_tokens eod_token = tokenizer.get_command('eos').Id assert eod_token == tokenizer.get_command('pad').Id before = num_tokens after = before multiple = args.make_vocab_size_divisible_by while (after % multiple) != 0: after += 1 print_rank_0('> padded vocab (size: {}) with {} dummy ' 'tokens (new size: {})'.format(before, after - before, after)) print_rank_0('> found end-of-document token: {}'.format(eod_token)) token_counts = torch.cuda.LongTensor([after, eod_token]) else: token_counts = torch.cuda.LongTensor([0, 0]) # Broadcast num tokens. torch.distributed.broadcast(token_counts, mpu.get_model_parallel_src_rank(), group=mpu.get_model_parallel_group()) num_tokens = token_counts[0].item() eod_token = token_counts[1].item() args.vocab_size, args.eod_token = num_tokens, eod_token return tokenizer
def init_model(self): # backbone init torch.cuda.set_device(self.gpu) self.net = self.net(**self.backbone_kwargs).cuda(self.gpu) self.worker_rank = mpu.get_model_parallel_rank() print('DistributeDataParallel worker rank', self.worker_rank) if self.use_fp16: self.net = apex.parallel.convert_syncbn_model(self.net) else: self.net = torch.nn.SyncBatchNorm.convert_sync_batchnorm(self.net) self.header = mpu.ArcfaceColumnParallelLinear( embedding_size=self.embedding_dim, output_classs_size=self.num_classes, bias=False).cuda(self.device) print('model parallel heads generated :class', self.num_classes) self.header.tag = "ArcfacePallelheader_" + str(self.worker_rank) # optimizer init self.optim_fac = OptimFactory(params=self.net.parameters(), rigid_lr=self.rigid_lr, milestones=self.milestones, **self.backbone_hypers) # io factory init self.io_fac = IOFactory(save_path=self.save_path, worker_rank=self.worker_rank, tag=self.header.tag, vis=self.visualize, log_name="train")
def __init__(self, module): super(DistributedDataParallel, self).__init__() self.warn_on_half = True if dist._backend == dist.dist_backend.GLOO else False self.module = module self.data_parallel_group = mpu.get_data_parallel_group() src_rank = mpu.get_model_parallel_rank() for p in self.module.parameters(): if torch.is_tensor(p): dist.broadcast(p, src_rank, group=self.data_parallel_group) def allreduce_params(reduce_after=True, no_scale=False, fp32_allreduce=False): if (self.needs_reduction): self.needs_reduction = False buckets = {} for name, param in self.module.named_parameters(): if param.requires_grad and param.grad is not None: tp = (param.data.type()) if tp not in buckets: buckets[tp] = [] buckets[tp].append(param) if self.warn_on_half: if torch.cuda.HalfTensor in buckets: print( "WARNING: gloo dist backend for half parameters may be extremely slow." + " It is recommended to use the NCCL backend in this case." ) self.warn_on_half = False for tp in buckets: bucket = buckets[tp] grads = [param.grad.data for param in bucket] coalesced = _flatten_dense_tensors(grads) if fp32_allreduce: coalesced = coalesced.float() if not no_scale and not reduce_after: coalesced /= dist.get_world_size( group=self.data_parallel_group) dist.all_reduce(coalesced, group=self.data_parallel_group) torch.cuda.synchronize() if not no_scale and reduce_after: coalesced /= dist.get_world_size( group=self.data_parallel_group) for buf, synced in zip( grads, _unflatten_dense_tensors(coalesced, grads)): buf.copy_(synced) self.hook_handles = [] self.hooks = [] for param in list(self.module.parameters()): def allreduce_hook(*unused): Variable._execution_engine.queue_callback(allreduce_params) # handle = param.register_hook(allreduce_hook) #self.hooks.append(allreduce_hook) #self.hook_handles.append(handle) self.allreduce_params = allreduce_params
def get_model(args, version=None): """Build the model.""" print_rank_0('building Bert model ...') if version is None: model = BertMixtureModel(num_layers=args.num_layers, vocab_size=args.vocab_size, hidden_size=args.hidden_size, num_attention_heads=args.num_attention_heads, embedding_dropout_prob=args.hidden_dropout, attention_dropout_prob=args.attention_dropout, output_dropout_prob=args.hidden_dropout, layernorm_epsilon=args.layernorm_epsilon, max_sequence_length=args.max_position_embeddings, checkpoint_activations=args.checkpoint_activations, checkpoint_num_layers=args.checkpoint_num_layers, parallel_output=True, num_experts=args.num_experts, type_vocab_size=2) elif version == "v0": model = BertMixtureModel_v0(num_layers=args.num_layers, vocab_size=args.vocab_size, hidden_size=args.hidden_size, num_attention_heads=args.num_attention_heads, embedding_dropout_prob=args.hidden_dropout, attention_dropout_prob=args.attention_dropout, output_dropout_prob=args.hidden_dropout, layernorm_epsilon=args.layernorm_epsilon, max_sequence_length=args.max_position_embeddings, checkpoint_activations=args.checkpoint_activations, checkpoint_num_layers=args.checkpoint_num_layers, parallel_output=True, num_experts=args.num_experts, type_vocab_size=2) if mpu.get_data_parallel_rank() == 0: print(' > number of parameters on model parallel rank {}: {}'.format( mpu.get_model_parallel_rank(), sum([p.nelement() for p in model.parameters()])), flush=True) #To prevent OOM for model sizes that cannot fit in GPU memory in full precision if args.deepspeed and args.fp16: model.half() # GPU allocation. model.cuda(torch.cuda.current_device()) # Fp16 conversion. if args.fp16: model = FP16_Module(model) # Wrap model for distributed training. if USE_TORCH_DDP: i = torch.cuda.current_device() model = DDP(model, device_ids=[i], output_device=i, process_group=mpu.get_data_parallel_group()) else: model = DDP(model) return model
def parallel_self_attention(model_parallel_size, num_att_heads_per_partition, hidden_size_per_att_head, dropout_prob, batch_size, sequence_length): mpu.initialize_model_parallel(model_parallel_size) model_parallel_size = mpu.get_model_parallel_world_size() seed = 12345 set_random_seed(seed) num_att_heads = num_att_heads_per_partition * \ torch.distributed.get_world_size() hidden_size = hidden_size_per_att_head * num_att_heads # Network identity_layer = IdentityLayer3D(batch_size, sequence_length, hidden_size).cuda() attention_layer = mpu.BertParallelSelfAttention(hidden_size, num_att_heads, dropout_prob).cuda() loss_weight = torch.randn([batch_size, sequence_length, hidden_size]).cuda() attention_mask = torch.randn([batch_size, 1, 1, sequence_length]).cuda() # Forward input_ = identity_layer() output = attention_layer(input_, attention_mask) loss = torch.mul(output, loss_weight).sum() # Backward loss.backward() rank = mpu.get_model_parallel_rank() mpu.destroy_model_parallel() return rank, hidden_size, model_parallel_size, loss, \ attention_layer, identity_layer
def get_model(args): """Build the model.""" print_rank_0('building GPT2 model ...') model = GPT2Model(num_layers=args.num_layers, vocab_size=args.vocab_size, hidden_size=args.hidden_size, num_attention_heads=args.num_attention_heads, embedding_dropout_prob=args.hidden_dropout, attention_dropout_prob=args.attention_dropout, output_dropout_prob=args.hidden_dropout, max_sequence_length=args.max_position_embeddings, checkpoint_activations=args.checkpoint_activations, checkpoint_num_layers=args.checkpoint_num_layers, parallel_output=False) if mpu.get_data_parallel_rank() == 0: print(' > number of parameters on model parallel rank {}: {}'.format( mpu.get_model_parallel_rank(), sum([p.nelement() for p in model.parameters()])), flush=True) # GPU allocation. model.cuda(torch.cuda.current_device()) # Fp16 conversion. if args.fp16: model = FP16_Module(model) # Wrap model for distributed training. model = DDP(model) return model
def build_multi_task_dataset(args, tokenizer): task_dirs = { "mnli": "MNLI", "cola": "CoLA", "mrpc": "MRPC", "qnli": "QNLI", "qqp": "QQP", "sst2": "SST-2", "agnews": "Agnews", "yelp-polarity": "yelp_review_polarity_csv", "yelp-full": "yelp_review_full_csv", "yahoo": "Yahoo", "squad": "SQuAD", "race": "RACE" } train, valid = None, None if mpu.get_model_parallel_rank() == 0: multi_seq_length = args.seq_length if args.multi_seq_length is not None: multi_seq_length = args.multi_seq_length train_datasets, valid_datasets = [], [] for task in args.multi_task_data: task = task.lower() data_dir = os.path.join(args.data_dir, task_dirs[task]) train_datasets.append( SuperGlueDataset(args, task, data_dir, multi_seq_length, "train", tokenizer, pattern_ensemble=True)) valid_datasets.append( SuperGlueDataset(args, task, data_dir, multi_seq_length, "dev", tokenizer, pattern_ensemble=True)) train = MultiTaskDataset(args.multi_task_data, train_datasets) valid = MultiTaskDataset(args.multi_task_data, valid_datasets) world_size = torch.distributed.get_world_size( group=mpu.get_data_parallel_group()) multi_batch_size = args.batch_size * world_size if args.multi_batch_size is not None: multi_batch_size = args.multi_batch_size * world_size train = make_data_loader(train, tokenizer, multi_batch_size, args.train_iters, args, shuffle=True) valid = make_data_loader(valid, tokenizer, multi_batch_size, args.train_iters, args, shuffle=True) return train, valid
def get_train_val_test_data(args, tokenizer): """Load the data on rank zero and boradcast number of tokens to all GPUS.""" (train_data, val_data, test_data) = (None, None, None) # Data loader only on rank 0 of each model parallel group. if mpu.get_model_parallel_rank() == 0: data_config = configure_data() if args.block_lm: data_set_type = "Block" elif args.transformer_xl: data_set_type = "GPT-XL" else: data_set_type = "GPT2" data_config.set_defaults(data_set_type=data_set_type, transpose=False) train_data, val_data, test_data = data_config.apply(args, tokenizer) data_counts = torch.cuda.LongTensor( [int(args.do_train), int(args.do_valid), int(args.do_test)]) else: data_counts = torch.cuda.LongTensor([0, 0, 0]) # Broadcast num tokens. torch.distributed.broadcast(data_counts, mpu.get_model_parallel_src_rank(), group=mpu.get_model_parallel_group()) args.do_train = data_counts[0].item() args.do_valid = data_counts[1].item() args.do_test = data_counts[2].item() return train_data, val_data, test_data
def get_checkpoint_name(checkpoints_path, iteration, release=False, mp_rank=None): if release: d = 'release' else: d = 'iter_{:07d}'.format(iteration) return os.path.join(checkpoints_path, d, 'mp_rank_{:02d}'.format(mpu.get_model_parallel_rank() if mp_rank is None else mp_rank), 'model_optim_rng.pt')
def read_context(tokenizer, args, output): terminate_runs, skip_run = 0, 0 if mpu.get_model_parallel_rank() == 0: while True: raw_text = input("\nContext prompt (stop to exit) >>> ") if not raw_text: print('Prompt should not be empty!') continue if raw_text == "stop": terminate_runs = 1 break if args.hierarchical: raw_text = "Summary: " + raw_text output.write(raw_text) context_tokens = tokenizer.EncodeAsIds(raw_text).tokenization context_length = len(context_tokens) if context_length >= args.seq_length: print("\nContext length", context_length, "\nPlease give smaller context than the window length!") continue break else: context_length = 0 terminate_runs_tensor = torch.cuda.LongTensor([terminate_runs]) torch.distributed.broadcast(terminate_runs_tensor, mpu.get_model_parallel_src_rank(), group=mpu.get_model_parallel_group()) terminate_runs = terminate_runs_tensor[0].item() if terminate_runs == 1: return terminate_runs, raw_text, None, None context_length_tensor = torch.cuda.LongTensor([context_length]) torch.distributed.broadcast(context_length_tensor, mpu.get_model_parallel_src_rank(), group=mpu.get_model_parallel_group()) context_length = context_length_tensor[0].item() if mpu.get_model_parallel_rank() == 0: context_tokens_tensor = torch.cuda.LongTensor(context_tokens) else: context_tokens_tensor = torch.cuda.LongTensor([0] * context_length) torch.distributed.broadcast(context_tokens_tensor, mpu.get_model_parallel_src_rank(), group=mpu.get_model_parallel_group()) return terminate_runs, raw_text, context_tokens_tensor, context_length
def evaluate(data_loader, model, args, timers, num_iterations=None): """Evaluation.""" # Turn on evaluation mode which disables dropout. model.eval() total_lm_loss = 0 if num_iterations is not None: max_iters = num_iterations else: if mpu.get_model_parallel_rank() == 0: max_iters_gpu = torch.cuda.LongTensor([len(data_loader)]) else: max_iters_gpu = torch.cuda.LongTensor([0]) torch.distributed.broadcast(max_iters_gpu, mpu.get_model_parallel_src_rank(), group=mpu.get_model_parallel_group()) max_iters = max_iters_gpu[0].item() print_rank_0('global rank: {} | max iters: {}'.format( torch.distributed.get_rank(), max_iters)) if data_loader is not None: data_iterator = iter(data_loader) else: data_iterator = None with torch.no_grad(): iteration = 0 while iteration < max_iters: if iteration % args.log_interval == 0: print_rank_0('global rank: {} | iteration: {}'.format( torch.distributed.get_rank(), iteration)) # Forward evaluation. lm_loss = forward_step(data_iterator, model, args, timers) if lm_loss is None: break # Reduce across processes. if isinstance(model, DDP): torch.distributed.all_reduce(lm_loss.data) if args.cloze_eval: lm_loss.data = lm_loss.data / args.world_size else: lm_loss.data = lm_loss.data / args.model_parallel_size if not args.cloze_eval: total_lm_loss += lm_loss.data.detach().float().item()/(args.num_tokenized_tokens-1) else: total_lm_loss += lm_loss.data.detach().float().item() iteration += 1 # Move model back to the train mode. model.train() return total_lm_loss
def get_checkpoint_name(checkpoints_path, iteration, release=False, zero=False): if release: d = 'release' else: d = 'iter_{:07d}'.format(iteration) if zero: dp_rank = mpu.get_data_parallel_rank() d += '_zero_dp_rank_{}'.format(dp_rank) return os.path.join(checkpoints_path, d, 'mp_rank_{:02d}'.format(mpu.get_model_parallel_rank()), 'model_optim_rng.pt')
def test_boradcast_data(model_parallel_size): if torch.distributed.get_rank() == 0: print( '> testing boradcast_data with model parallel size {} ...'.format( model_parallel_size)) mpu.initialize_model_parallel(model_parallel_size) torch.manual_seed(1234 + mpu.get_data_parallel_rank()) model_parallel_size = mpu.get_model_parallel_world_size() key_size_t = { 'key1': [7, 11], 'key2': [8, 2, 1], 'key3': [13], 'key4': [5, 1, 2], 'key5': [5, 12] } keys = list(key_size_t.keys()) data = {} data_t = {} for key in key_size_t: data[key] = torch.LongTensor(size=key_size_t[key]).random_(0, 1000) data_t[key] = data[key].clone() data['keyX'] = torch.FloatTensor(size=(5, )).random_(0, 1000) data_t['keyX'] = data['keyX'].clone() if mpu.get_model_parallel_rank() != 0: data = None data_utils._check_data_types(keys, data_t, torch.int64) key_size, key_numel, \ total_numel = data_utils._build_key_size_numel_dictionaries(keys, data) for key in keys: assert key_size[key] == key_size_t[key] total_numel_t = 0 for key in keys: target_size = functools.reduce(operator.mul, key_size_t[key], 1) assert key_numel[key] == target_size total_numel_t += target_size assert total_numel == total_numel_t data_b = data_utils.broadcast_data(keys, data, torch.int64) for key in keys: tensor = data_t[key].cuda() assert data_b[key].sub(tensor).abs().max() == 0 # Reset groups mpu.destroy_model_parallel() torch.distributed.barrier() if torch.distributed.get_rank() == 0: print('>> passed the test :-)')
def sample_sequence(model, tokenizer, context_tokens_tensor, context_length, args, device, mems=None, end_token=None): tokens, attention_mask, position_ids = get_batch(context_tokens_tensor, device, args) counter = 0 if mems is None: mems = [] if end_token is None: end_token = args.eod_token org_context_length = context_length while counter < (args.out_seq_length - org_context_length): if counter == 0: logits, *mems = model(tokens, position_ids, attention_mask, *mems) else: index = org_context_length + counter logits, *mems = model(tokens[:, index - 1: index], tokens.new_ones((1, 1)) * (index - 1), tokens.new_ones(1, 1, 1, args.mem_length + 1, device=tokens.device, dtype=torch.float), *mems) logits = logits[:, -1] logits /= args.temperature logits = top_k_logits(logits, top_k=args.top_k, top_p=args.top_p) log_probs = F.softmax(logits, dim=-1) prev = torch.multinomial(log_probs, num_samples=1)[0] is_end = prev == end_token if is_end: break tokens = torch.cat((tokens, prev.view(1, 1)), dim=1) context_length += 1 counter += 1 if not args.hierarchical and mpu.get_model_parallel_rank() == 0 and counter % 16 == 0: output_tokens_list = tokens.view(-1).contiguous() decode_tokens = tokenizer.DecodeIds(output_tokens_list.tolist()) if mpu.get_model_parallel_rank() == 0 and (counter % 128 == 0 or is_end): os.system('clear') trim_decode_tokens = decode_tokens print(trim_decode_tokens, flush=True) output_tokens_list = tokens.view(-1).contiguous() return output_tokens_list, mems
def get_model(args): """Build the model.""" print_rank_0('building BERT model ...') model = BertModel(args) if mpu.get_data_parallel_rank() == 0: print(' > number of parameters on model parallel rank {}: {}'.format( mpu.get_model_parallel_rank(), sum([p.nelement() for p in model.parameters()])), flush=True) # GPU allocation. model.cuda(torch.cuda.current_device()) # Fp16 conversion. if args.fp16: model = FP16_Module(model) if args.fp32_embedding: model.module.model.bert.embeddings.word_embeddings.float() if args.ds_type=='BERT': model.module.model.bert.embeddings.position_embeddings.float() else: model.module.model.bert.embeddings.token_position_embeddings.float() model.module.model.bert.embeddings.para_position_embeddings.float() model.module.model.bert.embeddings.sent_position_embeddings.float() model.module.model.bert.embeddings.token_type_embeddings.float() if args.fp32_tokentypes: model.module.model.bert.embeddings.token_type_embeddings.float() if args.fp32_layernorm: for name, _module in model.named_modules(): if 'LayerNorm' in name: _module.float() # Wrap model for distributed training. if args.DDP_impl == 'torch': i = torch.cuda.current_device() args.DDP_type = torch.nn.parallel.distributed.DistributedDataParallel model = args.DDP_type(model, device_ids=[i], output_device=i, process_group=mpu.get_data_parallel_group()) elif args.DDP_impl == 'local': args.DDP_type = LocalDDP model = args.DDP_type(model) else: print_rank_0('Unknown DDP implementation specified: {}. ' 'Exiting.'.format(args.DDP_impl)) exit() return model
def generate_samples(model, tokenizer, args, device): model.eval() output_path = "./samples" if not os.path.exists(output_path): os.makedirs(output_path) output_path = os.path.join(output_path, f"sample-{datetime.now().strftime('%m-%d-%H-%M')}.txt") with torch.no_grad(), open(output_path, "w") as output: while True: torch.distributed.barrier(group=mpu.get_model_parallel_group()) terminate_runs, raw_text, context_tokens_tensor, context_length = read_context(tokenizer, args, output) if terminate_runs == 1: return start_time = time.time() if args.block_lm: mems = [] tokens, attention_mask, position_ids = get_batch(context_tokens_tensor, device, args) mask_tokens = ['MASK', 'sMASK', 'gMASK'] if args.task_mask else ['MASK'] mask_tokens = [tokenizer.get_command(token).Id for token in mask_tokens] end_tokens = [tokenizer.get_command('eop').Id, args.eod_token] mask_positions = [] for token in mask_tokens: mask_positions += (context_tokens_tensor == token).nonzero(as_tuple=True)[0].tolist() mask_positions.sort() if args.no_block_position: for mask_position in mask_positions: position_ids[0, mask_position + 1:] += args.out_seq_length _, *mems = model(tokens, position_ids, attention_mask, *mems) for mask_position in mask_positions: if args.no_block_position: position = position_ids[0, mask_position].item() else: position = mask_position tokens, mems = sample_sequence(model, tokenizer, tokens, position, args, device, mems=mems, end_tokens=end_tokens) else: tokens, _ = sample_sequence(model, tokenizer, context_tokens_tensor, context_length, args, device) output_tokens_list = tokens.view(-1).contiguous() if mpu.get_model_parallel_rank() == 0: os.system('clear') print("\nTaken time {:.2f}\n".format(time.time() - start_time), flush=True) print("\nContext:", raw_text, flush=True) decode_tokens = tokenizer.DecodeIds(output_tokens_list.tolist()) trim_decode_tokens = decode_tokens print("\nGLM:", trim_decode_tokens, flush=True) output.write(trim_decode_tokens + "\n") torch.distributed.barrier(group=mpu.get_model_parallel_group())
def get_train_val_test_data(args): """Load the data on rank zero and boradcast number of tokens to all GPUS.""" (train_data, val_data, test_data) = (None, None, None) # Data loader only on rank 0 of each model parallel group. if mpu.get_model_parallel_rank() == 0: if args.use_npy_data_loader: (train_data, val_data, test_data), num_tokens, \ eod_token = make_gpt2_dataloaders(args) else: data_config = configure_data() data_config.set_defaults(data_set_type='GPT2', transpose=False) (train_data, val_data, test_data), tokenizer = data_config.apply(args) num_tokens = tokenizer.num_tokens eod_token = tokenizer.get_command('eos').Id assert eod_token == tokenizer.get_command('pad').Id before = num_tokens after = before multiple = args.make_vocab_size_divisible_by * \ mpu.get_model_parallel_world_size() while (after % multiple) != 0: after += 1 print_rank_0('> padded vocab (size: {}) with {} dummy ' 'tokens (new size: {})'.format(before, after - before, after)) print_rank_0('> found end-of-document token: {}'.format(eod_token)) token_counts = torch.cuda.LongTensor([ after, eod_token, int(args.do_train), int(args.do_valid), int(args.do_test) ]) else: token_counts = torch.cuda.LongTensor([0, 0, 0, 0, 0]) # Broadcast num tokens. torch.distributed.broadcast(token_counts, mpu.get_model_parallel_src_rank(), group=mpu.get_model_parallel_group()) num_tokens = token_counts[0].item() eod_token = token_counts[1].item() args.do_train = token_counts[2].item() args.do_valid = token_counts[3].item() args.do_test = token_counts[4].item() return train_data, val_data, test_data, num_tokens, eod_token
def get_model(args): """Build the model.""" print_rank_0('building GPT2 model ...') model = GPT2Model(num_layers=args.num_layers, vocab_size=args.vocab_size, hidden_size=args.hidden_size, num_attention_heads=args.num_attention_heads, embedding_dropout_prob=args.hidden_dropout, attention_dropout_prob=args.attention_dropout, output_dropout_prob=args.hidden_dropout, max_sequence_length=args.max_position_embeddings, max_memory_length=args.mem_length, checkpoint_activations=args.checkpoint_activations, checkpoint_num_layers=args.checkpoint_num_layers, parallel_output=True, relative_encoding=args.transformer_xl) if mpu.get_data_parallel_rank() == 0: print(' > number of parameters on model parallel rank {}: {}'.format( mpu.get_model_parallel_rank(), sum([p.nelement() for p in model.parameters()])), flush=True) # To prevent OOM for model sizes that cannot fit in GPU memory in full precision if hasattr(args, "deepspeed") and args.deepspeed and args.fp16: model.half() # GPU allocation. model.cuda(torch.cuda.current_device()) # Fp16 conversion. if args.fp16: model = FP16_Module(model) # Wrap model for distributed training. if not args.deepspeed: if USE_TORCH_DDP: i = torch.cuda.current_device() model = DDP(model, device_ids=[i], output_device=i, process_group=mpu.get_data_parallel_group()) else: model = DDP(model) return model
def generate_samples(model, tokenizer, args, device): model.eval() output_path = "./samples" if not os.path.exists(output_path): os.makedirs(output_path) output_path = os.path.join(output_path, f"sample-{datetime.now().strftime('%m-%d-%H-%M')}.txt") with torch.no_grad(), open(output_path, "w") as output: while True: torch.distributed.barrier(group=mpu.get_model_parallel_group()) terminate_runs, raw_text, context_tokens_tensor, context_length = read_context(tokenizer, args, output) if terminate_runs == 1: return start_time = time.time() output_tokens_list, _ = sample_sequence(model, tokenizer, context_tokens_tensor, context_length, args, device) if args.hierarchical: eop_token = tokenizer.get_command('eop').Id if output_tokens_list[-1] == eop_token: output_tokens_list = output_tokens_list[:-1] decode_tokens = tokenizer.DecodeIds(output_tokens_list.tolist()) trim_decode_tokens = decode_tokens[9:] print("Summary:", trim_decode_tokens) keys = nltk.tokenize.sent_tokenize(trim_decode_tokens) context, mems = "", [] for i, key in enumerate(keys): if i > 0 and not context.endswith(" "): key = " " + key context_tokens = tokenizer.EncodeAsIds(key).tokenization context_length = len(context_tokens) context_tokens_tensor = torch.cuda.LongTensor(context_tokens) output_tokens_list, mems = sample_sequence(model, tokenizer, context_tokens_tensor, context_length, args, device, end_token=eop_token, mems=mems) decode_tokens = tokenizer.DecodeIds(output_tokens_list.tolist()) context += decode_tokens print(context) else: if mpu.get_model_parallel_rank() == 0: os.system('clear') print("\nTaken time {:.2f}\n".format(time.time() - start_time), flush=True) print("\nContext:", raw_text, flush=True) decode_tokens = tokenizer.DecodeIds(output_tokens_list.tolist()) trim_decode_tokens = decode_tokens[len(raw_text):] print("\nGPT2:", trim_decode_tokens, flush=True) output.write(trim_decode_tokens + "\n") torch.distributed.barrier(group=mpu.get_model_parallel_group())
def mix_forward_step(batch_and_dataloader, model, args, times, mems): use_blocklm = 0 if args.block_lm_ratio > 0.0: if mpu.get_model_parallel_rank() == 0: if random.random() > 1 / (1 + args.block_lm_ratio): use_blocklm = 1 use_blocklm = torch.cuda.LongTensor([use_blocklm]) torch.distributed.broadcast(use_blocklm, mpu.get_model_parallel_src_rank(), group=mpu.get_model_parallel_group()) use_blocklm = use_blocklm.item() if use_blocklm: return lm_forward_step((batch_and_dataloader[1], None), model, args, times, mems) else: return finetune_forward_step(batch_and_dataloader[0], model, args, times, mems)
def get_model(args): """Build the model.""" print_rank_0('building GPT2 model ...') model = GPT2Model(num_layers=args.num_layers, vocab_size=args.vocab_size, hidden_size=args.hidden_size, num_attention_heads=args.num_attention_heads, embedding_dropout_prob=args.hidden_dropout, attention_dropout_prob=args.attention_dropout, output_dropout_prob=args.hidden_dropout, max_sequence_length=args.max_position_embeddings, checkpoint_activations=args.checkpoint_activations, checkpoint_num_layers=args.checkpoint_num_layers, parallel_output=True) if mpu.get_data_parallel_rank() == 0: print(' > number of parameters on model parallel rank {}: {}'.format( mpu.get_model_parallel_rank(), sum([p.nelement() for p in model.parameters()])), flush=True) # GPU allocation. model.cuda(torch.cuda.current_device()) # Fp16 conversion. if args.fp16: model = FP16_Module(model) # Wrap model for distributed training. if args.DDP_impl == 'torch': i = torch.cuda.current_device() args.DDP_type = torch.nn.parallel.distributed.DistributedDataParallel model = args.DDP_type(model, device_ids=[i], output_device=i, process_group=mpu.get_data_parallel_group()) elif args.DDP_impl == 'local': args.DDP_type = LocalDDP model = args.DDP_type(model) else: print_rank_0('Unknown DDP implementation specified: {}. ' 'Exiting.'.format(args.DDP_impl)) exit() return model
def get_train_val_test_data(args): """Load the data on rank zero and boradcast number of tokens to all GPUS.""" (train_data, val_data, test_data) = (None, None, None) # Data loader only on rank 0 of each model parallel group. if mpu.get_model_parallel_rank() == 0: data_config = configure_data() ds_type = 'BERT' data_config.set_defaults(data_set_type=ds_type, transpose=False) (train_data, val_data, test_data), tokenizer = data_config.apply(args) before = tokenizer.num_tokens after = before multiple = args.make_vocab_size_divisible_by * \ mpu.get_model_parallel_world_size() while (after % multiple) != 0: after += 1 print_rank_0('> padded vocab (size: {}) with {} dummy ' 'tokens (new size: {})'.format(before, after - before, after)) # Need to broadcast num_tokens and num_type_tokens. token_counts = torch.cuda.LongTensor([ after, tokenizer.num_type_tokens, int(args.do_train), int(args.do_valid), int(args.do_test) ]) else: token_counts = torch.cuda.LongTensor([0, 0, 0, 0, 0]) # Broadcast num tokens. torch.distributed.broadcast(token_counts, mpu.get_model_parallel_src_rank(), group=mpu.get_model_parallel_group()) num_tokens = token_counts[0].item() num_type_tokens = token_counts[1].item() args.do_train = token_counts[2].item() args.do_valid = token_counts[3].item() args.do_test = token_counts[4].item() return train_data, val_data, test_data, num_tokens, num_type_tokens
def test_get_model_parallel_src_rank(model_parallel_size_): if torch.distributed.get_rank() == 0: print('> testing get_model_parallel_src_rank with size {} ...'.format( model_parallel_size_)) model_parallel_size = min(model_parallel_size_, torch.distributed.get_world_size()) assert not mpu.model_parallel_is_initialized() mpu.initialize_model_parallel(model_parallel_size) assert mpu.model_parallel_is_initialized() # Checks src_rank = torch.distributed.get_rank() - mpu.get_model_parallel_rank() assert mpu.get_model_parallel_src_rank() == src_rank # Reset groups mpu.destroy_model_parallel() torch.distributed.barrier() if torch.distributed.get_rank() == 0: print('>> passed the test :-)')
def get_model(args): """Build the model.""" print_rank_0('building BERT model ...') model = BertModel(args) if mpu.get_data_parallel_rank() == 0: print(' > number of parameters on model parallel rank {}: {}'.format( mpu.get_model_parallel_rank(), sum([p.nelement() for p in model.parameters()])), flush=True) # GPU allocation. model.cuda(torch.cuda.current_device()) # Fp16 conversion. if args.fp16: model = FP16_Module(model) if args.fp32_embedding: model.module.model.bert.embeddings.word_embeddings.float() model.module.model.bert.embeddings.position_embeddings.float() model.module.model.bert.embeddings.token_type_embeddings.float() if args.fp32_tokentypes: model.module.model.bert.embeddings.token_type_embeddings.float() if args.fp32_layernorm: for name, _module in model.named_modules(): if 'LayerNorm' in name: _module.float() # Wrap model for distributed training. if USE_TORCH_DDP: i = torch.cuda.current_device() model = DDP(model, device_ids=[i], output_device=i, process_group=mpu.get_data_parallel_group()) else: model = DDP(model) return model
def test_initialize_model_parallel(model_parallel_size): if torch.distributed.get_rank() == 0: print('> testing initialize_model_parallel with size {} ...'.format( model_parallel_size)) model_parallel_size_ = min(model_parallel_size, torch.distributed.get_world_size()) assert not mpu.model_parallel_is_initialized() mpu.initialize_model_parallel(model_parallel_size_) assert mpu.model_parallel_is_initialized() # Checks. def check(group, world_size, rank): assert world_size == torch.distributed.get_world_size(group=group) assert rank == torch.distributed.get_rank(group=group) # Model parallel. world_size = model_parallel_size_ rank = torch.distributed.get_rank() % model_parallel_size_ assert world_size == mpu.get_model_parallel_world_size() assert rank == mpu.get_model_parallel_rank() check(mpu.get_model_parallel_group(), world_size, rank) # Data parallel. world_size = torch.distributed.get_world_size() // model_parallel_size_ rank = torch.distributed.get_rank() // model_parallel_size assert world_size == mpu.get_data_parallel_world_size() assert rank == mpu.get_data_parallel_rank() check(mpu.get_data_parallel_group(), world_size, rank) # Reset groups mpu.destroy_model_parallel() torch.distributed.barrier() if torch.distributed.get_rank() == 0: print('>> passed the test :-)')
def get_model(args, config, do_fp16=False): """Build the model.""" print_rank_0('building GPT2 model ...') model = GPT2Model(**config, checkpoint_activations=args.checkpoint_activations, checkpoint_num_layers=args.checkpoint_num_layers, parallel_output=True) if mpu.get_data_parallel_rank() == 0: print(' > number of parameters on model parallel rank {}: {}'.format( mpu.get_model_parallel_rank(), sum([p.nelement() for p in model.parameters()])), flush=True) # To prevent OOM for model sizes that cannot fit in GPU memory in full precision if args.deepspeed and do_fp16: model.half() # GPU allocation. model.cuda(torch.cuda.current_device()) # Fp16 conversion. if do_fp16: model = FP16_Module(model) # Wrap model for distributed training. if USE_TORCH_DDP: i = torch.cuda.current_device() model = DDP(model, device_ids=[i], output_device=i, process_group=mpu.get_data_parallel_group()) else: model = DDP(model) return model
def test_model_parallel_cuda_manual_seed(model_parallel_size): if torch.distributed.get_rank() == 0: print('> testing model parallel cuda manual seed with size {} ...'. format(model_parallel_size)) mpu.initialize_model_parallel(model_parallel_size) model_parallel_size = mpu.get_model_parallel_world_size() mpu.model_parallel_cuda_manual_seed(12345) assert torch.cuda.initial_seed() == 12345 with mpu.get_cuda_rng_tracker().fork(): assert torch.cuda.initial_seed() == (12345 + 2718 + mpu.get_model_parallel_rank()) # Reset the tracker mpu.get_cuda_rng_tracker().reset() # Reset groups mpu.destroy_model_parallel() torch.distributed.barrier() if torch.distributed.get_rank() == 0: print('>> passed the test :-)')
def parallel_transformer(model_parallel_size, num_att_heads_per_partition, hidden_size_per_att_head, batch_size, sequence_length): mpu.initialize_model_parallel(model_parallel_size) model_parallel_size = mpu.get_model_parallel_world_size() seed = 12345 set_random_seed(seed) num_att_heads = num_att_heads_per_partition * \ torch.distributed.get_world_size() hidden_size = hidden_size_per_att_head * num_att_heads intermediate_size = 4 * hidden_size # Network identity_layer = IdentityLayer3D(batch_size, sequence_length, hidden_size).cuda() transformer_layer = mpu.BertParallelTransformerLayer( hidden_size, intermediate_size, num_att_heads, 0.0, 0.0, torch.nn.functional.relu, 1.0e-5).cuda() loss_weight = torch.randn([batch_size, sequence_length, hidden_size]).cuda() attention_mask = torch.randn([batch_size, 1, 1, sequence_length]).cuda() # Forward input_ = identity_layer() output = transformer_layer(input_, attention_mask) loss = torch.mul(output, loss_weight).sum() # Backward loss.backward() rank = mpu.get_model_parallel_rank() mpu.destroy_model_parallel() return rank, hidden_size, model_parallel_size, loss, \ transformer_layer, identity_layer