def evaluate(model, dataloader, eval_metric, args): """Evaluation.""" # Turn on evaluation mode which disables dropout. model.eval() total_output, total_count = 0.0, 0 total_tokens = 0 with torch.no_grad(): # For all the batches in the dataset. for iteration, batch in enumerate(dataloader): if (iteration + 1) % args.log_interval == 0: print_rank_0('> working on iteration: {}'.format(iteration)) # Forward evaluation. output, _, _ = lm_forward_step(batch, model, args, None, [], eval_metric=eval_metric) count = batch['text'].size(0) count = torch.cuda.LongTensor([count]) # Reduce across processes. torch.distributed.all_reduce(output, group=mpu.get_data_parallel_group()) torch.distributed.all_reduce(count, group=mpu.get_data_parallel_group()) total_output += output.item() total_count += count.item() total_tokens += batch['loss_mask'].sum().item() print(total_tokens) return {eval_metric: total_output}, total_count
def make_data_loader(dataset, batch_size, args): shuffle = args.shuffle if shuffle: sampler = data_utils.samplers.RandomSampler(dataset, replacement=True, num_samples=batch_size*args.train_iters) else: sampler = torch.utils.data.SequentialSampler(dataset) world_size = torch.distributed.get_world_size( group=mpu.get_data_parallel_group()) rank = torch.distributed.get_rank(group=mpu.get_data_parallel_group()) distributed = world_size > 1 drop_last = distributed if distributed: batch_sampler = data_utils.samplers.DistributedBatchSampler(sampler, batch_size, drop_last, rank, world_size) else: batch_sampler = torch.utils.data.BatchSampler(sampler, batch_size, drop_last) data_loader = torch.utils.data.DataLoader(dataset, batch_sampler=batch_sampler, num_workers=args.num_workers, pin_memory=True) return data_loader
def make_data_loader(dataset, tokenizer, batch_size, num_iters, args): world_size = torch.distributed.get_world_size( group=mpu.get_data_parallel_group()) rank = torch.distributed.get_rank(group=mpu.get_data_parallel_group()) distributed = world_size > 1 if args.transformer_xl: batch_sampler = data_utils.samplers.DistributedSequentialSampler( len(dataset), num_iters, batch_size, rank, world_size) else: shuffle = args.shuffle if shuffle: sampler = data_utils.samplers.RandomSampler( dataset, replacement=True, num_samples=batch_size * args.train_iters) else: sampler = torch.utils.data.SequentialSampler(dataset) drop_last = distributed # the GPUs in the same model parallel group receive the same data if distributed: batch_sampler = data_utils.samplers.DistributedBatchSampler( sampler, batch_size, drop_last, rank, world_size, gradient_accumulation_steps=args.gradient_accumulation_steps) else: batch_sampler = torch.utils.data.BatchSampler( sampler, batch_size, drop_last) use_block = args.block_lm or args.encoder_decoder if use_block: strategy = ConstructBlockStrategy( args, tokenizer, args.max_position_embeddings, bert_prob=args.bert_prob, gap_sentence_prob=args.gap_sentence_prob, gpt_infill_prob=args.gpt_infill_prob, average_block_length=args.avg_block_length, gpt_min_ratio=args.gpt_min_ratio, block_mask_prob=args.block_mask_prob, context_mask_ratio=args.context_mask_ratio, shuffle_blocks=not args.no_shuffle_block, block_position_encoding=not args.no_block_position, sentinel_token=args.sentinel_token, encoder_decoder=args.encoder_decoder, task_mask=args.task_mask, random_position=args.random_position, masked_lm=args.masked_lm) data_loader = torch.utils.data.DataLoader( dataset, batch_sampler=batch_sampler, num_workers=args.num_workers, pin_memory=True, collate_fn=strategy.construct_blocks if use_block else None) return data_loader
def evaluate_ocnli(model, dev_dataloader, device, args): model.eval() correct = 0 total = 0 with torch.no_grad(): for batch in tqdm.tqdm(dev_dataloader): tokens_1, masks_1, tokens_2, masks_2, tokens_3, masks_3, labels = [x.to(device) for x in batch] tokens, attention_mask, position_ids = get_batch(tokens_1, args) output, _ = model(tokens, position_ids, attention_mask) losses = mpu.vocab_parallel_cross_entropy(output[:, :-1, :].contiguous().float(), tokens[:, 1:]) output_1 = torch.sum(losses * masks_1, 1) / torch.sum(masks_1, -1) tensor_list = [torch.zeros_like(output_1) for _ in range(mpu.get_data_parallel_world_size())] torch.distributed.all_gather(tensor_list, output_1, mpu.get_data_parallel_group()) output_1 = torch.stack(tensor_list, 0).view(-1).cpu().detach().numpy() # -------------- tokens, attention_mask, position_ids = get_batch(tokens_2, args) output, _ = model(tokens, position_ids, attention_mask) losses = mpu.vocab_parallel_cross_entropy(output[:, :-1, :].contiguous().float(), tokens[:, 1:]) output_2 = torch.sum(losses * masks_2, 1) / torch.sum(masks_2, -1) tensor_list = [torch.zeros_like(output_2) for _ in range(mpu.get_data_parallel_world_size())] torch.distributed.all_gather(tensor_list, output_2, mpu.get_data_parallel_group()) output_2 = torch.stack(tensor_list, 0).view(-1).cpu().detach().numpy() # --------------- tokens, attention_mask, position_ids = get_batch(tokens_3, args) output, _ = model(tokens, position_ids, attention_mask) losses = mpu.vocab_parallel_cross_entropy(output[:, :-1, :].contiguous().float(), tokens[:, 1:]) output_3 = torch.sum(losses * masks_3, 1) / torch.sum(masks_3, -1) tensor_list = [torch.zeros_like(output_3) for _ in range(mpu.get_data_parallel_world_size())] torch.distributed.all_gather(tensor_list, output_3, mpu.get_data_parallel_group()) output_3 = torch.stack(tensor_list, 0).view(-1).cpu().detach().numpy() # -------------- tensor_list_labels = [torch.zeros_like(labels) for _ in range(mpu.get_data_parallel_world_size())] torch.distributed.all_gather(tensor_list_labels, labels, mpu.get_data_parallel_group()) if torch.distributed.get_rank() == 0: labels = torch.stack(tensor_list_labels, 0) labels = labels.view(-1).cpu().detach().numpy() res = [np.argmin(np.array(x)) for x in zip(output_1, output_2, output_3)] res = [x==y for x, y in zip(res, labels)] correct += sum(res) total += len(res) if torch.distributed.get_rank() == 0: print("EVAL", correct, total)
def evaluate(model, dev_dataloader, all_labels, device, args): model.eval() if torch.distributed.get_rank() == 0: res = [] with torch.no_grad(): for batch in tqdm.tqdm(dev_dataloader): tokens, masks = [x.to(device) for x in batch] tokens, attention_mask, position_ids = get_batch(tokens, args) output, _ = model(tokens, position_ids, attention_mask) losses = mpu.vocab_parallel_cross_entropy(output[:, :-1, :].contiguous().float(), tokens[:, 1:]) output = torch.sum(losses * masks, 1) / torch.sum(masks, -1) tensor_list = [torch.zeros_like(output) for _ in range(mpu.get_data_parallel_world_size())] torch.distributed.all_gather(tensor_list, output, mpu.get_data_parallel_group()) output = torch.stack(tensor_list, 0).view(-1).cpu().detach().numpy() if torch.distributed.get_rank() == 0: for v in output: res.append(v) if torch.distributed.get_rank() == 0: cnt = 0 label_size = max(all_labels) + 1 num_inst = len(res) // label_size for x in range(num_inst): label = all_labels[x] cur_res = res[x*label_size:(x+1)*label_size] pos = np.argmin(cur_res) if pos == label: cnt += 1 print("EVAL", cnt, num_inst)
def __init__(self, module): super(DistributedDataParallel, self).__init__() self.warn_on_half = True if dist._backend == dist.dist_backend.GLOO else False self.module = module self.data_parallel_group = mpu.get_data_parallel_group() src_rank = mpu.get_model_parallel_rank() for p in self.module.parameters(): if torch.is_tensor(p): dist.broadcast(p, src_rank, group=self.data_parallel_group) def allreduce_params(reduce_after=True, no_scale=False, fp32_allreduce=False): if (self.needs_reduction): self.needs_reduction = False buckets = {} for name, param in self.module.named_parameters(): if param.requires_grad and param.grad is not None: tp = (param.data.type()) if tp not in buckets: buckets[tp] = [] buckets[tp].append(param) if self.warn_on_half: if torch.cuda.HalfTensor in buckets: print( "WARNING: gloo dist backend for half parameters may be extremely slow." + " It is recommended to use the NCCL backend in this case." ) self.warn_on_half = False for tp in buckets: bucket = buckets[tp] grads = [param.grad.data for param in bucket] coalesced = _flatten_dense_tensors(grads) if fp32_allreduce: coalesced = coalesced.float() if not no_scale and not reduce_after: coalesced /= dist.get_world_size( group=self.data_parallel_group) dist.all_reduce(coalesced, group=self.data_parallel_group) torch.cuda.synchronize() if not no_scale and reduce_after: coalesced /= dist.get_world_size( group=self.data_parallel_group) for buf, synced in zip( grads, _unflatten_dense_tensors(coalesced, grads)): buf.copy_(synced) self.hook_handles = [] self.hooks = [] for param in list(self.module.parameters()): def allreduce_hook(*unused): Variable._execution_engine.queue_callback(allreduce_params) # handle = param.register_hook(allreduce_hook) #self.hooks.append(allreduce_hook) #self.hook_handles.append(handle) self.allreduce_params = allreduce_params
def build_multi_task_dataset(args, tokenizer): task_dirs = { "mnli": "MNLI", "cola": "CoLA", "mrpc": "MRPC", "qnli": "QNLI", "qqp": "QQP", "sst2": "SST-2", "agnews": "Agnews", "yelp-polarity": "yelp_review_polarity_csv", "yelp-full": "yelp_review_full_csv", "yahoo": "Yahoo", "squad": "SQuAD", "race": "RACE" } train, valid = None, None if mpu.get_model_parallel_rank() == 0: multi_seq_length = args.seq_length if args.multi_seq_length is not None: multi_seq_length = args.multi_seq_length train_datasets, valid_datasets = [], [] for task in args.multi_task_data: task = task.lower() data_dir = os.path.join(args.data_dir, task_dirs[task]) train_datasets.append( SuperGlueDataset(args, task, data_dir, multi_seq_length, "train", tokenizer, pattern_ensemble=True)) valid_datasets.append( SuperGlueDataset(args, task, data_dir, multi_seq_length, "dev", tokenizer, pattern_ensemble=True)) train = MultiTaskDataset(args.multi_task_data, train_datasets) valid = MultiTaskDataset(args.multi_task_data, valid_datasets) world_size = torch.distributed.get_world_size( group=mpu.get_data_parallel_group()) multi_batch_size = args.batch_size * world_size if args.multi_batch_size is not None: multi_batch_size = args.multi_batch_size * world_size train = make_data_loader(train, tokenizer, multi_batch_size, args.train_iters, args, shuffle=True) valid = make_data_loader(valid, tokenizer, multi_batch_size, args.train_iters, args, shuffle=True) return train, valid
def get_model(args, version=None): """Build the model.""" print_rank_0('building Bert model ...') if version is None: model = BertMixtureModel(num_layers=args.num_layers, vocab_size=args.vocab_size, hidden_size=args.hidden_size, num_attention_heads=args.num_attention_heads, embedding_dropout_prob=args.hidden_dropout, attention_dropout_prob=args.attention_dropout, output_dropout_prob=args.hidden_dropout, layernorm_epsilon=args.layernorm_epsilon, max_sequence_length=args.max_position_embeddings, checkpoint_activations=args.checkpoint_activations, checkpoint_num_layers=args.checkpoint_num_layers, parallel_output=True, num_experts=args.num_experts, type_vocab_size=2) elif version == "v0": model = BertMixtureModel_v0(num_layers=args.num_layers, vocab_size=args.vocab_size, hidden_size=args.hidden_size, num_attention_heads=args.num_attention_heads, embedding_dropout_prob=args.hidden_dropout, attention_dropout_prob=args.attention_dropout, output_dropout_prob=args.hidden_dropout, layernorm_epsilon=args.layernorm_epsilon, max_sequence_length=args.max_position_embeddings, checkpoint_activations=args.checkpoint_activations, checkpoint_num_layers=args.checkpoint_num_layers, parallel_output=True, num_experts=args.num_experts, type_vocab_size=2) if mpu.get_data_parallel_rank() == 0: print(' > number of parameters on model parallel rank {}: {}'.format( mpu.get_model_parallel_rank(), sum([p.nelement() for p in model.parameters()])), flush=True) #To prevent OOM for model sizes that cannot fit in GPU memory in full precision if args.deepspeed and args.fp16: model.half() # GPU allocation. model.cuda(torch.cuda.current_device()) # Fp16 conversion. if args.fp16: model = FP16_Module(model) # Wrap model for distributed training. if USE_TORCH_DDP: i = torch.cuda.current_device() model = DDP(model, device_ids=[i], output_device=i, process_group=mpu.get_data_parallel_group()) else: model = DDP(model) return model
def evaluate_tnews(args, model, dataloader, device, mode="dev"): model.eval() all_truth, all_preds = [], [] with torch.no_grad(): for batch, no_model_batch in tqdm(dataloader, desc="Evaluating {}".format(mode), disable=(torch.distributed.get_rank() != 0)): for k in batch: batch[k] = batch[k].to(device) for k in no_model_batch: no_model_batch[k] = no_model_batch[k].to(device) output = model(**batch) output = torch.sum(output * no_model_batch["loss_mask"].unsqueeze(-1), 1) / torch.sum( no_model_batch["loss_mask"], -1).unsqueeze(-1) # gather the output logits from other gpus tensor_list = [torch.zeros_like(output) for _ in range(mpu.get_data_parallel_world_size())] torch.distributed.all_gather(tensor_list, output, mpu.get_data_parallel_group()) # gather the truth labels from other gpus tensor_list_truth = [torch.zeros_like(no_model_batch["truth"], dtype=torch.long) for _ in range(mpu.get_data_parallel_world_size())] torch.distributed.all_gather(tensor_list_truth, no_model_batch["truth"], mpu.get_data_parallel_group()) if args.model_parallel_size == 1: scores = torch.stack(tensor_list, 0).view(-1, 30000) else: assert args.model_parallel_size == 2, "Now, we only support model parallel <= 2" # for convience implementation. Note that the truth labels only appears in the first 15000 part of the logits, e.g. on rank 0, 2, 4, ... scores = torch.stack(tensor_list, 0).view(-1, 15000) truth = torch.stack(tensor_list_truth, 0) truth = truth.view(-1) # scores = scores[:, cand_ids] preds = torch.argmax(scores, dim=-1) all_truth.extend(truth.detach().cpu().tolist()) all_preds.extend(preds.detach().cpu().tolist()) acc = sum([int(p == l) for p, l in zip(all_preds, all_truth)]) / len(all_truth) acc = torch.tensor(acc).to(device) acc_list = [torch.zeros_like(acc) for _ in range(mpu.get_model_parallel_world_size())] torch.distributed.all_gather(acc_list, acc, mpu.get_model_parallel_group()) return acc_list[0].item(), all_truth, all_preds
def make_data_loader(dataset, batch_size, args): #shuffle = args.shuffle #if shuffle: # sampler = data_utils.samplers.RandomSampler(dataset, replacement=True, num_samples=batch_size*args.train_iters) #else: # sampler = torch.utils.data.SequentialSampler(dataset) world_size = torch.distributed.get_world_size( group=mpu.get_data_parallel_group()) rank = torch.distributed.get_rank(group=mpu.get_data_parallel_group()) distributed = world_size > 1 drop_last = distributed #if distributed: # batch_sampler = data_utils.samplers.DistributedBatchSampler(sampler, # batch_size, # drop_last, # rank, # world_size) #else: # batch_sampler = torch.utils.data.BatchSampler(sampler, # batch_size, # drop_last) #data_loader = torch.utils.data.DataLoader(dataset, # batch_sampler=batch_sampler, # num_workers=args.num_workers, # pin_memory=True) ################### data_loader = torch.utils.data.DataLoader( dataset[rank] if len(dataset) == world_size else dataset[0], batch_size=batch_size, num_workers=args.num_workers, pin_memory=False, drop_last=drop_last, timeout=5, persistent_workers=True) return data_loader
def make_data_loader(dataset, batch_size, args): world_size = torch.distributed.get_world_size( group=mpu.get_data_parallel_group()) rank = torch.distributed.get_rank(group=mpu.get_data_parallel_group()) distributed = world_size > 1 if args.transformer_xl: batch_sampler = data_utils.samplers.DistributedSequentialSampler( len(dataset), args.train_iters, batch_size, rank, world_size) else: shuffle = args.shuffle if shuffle: sampler = data_utils.samplers.RandomSampler( dataset, replacement=True, num_samples=batch_size * args.train_iters) else: sampler = torch.utils.data.SequentialSampler(dataset) drop_last = distributed # the GPUs in the same model parallel group receive the same data if distributed: batch_sampler = data_utils.samplers.DistributedBatchSampler( sampler, batch_size, drop_last, rank, world_size, gradient_accumulation_steps=args.gradient_accumulation_steps) else: batch_sampler = torch.utils.data.BatchSampler( sampler, batch_size, drop_last) data_loader = torch.utils.data.DataLoader(dataset, batch_sampler=batch_sampler, num_workers=args.num_workers, pin_memory=True) return data_loader
def get_model(args): """Build the model.""" print_rank_0('building BERT model ...') model = BertModel(args) if mpu.get_data_parallel_rank() == 0: print(' > number of parameters on model parallel rank {}: {}'.format( mpu.get_model_parallel_rank(), sum([p.nelement() for p in model.parameters()])), flush=True) # GPU allocation. model.cuda(torch.cuda.current_device()) # Fp16 conversion. if args.fp16: model = FP16_Module(model) if args.fp32_embedding: model.module.model.bert.embeddings.word_embeddings.float() if args.ds_type=='BERT': model.module.model.bert.embeddings.position_embeddings.float() else: model.module.model.bert.embeddings.token_position_embeddings.float() model.module.model.bert.embeddings.para_position_embeddings.float() model.module.model.bert.embeddings.sent_position_embeddings.float() model.module.model.bert.embeddings.token_type_embeddings.float() if args.fp32_tokentypes: model.module.model.bert.embeddings.token_type_embeddings.float() if args.fp32_layernorm: for name, _module in model.named_modules(): if 'LayerNorm' in name: _module.float() # Wrap model for distributed training. if args.DDP_impl == 'torch': i = torch.cuda.current_device() args.DDP_type = torch.nn.parallel.distributed.DistributedDataParallel model = args.DDP_type(model, device_ids=[i], output_device=i, process_group=mpu.get_data_parallel_group()) elif args.DDP_impl == 'local': args.DDP_type = LocalDDP model = args.DDP_type(model) else: print_rank_0('Unknown DDP implementation specified: {}. ' 'Exiting.'.format(args.DDP_impl)) exit() return model
def get_model(args): """Build the model.""" print_rank_0('building GPT2 model ...') model = GPT2Model(num_layers=args.num_layers, vocab_size=args.vocab_size, hidden_size=args.hidden_size, num_attention_heads=args.num_attention_heads, embedding_dropout_prob=args.hidden_dropout, attention_dropout_prob=args.attention_dropout, output_dropout_prob=args.hidden_dropout, max_sequence_length=args.max_position_embeddings, max_memory_length=args.mem_length, checkpoint_activations=args.checkpoint_activations, checkpoint_num_layers=args.checkpoint_num_layers, parallel_output=True, relative_encoding=args.transformer_xl) if mpu.get_data_parallel_rank() == 0: print(' > number of parameters on model parallel rank {}: {}'.format( mpu.get_model_parallel_rank(), sum([p.nelement() for p in model.parameters()])), flush=True) # To prevent OOM for model sizes that cannot fit in GPU memory in full precision if hasattr(args, "deepspeed") and args.deepspeed and args.fp16: model.half() # GPU allocation. model.cuda(torch.cuda.current_device()) # Fp16 conversion. if args.fp16: model = FP16_Module(model) # Wrap model for distributed training. if not args.deepspeed: if USE_TORCH_DDP: i = torch.cuda.current_device() model = DDP(model, device_ids=[i], output_device=i, process_group=mpu.get_data_parallel_group()) else: model = DDP(model) return model
def get_model(args): """Build the model.""" print_rank_0('building GPT2 model ...') model = GPT2Model(num_layers=args.num_layers, vocab_size=args.vocab_size, hidden_size=args.hidden_size, num_attention_heads=args.num_attention_heads, embedding_dropout_prob=args.hidden_dropout, attention_dropout_prob=args.attention_dropout, output_dropout_prob=args.hidden_dropout, max_sequence_length=args.max_position_embeddings, checkpoint_activations=args.checkpoint_activations, checkpoint_num_layers=args.checkpoint_num_layers, parallel_output=True) if mpu.get_data_parallel_rank() == 0: print(' > number of parameters on model parallel rank {}: {}'.format( mpu.get_model_parallel_rank(), sum([p.nelement() for p in model.parameters()])), flush=True) # GPU allocation. model.cuda(torch.cuda.current_device()) # Fp16 conversion. if args.fp16: model = FP16_Module(model) # Wrap model for distributed training. if args.DDP_impl == 'torch': i = torch.cuda.current_device() args.DDP_type = torch.nn.parallel.distributed.DistributedDataParallel model = args.DDP_type(model, device_ids=[i], output_device=i, process_group=mpu.get_data_parallel_group()) elif args.DDP_impl == 'local': args.DDP_type = LocalDDP model = args.DDP_type(model) else: print_rank_0('Unknown DDP implementation specified: {}. ' 'Exiting.'.format(args.DDP_impl)) exit() return model
def backward_step(optimizer, model, lm_loss, args, timers): """Backward step.""" # Total loss. loss = lm_loss # Backward pass. if args.deepspeed: model.backward(loss) else: # optimizer.zero_grad() if args.fp16: optimizer.backward(loss, update_master_grads=False) else: loss.backward() reduced_losses = lm_loss.view(1) torch.distributed.all_reduce(reduced_losses.data, group=mpu.get_data_parallel_group()) reduced_losses.data = reduced_losses.data / (args.world_size / args.model_parallel_size) lm_loss_reduced = reduced_losses if args.deepspeed: # DeepSpeed backward propagation already addressed all reduce communication. # Reset the timer to avoid breaking timer logs below. timers('allreduce').reset() else: if not args.DDP_impl == 'torch': timers('allreduce').start() model.allreduce_params(reduce_after=False, fp32_allreduce=args.fp32_allreduce) timers('allreduce').stop() # Update master gradients. if not args.deepspeed: if args.fp16: optimizer.update_master_grads() # Clipping gradients helps prevent the exploding gradient. if args.clip_grad > 0: if not args.fp16: mpu.clip_grad_norm(model.parameters(), args.clip_grad) else: optimizer.clip_master_grads(args.clip_grad) return lm_loss_reduced
def get_model(args): """Build the model.""" print_rank_0('building BERT model ...') model = BertModel(args) if mpu.get_data_parallel_rank() == 0: print(' > number of parameters on model parallel rank {}: {}'.format( mpu.get_model_parallel_rank(), sum([p.nelement() for p in model.parameters()])), flush=True) # GPU allocation. model.cuda(torch.cuda.current_device()) # Fp16 conversion. if args.fp16: model = FP16_Module(model) if args.fp32_embedding: model.module.model.bert.embeddings.word_embeddings.float() model.module.model.bert.embeddings.position_embeddings.float() model.module.model.bert.embeddings.token_type_embeddings.float() if args.fp32_tokentypes: model.module.model.bert.embeddings.token_type_embeddings.float() if args.fp32_layernorm: for name, _module in model.named_modules(): if 'LayerNorm' in name: _module.float() # Wrap model for distributed training. if USE_TORCH_DDP: i = torch.cuda.current_device() model = DDP(model, device_ids=[i], output_device=i, process_group=mpu.get_data_parallel_group()) else: model = DDP(model) return model
def get_model(args): """Build the model.""" print_rank_0('building CPM model ...') model = GPT2Model(num_layers=args.num_layers, vocab_size=args.vocab_size, hidden_size=args.hidden_size, num_attention_heads=args.num_attention_heads, embedding_dropout_prob=args.hidden_dropout, attention_dropout_prob=args.attention_dropout, output_dropout_prob=args.hidden_dropout, max_sequence_length=args.max_position_embeddings, checkpoint_activations=args.checkpoint_activations, checkpoint_num_layers=args.checkpoint_num_layers, parallel_output=False) if mpu.get_data_parallel_rank() == 0: print(' > number of parameters on model parallel rank {}: {}'.format( mpu.get_model_parallel_rank(), sum([p.nelement() for p in model.parameters()])), flush=True) # GPU allocation. model.cuda(torch.cuda.current_device()) # Fp16 conversion. if args.fp16: model = FP16_Module(model) # Wrap model for distributed training. if USE_TORCH_DDP: i = torch.cuda.current_device() model = DDP(model, device_ids=[i], output_device=i, process_group=mpu.get_data_parallel_group()) else: model = DDP(model) return model
def test_initialize_model_parallel(model_parallel_size): if torch.distributed.get_rank() == 0: print('> testing initialize_model_parallel with size {} ...'.format( model_parallel_size)) model_parallel_size_ = min(model_parallel_size, torch.distributed.get_world_size()) assert not mpu.model_parallel_is_initialized() mpu.initialize_model_parallel(model_parallel_size_) assert mpu.model_parallel_is_initialized() # Checks. def check(group, world_size, rank): assert world_size == torch.distributed.get_world_size(group=group) assert rank == torch.distributed.get_rank(group=group) # Model parallel. world_size = model_parallel_size_ rank = torch.distributed.get_rank() % model_parallel_size_ assert world_size == mpu.get_model_parallel_world_size() assert rank == mpu.get_model_parallel_rank() check(mpu.get_model_parallel_group(), world_size, rank) # Data parallel. world_size = torch.distributed.get_world_size() // model_parallel_size_ rank = torch.distributed.get_rank() // model_parallel_size assert world_size == mpu.get_data_parallel_world_size() assert rank == mpu.get_data_parallel_rank() check(mpu.get_data_parallel_group(), world_size, rank) # Reset groups mpu.destroy_model_parallel() torch.distributed.barrier() if torch.distributed.get_rank() == 0: print('>> passed the test :-)')
def get_model(args, config, do_fp16=False): """Build the model.""" print_rank_0('building GPT2 model ...') model = GPT2Model(**config, checkpoint_activations=args.checkpoint_activations, checkpoint_num_layers=args.checkpoint_num_layers, parallel_output=True) if mpu.get_data_parallel_rank() == 0: print(' > number of parameters on model parallel rank {}: {}'.format( mpu.get_model_parallel_rank(), sum([p.nelement() for p in model.parameters()])), flush=True) # To prevent OOM for model sizes that cannot fit in GPU memory in full precision if args.deepspeed and do_fp16: model.half() # GPU allocation. model.cuda(torch.cuda.current_device()) # Fp16 conversion. if do_fp16: model = FP16_Module(model) # Wrap model for distributed training. if USE_TORCH_DDP: i = torch.cuda.current_device() model = DDP(model, device_ids=[i], output_device=i, process_group=mpu.get_data_parallel_group()) else: model = DDP(model) return model
def main(): """Main training program.""" # Disable CuDNN. torch.backends.cudnn.enabled = False # Timer. timers = Timers() # Arguments. args = get_args() # Pytorch distributed. initialize_distributed(args) # Random seeds for reproducability. set_random_seed(args.seed) # get the tokenizer tokenizer = GPT2Tokenizer(os.path.join(args.tokenizer_path, 'vocab.json'), os.path.join(args.tokenizer_path, 'chinese_vocab.model')) # load train data if args.do_train: train_dataloader, _ = load_data(args, 'train', tokenizer, 1) dev_dataloader, dev_dataset = load_data(args, 'dev', tokenizer, 1) with open(args.deepspeed_config, "r") as f: deepspeed_conf = json.load(f) epoch = args.epoch grad_acc = deepspeed_conf["gradient_accumulation_steps"] args.train_iters = len(train_dataloader) * epoch / grad_acc # Model, optimizer, and learning rate. # TODO: maybe need to reinitialize optimizer elif args.do_eval: # Set an arbitrary positive integer since the optimizer and the scheduler will not be used when do eval. args.train_iters = 1 model, optimizer, lr_scheduler = setup_model_and_optimizer_C(args) device = torch.cuda.current_device() # give a time stemp to the model cur_time = time.strftime("%Y-%m-%d-%H:%M:%S", time.localtime()) results_dir = os.path.join(args.results_dir, "{}-{}".format(args.model_name, cur_time)) os.makedirs(results_dir, exist_ok=True) if args.do_train and torch.distributed.get_rank() == 0: with open(os.path.join(results_dir, "train_log.txt"), "w") as f: f.write("Train losses:\n") with open(os.path.join(results_dir, "dev_log.txt"), "w") as f: f.write("Dev accs:\n") torch.distributed.barrier() if args.do_train: # cand_ids = torch.tensor(dev_dataset.cand_ids).to(device) total_loss, logging_loss, best_acc = 0.0, 0.0, 0.0 global_step, total_step, best_step = 0, 0, 0 for e in range(epoch): model.train() for batch, no_model_batch in tqdm(train_dataloader, disable=(torch.distributed.get_rank() != 0)): for k in batch: batch[k] = batch[k].to(device) for k in no_model_batch: no_model_batch[k] = no_model_batch[k].to(device) output = model(**batch) # get the loss of the last token output = torch.sum(output * no_model_batch["loss_mask"].unsqueeze(-1), 1) / torch.sum(no_model_batch["loss_mask"], -1).unsqueeze(-1) # get the label of the last token # labels = no_model_batch["labels"].float() labels = no_model_batch["truth"].float() # labels = (torch.sum(labels * no_model_batch["loss_mask"], 1) / torch.sum(no_model_batch["loss_mask"], -1)).long() # cross_entropy loss # losses = mpu.vocab_parallel_cross_entropy(output.unsqueeze(1).contiguous().float(), labels.unsqueeze(1)) losses = CrossEntropyLoss(output.unsqueeze(1).contiguous().float(), labels.unsqueeze(1)) loss = torch.mean(losses) model.backward(loss) model.step() torch.distributed.all_reduce(loss.data, group=mpu.get_data_parallel_group()) loss.data = loss.data / mpu.get_data_parallel_world_size() total_loss += loss.item() / grad_acc if total_step % grad_acc == 0: global_step += 1 if global_step != 0 and global_step % args.log_interval == 0: # logging if torch.distributed.get_rank() == 0: train_log = "Epoch {}, global step {}, total step {}, train lm loss: {}".format(e, global_step, epoch * len(train_dataloader), (total_loss - logging_loss) / args.log_interval) yprint(train_log) with open(os.path.join(results_dir, "train_log.txt"), "a") as f: f.write(train_log + "\n") logging_loss = total_loss if global_step != 0 and global_step % args.eval_interval == 0: # evaluate on the dev acc, _, _ = evaluate_tnews(args, model, dev_dataloader, device, mode="dev") dev_results_dir = os.path.join(results_dir, "dev_step-{}".format(global_step)) if acc > best_acc: best_acc = acc best_step = global_step if torch.distributed.get_rank() == 0: # we will only write the log file once dev_log = "Epoch: {}, Global step: {}, Acc: {}".format(e, global_step, acc) yprint(dev_log) os.makedirs(dev_results_dir, exist_ok=True) with open(os.path.join(dev_results_dir, "dev_result.txt"), "w") as f: f.write(dev_log + "\n") with open(os.path.join(results_dir, "dev_log.txt"), "a") as f: f.write(dev_log + "\n") torch.distributed.barrier() args.save = dev_results_dir save_checkpoint(global_step, model, optimizer, lr_scheduler, args) total_step += 1 with open(os.path.join(dev_results_dir, "dev_log.txt"), "a") as f: f.write("Best acc: {} Best step: {}\n".format(best_acc, best_step)) if args.do_eval: # evaluate on the test test_dataloader, test_dataset = load_data(args, 'test', tokenizer, 1) cand_ids = torch.tensor(test_dataset.cand_ids).to(device) if args.do_train: # if do training, then evaluate the one with the max acc on dev set. eval_ckpt_path = os.path.join(results_dir, "dev_step-{}".format(best_step)) args.load = eval_ckpt_path else: # if only do eval, then evaluate the one specified by the user. args.load = args.eval_ckpt_path load_checkpoint(model=model, optimizer=None, lr_scheduler=None, args=args) acc, _, _ = evaluate(args, model, test_dataloader, cand_ids, device, mode="test") if torch.distributed.get_rank() == 0: eval_log = "Checkpoint from {}: Acc: {}".format(args.load, acc) yprint(eval_log) with open(os.path.join(results_dir, "eval_log"), "w") as f: f.write(eval_log + "\n") torch.distributed.barrier()
def main(): """Main training program.""" # Disable CuDNN. torch.backends.cudnn.enabled = False # Timer. timers = Timers() # Arguments. args = get_args() # Pytorch distributed. initialize_distributed(args) # Random seeds for reproducability. set_random_seed(args.seed) # get the tokenizer tokenizer = GPT2Tokenizer( os.path.join(args.tokenizer_path, 'vocab.json'), os.path.join(args.tokenizer_path, 'chinese_vocab.model')) # load data test_dataloader, test_dataset = load_data(args, 'test', tokenizer, 1) # Set an arbitrary positive integer since the optimizer and the scheduler will not be used when do eval. args.train_iters = 1 # Model model, _, _ = setup_model_and_optimizer(args) device = torch.cuda.current_device() # give a time stemp to the model cur_time = time.strftime("%Y-%m-%d-%H:%M:%S", time.localtime()) results_dir = os.path.join(args.results_dir, "{}-{}".format(args.model_name, cur_time)) if torch.distributed.get_rank() == 0: os.makedirs(results_dir, exist_ok=True) model.eval() all_sids = [] all_cids = [] all_losses = [] with torch.no_grad(): for batch, no_model_batch in tqdm( test_dataloader, desc="Evaluating", disable=(torch.distributed.get_rank() != 0)): for k in batch: batch[k] = batch[k].to(device) for k in no_model_batch: no_model_batch[k] = no_model_batch[k].to(device) output = model(**batch) losses = mpu.vocab_parallel_cross_entropy( output.contiguous().float(), no_model_batch["labels"]) loss_mask = no_model_batch["loss_mask"] loss = torch.sum(losses * loss_mask, dim=-1) / loss_mask.sum(dim=-1) loss_tensor_list = [ torch.zeros_like(loss).to(device) for _ in range(mpu.get_data_parallel_world_size()) ] torch.distributed.all_gather(loss_tensor_list, loss.data, group=mpu.get_data_parallel_group()) all_losses.extend(loss_tensor_list) sids = no_model_batch["sids"] sid_tensor_list = [ torch.zeros_like(sids) for _ in range(mpu.get_data_parallel_world_size()) ] torch.distributed.all_gather(sid_tensor_list, sids.data, group=mpu.get_data_parallel_group()) all_sids.extend(sid_tensor_list) cids = no_model_batch["cids"] cid_tensor_list = [ torch.zeros_like(cids) for _ in range(mpu.get_data_parallel_world_size()) ] torch.distributed.all_gather(cid_tensor_list, cids.data, group=mpu.get_data_parallel_group()) all_cids.extend(cid_tensor_list) if torch.distributed.get_rank() == 0: all_losses = torch.stack(all_losses).view(-1).cpu().detach().numpy() all_sids = torch.stack(all_sids).view(-1).cpu().detach().numpy() all_cids = torch.stack(all_cids).view(-1).cpu().detach().numpy() truth_labels = test_dataset.truth_labels preds = [[] for _ in truth_labels] for sid, cid, loss in zip(all_sids, all_cids, all_losses): preds[sid].append((cid, loss)) preds = [min(p, key=lambda x: x[1])[0] for p in preds if len(p) > 0] yprint("Acc: {}".format( sum([int(p == l) for p, l in zip(preds, truth_labels)]) / len(truth_labels))) with open(os.path.join(results_dir, "zero-shot_result.txt"), "w") as f: f.write("Acc: {}\n".format( sum([int(p == l) for p, l in zip(preds, truth_labels)]) / len(truth_labels))) torch.distributed.barrier()
def evaluate(data_iterator, model, args, timers, forward_step_func, verbose=False): """Evaluation.""" # Turn on evaluation mode which disables dropout. model.eval() total_lm_loss, total_gpt_loss, total_bert_loss, total_sent_loss, total_multi_loss = 0, 0, 0, 0, 0 gpt_iters, bert_iters, sent_iters, multi_iters = 0, 0, 0, 0 mems = [] with torch.no_grad(): iteration = 0 while iteration < args.eval_iters: iteration += 1 if verbose and iteration % args.log_interval == 0: print_rank_0('Evaluating iter {}/{}'.format( iteration, args.eval_iters)) # Forward evaluation. lm_loss, mems, mode = forward_step_func(data_iterator, model, args, timers, mems=mems) '''when contiguous memory optimizations are enabled, the buffers allocated by the optimizations are deallocated during backward pass in the absence of backward pass the buffers should be reset after each forward pass''' if args.deepspeed and args.deepspeed_activation_checkpointing: deepspeed.checkpointing.reset() lm_loss = lm_loss.data.detach().float().item() total_lm_loss += lm_loss if mode == 'gpt': total_gpt_loss += lm_loss gpt_iters += 1 elif mode == 'bert': total_bert_loss += lm_loss bert_iters += 1 elif mode == 'sentence': total_sent_loss += lm_loss sent_iters += 1 elif mode == 'multi-task': total_multi_loss += lm_loss multi_iters += 1 # Move model back to the train mode. model.train() # Reduce across processes. loss_data = torch.cuda.FloatTensor([ total_lm_loss, total_gpt_loss, total_bert_loss, total_sent_loss, total_multi_loss, gpt_iters, bert_iters, sent_iters, multi_iters ]) torch.distributed.all_reduce(loss_data, group=mpu.get_data_parallel_group()) loss_data = loss_data.tolist() total_lm_loss = loss_data[0] / args.eval_iters / (args.world_size / args.model_parallel_size) total_gpt_loss = loss_data[1] / loss_data[5] if loss_data[5] > 0 else 0 total_bert_loss = loss_data[2] / loss_data[6] if loss_data[6] > 0 else 0 total_sent_loss = loss_data[3] / loss_data[7] if loss_data[7] > 0 else 0 total_multi_loss = loss_data[4] / loss_data[8] if loss_data[8] > 0 else 0 return total_lm_loss, total_gpt_loss, total_bert_loss, total_sent_loss, total_multi_loss
def get_samples_mapping_(indexed_dataset, data_prefix, num_epochs, max_num_samples, max_seq_length, short_seq_prob, seed, name): if not num_epochs: if not max_num_samples: raise ValueError("Need to specify either max_num_samples " "or num_epochs") num_epochs = np.iinfo(np.int32).max - 1 if not max_num_samples: max_num_samples = np.iinfo(np.int64).max - 1 # Filename of the index mapping indexmap_filename = data_prefix indexmap_filename += '_{}_indexmap'.format(name) if num_epochs != (np.iinfo(np.int32).max - 1): indexmap_filename += '_{}ep'.format(num_epochs) if max_num_samples != (np.iinfo(np.int64).max - 1): indexmap_filename += '_{}mns'.format(max_num_samples) indexmap_filename += '_{}msl'.format(max_seq_length) indexmap_filename += '_{:0.2f}ssp'.format(short_seq_prob) indexmap_filename += '_{}s'.format(seed) indexmap_filename += '.npy' # Build the indexed mapping if not exist. if torch.distributed.get_rank() == 0 and \ not os.path.isfile(indexmap_filename): print(' > WARNING: could not find index map file {}, building ' 'the indices on rank 0 ...'.format(indexmap_filename)) # Make sure the types match the helpers input types. assert indexed_dataset.doc_idx.dtype == np.int64 assert indexed_dataset.sizes.dtype == np.int32 # Build samples mapping verbose = torch.distributed.get_rank() == 0 start_time = time.time() print_rank_0( ' > building sapmles index mapping for {} ...'.format(name)) # First compile and then import. from data.dataset_utils import compile_helper compile_helper() from data import helpers samples_mapping = helpers.build_mapping( indexed_dataset.doc_idx, indexed_dataset.sizes, num_epochs, max_num_samples, max_seq_length - 3, # account for added tokens short_seq_prob, seed, verbose) print_rank_0(' > done building sapmles index maping') np.save(indexmap_filename, samples_mapping, allow_pickle=True) print_rank_0( ' > saved the index mapping in {}'.format(indexmap_filename)) # Make sure all the ranks have built the mapping print_rank_0(' > elasped time to build and save samples mapping ' '(seconds): {:4f}'.format(time.time() - start_time)) # This should be a barrier but nccl barrier assumes # device_index=rank which is not the case for model # parallel case counts = torch.cuda.LongTensor([1]) torch.distributed.all_reduce(counts, group=mpu.get_data_parallel_group()) assert counts[0].item() == torch.distributed.get_world_size( group=mpu.get_data_parallel_group()) # Load indexed dataset. print_rank_0( ' > loading indexed mapping from {}'.format(indexmap_filename)) start_time = time.time() samples_mapping = np.load(indexmap_filename, allow_pickle=True, mmap_mode='r') print_rank_0( ' loaded indexed file in {:3.3f} seconds'.format(time.time() - start_time)) print_rank_0(' total number of samples: {}'.format( samples_mapping.shape[0])) return samples_mapping
def make_loaders(args, tokenizer): """makes training/val/test""" if args.use_tfrecords: return make_tfrecord_loaders(args) world_size = torch.distributed.get_world_size( group=mpu.get_data_parallel_group()) if args.loader_scatter is not None: assert world_size % args.loader_scatter == 0 batch_size = args.batch_size * world_size eval_batch_size = batch_size if args.eval_batch_size is not None: eval_batch_size = args.eval_batch_size * world_size seq_length = args.seq_length if seq_length < 0: seq_length = seq_length * world_size eval_seq_length = args.eval_seq_length if eval_seq_length is not None and eval_seq_length < 0: eval_seq_length = eval_seq_length * world_size split = get_split(args) data_set_args = { 'path': args.train_data, 'seq_length': seq_length, 'mem_length': args.mem_length, 'delim': args.delim, 'text_key': args.text_key, 'label_key': 'label', 'ds_type': args.data_set_type, 'split': split, 'loose': args.loose_json, 'max_preds_per_seq': args.max_preds_per_seq, 'presplit_sentences': args.presplit_sentences, 'sample_one_document': args.sample_one_document, 'filter_english': args.filter_english, 'pre_tokenize': not args.no_pre_tokenize, 'tokenizer': tokenizer, 'save_splits': args.save_splits, 'load_splits': args.load_splits, 'save_test_data': args.save_test_data, 'no_lazy_loader': args.no_lazy_loader, 'loader_scatter': args.loader_scatter, 'data_parallel_rank': mpu.get_data_parallel_rank(), "non_sentence_start": args.non_sentence_start, "half_lazy_loader": args.half_lazy_loader } eval_set_args = copy.copy(data_set_args) eval_set_args['split'] = [1.] # if optional eval args were set then replace their # equivalent values in the arg dict if eval_seq_length: eval_set_args['seq_length'] = eval_seq_length if args.eval_max_preds_per_seq: eval_set_args['max_preds_per_seq'] = args.eval_max_preds_per_seq if args.eval_text_key is not None: eval_set_args['text_key'] = args.eval_text_key # make datasets splits and tokenizer train, valid, test = None, None, None if args.train_data is not None: train = data_utils.make_dataset(**data_set_args) if data_utils.should_split(split): train, valid, test = train eval_set_args['tokenizer'] = tokenizer # make training and val dataset if necessary if valid is None and args.valid_data is not None: eval_set_args['path'] = args.valid_data valid = data_utils.make_dataset(**eval_set_args) eval_set_args['tokenizer'] = tokenizer if test is None and args.test_data is not None: eval_set_args['path'] = args.test_data test = data_utils.make_dataset(**eval_set_args) # wrap datasets with data loader use_block = args.block_lm or args.encoder_decoder if train is not None and args.batch_size > 0: train = make_data_loader(train, tokenizer, batch_size, args.train_iters, args, shuffle=args.shuffle, block_collate=use_block) args.do_train = True else: args.do_train = False eval_batch_size = eval_batch_size if eval_batch_size != 0 else batch_size if valid is not None: valid = make_data_loader(valid, tokenizer, eval_batch_size, args.train_iters, args, shuffle=args.shuffle, block_collate=use_block) args.do_valid = True else: args.do_valid = False if test is not None: test = make_data_loader(test, tokenizer, eval_batch_size, len(test) // eval_batch_size + 1, args, shuffle=args.shuffle, block_collate=use_block) args.do_test = True else: args.do_test = False return train, valid, test
def __init__(self, config, batch_slices, seq_slices, distributed_init_method, world_size, data_parallel_size, model_parallel_size, pipeline_parallel_size, rank, local_rank, mixed_precision=False, use_mpi=False, init_process_group=False, checkpoint_gradients=False): self.config = config self.batch_slices = batch_slices self.seq_slices = seq_slices torch.cuda.set_device(local_rank) if init_process_group: dist.init_process_group( backend='nccl', init_method=distributed_init_method, world_size=world_size, rank=rank, ) dist.all_reduce(torch.zeros(1).cuda()) mpu.initialize_model_parallel(model_parallel_size, pipeline_parallel_size) set_random_seed(0) mpu.model_parallel_cuda_manual_seed(0) self.rank = rank self.local_rank = local_rank self.world_size = world_size self.data_parallel_size = data_parallel_size self.model_parallel_size = model_parallel_size self.pipeline_parallel_size = pipeline_parallel_size self.pipeline_parallel_group_rank = mpu.get_pipeline_parallel_group_rank( ) self.data_parallel_group = mpu.get_data_parallel_group() self.model_parallel_group = mpu.get_model_parallel_group() self.pipeline_parallel_pred_group = mpu.get_pipeline_parallel_pred_group( ) self.pipeline_parallel_succ_group = mpu.get_pipeline_parallel_succ_group( ) self.model_parallel_src_rank = mpu.get_model_parallel_src_rank() self.model_parallel_dst_rank = mpu.get_model_parallel_dst_rank() self.model_parallel_next_src_rank = ( self.model_parallel_src_rank + self.model_parallel_size if self.pipeline_parallel_group_rank < self.pipeline_parallel_size - 1 else None) self.model_parallel_prev_dst_rank = ( self.model_parallel_dst_rank - self.model_parallel_size if self.pipeline_parallel_group_rank > 0 else None) self.n_layers = (config.n_layers // pipeline_parallel_size + int(rank < config.n_layers % pipeline_parallel_size)) self.config = config self.mixed_precision = mixed_precision self.checkpoint_gradients = checkpoint_gradients self.layers = [] for _ in range(self.n_layers): l = ModelParallelTransformerLayer( self.config.embedding_dim, self.config.ffn_embedding_dim, self.config.num_attention_heads, device="cuda", checkpoint_gradients=self.checkpoint_gradients) self.layers.append(l.half() if self.mixed_precision else l) self.all_parameters = [] for layer in self.layers: self.all_parameters.extend(layer.parameters()) self.n_params = len(self.all_parameters) if self.mixed_precision: self.master_parameters = [ p.clone().detach().float() for p in self.all_parameters ] for p in self.master_parameters: p.requires_grad_() self.optimizer = optimizers.FusedAdam(self.master_parameters, lr=1e-10) else: self.optimizer = torch.optim.Adam(self.all_parameters, lr=1e-10)
def train_step(data_iterator, model, optimizer, lr_scheduler, args, timers, forward_step_func, mems=None, single_step=False): """Single training step.""" lm_loss_total, count = 0.0, 0 mems = [] if mems is None else mems if not args.deepspeed: optimizer.zero_grad() while True: skipped_iter, complete = 0, False # Forward model for one step. timers('forward').start() lm_loss, mems, _ = forward_step_func(data_iterator, model, args, timers, mems) timers('forward').stop() # print_rank_0("Forward step") if not args.deepspeed: lm_loss /= args.gradient_accumulation_steps reduced_loss = lm_loss.detach().clone().view(1) torch.distributed.all_reduce(reduced_loss.data, group=mpu.get_data_parallel_group()) reduced_loss.data = reduced_loss.data / (args.world_size / args.model_parallel_size) if not DynamicLossScaler._has_inf_or_nan(reduced_loss): lm_loss_total += reduced_loss count += 1 # Calculate gradients, reduce across processes, and clip. timers('backward').start() backward_step(optimizer, model, lm_loss, args, timers) timers('backward').stop() # print_rank_0("Backward step") # Update parameters. timers('optimizer').start() if args.deepspeed: if model.is_gradient_accumulation_boundary(): model.step() complete = True if not (args.fp16 and optimizer.overflow): lr_scheduler.step() else: skipped_iter = 1 else: model.step() else: if count == args.gradient_accumulation_steps: optimizer.step() complete = True # Update learning rate. if not (args.fp16 and optimizer.overflow): lr_scheduler.step() else: skipped_iter = 1 # print_rank_0("Optimizer step") timers('optimizer').stop() if complete: break else: print_rank_0("Found NaN loss, skip backward") del lm_loss, reduced_loss mems = [] if single_step: break if args.deepspeed: lm_loss_total = lm_loss_total / count return lm_loss_total, skipped_iter, mems
def make_dataset(path, seq_length, text_key, label_key, lazy=False, process_fn=None, split=[1.], delim=',', loose=False, binarize_sent=False, drop_unlabeled=False, tokenizer=None, tokenizer_type='CharacterLevelTokenizer', tokenizer_model_path=None, vocab_size=None, model_type='bpe', pad_token=0, character_converage=1.0, non_binary_cols=None, **kwargs): """function to create datasets+tokenizers for common options""" if isinstance(process_fn, str): process_fn = eval(process_fn) if non_binary_cols is not None: # multilabel dataset support (only for csvs) label_key = non_binary_cols def get_dataset_from_path(path_, dataset_len=None): if lazy: # get lazily loaded dataset named_corpora = False if supported_corpus(path_): named_corpora = True name = path_ path_ = corpora.NAMED_CORPORA[path_].PATH if not exists_lazy(path_, data_type='data'): # create cached version of dataset for lazy loading if it doesn't exist text = get_dataset(name if named_corpora else path_, text_key=text_key, label_key=label_key, binarize_sent=binarize_sent, delim=delim, drop_unlabeled=drop_unlabeled, loose_json=loose) make_lazy(path_, text.X, data_type='data') text = lazy_array_loader(path_, data_type='data', map_fn=process_fn) else: # get dataset text = get_dataset(path_, text_key=text_key, label_key=label_key, binarize_sent=binarize_sent, delim=delim, drop_unlabeled=drop_unlabeled, loose_json=loose, preprocess_fn=process_fn, dataset_len=dataset_len) return text # get one or multiple datasets and concatenate world_size = torch.distributed.get_world_size( group=mpu.get_data_parallel_group()) if isinstance(path, list) and len(path) == 1 and os.path.isdir(path[0]): path = [ os.path.join(path[0], f) for f in os.listdir(path[0]) if not os.path.isdir(os.path.join(path[0], f)) ] random.shuffle(path) path = [ path[start::world_size] for start in range(min(world_size, len(path))) ] elif isinstance(path, str): path = [[path]] elif isinstance(path, list) and len(path) == 1: path = [path] #print("path= ", path) #dataset_lens = [] #if 'train_file_lens_path' in kwargs and kwargs['train_file_lens_path'] is not None: # path_lens = {} # flens = open(kwargs['train_file_lens_path'], 'r') # for line in flens: # split_line = line.rstrip('\n').split('\t') # path_lens[split_line[0]] = int(split_line[1]) # flens.close() # for p in path: # if p in path_lens: # dataset_lens.append(path_lens[p]) # else: # dataset_lens.append(int(subprocess.check_output("wc -l " + p, shell=True).split()[0])) #else: # for p in path: # dataset_lens.append(int(subprocess.check_output("wc -l " + p, shell=True).split()[0])) #datasets = [get_dataset_from_path(p, dlen) for p, dlen in zip(path, dataset_lens)] #if len(datasets) == 1: # ds = datasets[0] #else: # ds = ConcatDataset(datasets) # make tokenizer for dataset if tokenizer is None: tokenizer = make_tokenizer(tokenizer_type, None, tokenizer_model_path, vocab_size, model_type, pad_token, character_converage, **kwargs) ds_type = '' if 'ds_type' in kwargs: ds_type = kwargs['ds_type'] # Split dataset into train/val/test (and wrap bert dataset) #if should_split(split): # ds = split_ds(ds, split) # if ds_type.lower() == 'bert': # presplit_sentences = kwargs['presplit_sentences'] if 'presplit_sentences' in kwargs else False # ds = [binglr_dataset(d, max_seq_len=seq_length, presplit_sentences=presplit_sentences) if d is not None else None for d in ds] # elif ds_type.lower() == 'gpt2': # ds = [GPT2Dataset(d, max_seq_len=seq_length) if d is not None else None for d in ds] #else: if ds_type.lower() == 'bert': ds = [] print((len(path), world_size)) for i in range(min(world_size, len(path))): ds_iters = [ binglr_iterator_dataset( [p], run_once=True, max_seq_len=seq_length, mask_lm_prob=kwargs['mask_lm_prob'] if 'mask_lm_prob' in kwargs else 0.15, max_preds_per_seq=kwargs['max_preds_per_seq'] if 'max_preds_per_seq' in kwargs else 20, tokenizer=tokenizer, train=kwargs['train'] if 'train' in kwargs else False, num_urls=kwargs['num_urls'] if 'num_urls' in kwargs else 4) for p in path[i] ] ds.append(MyChainDataset(ds_iters)) elif ds_type.lower() == 'pretrain': ds = [] for i in range(min(world_size, len(path))): ds_iters = [ bert_iterator_dataset( [p], run_once=True, max_seq_len=seq_length, mask_lm_prob=kwargs['mask_lm_prob'] if 'mask_lm_prob' in kwargs else 0.15, max_preds_per_seq=kwargs['max_preds_per_seq'] if 'max_preds_per_seq' in kwargs else 20, tokenizer=tokenizer, train=kwargs['train'] if 'train' in kwargs else False, num_urls=kwargs['num_urls'] if 'num_urls' in kwargs else 1) for p in path[i] ] ds.append(MyChainDataset0(ds_iters)) #ds = binglr_iterator_dataset(path, max_seq_len=seq_length, mask_lm_prob=kwargs['mask_lm_prob'] if 'mask_lm_prob' in kwargs else 0.15, max_preds_per_seq=kwargs['max_preds_per_seq'] if 'max_preds_per_seq' in kwargs else 20, tokenizer=tokenizer, train=kwargs['train'] if 'train' in kwargs else False, num_urls=kwargs['num_urls'] if 'num_urls' in kwargs else 4) elif ds_type.lower() == 'gpt2': ds = GPT2Dataset(ds, max_seq_len=seq_length) return ds, tokenizer
def get_model(args, model_type=None, multi_token=True, num_labels=None): """Build the model.""" print_rank_0('building GLM model ...') output_predict, parallel_output = True, True if (model_type == "multiple_choice" or model_type == "classification") and not args.cloze_eval: output_predict = False if model_type is not None: parallel_output = False model = GLMModel(num_layers=args.num_layers, vocab_size=args.vocab_size, hidden_size=args.hidden_size, num_attention_heads=args.num_attention_heads, embedding_dropout_prob=args.hidden_dropout, attention_dropout_prob=args.attention_dropout, output_dropout_prob=args.hidden_dropout, max_sequence_length=args.max_position_embeddings, max_memory_length=args.mem_length, checkpoint_activations=args.checkpoint_activations, checkpoint_num_layers=args.checkpoint_num_layers, parallel_output=parallel_output, relative_encoding=args.transformer_xl, block_position_encoding=args.block_lm and not args.masked_lm, output_predict=output_predict) if model_type is not None: if model_type == 'cloze': if multi_token: if args.fast_decode: model = GLMForMultiTokenClozeFast(model, length_penalty=args.length_penalty) else: model = GLMForMultiTokenCloze(model, length_penalty=args.length_penalty) else: model = GLMForSingleTokenCloze(model) elif model_type == 'classification': model = GLMForSequenceClassification(model, args.hidden_size, args.output_dropout, args.pool_token, num_class=num_labels) elif model_type == 'generation': pass else: raise NotImplementedError(model_type) if mpu.get_data_parallel_rank() == 0: print(' > number of parameters on model parallel rank {}: {}'.format( mpu.get_model_parallel_rank(), sum([p.nelement() for p in model.parameters()])), flush=True) # To prevent OOM for model sizes that cannot fit in GPU memory in full precision if hasattr(args, "deepspeed") and args.deepspeed and args.fp16: model.half() # GPU allocation. model.cuda(torch.cuda.current_device()) # Fp16 conversion. if args.fp16: model = FP16_Module(model) # Wrap model for distributed training. if not args.deepspeed: if args.DDP_impl == 'torch': i = torch.cuda.current_device() model = TorchDDP(model, device_ids=[i], output_device=i, process_group=mpu.get_data_parallel_group()) else: model = LocalDDP(model) return model
def make_loaders(args): """makes training/val/test""" if args.use_tfrecords: return make_tfrecord_loaders(args) world_size = torch.distributed.get_world_size( group=mpu.get_data_parallel_group()) batch_size = args.batch_size * world_size eval_batch_size = batch_size if args.eval_batch_size is not None: eval_batch_size = args.eval_batch_size * world_size seq_length = args.seq_length if seq_length < 0: seq_length = seq_length * world_size eval_seq_length = args.eval_seq_length if eval_seq_length is not None and eval_seq_length < 0: eval_seq_length = eval_seq_length * world_size split = get_split(args) data_set_args = { 'local_rank': args.local_rank, 'path': args.train_data, 'seq_length': seq_length, 'mem_length': args.mem_length, 'lazy': args.lazy_loader, 'xl_style': args.transformer_xl, 'delim': args.delim, 'text_key': args.text_key, 'label_key': 'label', 'non_binary_cols': None, 'ds_type': args.data_set_type, 'split': split, 'loose': args.loose_json, 'tokenizer_type': args.tokenizer_type, 'tokenizer_model_path': args.tokenizer_path, 'vocab_size': args.vocab_size, 'model_type': args.tokenizer_model_type, 'cache_dir': args.cache_dir, 'max_preds_per_seq': args.max_preds_per_seq, 'presplit_sentences': args.presplit_sentences, 'sample_one_document': args.sample_one_document, 'pre_tokenize': not args.not_pre_tokenize } eval_set_args = copy.copy(data_set_args) eval_set_args['split'] = [1.] # if optional eval args were set then replace their # equivalent values in the arg dict if eval_seq_length: eval_set_args['seq_length'] = eval_seq_length if args.eval_max_preds_per_seq: eval_set_args['max_preds_per_seq'] = args.eval_max_preds_per_seq if args.eval_text_key is not None: eval_set_args['text_key'] = args.eval_text_key # make datasets splits and tokenizer train = None valid = None test = None if args.train_data is not None: train, tokenizer = data_utils.make_dataset(**data_set_args) if data_utils.should_split(split): train, valid, test = train eval_set_args['tokenizer'] = tokenizer # make training and val dataset if necessary if valid is None and args.valid_data is not None: eval_set_args['path'] = args.valid_data valid, tokenizer = data_utils.make_dataset(**eval_set_args) eval_set_args['tokenizer'] = tokenizer if test is None and args.test_data is not None: eval_set_args['path'] = args.test_data test, tokenizer = data_utils.make_dataset(**eval_set_args) # wrap datasets with data loader if train is not None and args.batch_size > 0: train = make_data_loader(train, batch_size, args) args.do_train = True else: args.do_train = False eval_batch_size = eval_batch_size if eval_batch_size != 0 else batch_size if valid is not None: valid = make_data_loader(valid, eval_batch_size, args) args.do_valid = True else: args.do_valid = False if test is not None: test = make_data_loader(test, eval_batch_size, args) args.do_test = True else: args.do_test = False return (train, valid, test), tokenizer
def get_model(args, model_type=None, multi_token=True, num_labels=None, spell_length=None): """Build the model.""" print_rank_0('building GPT2 model ...') if args.pretrained_bert: if model_type == "multiple_choice": model = BertForMultipleChoice.from_pretrained( args.tokenizer_model_type, cache_dir=args.cache_dir, fp32_layernorm=args.fp32_layernorm, fp32_embedding=args.fp32_embedding, layernorm_epsilon=args.layernorm_epsilon) elif model_type == "classification": model = BertForSequenceClassification.from_pretrained( args.tokenizer_model_type, cache_dir=args.cache_dir, fp32_layernorm=args.fp32_layernorm, fp32_embedding=args.fp32_embedding, layernorm_epsilon=args.layernorm_epsilon, num_labels=num_labels) else: raise NotImplementedError else: output_predict, paralle_output = True, True if (model_type == "multiple_choice" or model_type == "classification") and not args.cloze_eval: output_predict = False if model_type is not None: paralle_output = False if spell_length is not None: print_rank_0(f"Continuous spell length {spell_length}") model = GLMModel(num_layers=args.num_layers, vocab_size=args.vocab_size, hidden_size=args.hidden_size, num_attention_heads=args.num_attention_heads, embedding_dropout_prob=args.hidden_dropout, attention_dropout_prob=args.attention_dropout, output_dropout_prob=args.hidden_dropout, max_sequence_length=args.max_position_embeddings, max_memory_length=args.mem_length, checkpoint_activations=args.checkpoint_activations, checkpoint_num_layers=args.checkpoint_num_layers, parallel_output=paralle_output, relative_encoding=args.transformer_xl, block_position_encoding=args.block_lm and not args.masked_lm, output_predict=output_predict, spell_length=spell_length, spell_func=args.prompt_func, attention_scale=args.attention_scale) if args.freeze_transformer: model.freeze_transformer( tune_prefix_layers=args.tune_prefix_layers) if model_type is not None: if model_type == 'multiple_choice': if args.cloze_eval: if multi_token: if args.fast_decode: model = GLMForMultiTokenClozeFast( model, length_penalty=args.length_penalty) else: model = GLMForMultiTokenCloze( model, length_penalty=args.length_penalty) else: model = GLMForSingleTokenCloze( model, take_softmax=args.adapet) else: model = GLMForSequenceClassification(model, args.hidden_size, args.output_dropout, args.pool_token, num_class=num_labels) elif model_type == 'classification': model = GLMForSequenceClassification(model, args.hidden_size, args.output_dropout, args.pool_token, num_class=num_labels) elif model_type == 'generation': pass else: raise NotImplementedError(model_type) if mpu.get_data_parallel_rank() == 0: print(' > number of parameters on model parallel rank {}: {}'.format( mpu.get_model_parallel_rank(), sum([p.nelement() for p in model.parameters()])), flush=True) # To prevent OOM for model sizes that cannot fit in GPU memory in full precision if args.fp16: model.half() # GPU allocation. model.cuda(torch.cuda.current_device()) # Fp16 conversion. if args.fp16: model = FP16_Module(model) # Wrap model for distributed training. if not args.deepspeed and (args.train_iters or args.epochs): if args.DDP_impl == 'torch': i = torch.cuda.current_device() model = TorchDDP(model, device_ids=[i], output_device=i, process_group=mpu.get_data_parallel_group()) elif args.DDP_impl == 'local': model = LocalDDP(model) else: print_rank_0("Skip DDP model") return model