def biencoder_model_provider(only_query_model=False, only_context_model=False, biencoder_shared_query_context_model=False, pre_process=True, post_process=True): """Build the model.""" assert mpu.get_tensor_model_parallel_world_size() == 1 and \ mpu.get_pipeline_model_parallel_world_size() == 1, \ "Model parallel size > 1 not supported for ICT" print_rank_0('building BiEncoderModel...') # simpler to just keep using 2 tokentypes since # the LM we initialize with has 2 tokentypes model = BiEncoderModel( num_tokentypes=2, parallel_output=False, only_query_model=only_query_model, only_context_model=only_context_model, biencoder_shared_query_context_model=\ biencoder_shared_query_context_model, pre_process=pre_process, post_process=post_process) return model
def forward_step(data_iterator, model, input_tensor): """Forward step.""" args = get_args() timers = get_timers() # Get the batch. timers('batch-generator').start() query_tokens, query_mask, \ context_tokens, context_mask, context_indices = get_ict_batch(data_iterator) timers('batch-generator').stop() # Query and Context Types query_types = torch.cuda.LongTensor(*query_tokens.shape).fill_(0) context_types = torch.cuda.LongTensor(*context_tokens.shape).fill_(0) # Forward model. query_logits, context_logits = model(query_tokens, query_mask, query_types, context_tokens, context_mask, context_types) micro_batch_size = query_logits.shape[0] # recall we assert that tensor_model_parallel_size == 1 assert mpu.get_tensor_model_parallel_world_size() == 1, \ "Model parallel size > 1 not supported for ICT" global_batch_size = dist.get_world_size() * micro_batch_size all_query_logits = AllgatherFromDataParallelRegion.apply(query_logits) all_context_logits = AllgatherFromDataParallelRegion.apply(context_logits) # scores are inner products between query and context embeddings retrieval_scores = torch.matmul(all_query_logits, torch.transpose(all_context_logits, 0, 1)) # scaling the retriever scores if args.retriever_score_scaling: retrieval_scores = retrieval_scores / math.sqrt(args.hidden_size) softmax_scores = F.log_softmax(retrieval_scores, dim=1) sorted_vals, sorted_indices = torch.topk(softmax_scores, k=softmax_scores.shape[1], sorted=True) def topk_accuracy(k): return torch.cuda.FloatTensor([sum([int(i in sorted_indices[i, :k]) \ for i in range(global_batch_size)]) / global_batch_size]) topk_accs = [topk_accuracy(int(k)) for k in args.retriever_report_topk_accuracies] labels = torch.arange(global_batch_size).long().cuda() loss = F.nll_loss(softmax_scores, labels, reduction='mean') reduced_losses = average_losses_across_data_parallel_group([loss, *topk_accs]) # Scale the retrieval loss loss = loss * mpu.get_data_parallel_world_size() # create stats_dict with retrieval loss and all specified top-k accuracies topk_acc_dict = {'top{}_acc'.format(k): v * 100 for k, v in \ zip(args.retriever_report_topk_accuracies, reduced_losses[1:])} stats_dict = dict(loss=reduced_losses[0], **topk_acc_dict) return loss, stats_dict
def __init__(self, attention_mask_func, init_method, output_layer_init_method, layer_number): super(ParallelSelfAttention, self).__init__() args = get_args() self.fp16 = args.fp16 self.attention_mask_func = attention_mask_func self.apply_query_key_layer_scaling = args.apply_query_key_layer_scaling self.attention_softmax_in_fp32 = args.attention_softmax_in_fp32 if self.apply_query_key_layer_scaling: self.attention_softmax_in_fp32 = True self.layer_number = max(1, layer_number) # Per attention head and per partition values. world_size = mpu.get_tensor_model_parallel_world_size() self.hidden_size_per_partition = mpu.divide(args.hidden_size, world_size) self.hidden_size_per_attention_head = mpu.divide( args.hidden_size, args.num_attention_heads) self.num_attention_heads_per_partition = mpu.divide( args.num_attention_heads, world_size) # Strided linear layer. self.query_key_value = mpu.ColumnParallelLinear( args.hidden_size, 3 * args.hidden_size, gather_output=False, init_method=init_method) coeff = None self.norm_factor = math.sqrt(self.hidden_size_per_attention_head) if self.apply_query_key_layer_scaling: coeff = self.layer_number self.norm_factor *= coeff self.scale_mask_softmax = FusedScaleMaskSoftmax( self.fp16, args.scaled_upper_triang_masked_softmax_fusion, args.scaled_masked_softmax_fusion, self.attention_mask_func, self.attention_softmax_in_fp32, coeff) # Dropout. Note that for a single iteration, this layer will generate # different outputs on different number of parallel partitions but # on average it should not be partition dependent. self.attention_dropout = torch.nn.Dropout(args.attention_dropout) # Output. self.dense = mpu.RowParallelLinear( args.hidden_size, args.hidden_size, input_is_parallel=True, init_method=output_layer_init_method, skip_bias_add=True)
def general_ict_model_provider(only_query_model=False, only_block_model=False): """Build the model.""" args = get_args() assert args.ict_head_size is not None, \ "Need to specify --ict-head-size to provide an ICTBertModel" assert mpu.get_tensor_model_parallel_world_size() == 1 and mpu.get_pipeline_model_parallel_world_size() == 1, \ "Model parallel size > 1 not supported for ICT" print_rank_0('building ICTBertModel...') # simpler to just keep using 2 tokentypes since the LM we initialize with has 2 tokentypes model = ICTBertModel(ict_head_size=args.ict_head_size, num_tokentypes=2, parallel_output=True, only_query_model=only_query_model, only_block_model=only_block_model) return model
def _communicate(tensor_send_next, tensor_send_prev, recv_prev, recv_next, tensor_shape, use_ring_exchange=False, dtype_=None): """Communicate tensors between stages. Used as helper method in other communication methods that are used in megatron/schedules.py. Takes the following arguments: tensor_send_next: tensor to send to next rank (no tensor sent if set to None). tensor_send_prev: tensor to send to prev rank (no tensor sent if set to None). recv_prev: boolean for whether tensor should be received from previous rank. recv_next: boolean for whether tensor should be received from next rank. tensor_shape: shape of tensor to receive (this method assumes that all tensors sent and received in a single function call are the same shape). use_ring_exchange: boolean for whether torch.distributed.ring_exchange() API should be used. dtype_: optional, this is used when the tensor that needs to be communicated is different from args.params_dtype. Returns: (tensor_recv_prev, tensor_recv_next) """ args = get_args() # Create placeholder tensors for receive in forward and backward directions # if needed. tensor_recv_prev = None tensor_recv_next = None # Some legacy inference code doesn't set the tensor shape, do so now # for the normal values for gpt/bert. This could be removed if inference # code is changed to provide tensor_shape. if tensor_shape is None: tensor_shape = (args.seq_length, args.micro_batch_size, args.hidden_size) override_scatter_gather_tensors_in_pipeline = False if args.scatter_gather_tensors_in_pipeline: tensor_chunk_shape = reduce(operator.mul, tensor_shape, 1) if tensor_chunk_shape % mpu.get_tensor_model_parallel_world_size( ) == 0: tensor_chunk_shape = tensor_chunk_shape // \ mpu.get_tensor_model_parallel_world_size() else: tensor_chunk_shape = tensor_shape override_scatter_gather_tensors_in_pipeline = True else: tensor_chunk_shape = tensor_shape dtype = args.params_dtype if args.fp32_residual_connection: dtype = torch.float requires_grad = True if dtype_ is not None: dtype = dtype_ requires_grad = False if recv_prev: tensor_recv_prev = torch.empty(tensor_chunk_shape, requires_grad=requires_grad, device=torch.cuda.current_device(), dtype=dtype) if recv_next: tensor_recv_next = torch.empty(tensor_chunk_shape, requires_grad=requires_grad, device=torch.cuda.current_device(), dtype=dtype) # Split tensor into smaller chunks if using scatter-gather optimization. if not override_scatter_gather_tensors_in_pipeline and \ args.scatter_gather_tensors_in_pipeline: if tensor_send_next is not None: tensor_send_next = mpu.split_tensor_into_1d_equal_chunks( tensor_send_next) if tensor_send_prev is not None: tensor_send_prev = mpu.split_tensor_into_1d_equal_chunks( tensor_send_prev) # Send tensors in both the forward and backward directions as appropriate. if use_ring_exchange: torch.distributed.ring_exchange( tensor_send_prev=tensor_send_prev, tensor_recv_prev=tensor_recv_prev, tensor_send_next=tensor_send_next, tensor_recv_next=tensor_recv_next, group=mpu.get_pipeline_model_parallel_group()) else: ops = [] if tensor_send_prev is not None: send_prev_op = torch.distributed.P2POp( torch.distributed.isend, tensor_send_prev, mpu.get_pipeline_model_parallel_prev_rank()) ops.append(send_prev_op) if tensor_recv_prev is not None: recv_prev_op = torch.distributed.P2POp( torch.distributed.irecv, tensor_recv_prev, mpu.get_pipeline_model_parallel_prev_rank()) ops.append(recv_prev_op) if tensor_send_next is not None: send_next_op = torch.distributed.P2POp( torch.distributed.isend, tensor_send_next, mpu.get_pipeline_model_parallel_next_rank()) ops.append(send_next_op) if tensor_recv_next is not None: recv_next_op = torch.distributed.P2POp( torch.distributed.irecv, tensor_recv_next, mpu.get_pipeline_model_parallel_next_rank()) ops.append(recv_next_op) if len(ops) > 0: reqs = torch.distributed.batch_isend_irecv(ops) for req in reqs: req.wait() # To protect against race condition when using batch_isend_irecv(). torch.cuda.synchronize() # If using scatter-gather optimization, gather smaller chunks. if not override_scatter_gather_tensors_in_pipeline and \ args.scatter_gather_tensors_in_pipeline: if recv_prev: tensor_recv_prev = mpu.gather_split_1d_tensor( tensor_recv_prev).view(tensor_shape).requires_grad_() tensor_recv_prev = mpu.make_viewless_tensor(tensor_recv_prev, requires_grad=True, keep_graph=False) if recv_next: tensor_recv_next = mpu.gather_split_1d_tensor( tensor_recv_next).view(tensor_shape).requires_grad_() tensor_recv_next = mpu.make_viewless_tensor(tensor_recv_next, requires_grad=True, keep_graph=False) return tensor_recv_prev, tensor_recv_next
def _communicate(tensor_send_next, tensor_send_prev, recv_prev, recv_next, use_ring_exchange=False): """Communicate tensors between stages. Used as helper method in other communication methods that are used in megatron/schedules.py. Takes the following arguments: tensor_send_next: tensor to send to next rank (no tensor sent if set to None). tensor_send_prev: tensor to send to prev rank (no tensor sent if set to None). recv_prev: boolean for whether tensor should be received from previous rank. recv_next: boolean for whether tensor should be received from next rank. use_ring_exchange: boolean for whether torch.distributed.ring_exchange() API should be used. Returns: (tensor_recv_prev, tensor_recv_next) """ args = get_args() # Create placeholder tensors for receive in forward and backward directions # if needed. tensor_recv_prev = None tensor_recv_next = None tensor_shape = (args.seq_length, args.micro_batch_size, args.hidden_size) if args.scatter_gather_tensors_in_pipeline: tensor_chunk_shape = reduce(operator.mul, tensor_shape, 1) // \ mpu.get_tensor_model_parallel_world_size() else: tensor_chunk_shape = tensor_shape dtype = args.params_dtype if args.fp32_residual_connection: dtype = torch.float if recv_prev: tensor_recv_prev = torch.empty(tensor_chunk_shape, requires_grad=True, device=torch.cuda.current_device(), dtype=dtype) if recv_next: tensor_recv_next = torch.empty(tensor_chunk_shape, requires_grad=True, device=torch.cuda.current_device(), dtype=dtype) # Split tensor into smaller chunks if using scatter-gather optimization. if args.scatter_gather_tensors_in_pipeline: if tensor_send_next is not None: tensor_send_next = mpu.split_tensor_into_1d_equal_chunks( tensor_send_next) if tensor_send_prev is not None: tensor_send_prev = mpu.split_tensor_into_1d_equal_chunks( tensor_send_prev) # Send tensors in both the forward and backward directions as appropriate. if use_ring_exchange: torch.distributed.ring_exchange( tensor_send_prev=tensor_send_prev, tensor_recv_prev=tensor_recv_prev, tensor_send_next=tensor_send_next, tensor_recv_next=tensor_recv_next, group=mpu.get_pipeline_model_parallel_group()) else: ops = [] if tensor_send_prev is not None: send_prev_op = torch.distributed.P2POp( torch.distributed.isend, tensor_send_prev, mpu.get_pipeline_model_parallel_prev_rank()) ops.append(send_prev_op) if tensor_recv_prev is not None: recv_prev_op = torch.distributed.P2POp( torch.distributed.irecv, tensor_recv_prev, mpu.get_pipeline_model_parallel_prev_rank()) ops.append(recv_prev_op) if tensor_send_next is not None: send_next_op = torch.distributed.P2POp( torch.distributed.isend, tensor_send_next, mpu.get_pipeline_model_parallel_next_rank()) ops.append(send_next_op) if tensor_recv_next is not None: recv_next_op = torch.distributed.P2POp( torch.distributed.irecv, tensor_recv_next, mpu.get_pipeline_model_parallel_next_rank()) ops.append(recv_next_op) if len(ops) > 0: reqs = torch.distributed.batch_isend_irecv(ops) for req in reqs: req.wait() # To protect against race condition when using batch_isend_irecv(). torch.cuda.synchronize() # If using scatter-gather optimization, gather smaller chunks. if args.scatter_gather_tensors_in_pipeline: if recv_prev: tensor_recv_prev = mpu.gather_split_1d_tensor( tensor_recv_prev).view(tensor_shape).requires_grad_() if recv_next: tensor_recv_next = mpu.gather_split_1d_tensor( tensor_recv_next).view(tensor_shape).requires_grad_() return tensor_recv_prev, tensor_recv_next