def linear_with_grad_accumulation_and_async_allreduce_in16bit( input, weight, bias, gradient_accumulation_fusion, async_grad_allreduce, ): args = _cast_if_autocast_enabled( input, weight, bias, gradient_accumulation_fusion, async_grad_allreduce ) with torch.cuda.amp.autocast(enabled=False): return LinearWithGradAccumulationAndAsyncAllreduceIn16Bit.apply(*args)
def scaled_upper_triang_masked_softmax(inputs, _, scale): b, np, sq, sk = inputs.size() assert sq == sk, "causal mask is only for self attention" # Reshaping input to 3D tensor (attn_batches, sq, sk) inputs = inputs.view(-1, sq, sk) args = _cast_if_autocast_enabled(inputs, scale) with torch.cuda.amp.autocast(enabled=False): probs = ScaledUpperTriangMaskedSoftmax.apply(*args) return probs.view(b, np, sq, sk)
def mixed_dtype_fused_layer_norm_affine(input, weight, bias, normalized_shape, eps=1e-6): args = _cast_if_autocast_enabled(input, weight, bias, normalized_shape, eps) with torch.cuda.amp.autocast(enabled=False): return FusedLayerNormAffineMixedDtypesFunction.apply(*args)
def scaled_masked_softmax(inputs, mask, scale): # input is 4D tensor (b, np, sq, sk) args = _cast_if_autocast_enabled(inputs, mask, scale) with torch.cuda.amp.autocast(enabled=False): return ScaledMaskedSoftmax.apply(*args)
def fused_bias_gelu(input, bias): args = _cast_if_autocast_enabled(input, bias) with torch.cuda.amp.autocast(enabled=False): return GeLUFunction.apply(*args)
def _fast_layer_norm(x, weight, bias, epsilon): args = _cast_if_autocast_enabled(x, weight, bias, epsilon) with torch.cuda.amp.autocast(enabled=False): return FastLayerNormFN.apply(*args)
def bias_dropout_add_fused_inference(x, bias, residual, prob): args = _cast_if_autocast_enabled(x, bias, residual, prob) with torch.cuda.amp.autocast(enabled=False): return bias_dropout_add_fused_inference_(*args)
def bias_dropout_add_fused_train(x, bias, residual, prob): # re-enable torch grad to enable fused optimization. with torch.enable_grad(): args = _cast_if_autocast_enabled(x, bias, residual, prob) with torch.cuda.amp.autocast(enabled=False): return bias_dropout_add_fused_train_(*args)
def column_parallel_linear(input, weight, bias): args = _cast_if_autocast_enabled(input, weight, bias) with torch.cuda.amp.autocast(enabled=False): return ColumnParallelLinearWithAsyncAllreduce.apply(*args)
def fused_rms_norm(input, normalized_shape, eps=1e-6): args = _cast_if_autocast_enabled(input, normalized_shape, eps) with torch.cuda.amp.autocast(enabled=False): return FusedRMSNormFunction.apply(*args)