Exemple #1
0
def linear_with_grad_accumulation_and_async_allreduce_in16bit(
    input, weight, bias, gradient_accumulation_fusion, async_grad_allreduce,
):
    args = _cast_if_autocast_enabled(
        input, weight, bias, gradient_accumulation_fusion, async_grad_allreduce
    )
    with torch.cuda.amp.autocast(enabled=False):
        return LinearWithGradAccumulationAndAsyncAllreduceIn16Bit.apply(*args)
Exemple #2
0
def scaled_upper_triang_masked_softmax(inputs, _, scale):
    b, np, sq, sk = inputs.size()
    assert sq == sk, "causal mask is only for self attention"
    # Reshaping input to 3D tensor (attn_batches, sq, sk)
    inputs = inputs.view(-1, sq, sk)
    args = _cast_if_autocast_enabled(inputs, scale)
    with torch.cuda.amp.autocast(enabled=False):
        probs = ScaledUpperTriangMaskedSoftmax.apply(*args)
    return probs.view(b, np, sq, sk)
Exemple #3
0
def mixed_dtype_fused_layer_norm_affine(input,
                                        weight,
                                        bias,
                                        normalized_shape,
                                        eps=1e-6):
    args = _cast_if_autocast_enabled(input, weight, bias, normalized_shape,
                                     eps)
    with torch.cuda.amp.autocast(enabled=False):
        return FusedLayerNormAffineMixedDtypesFunction.apply(*args)
Exemple #4
0
def scaled_masked_softmax(inputs, mask, scale):
    # input is 4D tensor (b, np, sq, sk)
    args = _cast_if_autocast_enabled(inputs, mask, scale)
    with torch.cuda.amp.autocast(enabled=False):
        return ScaledMaskedSoftmax.apply(*args)
Exemple #5
0
def fused_bias_gelu(input, bias):
    args = _cast_if_autocast_enabled(input, bias)
    with torch.cuda.amp.autocast(enabled=False):
        return GeLUFunction.apply(*args)
Exemple #6
0
def _fast_layer_norm(x, weight, bias, epsilon):
    args = _cast_if_autocast_enabled(x, weight, bias, epsilon)
    with torch.cuda.amp.autocast(enabled=False):
        return FastLayerNormFN.apply(*args)
Exemple #7
0
def bias_dropout_add_fused_inference(x, bias, residual, prob):
    args = _cast_if_autocast_enabled(x, bias, residual, prob)
    with torch.cuda.amp.autocast(enabled=False):
        return bias_dropout_add_fused_inference_(*args)
Exemple #8
0
def bias_dropout_add_fused_train(x, bias, residual, prob):
    # re-enable torch grad to enable fused optimization.
    with torch.enable_grad():
        args = _cast_if_autocast_enabled(x, bias, residual, prob)
        with torch.cuda.amp.autocast(enabled=False):
            return bias_dropout_add_fused_train_(*args)
Exemple #9
0
def column_parallel_linear(input, weight, bias):
    args = _cast_if_autocast_enabled(input, weight, bias)
    with torch.cuda.amp.autocast(enabled=False):
        return ColumnParallelLinearWithAsyncAllreduce.apply(*args)
Exemple #10
0
def fused_rms_norm(input, normalized_shape, eps=1e-6):
    args = _cast_if_autocast_enabled(input, normalized_shape, eps)
    with torch.cuda.amp.autocast(enabled=False):
        return FusedRMSNormFunction.apply(*args)