Ejemplo n.º 1
0
# rows = 3
# cols = 512
# fast = 1024
HALF = True
RAND = True
dim = 0
    

for rows, cols, fast in sizes:
    dims = rows, cols, fast
     
    print("\n\nTESTING dims = {}\n\n".format(dims))

    if RAND:
        pt_in = 1.*torch.cuda.FloatTensor(*dims).uniform_()
        g = torch.cuda.FloatTensor(*get_norm_shape(pt_in, dim)).uniform_()
    else:
        pt_in = torch.cuda.FloatTensor(*dims).fill_(1.)
        g = torch.cuda.FloatTensor(*get_norm_shape(pt_in, dim)).fill_(6.0)
    
    # per_col = torch.arange(1,cols+1).cuda()
    # print((rows*per_col*per_col).sqrt())
    # pt_in *= per_col
    
    cuda_out   =   torch.cuda.FloatTensor(*dims).fill_(0.)
    cuda_norms =   torch.cuda.FloatTensor(*get_norm_shape(pt_in, dim)).fill_(0.)
    
    # Save a copy of the input as float
    pt_in_fp32 = pt_in.clone()
    g_fp32     = g.clone()
    
Ejemplo n.º 2
0
# This means that output gradients in the backward pass will be equal
# to elementwise, so by manipulating elementwise, we have easy
# fine-grained control over the output gradients we'd like to use for
# testing purposes.
#
# The alternative is just to create the output_gradients manually
# and call output.backward(gradient=output_gradients),
# as is done in test_backward.py.
# But I wanted a minimal working sample similar to an "actual" use case,
# where gradients are computed by calling backward() on a scalar Loss.

if RAND:
    # With std=6.0, I observe the pytorch fp16 ops going unstable (sometimes)
    # while the fused kernel remains stable.
    pt_in_fp32 = torch.cuda.FloatTensor(*dims).normal_(std=1.0)
    norm_shape = get_norm_shape(pt_in_fp32, dim)
    pt_g_fp32 = torch.cuda.FloatTensor(*norm_shape).normal_(std=1.0)
    elementwise_fp32 = torch.cuda.FloatTensor(*dims).normal_(std=1.0)
else:
    pt_in_fp32 = torch.cuda.FloatTensor(*dims).fill_(1.0)
    norm_shape = get_norm_shape(pt_in_fp32, dim)
    pt_g_fp32 = torch.cuda.FloatTensor(*norm_shape).fill_(2.0)
    elementwise_fp32 = torch.cuda.FloatTensor(*dims).fill_(0.5)

pt_in_fp16 = pt_in_fp32.half()
cd_in_prec = pt_in_fp32.clone()
pt_g_fp16 = pt_g_fp32.half()
cd_g_prec = pt_g_fp32.clone()
elementwise_fp16 = elementwise_fp32.half()
elementwise_prec = elementwise_fp32.clone()
Ejemplo n.º 3
0
HALF = True
RAND = True
dim = 2

for rows, cols, fast in sizes:
    dims = rows, cols, fast
    # Incoming gradient vectors we will use later
    # Need to create the fp16 versions as a half() copy of a Tensor first rather than
    # a Variable, because if you create pt_input_control as a Variable then say
    # pt_input_fp16 = pt_input_control.half(), you are accidentally making pt_input_fp16 part of
    # pLpOutput_control's computational graph, instead of the leaf of its own separate graph.

    # Careful: if you initialize with torch.ones, the gradient wrt input becomes analytically zero.
    if RAND:
        pLpOutput_control = torch.cuda.FloatTensor(*dims).uniform_() * 1.0
        norm_shape = get_norm_shape(pLpOutput_control, dim)
        pLpg_control = torch.cuda.FloatTensor(*norm_shape).uniform_()
        pt_input_control = torch.cuda.FloatTensor(*dims).uniform_()
        pt_g_control = torch.cuda.FloatTensor(*norm_shape).uniform_()
    else:
        pLpOutput_control = torch.cuda.FloatTensor(*dims).fill_(1.)
        norm_shape = get_norm_shape(pLpOutput_control, dim)
        pLpg_control = torch.cuda.FloatTensor(*norm_shape).fill_(2.)
        pt_input_control = torch.cuda.FloatTensor(*dims).fill_(4.0)
        pt_g_control = torch.cuda.FloatTensor(*norm_shape).fill_(3.0)

    pLpOutput_fp16 = pLpOutput_control.clone()
    pLpg_fp16 = pLpg_control.clone()
    pt_input_fp16 = pt_input_control.clone()
    pt_g_fp16 = pt_g_control.clone()