Ejemplo n.º 1
0
    cuda_norms =   torch.cuda.FloatTensor(*get_norm_shape(pt_in, dim)).fill_(0.)
    
    # Save a copy of the input as float
    pt_in_fp32 = pt_in.clone()
    g_fp32     = g.clone()
    
    if HALF:
        pt_in    =    pt_in.half()
        g        =        g.half()
        cuda_out = cuda_out.half()
    
    apex._C.weight_norm_fwd(cuda_out, cuda_norms, pt_in, g, dim)
    torch.cuda.synchronize()
    # quit()

    print("type(cuda_out) = {}\n".format(type(cuda_out)))
    
    rownorms      = pt_norm(pt_in, dim)
    rownorms_fp32 = pt_norm(pt_in_fp32, dim)
    
    print("rownorms_fp32:")
    print(rownorms_fp32)
    print("cuda_norms"    )
    print(cuda_norms   )
    
    # rownorms is broadcast; torch.div(pt_in, rownorms) and pt_in/rownorms work the same way
    pt_out         = pt_in*(g/rownorms)
    pt_out_control = pt_in_fp32*(g_fp32/rownorms_fp32)
    
    compare(cuda_out, pt_out, pt_out_control, rows)
Ejemplo n.º 2
0
    elementwise_prec = elementwise_prec.half()

pt_in_fp32 = Variable(pt_in_fp32, requires_grad=True)
pt_in_fp16 = Variable(pt_in_fp16, requires_grad=True)
cd_in_prec = Variable(cd_in_prec, requires_grad=True)

pt_g_fp32 = Variable(pt_g_fp32, requires_grad=True)
pt_g_fp16 = Variable(pt_g_fp16, requires_grad=True)
cd_g_prec = Variable(cd_g_prec, requires_grad=True)

elementwise_fp32 = Variable(elementwise_fp32, requires_grad=False)
elementwise_fp16 = Variable(elementwise_fp16, requires_grad=False)
elementwise_prec = Variable(elementwise_prec, requires_grad=False)

torch.cuda.nvtx.range_push("fp16 forward, {}".format(pt_in_fp16.size()))
pt_norms_fp16 = pt_norm(pt_in_fp16, dim)
pt_out_fp16 = pt_in_fp16 * (pt_g_fp16 / pt_norms_fp16)
torch.cuda.nvtx.range_pop()
# torch.cuda.synchronize()

torch.cuda.nvtx.range_push("fp32 forward, {}".format(pt_in_fp32.size()))
pt_norms_fp32 = pt_norm(pt_in_fp32, dim)
pt_out_fp32 = pt_in_fp32 * (pt_g_fp32 / pt_norms_fp32)
torch.cuda.nvtx.range_pop()
# torch.cuda.synchronize()

# print("pt_norms_fp16    = ", pt_norms_fp16   )
# print("pt_norms_fp32 = ", pt_norms_fp32)

# print( "cd_in_prec.data_ptr = {:x}".format(cd_in_prec.data_ptr()))
Ejemplo n.º 3
0
        pLpg_fp16 = pLpg_fp16.half()
        pt_input_fp16 = pt_input_fp16.half()
        pt_g_fp16 = pt_g_fp16.half()

    pLpOutput_control = Variable(pLpOutput_control)
    pLpg_control = Variable(pLpg_control)
    pLpOutput_fp16 = Variable(pLpOutput_fp16)
    pLpg_fp16 = Variable(pLpg_fp16)

    pt_input_control = Variable(pt_input_control, requires_grad=True)
    pt_g_control = Variable(pt_g_control, requires_grad=True)
    pt_input_fp16 = Variable(pt_input_fp16, requires_grad=True)
    pt_g_fp16 = Variable(pt_g_fp16, requires_grad=True)

    # Do forward pass in fp16 and fp32
    pt_norms_fp16 = pt_norm(pt_input_fp16, dim)
    pt_norms_control = pt_norm(pt_input_control, dim)

    pt_output_fp16 = pt_input_fp16 * (pt_g_fp16 / pt_norms_fp16)
    pt_output_control = pt_input_control * (pt_g_control / pt_norms_control)

    # Run the Cuda version
    pLpInput_cuda = torch.cuda.FloatTensor(*dims).fill_(0.)
    pLpg_cuda = torch.cuda.FloatTensor(*norm_shape).fill_(0.)

    if HALF:
        pLpInput_cuda = pLpInput_cuda.half()
        pLpg_cuda = pLpg_cuda.half()

    torch.cuda.nvtx.range_push("kernel weight norm backward")
    apex_C.weight_norm_bwd(pLpInput_cuda, pLpg_cuda, pLpOutput_fp16,