cuda_norms = torch.cuda.FloatTensor(*get_norm_shape(pt_in, dim)).fill_(0.) # Save a copy of the input as float pt_in_fp32 = pt_in.clone() g_fp32 = g.clone() if HALF: pt_in = pt_in.half() g = g.half() cuda_out = cuda_out.half() apex._C.weight_norm_fwd(cuda_out, cuda_norms, pt_in, g, dim) torch.cuda.synchronize() # quit() print("type(cuda_out) = {}\n".format(type(cuda_out))) rownorms = pt_norm(pt_in, dim) rownorms_fp32 = pt_norm(pt_in_fp32, dim) print("rownorms_fp32:") print(rownorms_fp32) print("cuda_norms" ) print(cuda_norms ) # rownorms is broadcast; torch.div(pt_in, rownorms) and pt_in/rownorms work the same way pt_out = pt_in*(g/rownorms) pt_out_control = pt_in_fp32*(g_fp32/rownorms_fp32) compare(cuda_out, pt_out, pt_out_control, rows)
elementwise_prec = elementwise_prec.half() pt_in_fp32 = Variable(pt_in_fp32, requires_grad=True) pt_in_fp16 = Variable(pt_in_fp16, requires_grad=True) cd_in_prec = Variable(cd_in_prec, requires_grad=True) pt_g_fp32 = Variable(pt_g_fp32, requires_grad=True) pt_g_fp16 = Variable(pt_g_fp16, requires_grad=True) cd_g_prec = Variable(cd_g_prec, requires_grad=True) elementwise_fp32 = Variable(elementwise_fp32, requires_grad=False) elementwise_fp16 = Variable(elementwise_fp16, requires_grad=False) elementwise_prec = Variable(elementwise_prec, requires_grad=False) torch.cuda.nvtx.range_push("fp16 forward, {}".format(pt_in_fp16.size())) pt_norms_fp16 = pt_norm(pt_in_fp16, dim) pt_out_fp16 = pt_in_fp16 * (pt_g_fp16 / pt_norms_fp16) torch.cuda.nvtx.range_pop() # torch.cuda.synchronize() torch.cuda.nvtx.range_push("fp32 forward, {}".format(pt_in_fp32.size())) pt_norms_fp32 = pt_norm(pt_in_fp32, dim) pt_out_fp32 = pt_in_fp32 * (pt_g_fp32 / pt_norms_fp32) torch.cuda.nvtx.range_pop() # torch.cuda.synchronize() # print("pt_norms_fp16 = ", pt_norms_fp16 ) # print("pt_norms_fp32 = ", pt_norms_fp32) # print( "cd_in_prec.data_ptr = {:x}".format(cd_in_prec.data_ptr()))
pLpg_fp16 = pLpg_fp16.half() pt_input_fp16 = pt_input_fp16.half() pt_g_fp16 = pt_g_fp16.half() pLpOutput_control = Variable(pLpOutput_control) pLpg_control = Variable(pLpg_control) pLpOutput_fp16 = Variable(pLpOutput_fp16) pLpg_fp16 = Variable(pLpg_fp16) pt_input_control = Variable(pt_input_control, requires_grad=True) pt_g_control = Variable(pt_g_control, requires_grad=True) pt_input_fp16 = Variable(pt_input_fp16, requires_grad=True) pt_g_fp16 = Variable(pt_g_fp16, requires_grad=True) # Do forward pass in fp16 and fp32 pt_norms_fp16 = pt_norm(pt_input_fp16, dim) pt_norms_control = pt_norm(pt_input_control, dim) pt_output_fp16 = pt_input_fp16 * (pt_g_fp16 / pt_norms_fp16) pt_output_control = pt_input_control * (pt_g_control / pt_norms_control) # Run the Cuda version pLpInput_cuda = torch.cuda.FloatTensor(*dims).fill_(0.) pLpg_cuda = torch.cuda.FloatTensor(*norm_shape).fill_(0.) if HALF: pLpInput_cuda = pLpInput_cuda.half() pLpg_cuda = pLpg_cuda.half() torch.cuda.nvtx.range_push("kernel weight norm backward") apex_C.weight_norm_bwd(pLpInput_cuda, pLpg_cuda, pLpOutput_fp16,