def run_gradweight(self, inputs_shape, filters_shape, dCdH_shape, subsample=(1, 1)): inputs_shape = [inputs_shape[i] for i in (0, 3, 1, 2)] filters_shape = [filters_shape[i] for i in (0, 3, 1, 2)] dCdH_shape = [dCdH_shape[i] for i in (0, 3, 1, 2)] inputs_val = np.random.random(inputs_shape).astype(config.floatX) dCdH_val = np.random.random(dCdH_shape).astype(config.floatX) inputs = gpuarray_shared_constructor(inputs_val) dCdH = gpuarray_shared_constructor(dCdH_val) shape = gpuarray_shared_constructor(np.array(filters_shape[2:])) if subsample == (1, 1): conv_ref = CorrMM_gradWeights(subsample=subsample)( ref_cast(inputs), ref_cast(dCdH) ) conv_gemm = GpuCorrMM_gradWeights(subsample=subsample)(inputs, dCdH) else: conv_ref = CorrMM_gradWeights(subsample=subsample)( ref_cast(inputs), ref_cast(dCdH), shape=shape ) conv_gemm = GpuCorrMM_gradWeights(subsample=subsample)( inputs, dCdH, shape=shape ) f_ref = theano.function([], conv_ref, mode=mode_without_gpu) f = theano.function([], conv_gemm, mode=mode_with_gpu) res_ref = f_ref() res = f() utt.assert_allclose(res_ref, res)
def run_gradinput(self, inputs_shape, filters_shape, subsample=(1, 1, 1)): inputs_shape = [inputs_shape[i] for i in (0, 4, 1, 2, 3)] filters_shape = [filters_shape[i] for i in (0, 4, 1, 2, 3)] inputs_val = np.random.random(inputs_shape).astype(config.floatX) filters_val = np.random.random(filters_shape).astype(config.floatX) inputs = gpuarray_shared_constructor(inputs_val) filters = gpuarray_shared_constructor(filters_val) bottom_height = (inputs_shape[2] - 1) * subsample[0] + filters_shape[2] bottom_width = (inputs_shape[3] - 1) * subsample[1] + filters_shape[3] bottom_depth = (inputs_shape[4] - 1) * subsample[2] + filters_shape[4] bottom_shape = gpuarray_shared_constructor( np.array([bottom_height, bottom_width, bottom_depth])) if subsample == (1, 1, 1): conv_ref = Corr3dMMGradInputs(subsample=subsample)( kern=ref_cast(filters), topgrad=ref_cast(inputs)) conv_gemm = GpuCorr3dMM_gradInputs(subsample=subsample)( kern=filters, topgrad=inputs) else: conv_ref = Corr3dMMGradInputs(subsample=subsample)( kern=ref_cast(filters), topgrad=ref_cast(inputs), shape=bottom_shape) conv_gemm = GpuCorr3dMM_gradInputs(subsample=subsample)( kern=filters, topgrad=inputs, shape=bottom_shape) f_ref = theano.function([], conv_ref, mode=mode_without_gpu) f = theano.function([], conv_gemm, mode=mode_with_gpu) res_ref = f_ref() res = f() utt.assert_allclose(res_ref, res)
def run_conv_valid( self, inputs_shape, filters_shape, border_mode="valid", filter_dilation=(1, 1), subsample=(1, 1), unshared=False, verify_grad=False, ): inputs_shape = [inputs_shape[i] for i in (0, 3, 1, 2)] if unshared: filters_shape = [filters_shape[i] for i in (0, 1, 2, 5, 3, 4)] else: filters_shape = [filters_shape[i] for i in (0, 3, 1, 2)] inputs_val = np.random.random(inputs_shape).astype(config.floatX) filters_val = np.random.random(filters_shape).astype(config.floatX) inputs = gpuarray_shared_constructor(inputs_val) filters = gpuarray_shared_constructor(filters_val) conv_ref = CorrMM( border_mode=border_mode, filter_dilation=filter_dilation, subsample=subsample, unshared=unshared, )(ref_cast(inputs), ref_cast(filters)) f_ref = theano.function([], conv_ref, mode=mode_without_gpu) conv = GpuCorrMM( border_mode=border_mode, filter_dilation=filter_dilation, subsample=subsample, unshared=unshared, )(inputs, filters) f = theano.function([], conv, mode=mode_with_gpu) res_ref = f_ref() res = f() utt.assert_allclose(res_ref, res) if verify_grad: utt.verify_grad( GpuCorrMM( border_mode=border_mode, filter_dilation=filter_dilation, subsample=subsample, unshared=unshared, ), [inputs_val, filters_val], mode=mode_with_gpu, )
def run_gradweight_runtime_algorithm(algo): with aesara.config.change_flags(dnn__conv__algo_bwd_filter=algo): inputs = TensorType(dtype, _broadcastable)() filters = TensorType(dtype, _broadcastable)() conv = dnn_conv( img=inputs, kerns=filters, algo=algo, precision=dtype, subsample=unit_shape, dilation=unit_shape, ) grad_w = aesara.gradient.grad(conv.sum(), [filters]) f = aesara.function([inputs, filters], grad_w, mode=mode_with_gpu) assert 1 == len([ node for node in f.maker.fgraph.apply_nodes if isinstance(node.op, GpuDnnConvGradW) ]) assert not any( isinstance(node.op, GpuDnnConv) for node in f.maker.fgraph.apply_nodes) assert not any( isinstance(node.op, GpuDnnConvGradI) for node in f.maker.fgraph.apply_nodes) if self.ndim == 3: flipped_filters = filters[:, :, ::-1, ::-1, ::-1] else: flipped_filters = filters[:, :, ::-1, ::-1] conv_ref = self.cpu_conv_class(subsample=unit_shape)( ref_cast(inputs), flipped_filters) grad_w_ref = aesara.gradient.grad(conv_ref.sum(), [filters]) f_ref = aesara.function([inputs, filters], grad_w_ref, mode="FAST_RUN") runtime_shapes = self.runtime_shapes if algo in ("time_once", "guess_once"): runtime_shapes = [list(runtime_shapes[0])] runtime_shapes[0][0] = 5 for ntimes, (inputs_shape, filters_shape) in runtime_shapes: print("Shapes:", inputs_shape, filters_shape) for i in range(ntimes): inputs_val = np.random.random(inputs_shape).astype( dtype) filters_val = np.random.random(filters_shape).astype( dtype) gpu_res = f(inputs_val, filters_val) cpu_res = f_ref(inputs_val, filters_val) utt.assert_allclose(cpu_res, np.asarray(gpu_res))
def run_fwd_runtime_algorithm(algo): inputs = theano.tensor.TensorType(dtype, _broadcastable)() filters = theano.tensor.TensorType(dtype, _broadcastable)() # Scale down the input values to prevent very large absolute errors # due to float rounding lower_inputs = inputs / 10 lower_filters = filters / 10 conv = dnn_conv( img=lower_inputs, kerns=lower_filters, algo=algo, precision=dtype, subsample=unit_shape, dilation=unit_shape, ) f = theano.function([inputs, filters], conv, mode=mode_with_gpu) if self.ndim == 3: flipped_filters = lower_filters[:, :, ::-1, ::-1, ::-1] else: flipped_filters = lower_filters[:, :, ::-1, ::-1] conv_ref = self.cpu_conv_class(subsample=unit_shape)( ref_cast(lower_inputs), flipped_filters) f_ref = theano.function([inputs, filters], conv_ref, mode="FAST_RUN") runtime_shapes = self.runtime_shapes if algo in ("time_once", "guess_once"): runtime_shapes = [list(runtime_shapes[0])] runtime_shapes[0][0] = 5 for ntimes, (inputs_shape, filters_shape) in runtime_shapes: print("Shapes:", inputs_shape, filters_shape) for i in range(ntimes): inputs_val = np.random.random(inputs_shape).astype(dtype) filters_val = np.random.random(filters_shape).astype(dtype) gpu_res = np.asarray(f(inputs_val, filters_val)) cpu_res = f_ref(inputs_val, filters_val) self.scale_numpy_arrays_inplace(cpu_res, gpu_res, 1) utt.assert_allclose(cpu_res, gpu_res)
def run_conv_gradweight(self, algo, dtype, precision, parameters): ( inputs_shape, filters_shape, subsample, dilation, border_mode, conv_mode, alpha, beta, ) = parameters inputs_val = np.random.random(inputs_shape).astype(dtype) if beta == 0: filters_val = None else: filters_val = np.random.random(filters_shape).astype(dtype) filters_val /= 10 topgrad_val = self.array_like_conv_output( inputs_shape, filters_shape, border_mode, subsample, dilation, dtype ) # Scale down the input values to prevent absolute errors in utt.assert_allclose. inputs_val /= 10 topgrad_val /= 10 inputs = theano.shared(inputs_val) topgrad = theano.shared(topgrad_val) # Compile a theano function for the cuDNN implementation grad_w = dnn_gradweight( inputs, topgrad, filters_shape, alpha=alpha, beta=beta, out=filters_val, border_mode=border_mode, subsample=subsample, dilation=dilation, conv_mode=conv_mode, algo=algo, precision=precision, ) f = theano.function([], grad_w, mode=mode_with_gpu) # Compile a theano function for the reference implementation grad_w_ref = self.cpu_gradweight_class( border_mode=border_mode, subsample=subsample, filter_dilation=dilation )(ref_cast(inputs), ref_cast(topgrad), filters_shape[2:]) if conv_mode == "conv": if inputs.ndim == 5: grad_w_ref = grad_w_ref[:, :, ::-1, ::-1, ::-1] else: grad_w_ref = grad_w_ref[:, :, ::-1, ::-1] f_ref = theano.function([], grad_w_ref, mode="FAST_RUN") # Compare the results of the two implementations res_ref = f_ref() res = np.asarray(f()) if algo in cudnn.deterministic_bwd_filter_algorithms: utt.assert_allclose(res, np.asarray(f())) atol, rtol = self.get_atol_rtol(algo, dtype, precision) if beta == 0: cpu_res = alpha * res_ref else: cpu_res = alpha * res_ref + beta * filters_val self.scale_numpy_arrays_inplace(cpu_res, res, alpha) utt.assert_allclose(cpu_res, res, rtol=rtol, atol=atol)
def run_conv_fwd(self, algo, dtype, precision, parameters): ( inputs_shape, filters_shape, subsample, dilation, border_mode, conv_mode, alpha, beta, ) = parameters inputs_val = np.random.random(inputs_shape).astype(dtype) filters_val = np.random.random(filters_shape).astype(dtype) # Scale down the input values to prevent very large absolute errors # due to float rounding inputs_val /= 10 filters_val /= 10 inputs = theano.shared(inputs_val) filters = theano.shared(filters_val) if beta == 0: out = None else: out = self.array_like_conv_output( inputs_shape, filters_shape, border_mode, subsample, dilation, dtype ) out /= 10 # Compile a theano function for the cuDNN implementation conv = dnn_conv( img=inputs, kerns=filters, alpha=alpha, beta=beta, out=out, border_mode=border_mode, subsample=subsample, dilation=dilation, conv_mode=conv_mode, algo=algo, precision=precision, ) f = theano.function([], conv, mode=mode_with_gpu) # If conv_mode is 'conv' the reference implementation should use # filters flipped according to the width, height and time axis if conv_mode == "conv": if inputs.ndim == 5: flipped_filters = filters[:, :, ::-1, ::-1, ::-1] else: flipped_filters = filters[:, :, ::-1, ::-1] else: flipped_filters = filters # Compile a theano function for the reference implementation conv_ref = self.cpu_conv_class( border_mode=border_mode, subsample=subsample, filter_dilation=dilation )(ref_cast(inputs), flipped_filters) f_ref = theano.function([], conv_ref, mode="FAST_RUN") # Compare the results of the two implementations res_ref = f_ref() res = np.asarray(f()) if algo in cudnn.deterministic_fwd_algorithms: utt.assert_allclose(res, np.asarray(f())) atol, rtol = self.get_atol_rtol(algo, dtype, precision) if beta == 0: cpu_res = alpha * res_ref else: cpu_res = alpha * res_ref + beta * out self.scale_numpy_arrays_inplace(cpu_res, res, alpha) utt.assert_allclose(cpu_res, res, rtol=rtol, atol=atol)
def run_conv_gradinput(self, algo, dtype, precision, parameters): ( inputs_shape, filters_shape, subsample, dilation, border_mode, conv_mode, alpha, beta, ) = parameters if beta == 0: inputs_val = None else: inputs_val = np.random.random(inputs_shape).astype(dtype) inputs_val /= 10 filters_val = np.random.random(filters_shape).astype(dtype) topgrad_val = self.array_like_conv_output(inputs_shape, filters_shape, border_mode, subsample, dilation, dtype) # Scale down the input values to prevent absolute errors in utt.assert_allclose. filters_val /= 10 topgrad_val /= 10 filters = aesara.shared(filters_val) topgrad = aesara.shared(topgrad_val) # Compile an Aesara function for the cuDNN implementation grad_i = dnn_gradinput( filters, topgrad, inputs_shape, alpha=alpha, beta=beta, out=inputs_val, border_mode=border_mode, subsample=subsample, dilation=dilation, conv_mode=conv_mode, algo=algo, precision=precision, ) f = aesara.function([], grad_i, mode=mode_with_gpu) # If conv_mode is 'conv' the reference implementation should use # filters flipped according to the width, height and time axis if conv_mode == "conv": if filters.ndim == 5: flipped_filters = filters[:, :, ::-1, ::-1, ::-1] else: flipped_filters = filters[:, :, ::-1, ::-1] else: flipped_filters = filters # Compile an Aesara function for the reference implementation grad_i_ref = self.cpu_gradinput_class( border_mode=border_mode, subsample=subsample, filter_dilation=dilation)(ref_cast(flipped_filters), ref_cast(topgrad), inputs_shape[2:]) f_ref = aesara.function([], grad_i_ref, mode="FAST_RUN") # Compare the results of the two implementations res_ref = f_ref() res = np.asarray(f()) if algo in cudnn.deterministic_bwd_data_algorithms: utt.assert_allclose(res, np.asarray(f())) atol, rtol = self.get_atol_rtol(algo, dtype, precision) if beta == 0: cpu_res = alpha * res_ref else: cpu_res = alpha * res_ref + beta * inputs_val self.scale_numpy_arrays_inplace(cpu_res, res, alpha) utt.assert_allclose(cpu_res, res, rtol=rtol, atol=atol)