def _nd_convolution( self, n, input_channels, output_channels, batch_size, stride, size, kernel, dilation, pad, order, use_bias, engine, force_algo_fwd, force_algo_dgrad, force_algo_wgrad, gc, dc, ): dkernel = dilation * (kernel - 1) + 1 for op_type in ["Conv", "Conv" + str(n) + "D"]: op = core.CreateOperator( op_type, ["X", "w", "b"] if use_bias else ["X", "w"], ["Y"], strides=[stride] * n, kernels=[kernel] * n, dilations=[dilation] * n, pads=[pad] * n * 2, order=order, engine=engine, force_algo_fwd=force_algo_fwd, force_algo_dgrad=force_algo_dgrad, force_algo_wgrad=force_algo_wgrad, ) input_dims = [batch_size, input_channels] input_dims.extend([size] * n) filter_dims = [output_channels, input_channels] filter_dims.extend([kernel] * n) X = np.random.rand(*input_dims).astype(np.float32) - 0.5 w = np.random.rand(*filter_dims).astype(np.float32) - 0.5 b = np.random.rand(output_channels).astype(np.float32) - 0.5 if order == "NHWC": X = utils.NCHW2NHWC(X) w = utils.NCHW2NHWC(w) inputs = [X, w, b] if use_bias else [X, w] if size + pad + pad < dkernel or size + pad + pad < dkernel: with self.assertRaises(RuntimeError): self.assertDeviceChecks(dc, op, inputs, [0]) return self.assertDeviceChecks(dc, op, inputs, [0]) for i in range(len(inputs)): self.assertGradientChecks(gc, op, inputs, i, [0])
def _conv_2d_shuffle_offsets( batch_size, kernel, dims, num_deformable_group, input_channels, output_channels ): o = [] w0 = [[0 for x in range(kernel)] for y in range(kernel)] for y0 in range(0, kernel): for x0 in range(0, kernel): x = np.random.randint(0, kernel) y = np.random.randint(0, kernel) o.append(y - y0) o.append(x - x0) w0[y][x] += 1 o = o * num_deformable_group e = [] for v in o: e.append([[v] * int(dims[1])] * int(dims[0])) w0 = [[w0] * input_channels] * output_channels return ( np.array([e] * batch_size).astype(np.float32), utils.NCHW2NHWC(np.array(w0).astype(np.float32)) )
def test_instance_norm_model_helper(self, N, C, H, W, order, epsilon, seed, is_test): np.random.seed(seed) model = model_helper.ModelHelper(name="test_model") brew.instance_norm(model, 'input', 'output', C, epsilon=epsilon, order=order, is_test=is_test) input_blob = np.random.rand(N, C, H, W).astype(np.float32) if order == 'NHWC': input_blob = utils.NCHW2NHWC(input_blob) self.ws.create_blob('input').feed(input_blob) self.ws.create_net(model.param_init_net).run() self.ws.create_net(model.net).run() if is_test: scale = self.ws.blobs['output_s'].fetch() assert scale is not None assert scale.shape == (C, ) bias = self.ws.blobs['output_b'].fetch() assert bias is not None assert bias.shape == (C, ) output_blob = self.ws.blobs['output'].fetch() if order == 'NHWC': output_blob = utils.NHWC2NCHW(output_blob) assert output_blob.shape == (N, C, H, W)
def channel_shuffle_ref(X): if order == "NHWC": X = utils.NHWC2NCHW(X) Y_r = X.reshape(X.shape[0], groups, X.shape[1] // groups, X.shape[2], X.shape[3]) Y_trns = Y_r.transpose((0, 2, 1, 3, 4)) Y_reshaped = Y_trns.reshape(X.shape) if order == "NHWC": Y_reshaped = utils.NCHW2NHWC(Y_reshaped) return Y_reshaped
def _get_inputs(self, N, C, H, W, order): input_data = np.random.rand(N, C, H, W).astype(np.float32) if order == 'NHWC': # Allocate in the same order as NCHW and transpose to make sure # the inputs are identical on freshly-seeded calls. input_data = utils.NCHW2NHWC(input_data) elif order != "NCHW": raise Exception('unknown order type ({})'.format(order)) scale_data = np.random.rand(C).astype(np.float32) bias_data = np.random.rand(C).astype(np.float32) return input_data, scale_data, bias_data
def _get_inputs(self, N, C, H, W, order): input_data = np.random.rand(N, C, H, W).astype(np.float32) - 0.5 # default step size is 0.05 input_data[np.logical_and( input_data >= 0, input_data <= 0.051)] = 0.051 input_data[np.logical_and( input_data <= 0, input_data >= -0.051)] = -0.051 if order == 'NHWC': input_data = utils.NCHW2NHWC(input_data) return input_data,
def test_channel_shuffle_fast_path(self, channels_per_group, n, gc, dc): order = "NHWC" groups = 4 X = np.round( np.random.rand(n, channels_per_group * groups, 5, 6) * 255).astype( np.float32) if n != 0: X[0, 0, 0, 0] = 0 X[0, 0, 0, 1] = 255 X = utils.NCHW2NHWC(X) net = core.Net("test_net") quantize = core.CreateOperator("Quantize", ["X"], ["X_q"], engine="DNNLOWP") channel_shuffle = core.CreateOperator( "ChannelShuffle", ["X_q"], ["Y_q"], group=groups, kernel=1, order=order, engine="DNNLOWP", ) dequantize = core.CreateOperator("Dequantize", ["Y_q"], ["Y"], engine="DNNLOWP") net.Proto().op.extend([quantize, channel_shuffle, dequantize]) workspace.FeedBlob("X", X) workspace.RunNetOnce(net) Y = workspace.FetchBlob("Y") def channel_shuffle_ref(X): if order == "NHWC": X = utils.NHWC2NCHW(X) Y_r = X.reshape(X.shape[0], groups, X.shape[1] // groups, X.shape[2], X.shape[3]) Y_trns = Y_r.transpose((0, 2, 1, 3, 4)) Y_reshaped = Y_trns.reshape(X.shape) if order == "NHWC": Y_reshaped = utils.NCHW2NHWC(Y_reshaped) return Y_reshaped Y_ref = channel_shuffle_ref(X) np.testing.assert_allclose(Y, Y_ref)
def test_spatialbn_test_mode_3d(self, size, input_channels, batch_size, seed, order, epsilon, inplace, engine, gc, dc): # Currently MIOPEN SpatialBN only supports 2D if hiputl.run_in_hip(gc, dc): assume(engine != "CUDNN") op = core.CreateOperator( "SpatialBN", ["X", "scale", "bias", "mean", "var"], ["X" if inplace else "Y"], order=order, is_test=True, epsilon=epsilon, engine=engine, ) def reference_spatialbn_test(X, scale, bias, mean, var): if order == "NCHW": scale = scale[np.newaxis, :, np.newaxis, np.newaxis, np.newaxis] bias = bias[np.newaxis, :, np.newaxis, np.newaxis, np.newaxis] mean = mean[np.newaxis, :, np.newaxis, np.newaxis, np.newaxis] var = var[np.newaxis, :, np.newaxis, np.newaxis, np.newaxis] return ((X - mean) / np.sqrt(var + epsilon) * scale + bias, ) np.random.seed(1701) scale = np.random.rand(input_channels).astype(np.float32) + 0.5 bias = np.random.rand(input_channels).astype(np.float32) - 0.5 mean = np.random.randn(input_channels).astype(np.float32) var = np.random.rand(input_channels).astype(np.float32) + 0.5 X = np.random.rand(batch_size, input_channels, size, size, size)\ .astype(np.float32) - 0.5 if order == "NHWC": X = utils.NCHW2NHWC(X) self.assertReferenceChecks(gc, op, [X, scale, bias, mean, var], reference_spatialbn_test) self.assertDeviceChecks(dc, op, [X, scale, bias, mean, var], [0])
def test_leaky_relu_model_helper_helper(self, N, C, H, W, order, alpha, seed): np.random.seed(seed) arg_scope = {'order': order} model = model_helper.ModelHelper(name="test_model", arg_scope=arg_scope) model.LeakyRelu('input', 'output', alpha=alpha) input_blob = np.random.rand(N, C, H, W).astype(np.float32) if order == 'NHWC': input_blob = utils.NCHW2NHWC(input_blob) self.ws.create_blob('input').feed(input_blob) self.ws.create_net(model.param_init_net).run() self.ws.create_net(model.net).run() output_blob = self.ws.blobs['output'].fetch() if order == 'NHWC': output_blob = utils.NHWC2NCHW(output_blob) assert output_blob.shape == (N, C, H, W)
def ref(input_blob, scale_blob, bias_blob): if order == 'NHWC': input_blob = utils.NHWC2NCHW(input_blob) mean_blob = input_blob.reshape((N, C, -1)).mean(axis=2) inv_stdev_blob = 1.0 / \ np.sqrt(input_blob.reshape((N, C, -1)).var(axis=2) + epsilon) # _bc indicates blobs that are reshaped for broadcast scale_bc = scale_blob[np.newaxis, :, np.newaxis, np.newaxis] mean_bc = mean_blob[:, :, np.newaxis, np.newaxis] inv_stdev_bc = inv_stdev_blob[:, :, np.newaxis, np.newaxis] bias_bc = bias_blob[np.newaxis, :, np.newaxis, np.newaxis] normalized_blob = scale_bc * (input_blob - mean_bc) * inv_stdev_bc \ + bias_bc if order == 'NHWC': normalized_blob = utils.NCHW2NHWC(normalized_blob) if not store_mean and not store_inv_stdev: return normalized_blob, elif not store_inv_stdev: return normalized_blob, mean_blob else: return normalized_blob, mean_blob, inv_stdev_blob
def lc_2d_nhwc(X, W, b=None): XT = utils.NHWC2NCHW(X) WT = np.transpose(W, [0, 1, 2, 5, 3, 4]) output = lc_2d_nchw(XT, WT, b) return [utils.NCHW2NHWC(output[0])]
def _test_dnnlowp_nd_int( self, stride, pad, kernels, dilation, size, group, input_channels_per_group, output_channels_per_group, batch_size, order, prepack_weight, gc, dc, ): assume(group == 1 or dilation == 1) assume((not prepack_weight) or order == "NHWC") ndim = len(kernels) X, W, b = generate_convnd_inputs( (stride, ) * ndim, (pad, ) * ndim, kernels, (dilation, ) * ndim, (size, ) * ndim, group, input_channels_per_group, output_channels_per_group, batch_size, order, ) Output = collections.namedtuple("Output", ["Y", "op_type", "engine", "order"]) outputs = [] op_engine_list = [("Conv", ""), ("Conv", "DNNLOWP_16"), ("Int8Conv", "DNNLOWP")] for op_type, engine in op_engine_list: init_net = core.Net("test_init_net") net = core.Net("test_net") fall_back_to_NCHW = "DNNLOWP" not in engine and order == "NHWC" if fall_back_to_NCHW: X_nchw = utils.NHWC2NCHW(X) W_nchw = utils.NHWC2NCHW(W) do_quantize = "DNNLOWP" in engine do_dequantize = "DNNLOWP" in engine # If output scale/zp aren't set, it gets computed from ref fp32 op # in DNNLOWP, which isn't possible when we quantize input weights. # Make sure atleast one output is collected to compute output # scale/zp. do_quantize_weight = engine == "DNNLOWP" and len(outputs) > 0 do_prepack_weight = engine == "DNNLOWP" and prepack_weight if do_quantize: quantize = core.CreateOperator("Quantize", ["X"], ["X_q"], engine=engine, device_option=gc) net.Proto().op.extend([quantize]) x_q_param = dnnlowp_utils.choose_quantization_params( X.min(), X.max()) if do_quantize_weight: int8_given_tensor_fill, w_q_param = dnnlowp_utils.create_int8_given_tensor_fill( W, "W_q") init_net.Proto().op.extend([int8_given_tensor_fill]) # Bias int8_bias_tensor_fill = dnnlowp_utils.create_int8_bias_tensor_fill( b, "b_q", x_q_param, w_q_param) init_net.Proto().op.extend([int8_bias_tensor_fill]) if do_prepack_weight: inputs = ["W_q" if do_quantize_weight else "W"] if do_dequantize: inputs += ["b_q" if do_quantize_weight else "b"] pack = core.CreateOperator( "Int8ConvPackWeight", inputs, ["W_packed"], group=group, in_scale=x_q_param.scale, engine=engine, ) init_net.Proto().op.extend([pack]) conv = core.CreateOperator( op_type, [ "X_q" if do_quantize else "X", "W_packed" if do_prepack_weight else ("W_q" if do_quantize_weight else "W"), "b_q" if do_quantize_weight else "b", ], ["Y_q" if do_dequantize else "Y"], strides=[stride] * ndim, kernels=kernels, dilations=[dilation] * ndim, pads=[pad] * (ndim * 2), order="NCHW" if fall_back_to_NCHW else order, dequantize_output=not do_dequantize, engine=engine, group=group, device_option=gc, ) if do_quantize_weight or do_prepack_weight: # When quantized weight is provided, we can't rescale the # output dynamically by looking at the range of output of each # batch, so here we provide the range of output observed from # fp32 reference implementation dnnlowp_utils.add_quantization_param_args(conv, outputs[0][0]) net.Proto().op.extend([conv]) if do_dequantize: dequantize = core.CreateOperator("Dequantize", ["Y_q"], ["Y"], engine=engine, device_option=gc) net.Proto().op.extend([dequantize]) self.ws.create_blob("X").feed(X_nchw if fall_back_to_NCHW else X, device_option=gc) self.ws.create_blob("W").feed(W_nchw if fall_back_to_NCHW else W, device_option=gc) self.ws.create_blob("b").feed(b, device_option=gc) self.ws.run(init_net) self.ws.run(net) Y = self.ws.blobs["Y"].fetch() if fall_back_to_NCHW: Y = utils.NCHW2NHWC(Y) outputs.append( Output(Y=Y, op_type=op_type, engine=engine, order=order)) check_quantized_results_close(outputs)
def test_dnnlowp_group_norm( self, N, G, K, H, W, order, in_quantized, out_quantized, weight_quantized, gc, dc, ): C = G * K X = np.random.rand(N, C, H, W).astype(np.float32) * 5.0 - 1.0 if order == "NHWC": X = utils.NCHW2NHWC(X) gamma = np.random.rand(C).astype(np.float32) * 2.0 - 1.0 beta = np.random.randn(C).astype(np.float32) - 0.5 Output = collections.namedtuple("Output", ["Y", "op_type", "engine"]) outputs = [] op_engine_list = [ ("GroupNorm", ""), ("GroupNorm", "DNNLOWP"), ("Int8GroupNorm", "DNNLOWP"), ] for op_type, engine in op_engine_list: net = core.Net("test_net") do_quantize = "DNNLOWP" in engine and in_quantized do_dequantize = "DNNLOWP" in engine and out_quantized do_quantize_weight = (engine == "DNNLOWP" and weight_quantized and len(outputs) > 0) if do_quantize: quantize = core.CreateOperator("Quantize", ["X"], ["X_q"], engine=engine, device_option=gc) net.Proto().op.extend([quantize]) if do_quantize_weight: int8_given_tensor_fill, gamma_q_param = dnnlowp_utils.create_int8_given_tensor_fill( gamma, "gamma_q") net.Proto().op.extend([int8_given_tensor_fill]) X_min = 0 if X.size == 0 else X.min() X_max = 0 if X.size == 0 else X.max() X_q_param = dnnlowp_utils.choose_quantization_params( X_min, X_max) int8_bias_tensor_fill = dnnlowp_utils.create_int8_bias_tensor_fill( beta, "beta_q", X_q_param, gamma_q_param) net.Proto().op.extend([int8_bias_tensor_fill]) group_norm = core.CreateOperator( op_type, [ "X_q" if do_quantize else "X", "gamma_q" if do_quantize_weight else "gamma", "beta_q" if do_quantize_weight else "beta", ], ["Y_q" if do_dequantize else "Y"], dequantize_output=0 if do_dequantize else 1, group=G, order=order, is_test=True, engine=engine, device_option=gc, ) if do_quantize_weight: # When quantized weight is provided, we can't rescale the # output dynamically by looking at the range of output of each # batch, so here we provide the range of output observed from # fp32 reference implementation dnnlowp_utils.add_quantization_param_args( group_norm, outputs[0][0]) net.Proto().op.extend([group_norm]) if do_dequantize: dequantize = core.CreateOperator("Dequantize", ["Y_q"], ["Y"], engine=engine, device_option=gc) net.Proto().op.extend([dequantize]) self.ws.create_blob("X").feed(X, device_option=gc) self.ws.create_blob("gamma").feed(gamma, device_option=gc) self.ws.create_blob("beta").feed(beta, device_option=gc) self.ws.run(net) outputs.append( Output(Y=self.ws.blobs["Y"].fetch(), op_type=op_type, engine=engine)) check_quantized_results_close(outputs, atol_scale=2.0)
def _nd_convolution( self, n, input_channels_per_group, output_channels_per_group, batch_size, stride, size, kernel, dilation, pad, group, order, use_bias, engine, force_algo_fwd, force_algo_dgrad, force_algo_wgrad, gc, dc, ): # TODO: Group conv in NHWC not implemented for GPU yet. # TODO: Group 1D conv in NCHW not implemented for GPU yet. assume( group == 1 or (n != 1 and order == "NCHW") or gc.device_type == caffe2_pb2.CPU ) if group != 1 and (n == 1 or order == "NHWC"): dc = [d for d in dc if d.device_type == caffe2_pb2.CPU] input_channels = group * input_channels_per_group output_channels = group * output_channels_per_group dkernel = dilation * (kernel - 1) + 1 for op_type in ["Conv", "Conv" + str(n) + "D"]: op = core.CreateOperator( op_type, ["X", "w", "b"] if use_bias else ["X", "w"], ["Y"], strides=[stride] * n, kernels=[kernel] * n, dilations=[dilation] * n, pads=[pad] * n * 2, group=group, order=order, engine=engine, force_algo_fwd=force_algo_fwd, force_algo_dgrad=force_algo_dgrad, force_algo_wgrad=force_algo_wgrad, ) input_dims = [batch_size, input_channels] input_dims.extend([size] * n) filter_dims = [output_channels, input_channels // group] filter_dims.extend([kernel] * n) X = np.random.rand(*input_dims).astype(np.float32) - 0.5 w = np.random.rand(*filter_dims).astype(np.float32) - 0.5 b = np.random.rand(output_channels).astype(np.float32) - 0.5 if order == "NHWC": X = utils.NCHW2NHWC(X) w = utils.NCHW2NHWC(w) inputs = [X, w, b] if use_bias else [X, w] if size + pad + pad < dkernel or size + pad + pad < dkernel: with self.assertRaises(RuntimeError): self.assertDeviceChecks(dc, op, inputs, [0]) return self.assertDeviceChecks(dc, op, inputs, [0]) for i in range(len(inputs)): self.assertGradientChecks(gc, op, inputs, i, [0])
def nchw2nhwc_ref(X): return (utils.NCHW2NHWC(X), )
def test_dnnlowp_depthwise_3x3x3_conv( self, stride, size, group, batch_size, prepack_weight, fuse_relu, share_col_buffer, preserve_activation_sparsity, preserve_weight_sparsity, gc, dc, ): pad = 1 kernel = 3 dilation = 1 input_channels_per_group = 1 output_channels_per_group = 1 order = "NHWC" X, W, b = generate_convnd_inputs( (stride,) * 3, (pad,) * 3, (kernel,) * 3, (dilation,) * 3, (size,) * 3, group, input_channels_per_group, output_channels_per_group, batch_size, order, preserve_activation_sparsity=preserve_activation_sparsity, preserve_weight_sparsity=preserve_weight_sparsity, ) Output = collections.namedtuple("Output", ["Y", "op_type", "engine", "order"]) outputs = [] op = "ConvRelu" if fuse_relu else "Conv" op_engine_list = [(op, ""), (op, "DNNLOWP"), ("Int8" + op, "DNNLOWP")] for op_type, engine in op_engine_list: init_net = core.Net("test_init_net") net = core.Net("test_net") # TODO: no fall back to NCHW fall_back_to_NCHW = "DNNLOWP" not in engine if fall_back_to_NCHW: X_nchw = utils.NHWC2NCHW(X) W_nchw = utils.NHWC2NCHW(W) do_quantize = "DNNLOWP" in engine do_dequantize = "DNNLOWP" in engine do_prepack_weight = engine == "DNNLOWP" and prepack_weight if do_quantize: quantize = core.CreateOperator( "Quantize", ["X"], ["X_q"], preserve_activation_sparsity=preserve_activation_sparsity, engine=engine, device_option=gc, ) net.Proto().op.extend([quantize]) if do_prepack_weight: x_q_param = dnnlowp_utils.choose_quantization_params( X.min(), X.max(), preserve_activation_sparsity ) inputs = ["W"] if do_dequantize: inputs += ["b"] pack = core.CreateOperator( "Int8ConvPackWeight", inputs, ["W_packed"], group=group, preserve_weight_sparsity=preserve_weight_sparsity, in_scale=x_q_param.scale, engine=engine, ) init_net.Proto().op.extend([pack]) conv = core.CreateOperator( op_type, ["X_q" if do_quantize else "X", "W", "b"], ["Y_q" if do_dequantize else "Y"], strides=[stride] * 3, kernels=[kernel] * 3, dilations=[dilation] * 3, pads=[pad] * (3 * 2), order="NCHW" if fall_back_to_NCHW else order, shared_buffer=(1 if share_col_buffer else 0), preserve_activation_sparsity=preserve_activation_sparsity, preserve_weight_sparsity=preserve_weight_sparsity, engine=engine, group=group, device_option=gc, ) if do_dequantize or do_prepack_weight: dnnlowp_utils.add_quantization_param_args( conv, outputs[0][0], preserve_activation_sparsity ) net.Proto().op.extend([conv]) if do_dequantize: dequantize = core.CreateOperator( "Dequantize", ["Y_q"], ["Y"], engine=engine, device_option=gc ) net.Proto().op.extend([dequantize]) self.ws.create_blob("X").feed( X_nchw if fall_back_to_NCHW else X, device_option=gc ) self.ws.create_blob("W").feed( W_nchw if fall_back_to_NCHW else W, device_option=gc ) self.ws.create_blob("b").feed(b, device_option=gc) self.ws.run(init_net) self.ws.run(net) Y = self.ws.blobs["Y"].fetch() if fall_back_to_NCHW: Y = utils.NCHW2NHWC(Y) outputs.append(Output(Y=Y, op_type=op_type, engine=engine, order=order)) check_quantized_results_close(outputs, symmetric=preserve_activation_sparsity)