def test_1x1_conv(self, op_type, N, G, DX, DY, H, W, use_bias, order, force_algo_fwd, force_algo_dgrad, force_algo_wgrad, gc, dc): if hiputl.run_in_hip(gc, dc): assume(order == "NCHW") if order == "NHWC": G = 1 C = G * DX M = G * DY op = core.CreateOperator( op_type, ["X", "filter", "bias"] if use_bias else ["X", "filter"], ["Y"], stride_h=1, stride_w=1, pad_t=0, pad_l=0, pad_b=0, pad_r=0, kernel=1, order=order, group=G, force_algo_fwd=force_algo_fwd, force_algo_dgrad=force_algo_dgrad, force_algo_wgrad=force_algo_wgrad, ) if order == "NCHW": X = np.random.randn(N, C, H, W).astype(np.float32) filter = np.random.randn(M, DX, 1, 1).astype(np.float32) else: X = np.random.randn(N, H, W, C).astype(np.float32) filter = np.random.randn(M, 1, 1, DX).astype(np.float32) bias = np.random.randn(M).astype(np.float32) inputs = [X, filter, bias] if use_bias else [X, filter] def conv_1x1_nchw_ref(X, filter, bias=None): X = X.reshape(N, G, DX, -1) filter = filter.reshape(G, DY, DX) Y = np.zeros(shape=(N, G, DY, H * W), dtype=np.float32) for i in range(N): for j in range(G): Y[i, j, :, :] = np.dot(filter[j, :, :], X[i, j, :, :]) Y = Y.reshape(N, M, H, W) if bias is not None: bias = bias.reshape(1, M, 1, 1) Y = np.add(Y, bias) return [Y] def conv_1x1_nhwc_ref(X, filter, bias=None): X = X.reshape(N, -1, G, DX) filter = filter.reshape(G, DY, DX) Y = np.zeros(shape=(N, H * W, G, DY), dtype=np.float32) for i in range(N): for j in range(G): Y[i, :, j, :] = np.dot( X[i, :, j, :], filter[j, :, :].transpose()) Y = Y.reshape(N, H, W, M) if bias is not None: bias = bias.reshape(1, 1, 1, M) Y = np.add(Y, bias) return [Y] if order == "NCHW": conv_1x1_ref = conv_1x1_nchw_ref else: conv_1x1_ref = conv_1x1_nhwc_ref self.assertReferenceChecks( device_option=gc, op=op, inputs=inputs, reference=conv_1x1_ref, ) self.assertDeviceChecks(dc, op, inputs, [0]) for i in range(len(inputs)): self.assertGradientChecks(gc, op, inputs, i, [0])
def testEnforce(self): op = core.CreateOperator("Relu", ["X"], ["Y"]) with self.assertRaises(RuntimeError): workspace.RunOperatorOnce(op)
def test_cross_entropy_and_unjoied_cross_entropy_relation( self, log_D_trick, gc, dc ): logits = np.array([1.4720, 0.3500, -0.6529, -1.1908, 0.8357, -1.0774, -0.3395, -0.2469, 0.6708, -1.8332], dtype='f') targets = np.array([1., 1., 1., 1., 1., 1., 0., 0., 0., 0.], dtype='f') lr_size = targets.size unjoined_lr_loss = False def sigmoid_xentr_logit_ref(logits, targets): if unjoined_lr_loss: s = unjoined_sigmoid_cross_entropy(logits, targets) else: s = sigmoid_cross_entropy_with_logits(logits, targets) m = np.mean(s, axis=len(logits.shape) - 1) return (m, ) def sigmoid_xentr_logit_grad_ref(g_out, outputs, fwd_inputs): fwd_logits, fwd_targets = fwd_inputs inner_size = fwd_logits.shape[-1] if unjoined_lr_loss: m = unjoined_sigmoid_cross_entropy_grad(logits, targets) else: m = sigmoid_cross_entropy_with_logits_grad( fwd_logits, fwd_targets) # m = fwd_targets - sigmoid(fwd_logits) g_in = -np.expand_dims(g_out, axis=-1) * m / inner_size return (g_in, None) op = core.CreateOperator( 'SigmoidCrossEntropyWithLogits', ['logits', 'targets'], ['xentropy'], log_D_trick=log_D_trick, unjoined_lr_loss=unjoined_lr_loss ) output_lr = self.assertReferenceChecks( device_option=gc, op=op, inputs=[logits, targets], reference=sigmoid_xentr_logit_ref, output_to_grad='xentropy', grad_reference=sigmoid_xentr_logit_grad_ref) # Unjoined dataset where labels change later logits = np.array([1.4720, 0.3500, -0.6529, -1.1908, 0.8357, -1.0774, -0.3395, -0.2469, 0.6708, -1.8332, 1.4720, 0.3500, -0.6529, -1.1908, 0.8357, -1.0774], dtype='f') targets = np.array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1.], dtype='f') unjoined_lr_loss = True unjoined_lr_size = targets.size op = core.CreateOperator( 'SigmoidCrossEntropyWithLogits', ['logits', 'targets'], ['xentropy'], log_D_trick=log_D_trick, unjoined_lr_loss=unjoined_lr_loss ) outputs_unjoined_lr = self.assertReferenceChecks( device_option=gc, op=op, inputs=[logits, targets], reference=sigmoid_xentr_logit_ref, output_to_grad='xentropy', grad_reference=sigmoid_xentr_logit_grad_ref) self.assertAlmostEqual( output_lr[0].item(0) * lr_size / unjoined_lr_size, outputs_unjoined_lr[0].item(0), delta=0.0001)
def test_groupwise_dnnlowp_conv_relu_int( self, stride, pad, kernel, dilation, size, group, input_channels_per_group, output_channels_per_group, batch_size, order, gc, dc, ): assume(group == 1 or dilation == 1) X, W, b = generate_conv_inputs( stride, pad, kernel, dilation, size, group, input_channels_per_group, output_channels_per_group, batch_size, order, True, # group-wise ) Output = collections.namedtuple("Output", ["Y", "op_type", "engine", "order"]) outputs = [] op_engine_list = [ ("Conv", ""), ("ConvRelu", "DNNLOWP"), ("ConvRelu", "DNNLOWP_16"), ("Int8ConvRelu", "DNNLOWP"), ] for op_type, engine in op_engine_list: net = core.Net("test_net") if "DNNLOWP" in engine: quantize = core.CreateOperator("Quantize", ["X"], ["X_q"], engine=engine, device_option=gc) net.Proto().op.extend([quantize]) conv = core.CreateOperator( op_type, ["X_q", "W", "b"], ["Y_q"], stride=stride, kernel=kernel, dilation=dilation, pad=pad, order=order, engine=engine, group=group, quantize_groupwise=1, device_option=gc, ) # groupwise quantization only works with static quantization # so we need to set quantization parameters dnnlowp_utils.add_quantization_param_args(conv, outputs[0][0]) net.Proto().op.extend([conv]) dequantize = core.CreateOperator("Dequantize", ["Y_q"], ["Y"], engine=engine, device_option=gc) net.Proto().op.extend([dequantize]) else: conv = core.CreateOperator( op_type, ["X", "W", "b"], ["Y"], stride=stride, kernel=kernel, dilation=dilation, pad=pad, order=order, engine=engine, group=group, device_option=gc, ) net.Proto().op.extend([conv]) relu = core.CreateOperator("Relu", ["Y"], ["Y"], engine=engine, device_option=gc) net.Proto().op.extend([relu]) run_conv_or_fc(self, None, net, X, W, b, op_type, engine, order, gc, outputs) check_quantized_results_close(outputs)
def test_layernorm(self, seed): np.random.seed(seed) # Reset the workspace size = 4 input_channels = 4 batch_size = 1 axis = 1 epsilon = 1e-4 workspace.ResetWorkspace() dims = np.array(([batch_size, input_channels, size, size])) X = np.random.uniform(size=dims).astype(np.float32) - 0.5 gamma = np.random.randn(*X.shape[axis:]).astype(np.float32) beta = np.random.randn(*X.shape[axis:]).astype(np.float32) pred_net = caffe2_pb2.NetDef() pred_net.name = "pred" pred_net.external_input.extend(["X", "gamma", "beta"]) pred_net.external_output.extend(["Y", "mean", "rstd"]) pred_net.op.add().CopyFrom( core.CreateOperator( "LayerNorm", ["X", "gamma", "beta"], ["Y", "mean", "rstd"], axis=1, epsilon=epsilon, elementwise_affine=True ) ) pred_net_ref = caffe2_pb2.NetDef() pred_net_ref.name = "pred_ref" pred_net_ref.external_input.extend(["X", "gamma", "beta"]) pred_net_ref.external_output.extend(["Y", "mean", "rstd"]) pred_net_ref.op.add().CopyFrom( core.CreateOperator( "LayerNormFakeFP16NNPI", ["X", "gamma", "beta"], ["Y", "mean", "rstd"], axis=1, epsilon=epsilon, elementwise_affine=True ) ) shape_hits = {"X": X.shape, "gamma": gamma.shape, "beta": beta.shape} pred_net_onnxified = onnxifi_caffe2_net( pred_net, shape_hits, debug=True, adjust_batch=True, use_onnx=False ) num_onnxified_ops = sum( 1 if o.type == "Onnxifi" else 0 for o in pred_net_onnxified.op) np.testing.assert_equal(num_onnxified_ops, 1) workspace.FeedBlob("X", X) workspace.FeedBlob("gamma", gamma) workspace.FeedBlob("beta", beta) workspace.CreateNet(pred_net_ref) workspace.CreateNet(pred_net_onnxified) workspace.RunNet(pred_net_ref.name) Y_c2 = workspace.FetchBlob("Y") workspace.RunNet(pred_net_onnxified.name) Y_glow = workspace.FetchBlob("Y") if not np.allclose(Y_glow.astype(np.float16), Y_c2.astype(np.float16)): diff_Y = np.abs(Y_glow - Y_c2).astype(np.float16) print_test_debug_info( "layernorm", { "seed": seed, "size": size, "input_channels": input_channels, "batch_size": batch_size, "epsilon": epsilon, "axis": axis, "X": X, "Y_glow": Y_glow, "Y_c2": Y_c2, "diff_Y": diff_Y, } ) assert(0)
def test_convert_end2end(self): predict_net_f = tempfile.NamedTemporaryFile() init_net_f = tempfile.NamedTemporaryFile() onnx_model_f = tempfile.NamedTemporaryFile() x = 'X' w = 'W' b = 'b' y = 'Y' predict_net = caffe2_pb2.NetDef() predict_net.name = 'test-convert-end2end' predict_net.external_input[:] = [x, w, b] predict_net.external_output[:] = [y] predict_net.op.extend([ core.CreateOperator( 'FC', inputs=[x, w, b], outputs=[y], axis=2, ), ]) predict_net_f.write(predict_net.SerializeToString()) predict_net_f.flush() init_net = caffe2_pb2.NetDef() init_net.name = 'test-convert-end2end-init' init_net.external_output[:] = [w, b] x_val = np.random.randn(1, 3, 2).astype(np.float32) w_val = np.random.randn(4, 2).astype(np.float32) b_val = np.random.randn(4).astype(np.float32) init_net.op.extend([ core.CreateOperator( 'GivenTensorFill', [], [w], values=w_val, shape=w_val.shape, ), core.CreateOperator( 'GivenTensorFill', [], [b], values=b_val, shape=b_val.shape, ), ]) init_net_f.write(init_net.SerializeToString()) init_net_f.flush() y_val = np.matmul(x_val, w_val.transpose()) + b_val for _ in range(5): self._run_command( caffe2_to_onnx, [ predict_net_f.name, '--caffe2-init-net', init_net_f.name, '--output', onnx_model_f.name, '--value-info', json.dumps({ x: (TensorProto.FLOAT, (1, 3, 2)), }), ], catch_exceptions=False, ) onnx_model_f.seek(0) onnx_model = ModelProto() onnx_model.ParseFromString(onnx_model_f.read()) np.testing.assert_almost_equal( c2.run_model(onnx_model, {onnx_model.graph.input[0].name: x_val}), [y_val]) self._run_command(onnx_to_caffe2, [ onnx_model_f.name, '--output', predict_net_f.name, '--init-net-output', init_net_f.name, ]) predict_net_f.seek(0) predict_net = caffe2_pb2.NetDef() predict_net.ParseFromString(predict_net_f.read()) init_net_f.seek(0) init_net = caffe2_pb2.NetDef() init_net.ParseFromString(init_net_f.read()) x = predict_net.external_input[0] np.testing.assert_almost_equal( c2_native_run_net(init_net=init_net, predict_net=predict_net, inputs={x: x_val})[1], [y_val])
def test_dnnlowp_average_pool( self, ndim, stride, pad, kernel, size, input_channels, batch_size, order, in_quantized, gc, dc, ): kernel = 2 # Only kernel size 2 is supported assume(kernel <= size) assume(pad < kernel) C = input_channels N = batch_size strides = (stride, ) * ndim pads = (pad, ) * (ndim * 2) kernels = (kernel, ) * ndim sizes = (size, ) * ndim # X has scale 1, so no input quantization error min_ = -100 max_ = min_ + 255 if order == "NCHW": X = np.round( np.random.rand(*((N, C) + sizes)) * (max_ - min_) + min_) X = X.astype(np.float32) X[(0, ) * (ndim + 2)] = min_ X[(0, ) * (ndim + 1) + (1, )] = max_ elif order == "NHWC": X = np.round( np.random.rand(*((N, ) + sizes + (C, ))) * (max_ - min_) + min_) X = X.astype(np.float32) X[(0, ) * (ndim + 2)] = min_ X[(0, 1) + (0, ) * ndim] = max_ Output = collections.namedtuple("Output", ["Y", "op_type", "engine"]) outputs = [] op_engine_list = [ ("AveragePool", ""), ("AveragePool", "DNNLOWP"), ("Int8AveragePool", "DNNLOWP"), ] for op_type, engine in op_engine_list: net = core.Net("test_net") do_quantize = "DNNLOWP" in engine and in_quantized if do_quantize: quantize = core.CreateOperator("Quantize", ["X"], ["X_q"], engine=engine, device_option=gc) net.Proto().op.extend([quantize]) max_pool = core.CreateOperator( op_type, ["X_q" if do_quantize else "X"], ["Y_q" if engine == "DNNLOWP" else "Y"], strides=strides, kernels=kernels, pads=pads, order=order, engine=engine, device_option=gc, ) net.Proto().op.extend([max_pool]) if engine == "DNNLOWP": dequantize = core.CreateOperator("Dequantize", ["Y_q"], ["Y"], engine=engine, device_option=gc) net.Proto().op.extend([dequantize]) self.ws.create_blob("X").feed(X, device_option=gc) self.ws.run(net) outputs.append( Output(Y=self.ws.blobs["Y"].fetch(), op_type=op_type, engine=engine)) check_quantized_results_close(outputs)
def testCreateWithNoneKwarg(self): with self.assertRaises(ValueError): core.CreateOperator("Ludicrous", "x", "y", arg1=None)
def initializer(blob_name): return core.CreateOperator(op_name, [], blob_name, shape=array.shape, values=array.flatten().tolist())
def hard_sigmoid(x): x = (x + 1.) / 2. workspace.FeedBlob("x", x) Clip = core.CreateOperator("Clip", ["x"], ["x"], min=0., max=1.) workspace.RunOperatorOnce(Clip) return workspace.FetchBlob("x")
def share_freeze_blobs( net, namescope, ): log.warn("NOTE: Executing memonger to optimize gradient memory") # Collect ops that have something to do with gradients if namescope != "" and not namescope.endswith("/"): namescope += "/" netproto = copy.deepcopy(net.Proto()) new_net = copy.deepcopy(net) activations = [] external_input = set(new_net.Proto().external_input) external_output = set(new_net.Proto().external_output) start_idx = -1 end_idx = -1 # ops for idx, op in enumerate(new_net._net.op): # print(op) if namescope not in op.input[0]: continue if op.type == 'Conv' and start_idx < 0: start_idx = idx if op.type == 'StopGradient': end_idx = idx # print(namescope, 'start_idx: ', start_idx, ' end_idx: ', end_idx) # Hacky way to get activations, think of a better way for idx, op in enumerate(new_net._net.op[start_idx:end_idx]): if namescope not in op.input[0]: continue for b in op.output: if b not in external_output: activations.append(b) # print('activations: ', activations) used_activations = [] for a in activations: if a in used_activations: continue share_pool = [ namescope + '_shared_' + str(i) for i in range(1000, 10000) ] # print(a) first_idx = -1 for idx, op in enumerate(new_net._net.op): if namescope not in op.input[0]: continue if a in list(op.input) + list(op.output): first_idx = idx break assert first_idx >= 0, first_idx for idx, op in enumerate(new_net._net.op[first_idx:]): if namescope not in op.input[0]: continue for b in list(op.input) + list(op.output): if b in share_pool: share_pool.remove(b) for idx, op in enumerate(new_net._net.op): if namescope not in op.input[0]: continue op_input = copy.deepcopy(op.input) is_found = False for i, b in enumerate(op_input): if a == b: op_input[i] = share_pool[-1] is_found = True if is_found: del new_net._net.op[idx].input[:] new_net._net.op[idx].input.extend(op_input) op_output = copy.deepcopy(op.output) is_found = False for i, b in enumerate(op_output): if a == b: op_output[i] = share_pool[-1] is_found = True if is_found: del new_net._net.op[idx].output[:] new_net._net.op[idx].output.extend(op_output) used_activations.append(a) assert verify_graph_equality(net.Proto(), new_net.Proto()), \ "Memonger graph is not equal to original." assert verify_inplace_blobs(net.Proto(), new_net.Proto()), \ "Inplace assignments differ in memonger net." share_pool = [namescope + '_shared_' + str(i) for i in range(1000, 10000)] share_pool_used = {} for idx, op in enumerate(new_net._net.op): if namescope not in op.input[0]: continue for b in list(op.input) + list(op.output): if b in share_pool: share_pool_used[b] = idx for idx, op in enumerate(new_net._net.op[end_idx:]): if namescope not in op.input[0]: continue for b in list(op.input) + list(op.output): if b in share_pool_used.keys(): share_pool_used.pop(b) ops = list(new_net._net.op) for inp in share_pool_used.keys(): # print('free: ', inp) # new_net.Free([inp], [inp]) ops.insert(share_pool_used[inp] + 1, core.CreateOperator("Free", [inp], [inp])) del new_net._net.op[:] new_net._net.op.extend(ops) return new_net.Proto()
def tt(x): workspace.FeedBlob("x", x) Clip = core.CreateOperator("Clip", ["x"], ["x"], min=0., max=1.) workspace.RunOperatorOnce(Clip) return workspace.FetchBlob("x")
def test_dnnlowp_elementwise_add_int(self, N, is_empty, in_quantized, out_quantized, in_place, gc, dc): if is_empty: N = 0 # FIXME: DNNLOWP Add doesn't support inplace operation and # dequantize_output=1 at the same time if in_place[0] or in_place[1]: in_quantized = True out_quantized = True # A has scale 1, so exactly represented after quantization min_ = -100 max_ = min_ + 255 A = np.round(np.random.rand(N) * (max_ - min_) + min_) A = A.astype(np.float32) if N != 0: A[0] = min_ A[1] = max_ # B has scale 1/2, so exactly represented after quantization B = np.round(np.random.rand(N) * 255 / 2 - 64).astype(np.float32) if N != 0: B[0] = -64 B[1] = 127.0 / 2 Output = collections.namedtuple("Output", ["Y", "op_type", "engine"]) outputs = [] op_engine_list = [("Add", ""), ("Add", "DNNLOWP"), ("Int8Add", "DNNLOWP")] for op_type, engine in op_engine_list: net = core.Net("test_net") do_quantize = "DNNLOWP" in engine and in_quantized do_dequantize = "DNNLOWP" in engine and out_quantized if do_quantize: quantize_A = core.CreateOperator("Quantize", ["A"], ["A_q"], engine=engine, device_option=gc) net.Proto().op.extend([quantize_A]) quantize_B = core.CreateOperator("Quantize", ["B"], ["B_q"], engine=engine, device_option=gc) net.Proto().op.extend([quantize_B]) out = "Y" if in_place[0]: out = "A" elif in_place[1]: out = "B" add = core.CreateOperator( op_type, ["A_q", "B_q"] if do_quantize else ["A", "B"], [(out + "_q") if do_dequantize else out], dequantize_output=not do_dequantize, engine=engine, device_option=gc, ) net.Proto().op.extend([add]) if do_dequantize: dequantize = core.CreateOperator("Dequantize", [out + "_q"], [out], engine=engine, device_option=gc) net.Proto().op.extend([dequantize]) self.ws.create_blob("A").feed(A, device_option=gc) self.ws.create_blob("B").feed(B, device_option=gc) self.ws.run(net) outputs.append( Output(Y=self.ws.blobs[out].fetch(), op_type=op_type, engine=engine)) check_quantized_results_close(outputs)
def test_mkl_sigmoid(self, n, m, inplace, gc, dc): X = np.random.rand(m, n).astype(np.float32) op = core.CreateOperator("Sigmoid", ["X"], ["Y" if not inplace else "X"]) self.assertDeviceChecks(dc, op, [X], [0])
def test_dnnlowp_conv_acc16_outlier( self, stride, pad, kernel, dilation, size, group, input_channels_per_group, output_channels_per_group, batch_size, order, in_quantized, out_quantized, weight_quantized, prepack_weight, nbits_in_non_outlier, share_col_buffer, preserve_activation_sparsity, preserve_weight_sparsity, gc, dc, ): assume(group == 1 or dilation == 1) assume(size >= dilation * (kernel - 1) + 1) input_channels = input_channels_per_group * group output_channels = output_channels_per_group * group if nbits_in_non_outlier == 0: X, W, b = generate_conv_inputs( stride, pad, kernel, dilation, size, group, input_channels_per_group, output_channels_per_group, batch_size, order, preserve_activation_sparsity=preserve_activation_sparsity, preserve_weight_sparsity=preserve_weight_sparsity, ) else: X_min = 0 if preserve_activation_sparsity else -77 X_max = X_min + 255 X = np.random.rand(batch_size, size, size, input_channels) * 4 + X_min X = np.round(X).astype(np.float32) X[..., 0] = X_min X[0, 0, 0, 1] = X_max if preserve_weight_sparsity: W_min = -128 W_max = 100 else: W_min = -100 W_max = W_min + 255 W = (np.random.rand(output_channels, kernel, kernel, input_channels_per_group) * 4 - 2 + W_min + 128) W = np.round(W).astype(np.float32) W[0, 0, 0, 0] = W_min W[1, 0, 0, 0] = W_max W[..., 1] = W_min + 128 # No input quantization error in bias b = np.round(np.random.randn(output_channels)).astype(np.float32) Output = collections.namedtuple("Output", ["Y", "op_type", "engine", "order"]) outputs = [] op_engine_list = [ ("Conv", ""), ("Conv", "DNNLOWP_ACC16"), ("Int8Conv", "DNNLOWP_ACC16"), ] for op_type, engine in op_engine_list: init_net = core.Net("test_init_net") net = core.Net("test_net") do_quantize = "DNNLOWP" in engine and in_quantized do_dequantize = "DNNLOWP" in engine and out_quantized do_quantize_weight = "DNNLOWP" in engine and weight_quantized do_prepack_weight = "DNNLOWP" in engine and prepack_weight if do_quantize: quantize = core.CreateOperator( "Quantize", ["X"], ["X_q"], preserve_activation_sparsity=preserve_activation_sparsity, engine="DNNLOWP", device_option=gc, ) net.Proto().op.extend([quantize]) x_q_param = dnnlowp_utils.choose_quantization_params( X.min(), X.max(), preserve_activation_sparsity) if do_quantize_weight: int8_given_tensor_fill, w_q_param = dnnlowp_utils.create_int8_given_tensor_fill( W, "W_q", preserve_weight_sparsity) init_net.Proto().op.extend([int8_given_tensor_fill]) # Bias int8_bias_tensor_fill = dnnlowp_utils.create_int8_bias_tensor_fill( b, "b_q", x_q_param, w_q_param) init_net.Proto().op.extend([int8_bias_tensor_fill]) if do_prepack_weight: inputs = ["W_q" if do_quantize_weight else "W"] if do_dequantize: inputs += ["b_q" if do_quantize_weight else "b"] pack = core.CreateOperator( "Int8ConvPackWeight", inputs, ["W_packed"], group=group, nbits_in_non_outlier=nbits_in_non_outlier, preserve_weight_sparsity=preserve_weight_sparsity, in_scale=x_q_param.scale, engine=engine, ) init_net.Proto().op.extend([pack]) conv = core.CreateOperator( op_type, [ "X_q" if do_quantize else "X", "W_packed" if do_prepack_weight else ("W_q" if do_quantize_weight else "W"), "b_q" if do_quantize_weight else "b", ], ["Y_q" if do_dequantize else "Y"], stride=stride, kernel=kernel, dilation=dilation, pad=pad, order=order, dequantize_output=not do_dequantize, nbits_in_non_outlier=nbits_in_non_outlier, shared_buffer=(1 if share_col_buffer else 0), preserve_activation_sparsity=preserve_activation_sparsity, preserve_weight_sparsity=preserve_weight_sparsity, engine=engine, group=group, device_option=gc, ) if do_dequantize or do_quantize_weight or do_prepack_weight: # When quantized weight is provided, we can't rescale the # output dynamically by looking at the range of output of each # batch, so here we provide the range of output observed from # fp32 reference implementation dnnlowp_utils.add_quantization_param_args( conv, outputs[0][0], preserve_activation_sparsity) net.Proto().op.extend([conv]) if do_dequantize: dequantize = core.CreateOperator("Dequantize", ["Y_q"], ["Y"], engine="DNNLOWP", device_option=gc) net.Proto().op.extend([dequantize]) self.ws.create_blob("X").feed(X, device_option=gc) self.ws.create_blob("W").feed(W, device_option=gc) self.ws.create_blob("b").feed(b, device_option=gc) self.ws.run(init_net) self.ws.run(net) Y = self.ws.blobs["Y"].fetch() outputs.append( Output(Y=Y, op_type=op_type, engine=engine, order=order)) check_quantized_results_close(outputs, symmetric=preserve_activation_sparsity)
def test_upsample_grad(self, height_scale, width_scale, height, width, num_channels, batch_size, seed, gc, dc): np.random.seed(seed) output_height = np.int32(height * height_scale) output_width = np.int32(width * width_scale) X = np.random.rand(batch_size, num_channels, height, width).astype(np.float32) dY = np.random.rand(batch_size, num_channels, output_height, output_width).astype(np.float32) scales = np.array([height_scale, width_scale]).astype(np.float32) ops = [ ( core.CreateOperator( "UpsampleBilinearGradient", ["dY", "X"], ["dX"], width_scale=width_scale, height_scale=height_scale, ), [dY, X], ), ( core.CreateOperator( "UpsampleBilinearGradient", ["dY", "X", "scales"], ["dX"], ), [dY, X, scales], ), ] for op, inputs in ops: def ref(dY, X, scales=None): dX = np.zeros_like(X) rheight = ((height - 1) / (output_height - 1) if output_height > 1 else float(0)) rwidth = ((width - 1) / (output_width - 1) if output_width > 1 else float(0)) for i in range(output_height): h1r = rheight * i h1 = int(h1r) h1p = 1 if h1 < height - 1 else 0 h1lambda = h1r - h1 h0lambda = float(1) - h1lambda for j in range(output_width): w1r = rwidth * j w1 = int(w1r) w1p = 1 if w1 < width - 1 else 0 w1lambda = w1r - w1 w0lambda = float(1) - w1lambda dX[:, :, h1, w1] += ( h0lambda * w0lambda * dY[:, :, i, j]) dX[:, :, h1, w1 + w1p] += ( h0lambda * w1lambda * dY[:, :, i, j]) dX[:, :, h1 + h1p, w1] += ( h1lambda * w0lambda * dY[:, :, i, j]) dX[:, :, h1 + h1p, w1 + w1p] += ( h1lambda * w1lambda * dY[:, :, i, j]) return dX, self.assertDeviceChecks(dc, op, inputs, [0]) self.assertReferenceChecks(gc, op, inputs, ref)
def test_dnnlowp_conv_acc16_int( self, stride, pad, kernel, dilation, size, group, input_channels_per_group, output_channels_per_group, batch_size, order, in_quantized, out_quantized, weight_quantized, share_col_buffer, preserve_activation_sparsity, preserve_weight_sparsity, gc, dc, ): assume(group == 1 or dilation == 1) assume(size >= dilation * (kernel - 1) + 1) input_channels = input_channels_per_group * group output_channels = output_channels_per_group * group # X and W have scale 1, so exactly represented after quantization # This was made sure by having at least one 0 and one 255 for unsigned # 8-bit tensors, and at least one -128 and one 127 for signed 8-bit # tensors. # Since fbgemm_acc16 accumulates to 16-bit, To avoid overflow, we use # small numbers except for those 0, 255, -128, and 127, for this test # We also make sure 255, -128, or 127 are not multiplied together by # putting them in different input channels and the corresponding input # channel in other matrix is 0. # For example, we put 255 in input channel 1 in X, so we make the # corresponding input channel in W all zeros. X_min = 0 if preserve_activation_sparsity else -77 X_max = X_min + 255 X = np.random.rand(batch_size, size, size, input_channels) * 4 + X_min X = np.round(X).astype(np.float32) X[..., 0] = X_min X[0, 0, 0, 1] = X_max if preserve_weight_sparsity: W_min = -128 W_max = 100 else: W_min = -100 W_max = W_min + 255 W = (np.random.rand(output_channels, kernel, kernel, input_channels_per_group) * 4 - 2 + W_min + 128) W = np.round(W).astype(np.float32) W[0, 0, 0, 0] = W_min W[1, 0, 0, 0] = W_max W[..., 1] = W_min + 128 # "zeros" if order == "NCHW": X = nhwc2nchw(X) W = nhwc2nchw(W) # No input quantization error in bias b = np.round(np.random.randn(output_channels)).astype(np.float32) Output = collections.namedtuple("Output", ["Y", "op_type", "engine", "order"]) outputs = [] op_engine_list = [ ("Conv", ""), ("Conv", "DNNLOWP_ACC16"), ("Int8Conv", "DNNLOWP_ACC16"), ] for op_type, engine in op_engine_list: net = core.Net("test_net") do_quantize = "DNNLOWP" in engine and in_quantized do_dequantize = "DNNLOWP" in engine and out_quantized do_quantize_weight = ("DNNLOWP" in engine and weight_quantized and len(outputs) > 0) if do_quantize: quantize = core.CreateOperator( "Quantize", ["X"], ["X_q"], preserve_activation_sparsity=preserve_activation_sparsity, engine="DNNLOWP", device_option=gc, ) net.Proto().op.extend([quantize]) if do_quantize_weight: int8_given_tensor_fill, w_q_param = dnnlowp_utils.create_int8_given_tensor_fill( W, "W_q", preserve_weight_sparsity) net.Proto().op.extend([int8_given_tensor_fill]) # Bias x_q_param = dnnlowp_utils.choose_quantization_params( X.min(), X.max(), preserve_activation_sparsity) int8_bias_tensor_fill = dnnlowp_utils.create_int8_bias_tensor_fill( b, "b_q", x_q_param, w_q_param) net.Proto().op.extend([int8_bias_tensor_fill]) conv = core.CreateOperator( op_type, [ "X_q" if do_quantize else "X", "W_q" if do_quantize_weight else "W", "b_q" if do_quantize_weight else "b", ], ["Y_q" if do_dequantize else "Y"], stride=stride, kernel=kernel, dilation=dilation, pad=pad, order=order, dequantize_output=not do_dequantize, shared_buffer=(1 if share_col_buffer else 0), preserve_activation_sparsity=preserve_activation_sparsity, preserve_weight_sparsity=preserve_weight_sparsity, engine=engine, group=group, device_option=gc, ) if do_dequantize or do_quantize_weight: # When quantized weight is provided, we can't rescale the # output dynamically by looking at the range of output of each # batch, so here we provide the range of output observed from # fp32 reference implementation dnnlowp_utils.add_quantization_param_args( conv, outputs[0][0], preserve_activation_sparsity) net.Proto().op.extend([conv]) if do_dequantize: dequantize = core.CreateOperator("Dequantize", ["Y_q"], ["Y"], engine="DNNLOWP", device_option=gc) net.Proto().op.extend([dequantize]) self.ws.create_blob("X").feed(X, device_option=gc) self.ws.create_blob("W").feed(W, device_option=gc) self.ws.create_blob("b").feed(b, device_option=gc) self.ws.run(net) Y = self.ws.blobs["Y"].fetch() outputs.append( Output(Y=Y, op_type=op_type, engine=engine, order=order)) check_quantized_results_close(outputs, symmetric=preserve_activation_sparsity)
def test_upsample(self, height_scale, width_scale, height, width, num_channels, batch_size, seed, gc, dc): np.random.seed(seed) X = np.random.rand( batch_size, num_channels, height, width).astype(np.float32) scales = np.array([height_scale, width_scale]).astype(np.float32) ops = [ ( core.CreateOperator( "UpsampleBilinear", ["X"], ["Y"], width_scale=width_scale, height_scale=height_scale, ), [X], ), ( core.CreateOperator( "UpsampleBilinear", ["X", "scales"], ["Y"], ), [X, scales], ), ] for op, inputs in ops: def ref(X, scales=None): output_height = np.int32(height * height_scale) output_width = np.int32(width * width_scale) Y = np.random.rand( batch_size, num_channels, output_height, output_width).astype(np.float32) rheight = ((height - 1) / (output_height - 1) if output_height > 1 else float(0)) rwidth = ((width - 1) / (output_width - 1) if output_width > 1 else float(0)) for i in range(output_height): h1r = rheight * i h1 = int(h1r) h1p = 1 if h1 < height - 1 else 0 h1lambda = h1r - h1 h0lambda = float(1) - h1lambda for j in range(output_width): w1r = rwidth * j w1 = int(w1r) w1p = 1 if w1 < width - 1 else 0 w1lambda = w1r - w1 w0lambda = float(1) - w1lambda Y[:, :, i, j] = (h0lambda * ( w0lambda * X[:, :, h1, w1] + w1lambda * X[:, :, h1, w1 + w1p]) + h1lambda * (w0lambda * X[:, :, h1 + h1p, w1] + w1lambda * X[:, :, h1 + h1p, w1 + w1p])) return Y, self.assertReferenceChecks(gc, op, inputs, ref) self.assertDeviceChecks(dc, op, inputs, [0]) self.assertGradientChecks(gc, op, inputs, 0, [0], stepsize=0.1, threshold=1e-2)
# Let's print the current workspace. Note that there is nothing in the # workspace yet. print("Current workspace: {}".format(workspace.CurrentWorkspace())) print("Current blobs in the workspace: {}".format(workspace.Blobs())) workspace.SwitchWorkspace("default") print("Current workspace: {}".format(workspace.CurrentWorkspace())) print("Current blobs in the workspace: {}".format(workspace.Blobs())) workspace.ResetWorkspace() # Create an operator. op = core.CreateOperator( "Relu", # The type of operator that we want to run ["X"], # A list of input blobs by their names ["Y"], # A list of output blobs by their names ) # and we are done! print("Type of the created op is: {}".format(type(op))) print("Content:\n") print(str(op)) workspace.FeedBlob("X", np.random.randn(2, 3).astype(np.float32)) workspace.RunOperatorOnce(op) print("Current blobs in the workspace: {}\n".format(workspace.Blobs())) print("X:\n{}\n".format(workspace.FetchBlob("X"))) print("Y:\n{}\n".format(workspace.FetchBlob("Y"))) print("Expected:\n{}\n".format(np.maximum(workspace.FetchBlob("X"), 0)))
def test_broadcast_powt(self, gc, dc): np.random.seed(101) #operator def powt_op(X, Y): return [np.power(X, Y)] #two gradients Y*X^(Y-1) and X^Y * ln(X) def powt_grad(g_out, outputs, fwd_inputs): [X, Y] = fwd_inputs Z = outputs[0] return ([Y * np.power(X, Y - 1), Z * np.log(X)] * g_out) #1. Set broadcast and no axis, i.e. broadcasting last dimensions. X = np.random.rand(2, 3, 4, 5).astype(np.float32) + 1.0 Y = np.random.rand(4, 5).astype(np.float32) + 2.0 #two gradients Y*X^(Y-1) and X^Y * ln(X) #latter gradient is sumed over 1 and 0 dims to account for broadcast def powt_grad_broadcast(g_out, outputs, fwd_inputs): [GX, GY] = powt_grad(g_out, outputs, fwd_inputs) return ([GX, np.sum(np.sum(GY, 1), 0)]) op = core.CreateOperator("Pow", ["X", "Y"], "Z", broadcast=1) self.assertReferenceChecks(device_option=gc, op=op, inputs=[X, Y], reference=powt_op, output_to_grad="Z", grad_reference=powt_grad_broadcast) #2. broadcasting intermediate dimensions X = np.random.rand(2, 3, 4, 5).astype(np.float32) + 1.0 Y = np.random.rand(3, 4).astype(np.float32) + 2.0 #pow op with the latter array increased by one dim def powt_op_axis1(X, Y): return powt_op(X, Y[:, :, np.newaxis]) #two gradients Y*X^(Y-1) and X^Y * ln(X) #latter gradient is sumed over 3 and 0 dims to account for broadcast def powt_grad_axis1(g_out, outputs, fwd_inputs): [X, Y] = fwd_inputs [GX, GY] = powt_grad(g_out, outputs, [X, Y[:, :, np.newaxis]]) return ([GX, np.sum(np.sum(GY, 3), 0)]) op = core.CreateOperator("Pow", ["X", "Y"], "Z", broadcast=1, axis=1) self.assertReferenceChecks(device_option=gc, op=op, inputs=[X, Y], reference=powt_op_axis1, output_to_grad="Z", grad_reference=powt_grad_axis1) #3. broadcasting the first dimension X = np.random.rand(2, 3, 4, 5).astype(np.float32) + 1.0 Y = np.random.rand(2).astype(np.float32) + 2.0 #pow op with the latter array increased by one dim def powt_op_axis0(X, Y): return powt_op(X, Y[:, np.newaxis, np.newaxis, np.newaxis]) #two gradients Y*X^(Y-1) and X^Y * ln(X) #latter gradient is sumed over 3, 2 and 1 dims to account for broadcast def powt_grad_axis0(g_out, outputs, fwd_inputs): [X, Y] = fwd_inputs [GX, GY] = powt_grad(g_out, outputs, [X, Y[:, np.newaxis, np.newaxis, np.newaxis]]) return ([GX, np.sum(np.sum(np.sum(GY, 3), 2), 1)]) op = core.CreateOperator("Pow", ["X", "Y"], "Z", broadcast=1, axis=0) self.assertReferenceChecks(device_option=gc, op=op, inputs=[X, Y], reference=powt_op_axis0, output_to_grad="Z", grad_reference=powt_grad_axis0) #4. broadcasting with single elem dimensions at both ends X = np.random.rand(2, 3, 4, 5).astype(np.float32) + 1.0 Y = np.random.rand(1, 4, 1).astype(np.float32) + 2.0 #pow op with the latter array increased by one dim def powt_op_mixed(X, Y): return powt_op(X, Y[np.newaxis, :, :, :]) #two gradients Y*X^(Y-1) and X^Y * ln(X) #latter gradient is sumed over 0 and 1 dims to account for broadcast def powt_grad_mixed(g_out, outputs, fwd_inputs): [X, Y] = fwd_inputs [GX, GY] = powt_grad(g_out, outputs, [X, Y[np.newaxis, :, :, :]]) return ([ GX, np.reshape(np.sum(np.sum(np.sum(GY, 3), 1), 0), (1, 4, 1)) ]) op = core.CreateOperator("Pow", ["X", "Y"], "Z", broadcast=1, axis=1) self.assertReferenceChecks(device_option=gc, op=op, inputs=[X, Y], reference=powt_op_mixed, output_to_grad="Z", grad_reference=powt_grad_mixed)
def test_dnnlowp_max_pool( self, stride, pad, kernel, size, input_channels, batch_size, order, in_quantized, gc, dc, ): assume(kernel <= size) assume(pad < kernel) C = input_channels N = batch_size H = W = size min_ = -10 max_ = 20 if order == "NCHW": X = np.round(np.random.rand(N, C, H, W) * (max_ - min_) + min_) elif order == "NHWC": X = np.round(np.random.rand(N, H, W, C) * (max_ - min_) + min_) X = X.astype(np.float32) Output = collections.namedtuple("Output", ["Y", "op_type", "engine"]) outputs = [] op_engine_list = [ ("MaxPool", ""), ("MaxPool", "DNNLOWP"), ("Int8MaxPool", "DNNLOWP"), ] for op_type, engine in op_engine_list: net = core.Net("test_net") do_quantize = "DNNLOWP" in engine and in_quantized if do_quantize: quantize = core.CreateOperator("Quantize", ["X"], ["X_q"], engine=engine, device_option=gc) net.Proto().op.extend([quantize]) max_pool = core.CreateOperator( op_type, ["X_q" if do_quantize else "X"], ["Y_q" if engine == "DNNLOWP" else "Y"], stride=stride, kernel=kernel, pad=pad, order=order, engine=engine, device_option=gc, ) net.Proto().op.extend([max_pool]) if engine == "DNNLOWP": dequantize = core.CreateOperator("Dequantize", ["Y_q"], ["Y"], engine=engine, device_option=gc) net.Proto().op.extend([dequantize]) self.ws.create_blob("X").feed(X, device_option=gc) self.ws.run(net) outputs.append( Output(Y=self.ws.blobs["Y"].fetch(), op_type=op_type, engine=engine)) # Y_i = max(X_j) so the only error is in quantization of inputs check_quantized_results_close(outputs, ref=X)
def test_sum_reduce(self, gc, dc): # Set broadcast and no axis, i.e. broadcasting last dimensions. X = np.random.rand(2, 3, 4, 5).astype(np.float32) Y = np.random.rand(4, 5).astype(np.float32) op = core.CreateOperator("SumReduceLike", ["X", "Y"], "out", broadcast=1) workspace.FeedBlob("X", X) workspace.FeedBlob("Y", Y) workspace.RunOperatorOnce(op) out = workspace.FetchBlob("out") res = np.sum(X, axis=0) res = np.sum(res, axis=0) np.testing.assert_array_almost_equal(out, res) self.assertDeviceChecks(dc, op, [X, Y], [0]) # Set broadcast and no axis, i.e. broadcasting last dimensions. X = np.random.rand(2, 3, 4, 5).astype(np.float32) Y = np.random.rand(2, 3).astype(np.float32) op = core.CreateOperator("SumReduceLike", ["X", "Y"], "out", broadcast=1, axis=0) workspace.FeedBlob("X", X) workspace.FeedBlob("Y", Y) workspace.RunOperatorOnce(op) out = workspace.FetchBlob("out") res = np.sum(X, axis=3) res = np.sum(res, axis=2) np.testing.assert_array_almost_equal(out, res, decimal=3) self.assertDeviceChecks(dc, op, [X, Y], [0]) # broadcasting intermediate dimensions X = np.random.rand(2, 3, 4, 5).astype(np.float32) Y = np.random.rand(3, 4).astype(np.float32) op = core.CreateOperator("SumReduceLike", ["X", "Y"], "out", broadcast=1, axis=1) workspace.FeedBlob("X", X) workspace.FeedBlob("Y", Y) workspace.RunOperatorOnce(op) out = workspace.FetchBlob("out") res = np.sum(X, axis=0) res = np.sum(res, axis=2) np.testing.assert_array_almost_equal(out, res) self.assertDeviceChecks(dc, op, [X, Y], [0]) # broadcasting intermediate dimensions X = np.random.rand(2, 3, 4, 500).astype(np.float64) Y = np.random.rand(1).astype(np.float64) op = core.CreateOperator("SumReduceLike", ["X", "Y"], "out", broadcast=1) workspace.FeedBlob("X", X) workspace.FeedBlob("Y", Y) workspace.RunOperatorOnce(op) out = workspace.FetchBlob("out") res = np.array(np.sum(X)) np.testing.assert_array_almost_equal(out, res, decimal=0) # broadcasting with single elem dimensions at both ends X = np.random.rand(2, 3, 4, 5).astype(np.float32) Y = np.random.rand(1, 3, 4, 1).astype(np.float32) op = core.CreateOperator("SumReduceLike", ["X", "Y"], "out", broadcast=1) workspace.FeedBlob("X", X) workspace.FeedBlob("Y", Y) workspace.RunOperatorOnce(op) out = workspace.FetchBlob("out") res = np.sum(X, axis=0) res = np.sum(res, axis=2).reshape(Y.shape) np.testing.assert_array_almost_equal(out, res) self.assertDeviceChecks(dc, op, [X, Y], [0]) # fp64 is not supported with the CUDA op dc_cpu_only = [d for d in dc if d.device_type != caffe2_pb2.CUDA] self.assertDeviceChecks(dc_cpu_only, op, [X, Y], [0])
def test_groupwise_dnnlowp_conv_int( self, stride, pad, kernel, dilation, size, group, input_channels_per_group, output_channels_per_group, batch_size, order, prepack_weight, preserve_activation_sparsity, preserve_weight_sparsity, gc, dc, ): assume(group == 1 or dilation == 1) assume((not prepack_weight) or order == "NHWC") X, W, b = generate_conv_inputs( stride, pad, kernel, dilation, size, group, input_channels_per_group, output_channels_per_group, batch_size, order, groupwise_quantization=True, preserve_activation_sparsity=preserve_activation_sparsity, preserve_weight_sparsity=preserve_weight_sparsity, ) Output = collections.namedtuple("Output", ["Y", "op_type", "engine", "order"]) outputs = [] op_engine_list = [ ("Conv", ""), ("Conv", "DNNLOWP"), ("Conv", "DNNLOWP_16"), ("Int8Conv", "DNNLOWP"), ] for op_type, engine in op_engine_list: init_net = core.Net("test_init_net") net = core.Net("test_net") do_quantize = "DNNLOWP" in engine do_dequantize = "DNNLOWP" in engine do_prepack_weight = engine == "DNNLOWP" and prepack_weight if do_quantize: quantize = core.CreateOperator( "Quantize", ["X"], ["X_q"], preserve_activation_sparsity=preserve_activation_sparsity, engine=engine, device_option=gc, ) net.Proto().op.extend([quantize]) if do_prepack_weight: X_min = 0 if X.size == 0 else X.min() X_max = 0 if X.size == 0 else X.max() x_q_param = hardcode_scale_zp.choose_quantization_params( X_min, X_max) inputs = ["W"] if do_dequantize: inputs += ["b"] pack = core.CreateOperator( "Int8ConvPackWeight", inputs, ["W_packed"], group=group, quantize_groupwise=1, preserve_weight_sparsity=preserve_weight_sparsity, in_scale=x_q_param.scale, engine=engine, ) init_net.Proto().op.extend([pack]) conv = core.CreateOperator( op_type, [ "X_q" if do_quantize else "X", "W_packed" if do_prepack_weight else "W", "b", ], ["Y_q" if do_dequantize else "Y"], stride=stride, kernel=kernel, dilation=dilation, pad=pad, order=order, preserve_activation_sparsity=preserve_activation_sparsity, preserve_weight_sparsity=preserve_weight_sparsity, engine=engine, group=group, quantize_groupwise=1, device_option=gc, ) if do_dequantize or do_prepack_weight: # groupwise quantization only works with static quantization # so we need to set quantization parameters dnnlowp_utils.add_quantization_param_args( conv, outputs[0][0], preserve_activation_sparsity) net.Proto().op.extend([conv]) if do_dequantize: dequantize = core.CreateOperator( "Dequantize", ["Y_q"], ["Y"], preserve_activation_sparsity=preserve_activation_sparsity, engine=engine, device_option=gc, ) net.Proto().op.extend([dequantize]) run_conv_or_fc(self, init_net, net, X, W, b, op_type, engine, order, gc, outputs) check_quantized_results_close(outputs, symmetric=preserve_activation_sparsity)
def test_sum_reduce_fp16(self, gc, dc): assume(core.IsGPUDeviceType(gc.device_type)) # Set broadcast and no axis, i.e. broadcasting last dimensions. X = np.random.rand(2, 3, 4, 5).astype(np.float16) Y = np.random.rand(4, 5).astype(np.float16) op = core.CreateOperator("SumReduceLike", ["X", "Y"], "out", broadcast=1, device_option=gc) def ref_op(X, Y): res = np.sum(X, axis=0) res = np.sum(res, axis=0) return [res] self.assertReferenceChecks(device_option=gc, op=op, inputs=[X, Y], reference=ref_op, threshold=1e-3) # Set broadcast and no axis, i.e. broadcasting last dimensions. X = np.random.rand(2, 3, 4, 5).astype(np.float16) Y = np.random.rand(2, 3).astype(np.float16) op = core.CreateOperator("SumReduceLike", ["X", "Y"], "out", broadcast=1, axis=0) def ref_op(X, Y): res = np.sum(X, axis=3) res = np.sum(res, axis=2) return [res] self.assertReferenceChecks(device_option=gc, op=op, inputs=[X, Y], reference=ref_op, threshold=1e-3) # broadcasting intermediate dimensions X = np.random.rand(2, 3, 4, 5).astype(np.float16) Y = np.random.rand(3, 4).astype(np.float16) op = core.CreateOperator("SumReduceLike", ["X", "Y"], "out", broadcast=1, axis=1) def ref_op(X, Y): res = np.sum(X, axis=0) res = np.sum(res, axis=2) return [res] self.assertReferenceChecks(device_option=gc, op=op, inputs=[X, Y], reference=ref_op, threshold=1e-3) # broadcasting with single elem dimensions at both ends X = np.random.rand(2, 3, 4, 5).astype(np.float16) Y = np.random.rand(1, 3, 4, 1).astype(np.float16) op = core.CreateOperator("SumReduceLike", ["X", "Y"], "out", broadcast=1) def ref_op(X, Y): res = np.sum(X, axis=0) res = np.sum(res, axis=2) return [res.reshape(Y.shape)] self.assertReferenceChecks(device_option=gc, op=op, inputs=[X, Y], reference=ref_op, threshold=1e-3)
def test_hsm_search(self): samples = 10 dim_in = 5 X = np.random.rand(samples, dim_in).astype(np.float32) - 0.5 w = np.random.rand(hierarchy_proto.size, dim_in) \ .astype(np.float32) - 0.5 b = np.random.rand(hierarchy_proto.size).astype(np.float32) - 0.5 labels = np.array([np.random.randint(0, 8) for i in range(samples)]) \ .astype(np.int32) workspace.GlobalInit(['caffe2']) workspace.FeedBlob("data", X) workspace.FeedBlob("weights", w) workspace.FeedBlob("bias", b) workspace.FeedBlob("labels", labels) op = core.CreateOperator('HSoftmaxSearch', ['data', 'weights', 'bias'], ['names', 'scores'], 'HSoftmaxSearch', arg=args_search) workspace.RunOperatorOnce(op) names = workspace.FetchBlob('names') scores = workspace.FetchBlob('scores') def simulation_hsm_search(): names = [] scores = [] for line in struct: s, e = line[0], line[0] + line[1] score = np.dot(X, w[s:e].transpose()) + b[s:e] score = np.exp(score - np.max(score, axis=1, keepdims=True)) score /= score.sum(axis=1, keepdims=True) score = -np.log(score) score = score.transpose() idx = -1 for j, n in enumerate(names): if n == line[3]: idx = j score += scores[j] if idx == -1: score[score > beam] = np.inf else: score[score - scores[idx] > beam] = np.inf for i, name in enumerate(line[2]): scores.append(score[i]) names.append(name) scores = np.vstack(scores) return names, scores.transpose() p_names, p_scores = simulation_hsm_search() idx = np.argsort(p_scores, axis=1) p_scores = np.sort(p_scores, axis=1) p_names = np.array(p_names)[idx] for i in range(names.shape[0]): for j in range(names.shape[1]): if names[i][j]: self.assertEquals(names[i][j], p_names[i][j].item().encode('utf-8')) self.assertAlmostEqual(scores[i][j], p_scores[i][j], delta=0.001)
def Train(args): # Either use specified device list or generate one if args.gpus is not None: gpus = [int(x) for x in args.gpus.split(',')] num_gpus = len(gpus) else: gpus = list(range(args.num_gpus)) num_gpus = args.num_gpus log.info("Running on GPUs: {}".format(gpus)) # Verify valid batch size total_batch_size = args.batch_size batch_per_device = total_batch_size // num_gpus assert \ total_batch_size % num_gpus == 0, \ "Number of GPUs must divide batch size" # Round down epoch size to closest multiple of batch size across machines global_batch_size = total_batch_size * args.num_shards epoch_iters = int(args.epoch_size / global_batch_size) assert \ epoch_iters > 0, \ "Epoch size must be larger than batch size times shard count" args.epoch_size = epoch_iters * global_batch_size log.info("Using epoch size: {}".format(args.epoch_size)) # Create ModelHelper object train_arg_scope = { 'order': 'NCHW', 'use_cudnn': True, 'cudnn_exhaustive_search': True, 'ws_nbytes_limit': (args.cudnn_workspace_limit_mb * 1024 * 1024), } train_model = model_helper.ModelHelper(name="resnet50", arg_scope=train_arg_scope) num_shards = args.num_shards shard_id = args.shard_id # Expect interfaces to be comma separated. # Use of multiple network interfaces is not yet complete, # so simply use the first one in the list. interfaces = args.distributed_interfaces.split(",") # Rendezvous using MPI when run with mpirun if os.getenv("OMPI_COMM_WORLD_SIZE") is not None: num_shards = int(os.getenv("OMPI_COMM_WORLD_SIZE", 1)) shard_id = int(os.getenv("OMPI_COMM_WORLD_RANK", 0)) if num_shards > 1: rendezvous = dict(kv_handler=None, num_shards=num_shards, shard_id=shard_id, engine="GLOO", transport=args.distributed_transport, interface=interfaces[0], mpi_rendezvous=True, exit_nets=None) elif num_shards > 1: # Create rendezvous for distributed computation store_handler = "store_handler" if args.redis_host is not None: # Use Redis for rendezvous if Redis host is specified workspace.RunOperatorOnce( core.CreateOperator( "RedisStoreHandlerCreate", [], [store_handler], host=args.redis_host, port=args.redis_port, prefix=args.run_id, )) else: # Use filesystem for rendezvous otherwise workspace.RunOperatorOnce( core.CreateOperator( "FileStoreHandlerCreate", [], [store_handler], path=args.file_store_path, prefix=args.run_id, )) rendezvous = dict(kv_handler=store_handler, shard_id=shard_id, num_shards=num_shards, engine="GLOO", transport=args.distributed_transport, interface=interfaces[0], exit_nets=None) else: rendezvous = None # Model building functions def create_resnet50_model_ops(model, loss_scale): initializer = (PseudoFP16Initializer if args.dtype == 'float16' else Initializer) with brew.arg_scope([brew.conv, brew.fc], WeightInitializer=initializer, BiasInitializer=initializer, enable_tensor_core=args.enable_tensor_core, float16_compute=args.float16_compute): pred = resnet.create_resnet50( model, "data", num_input_channels=args.num_channels, num_labels=args.num_labels, no_bias=True, no_loss=True, ) if args.dtype == 'float16': pred = model.net.HalfToFloat(pred, pred + '_fp32') softmax, loss = model.SoftmaxWithLoss([pred, 'label'], ['softmax', 'loss']) loss = model.Scale(loss, scale=loss_scale) brew.accuracy(model, [softmax, "label"], "accuracy") return [loss] def add_optimizer(model): stepsz = int(30 * args.epoch_size / total_batch_size / num_shards) if args.float16_compute: # TODO: merge with multi-prceision optimizer opt = optimizer.build_fp16_sgd( model, args.base_learning_rate, momentum=0.9, nesterov=1, weight_decay=args.weight_decay, # weight decay included policy="step", stepsize=stepsz, gamma=0.1) else: optimizer.add_weight_decay(model, args.weight_decay) opt = optimizer.build_multi_precision_sgd(model, args.base_learning_rate, momentum=0.9, nesterov=1, policy="step", stepsize=stepsz, gamma=0.1) return opt # Define add_image_input function. # Depends on the "train_data" argument. # Note that the reader will be shared with between all GPUS. if args.train_data == "null": def add_image_input(model): AddNullInput( model, None, batch_size=batch_per_device, img_size=args.image_size, dtype=args.dtype, ) else: reader = train_model.CreateDB( "reader", db=args.train_data, db_type=args.db_type, num_shards=num_shards, shard_id=shard_id, ) def add_image_input(model): AddImageInput( model, reader, batch_size=batch_per_device, img_size=args.image_size, dtype=args.dtype, is_test=False, ) def add_post_sync_ops(model): """Add ops applied after initial parameter sync.""" for param_info in model.GetOptimizationParamInfo(model.GetParams()): if param_info.blob_copy is not None: model.param_init_net.HalfToFloat( param_info.blob, param_info.blob_copy[core.DataType.FLOAT]) # Create parallelized model data_parallel_model.Parallelize( train_model, input_builder_fun=add_image_input, forward_pass_builder_fun=create_resnet50_model_ops, optimizer_builder_fun=add_optimizer, post_sync_builder_fun=add_post_sync_ops, devices=gpus, rendezvous=rendezvous, optimize_gradient_memory=False, cpu_device=args.use_cpu, shared_model=args.use_cpu, combine_spatial_bn=args.use_cpu, ) data_parallel_model.OptimizeGradientMemory(train_model, {}, set(), False) workspace.RunNetOnce(train_model.param_init_net) workspace.CreateNet(train_model.net) # Add test model, if specified test_model = None if (args.test_data is not None): log.info("----- Create test net ----") test_arg_scope = { 'order': "NCHW", 'use_cudnn': True, 'cudnn_exhaustive_search': True, } test_model = model_helper.ModelHelper(name="resnet50_test", arg_scope=test_arg_scope, init_params=False) test_reader = test_model.CreateDB( "test_reader", db=args.test_data, db_type=args.db_type, ) def test_input_fn(model): AddImageInput( model, test_reader, batch_size=batch_per_device, img_size=args.image_size, dtype=args.dtype, is_test=True, ) data_parallel_model.Parallelize( test_model, input_builder_fun=test_input_fn, forward_pass_builder_fun=create_resnet50_model_ops, post_sync_builder_fun=add_post_sync_ops, param_update_builder_fun=None, devices=gpus, cpu_device=args.use_cpu, ) workspace.RunNetOnce(test_model.param_init_net) workspace.CreateNet(test_model.net) epoch = 0 # load the pre-trained model and reset epoch if args.load_model_path is not None: LoadModel(args.load_model_path, train_model) # Sync the model params data_parallel_model.FinalizeAfterCheckpoint(train_model) # reset epoch. load_model_path should end with *_X.mdl, # where X is the epoch number last_str = args.load_model_path.split('_')[-1] if last_str.endswith('.mdl'): epoch = int(last_str[:-4]) log.info("Reset epoch to {}".format(epoch)) else: log.warning("The format of load_model_path doesn't match!") expname = "resnet50_gpu%d_b%d_L%d_lr%.2f_v2" % ( args.num_gpus, total_batch_size, args.num_labels, args.base_learning_rate, ) explog = experiment_util.ModelTrainerLog(expname, args) # Run the training one epoch a time while epoch < args.num_epochs: epoch = RunEpoch(args, epoch, train_model, test_model, total_batch_size, num_shards, expname, explog) # Save the model for each epoch SaveModel(args, train_model, epoch) model_path = "%s/%s_" % (args.file_store_path, args.save_model_name) # remove the saved model from the previous epoch if it exists if os.path.isfile(model_path + str(epoch - 1) + ".mdl"): os.remove(model_path + str(epoch - 1) + ".mdl")
def _create_slice(cls, init_model, pred_model, n, opset_version): op = cls._common_onnx_node_to_caffe2_op(init_model, pred_model, n, opset_version) args = {arg.name: arg for arg in op.arg} starts_vals = np.array( args.pop('starts').ints, dtype=np.int64).tolist() ends_vals = np.array( [i - 1 if i < 0 else i for i in args.pop('ends').ints], dtype=np.int64).tolist() if 'axes' in args: axes_vals = np.array( args.pop('axes').ints, dtype=np.int32).tolist() else: ndims = len(starts_vals) axes_vals = np.array(range(ndims), dtype=np.int32).tolist() data, = op.input ops = [] shape_tensor = dummy_name() ops.append(core.CreateOperator( 'Shape', [data], [shape_tensor] )) axes_tensor = dummy_name() ops.extend([ core.CreateOperator( 'GivenTensorIntFill', [], [axes_tensor], shape=[len(axes_vals)], values=axes_vals, ), ]) starts_vals_tensor = dummy_name() starts_tensor = dummy_name() casted_starts_tensor = dummy_name() ops.extend([ core.CreateOperator( 'GivenTensorInt64Fill', [], [starts_vals_tensor], shape=[len(starts_vals)], values=starts_vals, ), core.CreateOperator( 'ConstantFill', [shape_tensor], [starts_tensor], dtype=caffe2_pb2.TensorProto.INT64, value=0, ), core.CreateOperator( 'ScatterAssign', [starts_tensor, axes_tensor, starts_vals_tensor], [starts_tensor], ), # Slice only accepts starts as int core.CreateOperator( 'Cast', [starts_tensor], [casted_starts_tensor], to=caffe2_pb2.TensorProto.INT32, ), ]) ends_vals_tensor = dummy_name() ends_tensor = dummy_name() casted_ends_tensor = dummy_name() ops.extend([ core.CreateOperator( 'GivenTensorInt64Fill', [], [ends_vals_tensor], shape=[len(ends_vals)], values=ends_vals, ), core.CreateOperator( 'ConstantFill', [shape_tensor], [ends_tensor], dtype=caffe2_pb2.TensorProto.INT64, value=-1, ), core.CreateOperator( 'ScatterAssign', [ends_tensor, axes_tensor, ends_vals_tensor], [ends_tensor], ), # Slice only accepts ends as int core.CreateOperator( 'Cast', [ends_tensor], [casted_ends_tensor], to=caffe2_pb2.TensorProto.INT32, ), ]) op.input[:] = [data, casted_starts_tensor, casted_ends_tensor] del op.arg[:] op.arg.extend(args.values()) ops.append(op) return ops
def test_row_wise_sparse_adam_output_grad(self, inputs, ITER, LR, beta1, beta2, epsilon, data_strategy, gc, dc): param, mom1, grad = inputs ITER = np.array([ITER], dtype=np.int64) LR = np.array([LR], dtype=np.float32) # Create a 1D row-wise average 2nd moment tensor. mom2 = data_strategy.draw( hu.tensor1d(min_len=param.shape[0], max_len=param.shape[0], elements=hu.elements_of_type(dtype=np.float32)) ) mom2 = np.absolute(mom2) # Create an indexing array containing values which index into grad indices = data_strategy.draw( hu.tensor( max_dim=1, min_value=1, max_value=grad.shape[0], dtype=np.int64, elements=st.sampled_from(np.arange(grad.shape[0])), ), ) # Note that unlike SparseAdam, RowWiseSparseAdam uses a moment # tensor that is strictly 1-dimensional and equal in length to the # first dimension of the parameters, so indices must also be # 1-dimensional. indices = indices.flatten() hypothesis.note('indices.shape: %s' % str(indices.shape)) # Verify that the generated indices are unique hypothesis.assume(np.array_equal(np.unique(indices), np.sort(indices))) # Sparsify grad grad = grad[indices] op = core.CreateOperator( "RowWiseSparseAdam", ["param", "mom1", "mom2", "indices", "grad", "lr", "iter"], ["param", "mom1", "mom2", "output_grad"], beta1=beta1, beta2=beta2, epsilon=epsilon) def ref_row_wise_sparse_output_grad(param, mom1, mom2, indices, grad, LR, ITER, beta1, beta2, epsilon, output_grad): param_out = np.copy(param) mom1_out = np.copy(mom1) mom2_out = np.copy(mom2) grad_out = np.copy(grad) for i, index in enumerate(indices): param_out[index], mom1_out[index], mom2_out[index], grad_out[i] = \ self.ref_row_wise_adam(param[index], mom1[index], mom2[index], grad[i], LR, ITER, beta1, beta2, epsilon, output_grad) return (param_out, mom1_out, mom2_out, grad_out) # Iter lives on the CPU input_device_options = {'iter': hu.cpu_do} self.assertDeviceChecks( dc, op, [param, mom1, mom2, indices, grad, LR, ITER], [0, 1, 2, 3], input_device_options=input_device_options) self.assertReferenceChecks( gc, op, [param, mom1, mom2, indices, grad, LR, ITER], functools.partial( ref_row_wise_sparse_output_grad, beta1=beta1, beta2=beta2, epsilon=epsilon, output_grad=True), input_device_options=input_device_options)
def test_sparse_normalize( self, inputs, use_max_norm, norm, data_strategy, use_fp16, gc, dc ): param, grad = inputs param += 0.02 * np.sign(param) param[param == 0.0] += 0.02 if use_fp16: param = param.astype(np.float16) grad = grad.astype(np.float16) # Create an indexing array containing values that are lists of indices, # which index into param indices = data_strategy.draw( hu.tensor( dtype=np.int64, min_dim=1, max_dim=1, elements=st.sampled_from(np.arange(param.shape[0])), ) ) hypothesis.note("indices.shape: %s" % str(indices.shape)) # For now, the indices must be unique hypothesis.assume( np.array_equal(np.unique(indices.flatten()), np.sort(indices.flatten())) ) op1 = core.CreateOperator( "Float16SparseNormalize" if use_fp16 else "SparseNormalize", ["param", "indices"], ["param"], use_max_norm=use_max_norm, norm=norm, ) # Sparsify grad grad = grad[indices] op2 = core.CreateOperator( "Float16SparseNormalize" if use_fp16 else "SparseNormalize", ["param", "indices", "grad"], ["param"], use_max_norm=use_max_norm, norm=norm, ) def ref_sparse_normalize(param, indices, grad=None): param_out = np.copy(param) for _, index in enumerate(indices): param_out[index] = self.ref_normalize(param[index], use_max_norm, norm) return (param_out,) # self.assertDeviceChecks(dc, op, [param, indices], [0]) self.assertReferenceChecks( gc, op1, [param, indices], ref_sparse_normalize, threshold=1e-2 if use_fp16 else 1e-4, ) self.assertReferenceChecks( gc, op2, [param, indices, grad], ref_sparse_normalize, threshold=1e-2 if use_fp16 else 1e-4, )
def test_convolution_layout(self, op_type, stride, pad, kernel, dilation, size, input_channels, output_channels, batch_size, use_bias, gc, dc): assume(size >= dilation * (kernel - 1) + 1) X = np.random.rand( batch_size, size, size, input_channels).astype(np.float32) - 0.5 w = np.random.rand( output_channels, kernel, kernel, input_channels ).astype(np.float32) - 0.5 b = np.random.rand(output_channels).astype(np.float32) - 0.5 Output = collections.namedtuple("Output", ["Y", "engine", "order"]) outputs = [] for order in ["NCHW", "NHWC"]: engine_list = [''] if hiputl.run_in_hip(gc, dc): if order == 'NCHW': engine_list.append('MIOPEN') else: if _cudnn_supports(dilation=(dilation > 1), nhwc=(order == 'NHWC')): engine_list.append('CUDNN') for engine in engine_list: op = core.CreateOperator( op_type, ["X", "w", "b"] if use_bias else ["X", "w"], ["Y"], stride=stride, kernel=kernel, dilation=dilation, pad=pad, order=order, engine=engine, device_option=gc, exhaustive_search=True, ) if order == "NCHW": X_f = X.transpose((0, 3, 1, 2)) w_f = w.transpose((0, 3, 1, 2)) else: X_f = X w_f = w self.assertDeviceChecks( dc, op, [X_f, w_f, b] if use_bias else [X_f, w_f], [0]) self.ws.create_blob("X").feed(X_f, device_option=gc) self.ws.create_blob("w").feed(w_f, device_option=gc) self.ws.create_blob("b").feed(b, device_option=gc) self.ws.run(op) outputs.append(Output( Y=self.ws.blobs["Y"].fetch(), engine=engine, order=order)) def canonical(o): if o.order == "NHWC": return o.Y.transpose((0, 3, 1, 2)) else: return o.Y for o in outputs: np.testing.assert_allclose( canonical(outputs[0]), canonical(o), atol=1e-4, rtol=1e-4)