def test_int8_convolution(self, stride, pad, kernel, size, input_channels, output_channels, batch_size, use_bias, gc, dc): X = np.random.rand(batch_size, input_channels, size, size).astype( np.float32) - 0.5 w = np.random.rand(output_channels, input_channels, kernel, kernel).astype(np.float32) - 0.5 b = np.random.rand(output_channels).astype(np.float32) - 0.5 old_ws_name = workspace.CurrentWorkspace() workspace.SwitchWorkspace("_device_check_", True) conv_fp32 = core.CreateOperator( "Conv", ["X_fp32", "w_fp32", "b_fp32"] if use_bias else ["X_fp32", "w_fp32"], ["Y_fp32"], stride=stride, pad=pad, kernel=kernel, training_mode=0, device_option=dc[0], ) workspace.FeedBlob('X_fp32', X, dc[0]) workspace.FeedBlob('w_fp32', w, dc[0]) workspace.FeedBlob('b_fp32', b, dc[0]) workspace.RunOperatorOnce(conv_fp32) Y = workspace.FetchBlob('Y_fp32') workspace.ResetWorkspace() Y_absmax = np.array([np.absolute(Y).max()]).astype(np.float32) if Y.min() >= 0: Y_scale = Y_absmax / 0xFF Y_zero_point = 0 else: Y_scale = Y_absmax / 0x7F Y_zero_point = 128 X_absmax = np.array([np.absolute(X).max()]).astype(np.float32) if X.min() >= 0: X_scale = X_absmax / 0xFF X_zero_point = 0 else: X_scale = X_absmax / 0x7F X_zero_point = 128 w_absmax = np.array([ np.absolute(w[i, ...]).max() for i in range(w.shape[0]) ]).astype(np.float32) w_scale = w_absmax / 0x7F w_zero_point = 128 w = np.transpose(w, (0, 2, 3, 1)).astype(np.float32) w_bytes = np.rint([w[i, ...] / w_scale[i] for i in range(w.shape[0]) ]).astype(np.int8) + w_zero_point w_filler = core.CreateOperator( "Int8GivenTensorFill", [], ["w"], shape=w.shape, values=w_bytes.astype(np.uint8).tobytes(), Y_zero_point=w_zero_point, Y_scales=w_scale, device_option=dc[1], ) b_scale = w_scale * X_scale b_zero_point = 0 b_bytes = np.rint([b[i] / b_scale[i] for i in range(b.shape[0])]).astype(np.int32) b_filler = core.CreateOperator( "Int8GivenIntTensorFill", [], ["b"], shape=b.shape, values=b_bytes, Y_zero_point=b_zero_point, Y_scales=b_scale, device_option=dc[1], ) sw2nhwc = core.CreateOperator("NCHW2NHWC", ["X"], ["X_nhwc"], device_option=dc[1]) quantize_X = core.CreateOperator( "Int8Quantize", ["X_nhwc"], ["X_quantized"], engine="DNNLOWP", device_option=dc[1], Y_zero_point=X_zero_point, Y_scale=X_scale[0], ) conv = core.CreateOperator( "Int8Conv", ["X_quantized", "w", "b"] if use_bias else ["X_quantized", "w"], ["Y_quantized"], stride=stride, pad=pad, kernel=kernel, engine="DNNLOWP", device_option=dc[1], Y_zero_point=Y_zero_point, Y_scale=Y_scale[0], ) dequantize_Y = core.CreateOperator( "Int8Dequantize", ["Y_quantized"], ["Y_nhwc"], engine="DNNLOWP", device_option=dc[1], ) sw2nchw = core.CreateOperator("NHWC2NCHW", ["Y_nhwc"], ["Y_out"], device_option=dc[1]) net = caffe2_pb2.NetDef() net.op.extend([ w_filler, b_filler, sw2nhwc, quantize_X, conv, dequantize_Y, sw2nchw ]) workspace.FeedBlob("X", X, dc[1]) workspace.RunNetOnce(net) Y_out = workspace.FetchBlob("Y_out") MSE = np.square(np.subtract(Y, Y_out)).mean() if MSE > 0.005: print(Y.flatten()) print(Y_out.flatten()) print(np.max(np.abs(Y_out - Y))) print("MSE", MSE) self.assertTrue(False) workspace.SwitchWorkspace(old_ws_name)
def Skip_test_SLS_NonQuantized_fp16(self): N = 20000 DIM = 64 D = (4 * np.random.random_sample((N, DIM)) + 1).astype(np.float32) I = (np.random.randint(0, N, size=12)).astype(np.int64) L = np.asarray([4, 4, 4]).astype(np.int32) workspace.FeedBlob("D", D) ref_c2_net = core.Net("test_ref_c2") ref_c2_net.SparseLengthsSum(["D", "I", "L"], "ref_out") ref_c2_net.Proto().external_input.extend(["D", "I", "L"]) ref_c2_net.Proto().external_output.extend(["ref_out"]) fp16_c2_net = core.Net("test_fp16_c2") fp16_c2_net.SparseLengthsSumFakeFP16AccFP16(["D", "I", "L"], "fp16_out") input_dict = {} pred_net = caffe2_pb2.NetDef() pred_net.name = "pred" pred_net.external_input.extend(["D", "I", "L"]) pred_net.external_output.append("glow_out") pred_net.op.add().CopyFrom( core.CreateOperator("SparseLengthsSum", ["D", "I", "L"], ["glow_out"])) onnxified_net = onnxifi_caffe2_net( pred_net, input_dict, max_batch_size=3, max_seq_size=16, debug=True, adjust_batch=False, use_onnx=False, ) num_onnxified_ops = sum(1 if op.type == "Onnxifi" else 0 for op in onnxified_net.op) print(onnxified_net) np.testing.assert_equal(num_onnxified_ops, 1) workspace.FeedBlob("I", I) workspace.FeedBlob("L", L) workspace.RunNetOnce(ref_c2_net) ref_c2_out = workspace.FetchBlob("ref_out") workspace.RunNetOnce(fp16_c2_net) fp16_c2_out = workspace.FetchBlob("fp16_out") np.testing.assert_allclose(fp16_c2_out, ref_c2_out, atol=1e-3, rtol=1e-3) workspace.RunNetOnce(onnxified_net) fp16_glow_out = workspace.FetchBlob("glow_out") if not np.allclose(fp16_glow_out, fp16_c2_out): diff = np.abs(fp16_glow_out - fp16_c2_out) print_test_debug_info( "sls", { "indices": I, "data": D, "lengths": L, "Y_c2": fp16_c2_out, "Y_glow": fp16_glow_out, "diff": diff, "rowwise_diff": diff[:, 0], }, ) assert 0
def share_grad_blobs( net, losses, param_grads, namescope, dont_share_blobs=None, share_activations=False, blob_shapes=None, ): ''' Implements similar optimization as Torch's shareGradInput(): for the gradients that are passed between layers, share blobs between operators when possible. This yields significant memory savings with deep networks. Returns an optimized protobuf (assign to net._net) ''' def is_grad_blob(b): name = str(b) # Note: need to look at _{namescope} pattern as it matches # to handle the auto-split gradients return name.endswith("_grad") and ( name.startswith(namescope) or name.startswith("_" + namescope)) and name not in param_grads def is_grad_op(op): # TODO: something smarter for b in list(op.input) + list(op.output): if is_grad_blob(b): return True return False log.warn("NOTE: Executing memonger to optimize gradient memory") # Collect ops that have something to do with gradients if namescope != "" and not namescope.endswith("/"): namescope += "/" netproto = copy.deepcopy(net.Proto()) activations = [] external_output = set(net.Proto().external_output) # Hacky way to get activations, think of a better way for op in net.Proto().op: for b in op.output: if b + "_w" in op.input and b not in external_output: activations.append(b) # Remove last activations, as they are usually accessed externally activations = set(activations[:-2]) # Gradient ops grad_op_indices = [] for idx, op in enumerate(netproto.op): if (is_grad_op(op)): grad_op_indices.append(idx) shared_blobs = set() for op in net.Proto().op: for b in list(op.input) + list(op.output): if is_grad_blob(b) or (share_activations and b in activations): shared_blobs.add(b) start_time = time.time() optim_str = C.memonger_compute_blob_recycling_for_dag( netproto.SerializeToString(), [str(s).encode('utf-8') for s in losses], grad_op_indices, set(str(s).encode('utf-8') for s in shared_blobs), namescope.encode('utf-8'), set() if dont_share_blobs is None else dont_share_blobs, {} if blob_shapes is None else blob_shapes) log.info( "Memonger memory optimization took {} secs".format(time.time() - start_time), ) optim = caffe2_pb2.NetDef() optim.ParseFromString(optim_str) assert verify_graph_equality(net.Proto(), optim), \ "Memonger graph is not equal to original." assert verify_inplace_blobs(net.Proto(), optim), \ "Inplace assignments differ in memonger net." return optim
def gen_init_net_from_blobs(blobs): ''' Generate an initialization net based on a blob dict ''' ret = caffe2_pb2.NetDef() for name, blob in blobs.items(): add_tensor(ret, name, blob) return ret
def test_slws_fused_8bit_rowwise_all_same(self, seed): # Comment out for predictable debugging np.random.seed(seed) workspace.ResetWorkspace() n = 1 m = 2 data = np.ones((n, m)).astype(np.float32) * 0.2 - 0.1 max_segments = 5 max_segment_length = 200 num_lengths = np.random.randint(1, max_segments + 1) # number of segments to run lengths = np.random.randint(0, max_segment_length + 1, size=num_lengths).astype(np.int32) num_indices = np.sum(lengths) indices = np.zeros(num_indices, dtype=np.int64) weights = np.random.uniform(low=-0.5, high=0.5, size=[len(indices)]).astype(np.float32) pred_net = caffe2_pb2.NetDef() pred_net.name = "pred" pred_net.external_input.extend( ["quantized_data", "weights", "indices", "lengths"]) pred_net.external_output.append("Y") pred_net.op.add().CopyFrom( core.CreateOperator( "SparseLengthsWeightedSumFused8BitRowwise", ["quantized_data", "weights", "indices", "lengths"], ["Y"], )) ref_net = caffe2_pb2.NetDef() ref_net.name = "ref" ref_net.external_input.extend( ["quantized_data", "weights", "indices", "lengths"]) ref_net.external_output.append("Y") ref_net.op.add().CopyFrom( core.CreateOperator( "SparseLengthsWeightedSumFused8BitRowwiseFakeFP16NNPI", ["quantized_data", "weights", "indices", "lengths"], ["Y"], )) workspace.FeedBlob("data", data) workspace.RunOperatorOnce( core.CreateOperator("FloatToFused8BitRowwiseQuantized", ["data"], ["quantized_data"])) pred_net_onnxified = onnxifi_caffe2_net( pred_net, {}, max_batch_size=max_segments, max_seq_size=max_segment_length, debug=True, adjust_batch=True, use_onnx=False, ) num_onnxified_ops = sum(1 if o.type == "Onnxifi" else 0 for o in pred_net_onnxified.op) np.testing.assert_equal(num_onnxified_ops, 1) workspace.FeedBlob("indices", indices) workspace.FeedBlob("lengths", lengths) workspace.FeedBlob("weights", weights) workspace.CreateNet(pred_net_onnxified) workspace.CreateNet(ref_net) workspace.RunNet(pred_net_onnxified.name) Y_glow = workspace.FetchBlob("Y") workspace.RunNet(ref_net.name) Y_c2 = workspace.FetchBlob("Y") if not np.allclose(Y_c2, Y_glow): print_test_debug_info( "slws_fused_8bit_rowwise", { "seed": seed, "indices": indices, "data": data, "lengths": lengths, "weights": weights, "Y_c2": Y_c2, "Y_glow": Y_glow, "diff": Y_glow - Y_c2, "rowwise_diff": (Y_glow - Y_c2)[:, 0], }, ) assert 0
def test_convolution_sum_fusion(self, stride, pad, kernel, size, input_channels, output_channels, batch_size, use_bias, group, sum_add, gc, dc): pool_S0 = core.CreateOperator("MaxPool", ["SX0"], ["S0"], stride=2, pad=0, kernel=2, device_option=dc[0]) conv = core.CreateOperator( "Conv", ["X0", "w0", "b0"] if use_bias else ["X0", "w0"], ["Y0"], stride=stride, pad=pad, kernel=kernel, group=group, device_option=dc[0]) sum = core.CreateOperator(sum_add, ["S0", "Y0"], ["S0"], device_option=dc[0]) # Manual fusion for Conv + Sum pool_S1 = core.CreateOperator("MaxPool", ["SX1"], ["S1"], stride=2, pad=0, kernel=2, group=group, device_option=dc[1]) conv_fusion = core.CreateOperator( "ConvFusion", ["X1", "w1", "b1", "S1"] if use_bias else ["X1", "w1", "S1"], ["S1"], stride=stride, pad=pad, kernel=kernel, group=group, fusion_type=2, device_option=dc[1]) pool_input_size = int( math.ceil(float(size + 2 * pad - kernel + 1) / stride)) * 2 SX = np.random.rand(batch_size, output_channels * group, pool_input_size, pool_input_size).astype( np.float32) - 0.5 X = np.random.rand(batch_size, input_channels * group, size, size).astype(np.float32) - 0.5 w = np.random.rand( output_channels * group, input_channels, kernel, kernel) \ .astype(np.float32) - 0.5 b = np.random.rand(output_channels * group).astype(np.float32) - 0.5 old_ws_name = workspace.CurrentWorkspace() workspace.SwitchWorkspace("_device_check_", True) workspace.FeedBlob('SX0', SX, dc[0]) workspace.FeedBlob('X0', X, dc[0]) workspace.FeedBlob('w0', w, dc[0]) workspace.FeedBlob('b0', b, dc[0]) workspace.RunOperatorOnce(pool_S0) workspace.RunOperatorOnce(conv) workspace.RunOperatorOnce(sum) S0 = workspace.FetchBlob('S0') workspace.ResetWorkspace() workspace.FeedBlob('SX1', SX, dc[1]) workspace.FeedBlob('X1', X, dc[1]) workspace.FeedBlob('w1', w, dc[1]) workspace.FeedBlob('b1', b, dc[1]) workspace.RunOperatorOnce(pool_S1) workspace.RunOperatorOnce(conv_fusion) S1 = workspace.FetchBlob('S1') if not np.allclose(S0, S1, atol=0.01, rtol=0.01): print(S1.flatten()) print(S0.flatten()) print(np.max(np.abs(S1 - S0))) self.assertTrue(False) # Auto fusion for Conv + Sum workspace.ResetWorkspace() old_net = caffe2_pb2.NetDef() pool_S0_old = caffe2_pb2.OperatorDef() pool_S0_old.CopyFrom(pool_S0) pool_S0_old.device_option.CopyFrom(dc[1]) conv_old = caffe2_pb2.OperatorDef() conv_old.CopyFrom(conv) conv_old.device_option.CopyFrom(dc[1]) sum_old = caffe2_pb2.OperatorDef() sum_old.CopyFrom(sum) sum_old.device_option.CopyFrom(dc[1]) old_net.op.extend([pool_S0_old, conv_old, sum_old]) # Conv + Sum should be fused case: [PreNode, Conv, Sum] workspace.FeedBlob('SX0', SX, dc[1]) workspace.FeedBlob('X0', X, dc[1]) workspace.FeedBlob('w0', w, dc[1]) workspace.FeedBlob('b0', b, dc[1]) net = core.Net("net") net.Proto().CopyFrom(old_net) optimizeForMKLDNN(net) self.assertTrue(len(net.Proto().op) == 2) self.assertTrue(net.Proto().op[1].type == "ConvFusion") workspace.RunNetOnce(net.Proto()) # The output tensor name will be changed by optimization # sometimes when applying conv sum fusion S2 = workspace.FetchBlob(net.Proto().op[-1].output[0]) if not np.allclose(S0, S2, atol=0.01, rtol=0.01): print(S2.flatten()) print(S0.flatten()) print(np.max(np.abs(S2 - S0))) self.assertTrue(False) # Conv + Sum should be fused case: [Conv, PreNode, Sum] workspace.ResetWorkspace() old_net = caffe2_pb2.NetDef() workspace.FeedBlob('SX0', SX, dc[1]) workspace.FeedBlob('X0', X, dc[1]) workspace.FeedBlob('w0', w, dc[1]) workspace.FeedBlob('b0', b, dc[1]) old_net.op.extend([conv_old, pool_S0_old, sum_old]) net = core.Net("net") net.Proto().CopyFrom(old_net) optimizeForMKLDNN(net) self.assertTrue(len(net.Proto().op) == 2) self.assertTrue(net.Proto().op[1].type == "ConvFusion") workspace.RunNetOnce(net.Proto()) # The output tensor name will be changed by optimization # sometimes when applying conv sum fusion S2 = workspace.FetchBlob(net.Proto().op[-1].output[0]) if not np.allclose(S0, S2, atol=0.01, rtol=0.01): print(S2.flatten()) print(S0.flatten()) print(np.max(np.abs(S2 - S0))) self.assertTrue(False) # Conv + Sum should not be fused case: [Conv, midOp, preNode, Sum] Conv output is used by midOp dropout = core.CreateOperator("Dropout", ["Y0"], ["Y_dropout"], ratio=0.5, is_test=True, device_option=dc[1]) workspace.ResetWorkspace() workspace.FeedBlob('SX0', SX, dc[1]) workspace.FeedBlob('X0', X, dc[1]) workspace.FeedBlob('w0', w, dc[1]) workspace.FeedBlob('b0', b, dc[1]) old_net = caffe2_pb2.NetDef() old_net.op.extend([conv_old, dropout, pool_S0_old, sum_old]) net = core.Net("net") net.Proto().CopyFrom(old_net) optimizeForMKLDNN(net) self.assertTrue(len(net.Proto().op) == 4) workspace.RunNetOnce(net.Proto()) S2 = workspace.FetchBlob(net.Proto().op[-1].output[0]) if not np.allclose(S0, S2, atol=0.01, rtol=0.01): print(S2.flatten()) print(S0.flatten()) print(np.max(np.abs(S2 - S0))) self.assertTrue(False) # Conv + Sum should not be fused case: [Conv, preNode, Sum, midOp] preNode output is used by midOp sum1 = core.CreateOperator(sum_add, ["S0", "Y0"], ["S3"], device_option=dc[1]) dropout = core.CreateOperator("Dropout", ["S0"], ["Y_dropout"], ratio=0.5, is_test=True, device_option=dc[1]) workspace.ResetWorkspace() workspace.FeedBlob('SX0', SX, dc[1]) workspace.FeedBlob('X0', X, dc[1]) workspace.FeedBlob('w0', w, dc[1]) workspace.FeedBlob('b0', b, dc[1]) old_net = caffe2_pb2.NetDef() old_net.op.extend([conv_old, pool_S0_old, sum1, dropout]) net = core.Net("net") net.Proto().CopyFrom(old_net) optimizeForMKLDNN(net) print("net={}\n".format(net.Proto())) self.assertTrue(len(net.Proto().op) == 4) workspace.RunNetOnce(net.Proto()) S2 = workspace.FetchBlob(net.Proto().op[-2].output[0]) if not np.allclose(S0, S2, atol=0.01, rtol=0.01): print(S2.flatten()) print(S0.flatten()) print(np.max(np.abs(S2 - S0))) self.assertTrue(False) # Conv + Sum should not be fused case: [Conv, midOp, preNode, Sum] # midOp output has the same name with that of the Conv input relu_0 = core.CreateOperator("Relu", ["X0"], ["X1"], device_option=dc[0]) conv = core.CreateOperator( "Conv", ["X1", "w0", "b0"] if use_bias else ["X1", "w0"], ["Y0"], stride=1, pad=0, kernel=1, device_option=dc[0]) relu_1 = core.CreateOperator("Relu", ["X1"], ["X1"], device_option=dc[0]) pool = core.CreateOperator("MaxPool", ["X1"], ["S0"], stride=1, pad=0, kernel=1, device_option=dc[0]) sum = core.CreateOperator("Sum", ["S0", "Y0"], ["S0"], device_option=dc[0]) X = np.random.rand(batch_size, input_channels, size, size).astype( np.float32) - 0.5 w = np.random.rand(input_channels, input_channels, 1, 1).astype( np.float32) - 0.5 b = np.random.rand(input_channels).astype(np.float32) - 0.5 workspace.SwitchWorkspace(old_ws_name) workspace.ResetWorkspace() workspace.FeedBlob('X0', X, dc[0]) workspace.FeedBlob('w0', w, dc[0]) workspace.FeedBlob('b0', b, dc[0]) workspace.RunOperatorOnce(relu_0) workspace.RunOperatorOnce(conv) workspace.RunOperatorOnce(relu_1) workspace.RunOperatorOnce(pool) workspace.RunOperatorOnce(sum) S0 = workspace.FetchBlob('S0') workspace.ResetWorkspace() workspace.FeedBlob('X0', X, dc[1]) workspace.FeedBlob('w0', w, dc[1]) workspace.FeedBlob('b0', b, dc[1]) relu_0_old = caffe2_pb2.OperatorDef() relu_0_old.CopyFrom(relu_0) relu_0_old.device_option.CopyFrom(dc[1]) conv_old = caffe2_pb2.OperatorDef() conv_old.CopyFrom(conv) conv_old.device_option.CopyFrom(dc[1]) relu_1_old = caffe2_pb2.OperatorDef() relu_1_old.CopyFrom(relu_1) relu_1_old.device_option.CopyFrom(dc[1]) pool_old = caffe2_pb2.OperatorDef() pool_old.CopyFrom(pool) pool_old.device_option.CopyFrom(dc[1]) sum_old = caffe2_pb2.OperatorDef() sum_old.CopyFrom(sum) sum_old.device_option.CopyFrom(dc[1]) old_net = caffe2_pb2.NetDef() old_net.op.extend( [relu_0_old, conv_old, relu_1_old, pool_old, sum_old]) net = core.Net("net") net.Proto().CopyFrom(old_net) optimizeForMKLDNN(net) self.assertTrue(len(net.Proto().op) == 5) workspace.RunNetOnce(net.Proto()) S2 = workspace.FetchBlob(net.Proto().op[-1].output[0]) if not np.allclose(S0, S2, atol=0.01, rtol=0.01): print(S2.flatten()) print(S0.flatten()) print(np.max(np.abs(S2 - S0))) self.assertTrue(False)
def test_convolution_grouped_sum_relu_fusion(self, stride, pad, kernel, size, input_channels, output_channels, batch_size, use_bias, group, gc, dc): conv_S0 = core.CreateOperator( "Conv", ["SX0", "Sw0", "Sb0"] if use_bias else ["SX0", "Sw0"], ["S0"], stride=stride, pad=pad, kernel=kernel, group=group, device_option=dc[0]) conv = core.CreateOperator( "Conv", ["X0", "w0", "b0"] if use_bias else ["X0", "w0"], ["Y0"], stride=stride, pad=pad, kernel=kernel, group=group, device_option=dc[0]) sum = core.CreateOperator("Sum", ["S0", "Y0"], ["S0"], device_option=dc[0]) relu = core.CreateOperator("Relu", ["S0"], ["S0"], device_option=dc[0]) SX = np.random.rand(batch_size, input_channels * group, size, size).astype(np.float32) - 0.5 Sw = np.random.rand( output_channels * group, input_channels, kernel, kernel) \ .astype(np.float32) - 0.5 Sb = np.random.rand(output_channels * group).astype(np.float32) - 0.5 X = np.random.rand(batch_size, input_channels * group, size, size).astype(np.float32) - 0.5 w = np.random.rand( output_channels * group, input_channels, kernel, kernel) \ .astype(np.float32) - 0.5 b = np.random.rand(output_channels * group).astype(np.float32) - 0.5 old_ws_name = workspace.CurrentWorkspace() workspace.SwitchWorkspace("_device_check_", True) workspace.FeedBlob('SX0', SX, dc[0]) workspace.FeedBlob('Sw0', Sw, dc[0]) workspace.FeedBlob('Sb0', Sb, dc[0]) workspace.FeedBlob('X0', X, dc[0]) workspace.FeedBlob('w0', w, dc[0]) workspace.FeedBlob('b0', b, dc[0]) workspace.RunOperatorOnce(conv_S0) workspace.RunOperatorOnce(conv) workspace.RunOperatorOnce(sum) workspace.RunOperatorOnce(relu) S0 = workspace.FetchBlob('S0') workspace.ResetWorkspace() old_net = caffe2_pb2.NetDef() conv_S0_old = caffe2_pb2.OperatorDef() conv_S0_old.CopyFrom(conv_S0) conv_S0_old.device_option.CopyFrom(dc[1]) conv_old = caffe2_pb2.OperatorDef() conv_old.CopyFrom(conv) conv_old.device_option.CopyFrom(dc[1]) sum_old = caffe2_pb2.OperatorDef() sum_old.CopyFrom(sum) sum_old.device_option.CopyFrom(dc[1]) relu_old = caffe2_pb2.OperatorDef() relu_old.CopyFrom(relu) relu_old.device_option.CopyFrom(dc[1]) old_net.op.extend([conv_S0_old, conv_old, sum_old, relu_old]) workspace.FeedBlob('SX0', SX, dc[1]) workspace.FeedBlob('Sw0', Sw, dc[1]) workspace.FeedBlob('Sb0', Sb, dc[1]) workspace.FeedBlob('X0', X, dc[1]) workspace.FeedBlob('w0', w, dc[1]) workspace.FeedBlob('b0', b, dc[1]) net = core.Net("net") net.Proto().CopyFrom(old_net) optimizeForMKLDNN(net) workspace.RunNetOnce(net.Proto()) # The output tensor name will be changed by optimization # sometimes when applying conv sum fusion S2 = workspace.FetchBlob(net.Proto().op[-1].output[0]) if not np.allclose(S0, S2, atol=0.01, rtol=0.01): print(S2.flatten()) print(S0.flatten()) print(np.max(np.abs(S2 - S0))) self.assertTrue(False) workspace.SwitchWorkspace(old_ws_name)
def test_in_place(self, stride, pad, kernel, size, input_channels, output_channels, batch_size, use_bias, gc, dc): # To expose fallback in-place potential issue, the fallback op # following ideep op must be run at least two iterations. conv = core.CreateOperator( "Conv", ["X", "w", "b"] if use_bias else ["X", "w"], ["Y"], stride=stride, pad=pad, kernel=kernel, device_option=dc[0] ) X = np.random.rand( batch_size, input_channels, size, size).astype(np.float32) - 0.5 w = np.random.rand(output_channels, input_channels, kernel, kernel) \ .astype(np.float32) - 0.5 b = np.random.rand(output_channels).astype(np.float32) - 0.5 old_ws_name = workspace.CurrentWorkspace() workspace.SwitchWorkspace("_device_check_", True) workspace.FeedBlob('X', X, dc[0]) workspace.FeedBlob('w', w, dc[0]) workspace.FeedBlob('b', b, dc[0]) workspace.RunOperatorOnce(conv) Y = workspace.FetchBlob('Y') scale = np.random.randn(Y.shape[1]).astype(np.float32) bias = np.random.randn(Y.shape[1]).astype(np.float32) ac = core.CreateOperator( "AffineChannel", ["Y", "scale", "bias"], ["Y"], is_learnable=False, device_option=dc[0] ) workspace.FeedBlob('scale', scale, dc[0]) workspace.FeedBlob('bias', bias, dc[0]) workspace.RunOperatorOnce(ac) workspace.RunOperatorOnce(conv) workspace.RunOperatorOnce(ac) Y0 = workspace.FetchBlob('Y') workspace.ResetWorkspace() dev_net = caffe2_pb2.NetDef() conv_dev = caffe2_pb2.OperatorDef() conv_dev.CopyFrom(conv) conv_dev.device_option.CopyFrom(dc[1]) ac_dev = caffe2_pb2.OperatorDef() ac_dev.CopyFrom(ac) ac_dev.device_option.CopyFrom(dc[1]) dev_net.op.extend([conv_dev, ac_dev]) workspace.FeedBlob('X', X, dc[1]) workspace.FeedBlob('w', w, dc[1]) workspace.FeedBlob('b', b, dc[1]) workspace.FeedBlob('scale', scale, dc[1]) workspace.FeedBlob('bias', bias, dc[1]) workspace.RunNetOnce(dev_net) workspace.RunNetOnce(dev_net) Y1 = workspace.FetchBlob('Y') if not np.allclose(Y0, Y1, atol=0.01, rtol=0.01): print(Y1.flatten()) print(Y0.flatten()) print(np.max(np.abs(Y1 - Y0))) self.assertTrue(False) workspace.SwitchWorkspace(old_ws_name)
def test_convolution_sum_relu_fusion(self, stride, pad, kernel, size, input_channels, output_channels, batch_size, use_bias, group, sum_add, gc, dc): conv_S0 = core.CreateOperator( "Conv", ["SX0", "Sw0", "Sb0"] if use_bias else ["SX0", "Sw0"], ["S0"], stride=stride, pad=pad, kernel=kernel, group=group, device_option=dc[0]) conv = core.CreateOperator( "Conv", ["X0", "w0", "b0"] if use_bias else ["X0", "w0"], ["Y0"], stride=stride, pad=pad, kernel=kernel, group=group, device_option=dc[0]) sum = core.CreateOperator(sum_add, ["S0", "Y0"], ["S0"], device_option=dc[0]) relu = core.CreateOperator("Relu", ["S0"], ["S0"], device_option=dc[0]) # Manual fusion for Conv + Sum + ReLU conv_S1 = core.CreateOperator( "Conv", ["SX1", "Sw1", "Sb1"] if use_bias else ["SX1", "Sw1"], ["S1"], stride=stride, pad=pad, kernel=kernel, group=group, device_option=dc[1]) conv_fusion = core.CreateOperator( "ConvFusion", ["X1", "w1", "b1", "S1"] if use_bias else ["X1", "w1", "S1"], ["S1"], stride=stride, pad=pad, kernel=kernel, group=group, fusion_type=3, device_option=dc[1]) SX = np.random.rand(batch_size, input_channels * group, size, size).astype(np.float32) - 0.5 Sw = np.random.rand( output_channels * group, input_channels, kernel, kernel) \ .astype(np.float32) - 0.5 Sb = np.random.rand(output_channels * group).astype(np.float32) - 0.5 X = np.random.rand(batch_size, input_channels * group, size, size).astype(np.float32) - 0.5 w = np.random.rand( output_channels * group, input_channels, kernel, kernel) \ .astype(np.float32) - 0.5 b = np.random.rand(output_channels * group).astype(np.float32) - 0.5 old_ws_name = workspace.CurrentWorkspace() workspace.SwitchWorkspace("_device_check_", True) workspace.FeedBlob('SX0', SX, dc[0]) workspace.FeedBlob('Sw0', Sw, dc[0]) workspace.FeedBlob('Sb0', Sb, dc[0]) workspace.FeedBlob('X0', X, dc[0]) workspace.FeedBlob('w0', w, dc[0]) workspace.FeedBlob('b0', b, dc[0]) workspace.RunOperatorOnce(conv_S0) workspace.RunOperatorOnce(conv) workspace.RunOperatorOnce(sum) workspace.RunOperatorOnce(relu) S0 = workspace.FetchBlob('S0') workspace.ResetWorkspace() workspace.FeedBlob('SX1', SX, dc[1]) workspace.FeedBlob('Sw1', Sw, dc[1]) workspace.FeedBlob('Sb1', Sb, dc[1]) workspace.FeedBlob('X1', X, dc[1]) workspace.FeedBlob('w1', w, dc[1]) workspace.FeedBlob('b1', b, dc[1]) workspace.RunOperatorOnce(conv_S1) workspace.RunOperatorOnce(conv_fusion) S1 = workspace.FetchBlob('S1') if not np.allclose(S0, S1, atol=0.01, rtol=0.01): print(S1.flatten()) print(S0.flatten()) print(np.max(np.abs(S1 - S0))) self.assertTrue(False) # Auto fusion for Conv + Sum + ReLU workspace.ResetWorkspace() old_net = caffe2_pb2.NetDef() conv_S0_old = caffe2_pb2.OperatorDef() conv_S0_old.CopyFrom(conv_S0) conv_S0_old.device_option.CopyFrom(dc[1]) conv_old = caffe2_pb2.OperatorDef() conv_old.CopyFrom(conv) conv_old.device_option.CopyFrom(dc[1]) sum_old = caffe2_pb2.OperatorDef() sum_old.CopyFrom(sum) sum_old.device_option.CopyFrom(dc[1]) relu_old = caffe2_pb2.OperatorDef() relu_old.CopyFrom(relu) relu_old.device_option.CopyFrom(dc[1]) old_net.op.extend([conv_S0_old, conv_old, sum_old, relu_old]) workspace.FeedBlob('SX0', SX, dc[1]) workspace.FeedBlob('Sw0', Sw, dc[1]) workspace.FeedBlob('Sb0', Sb, dc[1]) workspace.FeedBlob('X0', X, dc[1]) workspace.FeedBlob('w0', w, dc[1]) workspace.FeedBlob('b0', b, dc[1]) net = core.Net("net") net.Proto().CopyFrom(old_net) optimizeForMKLDNN(net) self.assertTrue(len(net.Proto().op) == 2) self.assertTrue(net.Proto().op[1].type == "ConvFusion") workspace.RunNetOnce(net.Proto()) S2 = workspace.FetchBlob('S0') if not np.allclose(S0, S2, atol=0.01, rtol=0.01): print(S2.flatten()) print(S0.flatten()) print(np.max(np.abs(S2 - S0))) self.assertTrue(False) workspace.SwitchWorkspace(old_ws_name)
def make(model='zoo_resnet50', create_dist_net=False, standardize=False, normalize=False, img_size=224, **kw): input_blob = 'data' remove_layers_type = ['FC', 'Softmax'] # By type. remove_layers_with = [] # By name. assert (model == 'zoo_resnet50' ) # TODO add the others. If statements to set other params... #model = 'zoo_densenet121' #output_blob = 'pool5' #global_pool = False #model = 'zoo_inception2' #output_blob = 'pool5/7x7_s1' #global_pool = False model = 'zoo_resnet50' output_blob = 'res4_5_branch2c_bn' #output_blob2 = 'res5_2_branch2c_bn' pooling = 'global' #pooling = 6 #remove_layers_with = ['res5', 'pool5'] DIR = '/home/slee/stuff/terrapixel/cpp_localization/' model = DIR + 'saves/raw/' + model dev = core.DeviceOption(caffe2_pb2.CUDA, 0) #dev = core.DeviceOption(caffe2_pb2.CPU,0) with core.DeviceScope(dev): path = '%s/predict_net.pb' % model pred_net = caffe2_pb2.NetDef() with open(path, 'rb') as f: pred_net.ParseFromString(f.read()) pred_net.device_option.CopyFrom(dev) # GPU print('pred do', pred_net.device_option) path = '%s/init_net.pb' % model init_net = caffe2_pb2.NetDef() with open(path, 'rb') as f: init_net.ParseFromString(f.read()) init_net.device_option.CopyFrom(dev) # GPU # Do 3 things: # 1) Set device on all ops to gpu # 2) if blobs have 'gpu_0' prefix, remove it. # 3) Remove FC layers, softmaxes, etc. for net in (pred_net, init_net): remove_ops = [] for j, op in enumerate(net.op): if op.type in remove_layers_type or any( any(n in out for out in op.output) for n in remove_layers_with): remove_ops.append(j) op.device_option.CopyFrom(dev) for i, f in enumerate(op.input): op.input[i] = f.replace('gpu_0/', '') for i, f in enumerate(op.output): op.output[i] = f.replace('gpu_0/', '') for i, f in enumerate(net.external_input): net.external_input[i] = f.replace('gpu_0/', '') for i, f in enumerate(net.external_output): net.external_output[i] = f.replace('gpu_0/', '') dbg_removed = [] for rop in remove_ops[::-1]: for out in net.op[rop].output: for j, exout in enumerate(net.external_output): if exout == out: net.external_output.pop(j) break net.op.pop(rop) dbg_removed += [out] print('Removed Ops:', ' '.join(dbg_removed)) pred_net = core.Net(pred_net) init_net = core.Net(init_net) # Flatten! ''' if pooling == 'global': pred_net.AveragePool2D([output_blob], ['output_code_1'], global_pooling=True) pred_net.AveragePool2D([output_blob2], ['output_code_2'], global_pooling=True) pred_net.Concat(['output_code_1','output_code_2'], ['output_code_','whocares'], axis=1) #pred_net.MaxPool2D([output_blob], ['output_code_'], global_pooling=True) pred_net.Flatten(['output_code_'], ['output_code']) else: pred_net.AveragePool2D([output_blob], ['output_code_'], kernel=pooling, stride=pooling) pred_net.Flatten(['output_code_'], ['output_code']) #pred_net.Copy([output_blob], ['output_code']) output_blob = 'output_code' ''' pred_net._net.external_output.append(output_blob) mh = model_helper.ModelHelper() mh.net = pred_net mh.param_init_net = init_net mh.RunAllOnGPU() #pdb.set_trace() #print('model proto', mh.net.Proto()) # Determine code_size. Reset workspace afterwards. SZ = img_size img_size = (SZ, SZ) fake_x = np.random.randn(*[1, 3, *img_size]).astype(np.float32) #print('running param init.') workspace.RunNetOnce(init_net) #print('running net.') workspace.FeedBlob(input_blob, fake_x, device_option=dev) workspace.CreateNet(pred_net) #workspace.RunNet(pred_net) #workspace.RunNetOnce(pred_net) #print('fetching.') #fake_code = workspace.FetchBlob(output_blob) #workspace.ResetWorkspace() #workspace.FeedBlob(input_blob, fake_x, device_option=dev) #print(fake_code.shape) #assert((len(fake_code.shape) == 2 or all(fake_code.shape[2:]==1)) and 'code_size must be 1 dim') #code_size = fake_code.shape[1] return mh
# Convert HWC -> CHW img = img.swapaxes(1, 2).swapaxes(0, 1) # Convert CHW -> NCHW img = np.array([img]) # Im info N x 3 tensor of (height, width, scale) im_info = np.reshape( np.array([np.float(img.shape[2]), np.float(img.shape[3]), 1.0]), (1, -1)) im_info = im_info.astype('float32') device_opts = core.DeviceOption(caffe2_pb2.CPU) # Read the contents of the input protobufs into local variables init_net = caffe2_pb2.NetDef() with open(INIT_NET, 'rb') as f: init_net.ParseFromString(f.read()) init_net.device_option.CopyFrom(device_opts) predict_net = caffe2_pb2.NetDef() with open(PREDICT_NET, "rb") as f: predict_net.ParseFromString(f.read()) predict_net.device_option.CopyFrom(device_opts) # Initialise the predictor from the input protobufs p = workspace.Predictor(init_net, predict_net) # Run the net and return predictions results = p.run({'data': img, "im_info": im_info})
def test_slws_fused_8bit_rowwise_acc32_nnpi(self, seed): workspace.GlobalInit( [ "caffe2", "--glow_global_fp16=0", "--glow_global_fused_scale_offset_fp16=0", "--glow_global_force_sls_fp16_accum=0", ] ) np.random.seed(seed) workspace.ResetWorkspace() n = 20000 DIM = 6 data = (4 * np.random.random_sample((n, DIM)) + 1).astype(np.float32) max_segments = 200 max_segment_length = 200 num_lengths = np.random.randint(0, max_segments + 1) # number of segments to run lengths = np.random.randint(2, max_segment_length + 1, size=num_lengths).astype( np.int32 ) num_indices = np.sum(lengths) indices = np.random.randint(low=0, high=n, size=num_indices, dtype=np.int64) weights = np.random.uniform(low=0.01, high=0.5, size=[len(indices)]).astype( np.float32 ) pred_net = caffe2_pb2.NetDef() pred_net.name = "pred" pred_net.external_input.extend( ["quantized_data", "weights", "indices", "lengths"] ) pred_net.external_output.append("Y") pred_net.op.add().CopyFrom( core.CreateOperator( "SparseLengthsWeightedSumFused8BitRowwise", ["quantized_data", "weights", "indices", "lengths"], ["Y"], ) ) ref_net = caffe2_pb2.NetDef() ref_net.name = "ref" ref_net.external_input.extend( ["quantized_data", "weights", "indices", "lengths"] ) ref_net.external_output.append("Y") ref_net.op.add().CopyFrom( core.CreateOperator( "SparseLengthsWeightedSumFused8BitRowwiseFakeFP32NNPI", ["quantized_data", "weights", "indices", "lengths"], ["Y"], ) ) workspace.FeedBlob("data", data) workspace.RunOperatorOnce( core.CreateOperator( "FloatToFused8BitRowwiseQuantized", ["data"], ["quantized_data"] ) ) onnxified_net = onnxifi_caffe2_net( pred_net, {}, max_batch_size=max_segments, max_seq_size=max_segments * max_segment_length, debug=True, adjust_batch=True, use_onnx=False, ) workspace.FeedBlob("indices", indices) workspace.FeedBlob("lengths", lengths) workspace.FeedBlob("weights", weights) workspace.CreateNet(onnxified_net) workspace.CreateNet(ref_net) workspace.RunNet(onnxified_net.name) Y_glow = workspace.FetchBlob("Y") workspace.RunNet(ref_net.name) Y_ref = workspace.FetchBlob("Y") diff = np.abs((Y_ref - Y_glow) / (Y_ref + 1e-8)) max_err = np.max(diff, axis=1) num_offenders = (max_err > 0).sum() if num_offenders > 0: print_test_debug_info( "test_slws_fused_8bit_rowwise_acc32_nnpi", { "indices": indices, "data": data.shape, "lengths": lengths, "weights": weights, "Y_glow": Y_glow, "Y_ref": Y_ref, "diff": diff, "rowwise_diff": np.max(diff, axis=1), }, ) assert 0
def Skip_test_tanhquantize(self, scale, zp, size, rand_seed): np.random.seed(rand_seed) workspace.ResetWorkspace() pred_net = caffe2_pb2.NetDef() pred_net.name = "ref" pred_net.external_input.append("X") pred_net.external_output.append("Y_q") pred_net.op.add().CopyFrom( core.CreateOperator( "Tanh", ["X"], ["Y"] ) ) pred_net.op.add().CopyFrom( core.CreateOperator( "Int8Quantize", ["Y"], ["Y_q"], Y_scale=scale, Y_zero_point=zp ) ) X = np.linspace(-1, 1, size).astype(np.float16).astype(np.float32) pred_net_onnxified = onnxifi_caffe2_net( pred_net, {"X": X.shape}, debug=True, adjust_batch=False, use_onnx=False, ) num_onnxified_ops = sum( 1 if o.type == "Onnxifi" else 0 for o in pred_net_onnxified.op ) np.testing.assert_equal(num_onnxified_ops, 1) workspace.FeedBlob("X", X) workspace.CreateNet(pred_net_onnxified) workspace.RunNet(pred_net_onnxified.name) Y_glow = workspace.FetchInt8Blob("Y_q") ref_net = caffe2_pb2.NetDef() ref_net.name = "ref" ref_net.external_input.append("X") ref_net.external_output.append("Y_q") ref_net.op.add().CopyFrom( core.CreateOperator( "TanhQuantFakeFp16NNPI", ["X"], ["Y_q"], Y_scale=scale, Y_zero_point=zp ) ) workspace.CreateNet(ref_net) workspace.RunNet(ref_net.name) Y_ref = workspace.FetchInt8Blob("Y_q") if not np.array_equal(Y_ref.data, Y_glow.data) or \ not Y_ref.scale == Y_glow.scale or \ not Y_ref.zero_point == Y_glow.zero_point: print_test_debug_info( "tanhfusion", { "scale": scale, "zp": zp, "input": X, "ideal nonquant": np.tanh(X), "Y_glow": Y_glow, "Y_c2": Y_ref, } ) assert(0)
def load_caffe2_net(file): net = caffe2_pb2.NetDef() with open(file, "rb") as f: net.ParseFromString(f.read()) return net
def Skip_test_layernorm(self, seed, batch_size, size, epsilon, elementwise_affine): np.random.seed(seed) # Reset the workspace workspace.ResetWorkspace() axis = 1 dims = np.array(([batch_size, size])) X = np.random.uniform(size=dims).astype(np.float32) - 0.5 gamma = np.random.randn(*X.shape[axis:]).astype(np.float32) beta = np.random.randn(*X.shape[axis:]).astype(np.float32) pred_net = caffe2_pb2.NetDef() pred_net.name = "pred" pred_net.external_input.extend(["X", "gamma", "beta"]) pred_net.external_output.extend(["Y", "mean", "rstd"]) pred_net.op.add().CopyFrom( core.CreateOperator( "LayerNorm", ["X", "gamma", "beta"] if elementwise_affine else ["X"], ["Y", "mean", "rstd"], axis=axis, epsilon=epsilon, elementwise_affine=elementwise_affine)) pred_net_ref = caffe2_pb2.NetDef() pred_net_ref.name = "pred_ref" pred_net_ref.external_input.extend(["X", "gamma", "beta"]) pred_net_ref.external_output.extend(["Y", "mean", "rstd"]) pred_net_ref.op.add().CopyFrom( core.CreateOperator( "LayerNormFakeFP16NNPI", ["X", "gamma", "beta"] if elementwise_affine else ["X"], ["Y", "mean", "rstd"], axis=axis, epsilon=epsilon, elementwise_affine=elementwise_affine)) shape_hits = {"X": X.shape, "gamma": gamma.shape, "beta": beta.shape} pred_net_onnxified = onnxifi_caffe2_net(pred_net, shape_hits, debug=True, adjust_batch=True, use_onnx=False) num_onnxified_ops = sum(1 if o.type == "Onnxifi" else 0 for o in pred_net_onnxified.op) np.testing.assert_equal(num_onnxified_ops, 1) workspace.FeedBlob("X", X) workspace.FeedBlob("gamma", gamma) workspace.FeedBlob("beta", beta) workspace.CreateNet(pred_net_ref) workspace.CreateNet(pred_net_onnxified) workspace.RunNet(pred_net_ref.name) Y_c2 = workspace.FetchBlob("Y") dims1 = np.array(([1, *dims])) X_glow = X.reshape(dims1) workspace.FeedBlob("X", X_glow) workspace.RunNet(pred_net_onnxified.name) Y_glow = workspace.FetchBlob("Y") if not np.allclose(Y_glow, Y_c2): diff_Y = np.abs(Y_glow - Y_c2) print_test_debug_info( "layernorm", { "seed": seed, "size": size, "batch_size": batch_size, "epsilon": epsilon, "gamma": gamma, "beta": beta, "elementwise_affine": elementwise_affine, "X": X, "Y_glow": Y_glow, "Y_c2": Y_c2, "diff_Y": diff_Y, }) assert (0)
def prepare(cls, model, device='CPU', **kwargs): ''' For Onnx Caffe2Backend, we require that init_graph don't initialize the actual input of the predict_graph, for example, if "img" is the input blob for the predict_net, we require that in init_graph and in initializer of the predict_graph, "img" is not initalized. We don't have a check for this, since there is no way we can know which blob is the input of the predict_graph. ''' super(Caffe2Backend, cls).prepare(model, device, **kwargs) opset_version = None for imp in model.opset_import: if not imp.HasField("domain") or imp.domain == "": opset_version = imp.version if imp.version > cls._known_opset_version: warnings.warn("This version of onnx-caffe2 targets ONNX operator set version {}, but the model we are trying to import uses version {}. We will try to import it anyway, but if the model uses operators which had BC-breaking changes in the intervening versions, import will fail.".format(cls._known_opset_version, imp.version)) else: warnings.warn("Unrecognized operator set {}".format(imp.domain)) if opset_version is None: if model.ir_version >= 0x00000003: raise RuntimeError("Model with IR version >= 3 did not specify ONNX operator set version (onnx-caffe2 requires it)") else: opset_version = 1 # Check whether we have RNN related ops pred_model = ModelProto() pred_model.ParseFromString(cls.optimize_onnx(model.SerializeToString(), predict=True)) cls._inplace_rewrite(pred_model.graph) rnn_nodes = [] for node in pred_model.graph.node: if node.op_type in {'LSTM', 'GRU', 'RNN'}: rnn_nodes.append(node) # Build the C++ backend # TODO: build a predictor that supports GPU # And for RNN nets, we need to avoid adding init_net if device == 'CPU' and not rnn_nodes: c2_rnn_ops = [] if rnn_nodes: init_model = ModelProto() init_model.ParseFromString(cls.optimize_onnx(model.SerializeToString(), init=True)) cls._inplace_rewrite(init_model.graph) for node in rnn_nodes: c2ops = cls._onnx_node_to_caffe2_op( init_model, pred_model, node, opset_version) init_ops = [x.SerializeToString() for x in c2ops.init_ops] ops = [x.SerializeToString() for x in c2ops.ops] external_inputs = c2ops.interface_blobs c2_rnn_ops.append(C.Caffe2Ops(init_ops, ops, external_inputs)) del init_model cbackend = C.Caffe2Backend() rep = cbackend.prepare(model.SerializeToString(), device, c2_rnn_ops) # For testing # Dump the net descriptions to file for comparison with the Python ones if "ONNX_CAFFE2_DEBUG" in os.environ: pred_net_str = rep.pred_net() pn = caffe2_pb2.NetDef() pn.ParseFromString(pred_net_str) init_net_str = rep.init_net() inn = caffe2_pb2.NetDef() inn.ParseFromString(init_net_str) with open("cpp.txt", "w") as f: f.write("pred_net: \n{}".format(pn)) rep_wrapper = Caffe2CppRep(rep) return rep_wrapper else: ws = Workspace() device_option = get_device_option(Device(device)) # Directly load initializer data into blobs in workspace cls._direct_initialize_parameters( model.graph.initializer, ws, device_option, ) initialized = {init.name for init in model.graph.initializer} cls._direct_initialize_inputs( model.graph.input, initialized, ws, device_option, ) uninitialized = [value_info.name for value_info in model.graph.input if value_info.name not in initialized] init_net, predict_net = cls._onnx_model_to_caffe2_net(model, device, opset_version, False) if "ONNX_CAFFE2_DEBUG" in os.environ: with open("python.txt", "w") as f: f.write("pred_net: \n{}".format(predict_net)) retval = Caffe2Rep(init_net, predict_net, ws, uninitialized) return retval
def ExtractPredictorNet( net_proto, input_blobs, output_blobs, device=None, renames=None, disabled_inputs=None, ): ''' Takes a model net for training and returns a net which can be used for prediction. For example, all gradient operators and input operators are removed. @param net_proto protobuf of the net you want to process (net.Proto()) @param input_blobs list/set of blob names that are the inputs of predictor @param output_blobs list/set of blob names that are outputs of predictor @param device optional device option that is assigned @param renames dictionary of blob name to a new name (optional) @param disabled_inputs optional set of blobs that are 'switched off'. This will cause branches with those blobs as inputs to be removed ''' predict_net = core.Net(net_proto.name + "_predict") predict_proto = predict_net.Proto() orig_external_inputs = set(net_proto.external_input) orig_external_outputs = set(net_proto.external_output) input_blobs = {str(b) for b in input_blobs} known_blobs = set(orig_external_inputs).union(input_blobs) output_blobs = {str(b) for b in output_blobs} external_inputs = set(input_blobs) external_outputs = set(output_blobs) if renames is None: renames = {} if disabled_inputs is not None: known_blobs = known_blobs - set(disabled_inputs) ops = list(net_proto.op) # Find the range of ops that we should include try: first_op_with_input = min([ j for j in range(len(ops)) if input_blobs.intersection(ops[j].input) and ops[j].type != 'StopGradient' ]) except ValueError: raise Exception("No ops with input={}".format(input_blobs)) try: last_op_with_output = max([ j for j in range(len(ops)) if output_blobs.intersection(ops[j].output) ]) except ValueError: raise Exception("No ops with output={}".format(output_blobs)) def validate_op(op): # Check that the op does not have is_test = 0 set. This is a common # pitfall with SpatialBN op, at lest. for arg in op.arg: if arg.name == "is_test" and arg.i == 0: raise Exception( "An operator had is_test=0, did you try to extract a " + "predictor from a train model (instead of test model)?" + " Op was: {}".format(str(op))) # Iterate through the ops and only include those whose inputs # we can satisfy. for op in ops[first_op_with_input:(last_op_with_output + 1)]: if known_blobs.issuperset(op.input): # Special handling for recurrent nets # TODO: when standard argument type for "nets" is introduced, # this can be more general if op.type == 'RecurrentNetwork': import google.protobuf.text_format as protobuftx for arg in op.arg: if arg.name == 'backward_step_net': arg.s = b"" elif arg.name == 'step_net': step_proto = caffe2_pb2.NetDef() protobuftx.Merge(arg.s.decode("ascii"), step_proto) for step_op in step_proto.op: if device is not None: step_op.device_option.device_type = device.device_type step_op.device_option.cuda_gpu_id = device.cuda_gpu_id # Add additional external inputs external_inputs.update( set(step_proto.external_input).intersection( orig_external_inputs)) arg.s = str(step_proto).encode("ascii") if device is not None: op.device_option.device_type = device.device_type op.device_option.cuda_gpu_id = device.cuda_gpu_id validate_op(op) predict_proto.op.extend([op]) known_blobs.update(op.output) external_inputs.update( set(op.input).intersection(orig_external_inputs)) external_outputs.update( set(op.output).intersection(orig_external_outputs)) else: logging.debug("Op {} had unknown inputs: {}".format( op.type, set(op.input).difference(known_blobs))) def rename_list(proto_list): # proto lists don't support assignments new_list = proto_list[:] for j, b in enumerate(new_list): if b in renames: new_list[j] = renames[b] del proto_list[:] proto_list.extend(new_list) # Predictor net's external inputs and outputs include only those # that are part of this net. predict_proto.external_input.extend(external_inputs) predict_proto.external_output.extend(external_outputs) rename_list(predict_proto.external_input) rename_list(predict_proto.external_output) renamed_input_blobs = [] for b in input_blobs: if b in renames: renamed_input_blobs.append(renames[b]) else: renamed_input_blobs.append(b) for op in predict_proto.op: rename_list(op.input) rename_list(op.output) return predict_net, list( set(predict_proto.external_input) - set(renamed_input_blobs))
def test_batch_matmul(self, M, K, N, C, rand_seed, trans_a, trans_b, run_ints): np.random.seed(rand_seed) workspace.ResetWorkspace() batch_dims = [C] if run_ints: X = np.random.randint(low=1, high=3, size=((C, M, K))).astype(np.float32) else: X = 100 * (np.random.rand(*(batch_dims + [M, K])).astype( np.float32) - 0.5) if trans_a: X = X.swapaxes(-1, -2) if run_ints: Y = np.random.randint(low=1, high=3, size=((C, K, N))).astype(np.float32) else: Y = 100 * (np.random.rand(*(batch_dims + [K, N])).astype( np.float32) - 0.5) if trans_b: Y = Y.swapaxes(-1, -2) pred_net = caffe2_pb2.NetDef() pred_net.name = "pred" pred_net.external_input.extend(["X", "Y"]) pred_net.external_output.append("out") pred_net.op.add().CopyFrom( core.CreateOperator('BatchMatMul', ['X', 'Y'], 'out', trans_a=trans_a, trans_b=trans_b)) pred_net_ref = core.Net("pred_net_ref") # Reference updated to fp16 with fp32 accumulation pred_net_ref.BatchMatMulFP16Acc32Fake(["X", "Y"], ['out'], trans_a=trans_a, trans_b=trans_b) print("dims", batch_dims, X.shape, Y.shape) pred_net_onnxified = onnxifi_caffe2_net(pred_net, { "X": X.shape, "Y": Y.shape }, debug=True, adjust_batch=False, use_onnx=False) num_onnxified_ops = sum(1 if o.type == "Onnxifi" else 0 for o in pred_net_onnxified.op) np.testing.assert_equal(num_onnxified_ops, 1) workspace.FeedBlob("X", X) workspace.FeedBlob("Y", Y) workspace.CreateNet(pred_net_onnxified) workspace.CreateNet(pred_net_ref) # Run Glow net workspace.RunNet(pred_net_onnxified.name) out_glow = workspace.FetchBlob('out') # Run caffe2 net workspace.RunNet(pred_net_ref) out_c2_fakefp16 = workspace.FetchBlob('out') diff = np.abs(out_c2_fakefp16 - out_glow) if not np.allclose(out_glow, out_c2_fakefp16): print_test_debug_info( "bmm", { "seed": rand_seed, "m": M, "k": K, "n": N, "X": X.shape, "Y": Y.shape, "trans_a": trans_a, "trans_b": trans_b, "run_ints": run_ints, "out_glow": out_glow, "out_c2_fakefp16": out_c2_fakefp16, "diff": diff }) assert (0)
def test_convolution_relu_fusion(self, stride, pad, kernel, size, input_channels, output_channels, batch_size, use_bias, group, gc, dc): conv = core.CreateOperator( "Conv", ["X0", "w0", "b0"] if use_bias else ["X0", "w0"], ["Y0"], stride=stride, pad=pad, kernel=kernel, group=group, device_option=dc[0]) relu = core.CreateOperator("Relu", ["Y0"], ["Y0"], device_option=dc[0]) # Manual fusion for Conv + ReLU conv_fusion = core.CreateOperator( "ConvFusion", ["X1", "w1", "b1"] if use_bias else ["X1", "w1"], ["Y1"], stride=stride, pad=pad, kernel=kernel, group=group, fusion_type=1, device_option=dc[1]) X = np.random.rand(batch_size, input_channels * group, size, size).astype(np.float32) - 0.5 w = np.random.rand( output_channels * group, input_channels, kernel, kernel) \ .astype(np.float32) - 0.5 b = np.random.rand(output_channels * group).astype(np.float32) - 0.5 old_ws_name = workspace.CurrentWorkspace() workspace.SwitchWorkspace("_device_check_", True) workspace.FeedBlob('X0', X, dc[0]) workspace.FeedBlob('w0', w, dc[0]) workspace.FeedBlob('b0', b, dc[0]) workspace.RunOperatorOnce(conv) workspace.RunOperatorOnce(relu) Y0 = workspace.FetchBlob('Y0') workspace.ResetWorkspace() workspace.FeedBlob('X1', X, dc[1]) workspace.FeedBlob('w1', w, dc[1]) workspace.FeedBlob('b1', b, dc[1]) workspace.RunOperatorOnce(conv_fusion) Y1 = workspace.FetchBlob('Y1') if not np.allclose(Y0, Y1, atol=0.01, rtol=0.01): print(Y1.flatten()) print(Y0.flatten()) print(np.max(np.abs(Y1 - Y0))) self.assertTrue(False) # Auto fusion for Conv + ReLU workspace.ResetWorkspace() old_net = caffe2_pb2.NetDef() conv_old = caffe2_pb2.OperatorDef() conv_old.CopyFrom(conv) conv_old.device_option.CopyFrom(dc[1]) relu_old = caffe2_pb2.OperatorDef() relu_old.CopyFrom(relu) relu_old.device_option.CopyFrom(dc[1]) old_net.op.extend([conv_old, relu_old]) workspace.FeedBlob('X0', X, dc[1]) workspace.FeedBlob('w0', w, dc[1]) workspace.FeedBlob('b0', b, dc[1]) net = core.Net("net") net.Proto().CopyFrom(old_net) optimizeForMKLDNN(net) self.assertTrue(len(net.Proto().op) == 1) self.assertTrue(net.Proto().op[0].type == "ConvFusion") workspace.RunOperatorOnce(net.Proto().op[0]) Y2 = workspace.FetchBlob('Y0') if not np.allclose(Y0, Y2, atol=0.01, rtol=0.01): print(Y2.flatten()) print(Y0.flatten()) print(np.max(np.abs(Y2 - Y0))) self.assertTrue(False) workspace.SwitchWorkspace(old_ws_name)
def read_init_net_pbtxt(init_net_file): init_net_txt = open(init_net_file, "r").read() init_net = caffe2_pb2.NetDef() text_format.Merge(init_net_txt, init_net) return init_net
def test_convolution_affch_folding(self, stride, pad, kernel, size, input_channels, output_channels, batch_size, use_bias, group, inplace, gc, dc): conv = core.CreateOperator( "Conv", ["X0", "w0", "b0"] if use_bias else ["X0", "w0"], ["X1"], stride=stride, pad=pad, kernel=kernel, group=group, device_option=dc[1]) affch = core.CreateOperator("AffineChannel", ["X1", "scale", "bias"], ["X1" if inplace else "Y"], device_option=dc[1]) X = np.random.rand(batch_size, input_channels * group, size, size).astype(np.float32) - 0.5 w = np.random.rand( output_channels * group, input_channels, kernel, kernel) \ .astype(np.float32) - 0.5 b = np.random.rand(output_channels * group).astype(np.float32) - 0.5 scale = np.random.rand(output_channels).astype(np.float32) + 0.5 bias = np.random.rand(output_channels).astype(np.float32) - 0.5 old_ws_name = workspace.CurrentWorkspace() workspace.SwitchWorkspace("_device_check_", True) workspace.FeedBlob('X0', X, dc[1]) workspace.FeedBlob('w0', w, dc[1]) workspace.FeedBlob('b0', b, dc[1]) workspace.FeedBlob('scale', scale, dc[1]) workspace.FeedBlob('bias', bias, dc[1]) workspace.RunOperatorOnce(conv) workspace.RunOperatorOnce(affch) Y = workspace.FetchBlob('X1' if inplace else "Y") workspace.ResetWorkspace() old_net = caffe2_pb2.NetDef() conv_old = caffe2_pb2.OperatorDef() conv_old.CopyFrom(conv) conv_old.device_option.CopyFrom(dc[1]) affch_old = caffe2_pb2.OperatorDef() affch_old.CopyFrom(affch) affch_old.device_option.CopyFrom(dc[1]) old_net.op.extend([conv_old, affch_old]) workspace.FeedBlob('X0', X, dc[1]) workspace.FeedBlob('w0', w, dc[1]) workspace.FeedBlob('b0', b, dc[1]) workspace.FeedBlob('scale', scale, dc[1]) workspace.FeedBlob('bias', bias, dc[1]) net = core.Net("net") net.Proto().CopyFrom(old_net) optimizeForMKLDNN(net) self.assertTrue(len(net.Proto().op) == 1) self.assertTrue(net.Proto().op[0].type == "Conv") workspace.RunOperatorOnce(net.Proto().op[0]) Y1 = workspace.FetchBlob('X1' if inplace else "Y") if not np.allclose(Y, Y1, atol=0.01, rtol=0.01): print(Y.flatten()) print(Y1.flatten()) print(np.max(np.abs(Y - Y1))) self.assertTrue(False) workspace.SwitchWorkspace(old_ws_name)
def read_init_net(init_net_file): init_net_pb = open(init_net_file, "rb").read() init_net = caffe2_pb2.NetDef() init_net.ParseFromString(init_net_pb) return init_net
## This is a helper script that generates simple Caffe2 models. from caffe2.proto import caffe2_pb2 from caffe2.python import utils # Define a weights network weights = caffe2_pb2.NetDef() weights.name = "init" op = caffe2_pb2.OperatorDef() op.type = "fake_data_provider" op.output.extend(["data"]) weights.op.extend([op]) weights.external_output.extend(op.output) op = caffe2_pb2.OperatorDef() op.type = "GivenTensorFill" op.output.extend(["fc_w"]) op.arg.extend([utils.MakeArgument("shape", [1,4])]) op.arg.extend([utils.MakeArgument("values", [1.0 for i in range(4)])]) weights.op.extend([op]) weights.external_output.extend(op.output) op = caffe2_pb2.OperatorDef() op.type = "GivenTensorFill" op.output.extend(["fc_b"]) op.arg.extend([utils.MakeArgument("shape", [1,4])]) op.arg.extend([utils.MakeArgument("values", [1.0 for i in range(4)])]) weights.op.extend([op]) weights.external_output.extend(op.output)
def read_predict_net(predict_net_file): predict_net_txt = open(predict_net_file, "r").read() predict_net = caffe2_pb2.NetDef() predict_net.name = "the_model" text_format.Merge(predict_net_txt, predict_net) return predict_net
def test_slws_fused_8bit_rowwise(self, seed, num_rows, embedding_dim, batch_size, max_weight): np.random.seed(seed) workspace.ResetWorkspace() data = np.random.rand(num_rows, embedding_dim).astype(np.float32) lengths = np.random.choice(np.arange(1, num_rows), batch_size).astype(np.int32) indices = [] for length in lengths: indices.extend(np.random.choice(np.arange(1, num_rows), length)) indices = np.asarray(indices).astype(np.int64) weights = np.random.uniform(low=0, high=max_weight, size=[len(indices)]).astype(np.float32) assert (len(weights) < 64000) pred_net = caffe2_pb2.NetDef() pred_net.name = "pred" pred_net.external_input.extend( ["quantized_data", "weights", "indices", "lengths"]) pred_net.external_output.append("Y") pred_net.op.add().CopyFrom( core.CreateOperator( "SparseLengthsWeightedSumFused8BitRowwise", ["quantized_data", "weights", "indices", "lengths"], ["Y"], )) ref_net = caffe2_pb2.NetDef() ref_net.name = "ref" ref_net.external_input.extend( ["quantized_data", "weights", "indices", "lengths"]) ref_net.external_output.append("Y") ref_net.op.add().CopyFrom( core.CreateOperator( "SparseLengthsWeightedSumFused8BitRowwiseFakeFP16NNPI", ["quantized_data", "weights", "indices", "lengths"], ["Y"], )) workspace.FeedBlob("data", data) workspace.RunOperatorOnce( core.CreateOperator("FloatToFused8BitRowwiseQuantized", ["data"], ["quantized_data"])) onnxified_net = onnxifi_caffe2_net( pred_net, {}, max_batch_size=batch_size, max_seq_size=np.max(lengths), debug=True, adjust_batch=True, use_onnx=False, ) num_onnxified_ops = sum(1 if o.type == "Onnxifi" else 0 for o in onnxified_net.op) np.testing.assert_equal(num_onnxified_ops, 1) workspace.FeedBlob("indices", indices) workspace.FeedBlob("lengths", lengths) workspace.FeedBlob("weights", weights) workspace.CreateNet(onnxified_net) workspace.CreateNet(ref_net) workspace.RunNet(onnxified_net.name) Y_glow = workspace.FetchBlob("Y") workspace.RunNet(ref_net.name) Y_ref = workspace.FetchBlob("Y") diff = np.abs((Y_ref - Y_glow) / (Y_ref + 1e-8)) max_err = np.max(diff, axis=1) num_offenders = (max_err > 0).sum() if num_offenders > 0: print_test_debug_info( "slws_fused_8bit_rowwise_inv_scale", { "seed": seed, "num_rows": num_rows, "embedding_dim": embedding_dim, "batch_size": batch_size, "max_weight": max_weight, "indices": indices, "data": data.shape, "lengths": lengths, "weights": weights, "Y_glow": Y_glow, "Y_ref": Y_ref, "diff": diff, "rowwise_diff": np.max(diff, axis=1), }, ) assert 0
def test_onnx_while_fibb(self, condition, max_trip_count, save_scopes, disable_scopes, seed, gc, dc): np.random.seed(seed) if disable_scopes: save_scopes = False # Create body net body_net = caffe2_pb2.NetDef() # Two loop carried dependencies: first and second body_net.external_input.extend(['i', 'cond', 'first', 'second']) body_net.external_output.extend( ['cond_new', 'second', 'third', 'third']) add_op = core.CreateOperator( 'Add', ['first', 'second'], ['third'], ) print3 = core.CreateOperator( 'Print', ['third'], [], ) limit_const = core.CreateOperator( 'ConstantFill', [], ['limit_const'], shape=[1], dtype=caffe2_pb2.TensorProto.FLOAT, value=100.0, ) cond = core.CreateOperator( 'LT', ['third', 'limit_const'], ['cond_new'], ) body_net.op.extend([add_op, print3, limit_const, cond]) while_op = core.CreateOperator( 'ONNXWhile', ['max_trip_count', 'condition', 'first_init', 'second_init'], ['first_a', 'second_a', 'third_a'], body=body_net, has_cond=True, has_trip_count=True, save_scopes=save_scopes, disable_scopes=disable_scopes, ) condition_arr = np.array(condition).astype(np.bool) max_trip_count_arr = np.array(max_trip_count).astype(np.int64) first_init = np.array([1]).astype(np.float32) second_init = np.array([1]).astype(np.float32) def ref(max_trip_count, condition, first_init, second_init): first = 1 second = 1 results = [] if condition: for _ in range(max_trip_count): third = first + second first = second second = third results.append(third) if third > 100: break return (first, second, np.array(results).astype(np.float32)) self.assertReferenceChecks( gc, while_op, [max_trip_count_arr, condition_arr, first_init, second_init], ref, )
def test_small_sls(self, seed): np.random.seed(seed) workspace.ResetWorkspace() n = 2 DIM = 3 data = 4 * (np.random.random_sample((n, DIM)) + 1).astype(np.float32) lengths = np.array([n], dtype=np.int32) indices = np.array(range(n), dtype=np.int64) weights = np.random.uniform(low=0.01, high=0.5, size=[n]).astype(np.float32) pred_net = caffe2_pb2.NetDef() pred_net.name = "pred" pred_net.external_input.extend( ["quantized_data", "weights", "indices", "lengths"]) pred_net.external_output.append("Y") pred_net.op.add().CopyFrom( core.CreateOperator( "SparseLengthsWeightedSumFused8BitRowwise", ["quantized_data", "weights", "indices", "lengths"], ["Y"], )) ref_net = caffe2_pb2.NetDef() ref_net.name = "ref" ref_net.external_input.extend( ["quantized_data", "weights", "indices", "lengths"]) ref_net.external_output.append("Y") ref_net.op.add().CopyFrom( core.CreateOperator( "SparseLengthsWeightedSumFused8BitRowwiseFakeFP16NNPI", ["quantized_data", "weights", "indices", "lengths"], ["Y"], )) workspace.FeedBlob("data", data) workspace.RunOperatorOnce( core.CreateOperator("FloatToFused8BitRowwiseQuantized", ["data"], ["quantized_data"])) quantized_data = workspace.FetchBlob("quantized_data") onnxified_net = onnxifi_caffe2_net( pred_net, {}, max_batch_size=1, max_seq_size=n, debug=True, adjust_batch=True, use_onnx=False, ) num_onnxified_ops = sum(1 if o.type == "Onnxifi" else 0 for o in onnxified_net.op) np.testing.assert_equal(num_onnxified_ops, 1) workspace.FeedBlob("indices", indices) workspace.FeedBlob("lengths", lengths) workspace.FeedBlob("weights", weights) workspace.CreateNet(onnxified_net) workspace.CreateNet(ref_net) workspace.RunNet(onnxified_net.name) Y_glow = workspace.FetchBlob("Y") workspace.RunNet(ref_net.name) Y_ref = workspace.FetchBlob("Y") diff = np.abs((Y_ref - Y_glow) / (Y_ref + 1e-8)) max_err = np.max(diff, axis=1) num_offenders = (max_err > 0).sum() if num_offenders > 0: np.set_printoptions(precision=12) print( "ref", Y_ref.astype(np.float16).astype(np.float32), "glow", Y_glow.astype(np.float16).astype(np.float32), ) print_test_debug_info( "slws_fused_8bit_rowwise_inv_scale", { "seed": seed, "indices": indices, "data": data, "quantized_data": quantized_data, "lengths": lengths, "weights": weights, "Y_glow": Y_glow, "Y_ref": Y_ref, "diff": diff, "rowwise_diff": np.max(diff, axis=1), }, ) assert 0
# Add the data input layer to the model, pointing at the TEST_LMDB data, _ = AddInputLayer(test_model, 1, TEST_LMDB, 'lmdb') # ### Populate the Model Helper with Saved Model Params # # To format a model for testing, we do not need to create params in the model helper, nor do we need to add gradient operators as we will only be performing forward passes. All we really need to do is populate the *.net* and *.param_init_net* members of the model helper with the contents of the saved *predict_net.pb* and *init_net.pb*, respectively. To accomplish this, we construct *caffe2_pb* objects with the protobuf from the pb files, create *Net* objects with the *caffe2_pb* objects, then **append** the net objects to the *.net* and *.param_init_net* members of the model helper. Appending is very important here! If we do not append, we would wipe out the input data layer stuff that we just added. # # Recall from Part 1, the saved model expected an input named *data* and produced an output called *softmax*. Conveniently (but not accidentally), the *AddInputLayer* function reads from the lmdb and puts the information into the workspace in a blob called *data*. It is also important to remember what each of the saved nets that we are appending to our model contains. The *predict_net* contains the structure of the model, including the ops involved in the forward pass. It has the definitions of the convolutional, pooling, and fc layers in the model. The *init_net* contains the weight initializations for the parameters that the ops in the *predict_net* expect. For example, if there is an op in the *predict_net* named 'fc1', the *init_net* will contain the trained weights (*fc1_w*), and biases (*fc1_b*) for that layer. # # After we append the nets, we add an accuracy layer to the model which uses the *softmax* output from the saved model and the *label* input from the lmdb. Note, we could manually fetch the softmax blob from the workspace after every iteration and check whether or not the class with the highest softmax score is the true label, but instead we opt for the simpler accuacy layer. # In[5]: # Populate the model helper obj with the init net stuff, which provides the # weight initializations for the model init_net_proto = caffe2_pb2.NetDef() with open(INIT_NET, "r") as f: init_net_proto.ParseFromString(f.read()) test_model.param_init_net = test_model.param_init_net.AppendNet( core.Net(init_net_proto)) # Populate the model helper obj with the predict net stuff, which defines # the structure of the model predict_net_proto = caffe2_pb2.NetDef() with open(PREDICT_NET, "r") as f: predict_net_proto.ParseFromString(f.read()) test_model.net = test_model.net.AppendNet(core.Net(predict_net_proto)) # Add an accuracy feature to the model for convenient reporting during testing accuracy = brew.accuracy(test_model, ['softmax', 'label'], 'accuracy')
def TranslateModel( cls, caffe_net, pretrained_net, is_test=False, input_mean=None, net_state=None, ): net_state = caffe_pb2.NetState() if net_state is None else net_state net = caffe2_pb2.NetDef() net.name = caffe_net.name net_params = caffe2_pb2.TensorProtos() if len(caffe_net.layer) == 0: raise ValueError( 'I think something is wrong. This translation script ' 'only accepts new style layers that are stored in the ' 'layer field.') if input_mean: caffenet_mean = caffe_pb2.BlobProto() caffenet_mean.ParseFromString(open(input_mean, 'rb').read()) mean_ = utils.CaffeBlobToNumpyArray(caffenet_mean) mean_tensor = utils.NumpyArrayToCaffe2Tensor(mean_, 'mean_') net_params.protos.extend([mean_tensor]) mean_op = caffe2_pb2.OperatorDef() mean_op.type = 'Sub' mean_op.input.extend(['data_', 'mean_']) # Assume that input blob's name is "data" mean_op.output.extend(['data']) net.op.extend([mean_op]) i = 0 while i < len(caffe_net.layer): if not _ShouldInclude(net_state, caffe_net.layer[i]): log.info('Current net state does not need layer {}'.format( caffe_net.layer[i].name)) continue log.info('Translate layer {}'.format(caffe_net.layer[i].name)) # Get pretrained one pretrained_layers_index = ([ l for l in xrange(len(pretrained_net.layer)) if pretrained_net.layer[l].name == caffe_net.layer[i].name ] + [ l for l in xrange(len(pretrained_net.layers)) if pretrained_net.layers[l].name == caffe_net.layer[i].name ]) is_bn = False if len(pretrained_layers_index) > 1: raise ValueError( 'huh? more than one pretrained layer of one name?') elif len(pretrained_layers_index) == 1: if pretrained_net.layer[ pretrained_layers_index[0]].type == "BatchNorm": # A Scale layer should follow BatchNorm layer # according to paper https://arxiv.org/abs/1502.03167. assert pretrained_net.layer[pretrained_layers_index[0] + 1].type == "Scale" pretrained_blobs = [utils.CaffeBlobToNumpyArray(blob) for blob in pretrained_net.layer[pretrained_layers_index[0]].blobs] + \ [utils.CaffeBlobToNumpyArray(blob) for blob in pretrained_net.layer[pretrained_layers_index[0] + 1].blobs] is_bn = True else: pretrained_blobs = [ utils.CaffeBlobToNumpyArray(blob) for blob in pretrained_net.layer[pretrained_layers_index[0]].blobs ] else: # No pretrained layer for the given layer name. We'll just pass # no parameter blobs. # print 'No pretrained layer for layer', layer.name pretrained_blobs = [] operators, params = cls.TranslateLayer(caffe_net.layer[i], pretrained_blobs, is_test) net.op.extend(operators) net_params.protos.extend(params) if is_bn: i += 2 else: i += 1 return net, net_params
def _test_binary_op_graph(self, name): workspace.ResetWorkspace() # First dimension is the batch size dims = np.concatenate((np.array([1]), np.random.randint(1, 20, size=3))) A = np.random.uniform(low=-100.0, high=100.0, size=dims).astype(np.float32) B = np.random.uniform(low=-100.0, high=100.0, size=dims).astype(np.float32) print(A.shape, B.shape) pred_net = caffe2_pb2.NetDef() pred_net.name = "pred" pred_net.external_input.extend(["A", "B"]) pred_net.external_output.append("C") pred_net.op.add().CopyFrom(core.CreateOperator(name, ["A", "B"], ["C"])) pred_net_ref = caffe2_pb2.NetDef() pred_net_ref.name = "ref" pred_net_ref.external_input.extend(["A", "B"]) pred_net_ref.external_output.append("C_ref") pred_net_ref.op.add().CopyFrom( core.CreateOperator( name + "FakeFp16", ["A", "B"], ["C_ref"], )) shape_hints = {"A": A.shape, "B": B.shape} pred_net_onnxified = onnxifi_caffe2_net(pred_net, shape_hints, debug=True, adjust_batch=True, use_onnx=False) print(pred_net_onnxified) num_onnxified_ops = sum(1 if o.type == "Onnxifi" else 0 for o in pred_net_onnxified.op) np.testing.assert_equal(num_onnxified_ops, 1) workspace.SwitchWorkspace("glow_test_ws", True) workspace.FeedBlob("A", A) workspace.FeedBlob("B", B) workspace.CreateNet(pred_net_ref) workspace.CreateNet(pred_net_onnxified) num_iterations = 10 for _ in range(num_iterations): A = np.random.uniform(low=-100.0, high=100.0, size=dims).astype(np.float32) B = np.random.uniform(low=-100.0, high=100.0, size=dims).astype(np.float32) workspace.FeedBlob("A", A) workspace.FeedBlob("B", B) # Run caffe2 net workspace.RunNet(pred_net_ref.name) Y_c2 = workspace.FetchBlob("C_ref") # Run Glow net workspace.RunNet(pred_net_onnxified.name) Y_glow = workspace.FetchBlob("C") # Results should be identical since we are comparing with the C2 emulation if not np.allclose(Y_c2, Y_glow): diff = np.abs((Y_glow - Y_c2) / (Y_c2 + kEpsilon)) print_test_debug_info( name, { "dims": dims, "A": A, "B": B, "Y_glow": Y_glow, "Y_c2": Y_c2, "diff": diff }) assert (0)