Ejemplo n.º 1
0
    def test_int8_convolution(self, stride, pad, kernel, size, input_channels,
                              output_channels, batch_size, use_bias, gc, dc):
        X = np.random.rand(batch_size, input_channels, size, size).astype(
            np.float32) - 0.5
        w = np.random.rand(output_channels, input_channels, kernel,
                           kernel).astype(np.float32) - 0.5
        b = np.random.rand(output_channels).astype(np.float32) - 0.5

        old_ws_name = workspace.CurrentWorkspace()
        workspace.SwitchWorkspace("_device_check_", True)
        conv_fp32 = core.CreateOperator(
            "Conv",
            ["X_fp32", "w_fp32", "b_fp32"]
            if use_bias else ["X_fp32", "w_fp32"],
            ["Y_fp32"],
            stride=stride,
            pad=pad,
            kernel=kernel,
            training_mode=0,
            device_option=dc[0],
        )
        workspace.FeedBlob('X_fp32', X, dc[0])
        workspace.FeedBlob('w_fp32', w, dc[0])
        workspace.FeedBlob('b_fp32', b, dc[0])
        workspace.RunOperatorOnce(conv_fp32)
        Y = workspace.FetchBlob('Y_fp32')

        workspace.ResetWorkspace()

        Y_absmax = np.array([np.absolute(Y).max()]).astype(np.float32)
        if Y.min() >= 0:
            Y_scale = Y_absmax / 0xFF
            Y_zero_point = 0
        else:
            Y_scale = Y_absmax / 0x7F
            Y_zero_point = 128

        X_absmax = np.array([np.absolute(X).max()]).astype(np.float32)
        if X.min() >= 0:
            X_scale = X_absmax / 0xFF
            X_zero_point = 0
        else:
            X_scale = X_absmax / 0x7F
            X_zero_point = 128

        w_absmax = np.array([
            np.absolute(w[i, ...]).max() for i in range(w.shape[0])
        ]).astype(np.float32)
        w_scale = w_absmax / 0x7F
        w_zero_point = 128
        w = np.transpose(w, (0, 2, 3, 1)).astype(np.float32)
        w_bytes = np.rint([w[i, ...] / w_scale[i] for i in range(w.shape[0])
                           ]).astype(np.int8) + w_zero_point

        w_filler = core.CreateOperator(
            "Int8GivenTensorFill",
            [],
            ["w"],
            shape=w.shape,
            values=w_bytes.astype(np.uint8).tobytes(),
            Y_zero_point=w_zero_point,
            Y_scales=w_scale,
            device_option=dc[1],
        )

        b_scale = w_scale * X_scale
        b_zero_point = 0
        b_bytes = np.rint([b[i] / b_scale[i]
                           for i in range(b.shape[0])]).astype(np.int32)
        b_filler = core.CreateOperator(
            "Int8GivenIntTensorFill",
            [],
            ["b"],
            shape=b.shape,
            values=b_bytes,
            Y_zero_point=b_zero_point,
            Y_scales=b_scale,
            device_option=dc[1],
        )

        sw2nhwc = core.CreateOperator("NCHW2NHWC", ["X"], ["X_nhwc"],
                                      device_option=dc[1])

        quantize_X = core.CreateOperator(
            "Int8Quantize",
            ["X_nhwc"],
            ["X_quantized"],
            engine="DNNLOWP",
            device_option=dc[1],
            Y_zero_point=X_zero_point,
            Y_scale=X_scale[0],
        )

        conv = core.CreateOperator(
            "Int8Conv",
            ["X_quantized", "w", "b"] if use_bias else ["X_quantized", "w"],
            ["Y_quantized"],
            stride=stride,
            pad=pad,
            kernel=kernel,
            engine="DNNLOWP",
            device_option=dc[1],
            Y_zero_point=Y_zero_point,
            Y_scale=Y_scale[0],
        )

        dequantize_Y = core.CreateOperator(
            "Int8Dequantize",
            ["Y_quantized"],
            ["Y_nhwc"],
            engine="DNNLOWP",
            device_option=dc[1],
        )

        sw2nchw = core.CreateOperator("NHWC2NCHW", ["Y_nhwc"], ["Y_out"],
                                      device_option=dc[1])

        net = caffe2_pb2.NetDef()
        net.op.extend([
            w_filler, b_filler, sw2nhwc, quantize_X, conv, dequantize_Y,
            sw2nchw
        ])

        workspace.FeedBlob("X", X, dc[1])
        workspace.RunNetOnce(net)
        Y_out = workspace.FetchBlob("Y_out")

        MSE = np.square(np.subtract(Y, Y_out)).mean()
        if MSE > 0.005:
            print(Y.flatten())
            print(Y_out.flatten())
            print(np.max(np.abs(Y_out - Y)))
            print("MSE", MSE)
            self.assertTrue(False)

        workspace.SwitchWorkspace(old_ws_name)
    def Skip_test_SLS_NonQuantized_fp16(self):
        N = 20000
        DIM = 64
        D = (4 * np.random.random_sample((N, DIM)) + 1).astype(np.float32)
        I = (np.random.randint(0, N, size=12)).astype(np.int64)
        L = np.asarray([4, 4, 4]).astype(np.int32)
        workspace.FeedBlob("D", D)

        ref_c2_net = core.Net("test_ref_c2")
        ref_c2_net.SparseLengthsSum(["D", "I", "L"], "ref_out")
        ref_c2_net.Proto().external_input.extend(["D", "I", "L"])
        ref_c2_net.Proto().external_output.extend(["ref_out"])

        fp16_c2_net = core.Net("test_fp16_c2")
        fp16_c2_net.SparseLengthsSumFakeFP16AccFP16(["D", "I", "L"],
                                                    "fp16_out")

        input_dict = {}

        pred_net = caffe2_pb2.NetDef()
        pred_net.name = "pred"
        pred_net.external_input.extend(["D", "I", "L"])
        pred_net.external_output.append("glow_out")
        pred_net.op.add().CopyFrom(
            core.CreateOperator("SparseLengthsSum", ["D", "I", "L"],
                                ["glow_out"]))

        onnxified_net = onnxifi_caffe2_net(
            pred_net,
            input_dict,
            max_batch_size=3,
            max_seq_size=16,
            debug=True,
            adjust_batch=False,
            use_onnx=False,
        )

        num_onnxified_ops = sum(1 if op.type == "Onnxifi" else 0
                                for op in onnxified_net.op)
        print(onnxified_net)
        np.testing.assert_equal(num_onnxified_ops, 1)

        workspace.FeedBlob("I", I)
        workspace.FeedBlob("L", L)

        workspace.RunNetOnce(ref_c2_net)
        ref_c2_out = workspace.FetchBlob("ref_out")

        workspace.RunNetOnce(fp16_c2_net)
        fp16_c2_out = workspace.FetchBlob("fp16_out")

        np.testing.assert_allclose(fp16_c2_out,
                                   ref_c2_out,
                                   atol=1e-3,
                                   rtol=1e-3)

        workspace.RunNetOnce(onnxified_net)
        fp16_glow_out = workspace.FetchBlob("glow_out")

        if not np.allclose(fp16_glow_out, fp16_c2_out):
            diff = np.abs(fp16_glow_out - fp16_c2_out)
            print_test_debug_info(
                "sls",
                {
                    "indices": I,
                    "data": D,
                    "lengths": L,
                    "Y_c2": fp16_c2_out,
                    "Y_glow": fp16_glow_out,
                    "diff": diff,
                    "rowwise_diff": diff[:, 0],
                },
            )
            assert 0
Ejemplo n.º 3
0
def share_grad_blobs(
    net,
    losses,
    param_grads,
    namescope,
    dont_share_blobs=None,
    share_activations=False,
    blob_shapes=None,
):
    '''
    Implements similar optimization as Torch's shareGradInput():
    for the gradients that are passed between layers, share blobs between
    operators when possible. This yields significant memory savings with
    deep networks.

    Returns an optimized protobuf (assign to net._net)
    '''
    def is_grad_blob(b):
        name = str(b)
        # Note: need to look at _{namescope} pattern as it matches
        # to handle the auto-split gradients
        return name.endswith("_grad") and (
            name.startswith(namescope)
            or name.startswith("_" + namescope)) and name not in param_grads

    def is_grad_op(op):
        # TODO: something smarter
        for b in list(op.input) + list(op.output):
            if is_grad_blob(b):
                return True
        return False

    log.warn("NOTE: Executing memonger to optimize gradient memory")

    # Collect ops that have something to do with gradients
    if namescope != "" and not namescope.endswith("/"):
        namescope += "/"

    netproto = copy.deepcopy(net.Proto())
    activations = []
    external_output = set(net.Proto().external_output)

    # Hacky way to get activations, think of a better way
    for op in net.Proto().op:
        for b in op.output:
            if b + "_w" in op.input and b not in external_output:
                activations.append(b)

    # Remove last activations, as they are usually accessed externally
    activations = set(activations[:-2])

    # Gradient ops
    grad_op_indices = []
    for idx, op in enumerate(netproto.op):
        if (is_grad_op(op)):
            grad_op_indices.append(idx)

    shared_blobs = set()
    for op in net.Proto().op:
        for b in list(op.input) + list(op.output):
            if is_grad_blob(b) or (share_activations and b in activations):
                shared_blobs.add(b)
    start_time = time.time()
    optim_str = C.memonger_compute_blob_recycling_for_dag(
        netproto.SerializeToString(), [str(s).encode('utf-8') for s in losses],
        grad_op_indices, set(str(s).encode('utf-8') for s in shared_blobs),
        namescope.encode('utf-8'),
        set() if dont_share_blobs is None else dont_share_blobs,
        {} if blob_shapes is None else blob_shapes)

    log.info(
        "Memonger memory optimization took {} secs".format(time.time() -
                                                           start_time), )

    optim = caffe2_pb2.NetDef()
    optim.ParseFromString(optim_str)
    assert verify_graph_equality(net.Proto(), optim), \
        "Memonger graph is not equal to original."
    assert verify_inplace_blobs(net.Proto(), optim), \
        "Inplace assignments differ in memonger net."
    return optim
def gen_init_net_from_blobs(blobs):
    ''' Generate an initialization net based on a blob dict '''
    ret = caffe2_pb2.NetDef()
    for name, blob in blobs.items():
        add_tensor(ret, name, blob)
    return ret
    def test_slws_fused_8bit_rowwise_all_same(self, seed):
        # Comment out for predictable debugging
        np.random.seed(seed)
        workspace.ResetWorkspace()
        n = 1
        m = 2
        data = np.ones((n, m)).astype(np.float32) * 0.2 - 0.1

        max_segments = 5
        max_segment_length = 200
        num_lengths = np.random.randint(1, max_segments + 1)
        # number of segments to run
        lengths = np.random.randint(0,
                                    max_segment_length + 1,
                                    size=num_lengths).astype(np.int32)
        num_indices = np.sum(lengths)
        indices = np.zeros(num_indices, dtype=np.int64)
        weights = np.random.uniform(low=-0.5, high=0.5,
                                    size=[len(indices)]).astype(np.float32)

        pred_net = caffe2_pb2.NetDef()
        pred_net.name = "pred"
        pred_net.external_input.extend(
            ["quantized_data", "weights", "indices", "lengths"])
        pred_net.external_output.append("Y")
        pred_net.op.add().CopyFrom(
            core.CreateOperator(
                "SparseLengthsWeightedSumFused8BitRowwise",
                ["quantized_data", "weights", "indices", "lengths"],
                ["Y"],
            ))

        ref_net = caffe2_pb2.NetDef()
        ref_net.name = "ref"
        ref_net.external_input.extend(
            ["quantized_data", "weights", "indices", "lengths"])
        ref_net.external_output.append("Y")
        ref_net.op.add().CopyFrom(
            core.CreateOperator(
                "SparseLengthsWeightedSumFused8BitRowwiseFakeFP16NNPI",
                ["quantized_data", "weights", "indices", "lengths"],
                ["Y"],
            ))

        workspace.FeedBlob("data", data)
        workspace.RunOperatorOnce(
            core.CreateOperator("FloatToFused8BitRowwiseQuantized", ["data"],
                                ["quantized_data"]))
        pred_net_onnxified = onnxifi_caffe2_net(
            pred_net,
            {},
            max_batch_size=max_segments,
            max_seq_size=max_segment_length,
            debug=True,
            adjust_batch=True,
            use_onnx=False,
        )

        num_onnxified_ops = sum(1 if o.type == "Onnxifi" else 0
                                for o in pred_net_onnxified.op)
        np.testing.assert_equal(num_onnxified_ops, 1)

        workspace.FeedBlob("indices", indices)
        workspace.FeedBlob("lengths", lengths)
        workspace.FeedBlob("weights", weights)

        workspace.CreateNet(pred_net_onnxified)
        workspace.CreateNet(ref_net)

        workspace.RunNet(pred_net_onnxified.name)
        Y_glow = workspace.FetchBlob("Y")

        workspace.RunNet(ref_net.name)
        Y_c2 = workspace.FetchBlob("Y")

        if not np.allclose(Y_c2, Y_glow):
            print_test_debug_info(
                "slws_fused_8bit_rowwise",
                {
                    "seed": seed,
                    "indices": indices,
                    "data": data,
                    "lengths": lengths,
                    "weights": weights,
                    "Y_c2": Y_c2,
                    "Y_glow": Y_glow,
                    "diff": Y_glow - Y_c2,
                    "rowwise_diff": (Y_glow - Y_c2)[:, 0],
                },
            )
            assert 0
Ejemplo n.º 6
0
    def test_convolution_sum_fusion(self, stride, pad, kernel, size,
                                    input_channels, output_channels,
                                    batch_size, use_bias, group, sum_add, gc,
                                    dc):
        pool_S0 = core.CreateOperator("MaxPool", ["SX0"], ["S0"],
                                      stride=2,
                                      pad=0,
                                      kernel=2,
                                      device_option=dc[0])
        conv = core.CreateOperator(
            "Conv", ["X0", "w0", "b0"] if use_bias else ["X0", "w0"], ["Y0"],
            stride=stride,
            pad=pad,
            kernel=kernel,
            group=group,
            device_option=dc[0])
        sum = core.CreateOperator(sum_add, ["S0", "Y0"], ["S0"],
                                  device_option=dc[0])

        # Manual fusion for Conv + Sum
        pool_S1 = core.CreateOperator("MaxPool", ["SX1"], ["S1"],
                                      stride=2,
                                      pad=0,
                                      kernel=2,
                                      group=group,
                                      device_option=dc[1])
        conv_fusion = core.CreateOperator(
            "ConvFusion",
            ["X1", "w1", "b1", "S1"] if use_bias else ["X1", "w1", "S1"],
            ["S1"],
            stride=stride,
            pad=pad,
            kernel=kernel,
            group=group,
            fusion_type=2,
            device_option=dc[1])
        pool_input_size = int(
            math.ceil(float(size + 2 * pad - kernel + 1) / stride)) * 2
        SX = np.random.rand(batch_size, output_channels * group,
                            pool_input_size, pool_input_size).astype(
                                np.float32) - 0.5
        X = np.random.rand(batch_size, input_channels * group, size,
                           size).astype(np.float32) - 0.5
        w = np.random.rand(
                output_channels * group, input_channels, kernel, kernel) \
            .astype(np.float32) - 0.5
        b = np.random.rand(output_channels * group).astype(np.float32) - 0.5

        old_ws_name = workspace.CurrentWorkspace()
        workspace.SwitchWorkspace("_device_check_", True)
        workspace.FeedBlob('SX0', SX, dc[0])
        workspace.FeedBlob('X0', X, dc[0])
        workspace.FeedBlob('w0', w, dc[0])
        workspace.FeedBlob('b0', b, dc[0])
        workspace.RunOperatorOnce(pool_S0)
        workspace.RunOperatorOnce(conv)
        workspace.RunOperatorOnce(sum)
        S0 = workspace.FetchBlob('S0')

        workspace.ResetWorkspace()
        workspace.FeedBlob('SX1', SX, dc[1])
        workspace.FeedBlob('X1', X, dc[1])
        workspace.FeedBlob('w1', w, dc[1])
        workspace.FeedBlob('b1', b, dc[1])
        workspace.RunOperatorOnce(pool_S1)
        workspace.RunOperatorOnce(conv_fusion)
        S1 = workspace.FetchBlob('S1')

        if not np.allclose(S0, S1, atol=0.01, rtol=0.01):
            print(S1.flatten())
            print(S0.flatten())
            print(np.max(np.abs(S1 - S0)))
            self.assertTrue(False)

        # Auto fusion for Conv + Sum
        workspace.ResetWorkspace()
        old_net = caffe2_pb2.NetDef()
        pool_S0_old = caffe2_pb2.OperatorDef()
        pool_S0_old.CopyFrom(pool_S0)
        pool_S0_old.device_option.CopyFrom(dc[1])
        conv_old = caffe2_pb2.OperatorDef()
        conv_old.CopyFrom(conv)
        conv_old.device_option.CopyFrom(dc[1])
        sum_old = caffe2_pb2.OperatorDef()
        sum_old.CopyFrom(sum)
        sum_old.device_option.CopyFrom(dc[1])
        old_net.op.extend([pool_S0_old, conv_old, sum_old])

        # Conv + Sum should be fused case: [PreNode, Conv, Sum]
        workspace.FeedBlob('SX0', SX, dc[1])
        workspace.FeedBlob('X0', X, dc[1])
        workspace.FeedBlob('w0', w, dc[1])
        workspace.FeedBlob('b0', b, dc[1])
        net = core.Net("net")
        net.Proto().CopyFrom(old_net)
        optimizeForMKLDNN(net)
        self.assertTrue(len(net.Proto().op) == 2)
        self.assertTrue(net.Proto().op[1].type == "ConvFusion")
        workspace.RunNetOnce(net.Proto())
        # The output tensor name will be changed by optimization
        # sometimes when applying conv sum fusion
        S2 = workspace.FetchBlob(net.Proto().op[-1].output[0])
        if not np.allclose(S0, S2, atol=0.01, rtol=0.01):
            print(S2.flatten())
            print(S0.flatten())
            print(np.max(np.abs(S2 - S0)))
            self.assertTrue(False)

        # Conv + Sum should be fused case: [Conv, PreNode, Sum]
        workspace.ResetWorkspace()
        old_net = caffe2_pb2.NetDef()
        workspace.FeedBlob('SX0', SX, dc[1])
        workspace.FeedBlob('X0', X, dc[1])
        workspace.FeedBlob('w0', w, dc[1])
        workspace.FeedBlob('b0', b, dc[1])
        old_net.op.extend([conv_old, pool_S0_old, sum_old])
        net = core.Net("net")
        net.Proto().CopyFrom(old_net)
        optimizeForMKLDNN(net)
        self.assertTrue(len(net.Proto().op) == 2)
        self.assertTrue(net.Proto().op[1].type == "ConvFusion")
        workspace.RunNetOnce(net.Proto())
        # The output tensor name will be changed by optimization
        # sometimes when applying conv sum fusion
        S2 = workspace.FetchBlob(net.Proto().op[-1].output[0])
        if not np.allclose(S0, S2, atol=0.01, rtol=0.01):
            print(S2.flatten())
            print(S0.flatten())
            print(np.max(np.abs(S2 - S0)))
            self.assertTrue(False)

        # Conv + Sum should not be fused case: [Conv, midOp, preNode, Sum] Conv output is used by midOp
        dropout = core.CreateOperator("Dropout", ["Y0"], ["Y_dropout"],
                                      ratio=0.5,
                                      is_test=True,
                                      device_option=dc[1])

        workspace.ResetWorkspace()
        workspace.FeedBlob('SX0', SX, dc[1])
        workspace.FeedBlob('X0', X, dc[1])
        workspace.FeedBlob('w0', w, dc[1])
        workspace.FeedBlob('b0', b, dc[1])
        old_net = caffe2_pb2.NetDef()
        old_net.op.extend([conv_old, dropout, pool_S0_old, sum_old])
        net = core.Net("net")
        net.Proto().CopyFrom(old_net)
        optimizeForMKLDNN(net)
        self.assertTrue(len(net.Proto().op) == 4)
        workspace.RunNetOnce(net.Proto())
        S2 = workspace.FetchBlob(net.Proto().op[-1].output[0])
        if not np.allclose(S0, S2, atol=0.01, rtol=0.01):
            print(S2.flatten())
            print(S0.flatten())
            print(np.max(np.abs(S2 - S0)))
            self.assertTrue(False)

        # Conv + Sum should not be fused case: [Conv, preNode, Sum, midOp] preNode output is used by midOp
        sum1 = core.CreateOperator(sum_add, ["S0", "Y0"], ["S3"],
                                   device_option=dc[1])
        dropout = core.CreateOperator("Dropout", ["S0"], ["Y_dropout"],
                                      ratio=0.5,
                                      is_test=True,
                                      device_option=dc[1])

        workspace.ResetWorkspace()
        workspace.FeedBlob('SX0', SX, dc[1])
        workspace.FeedBlob('X0', X, dc[1])
        workspace.FeedBlob('w0', w, dc[1])
        workspace.FeedBlob('b0', b, dc[1])
        old_net = caffe2_pb2.NetDef()
        old_net.op.extend([conv_old, pool_S0_old, sum1, dropout])
        net = core.Net("net")
        net.Proto().CopyFrom(old_net)
        optimizeForMKLDNN(net)
        print("net={}\n".format(net.Proto()))
        self.assertTrue(len(net.Proto().op) == 4)
        workspace.RunNetOnce(net.Proto())
        S2 = workspace.FetchBlob(net.Proto().op[-2].output[0])
        if not np.allclose(S0, S2, atol=0.01, rtol=0.01):
            print(S2.flatten())
            print(S0.flatten())
            print(np.max(np.abs(S2 - S0)))
            self.assertTrue(False)

        # Conv + Sum should not be fused case: [Conv, midOp, preNode, Sum]
        # midOp output has the same name with that of the Conv input
        relu_0 = core.CreateOperator("Relu", ["X0"], ["X1"],
                                     device_option=dc[0])
        conv = core.CreateOperator(
            "Conv", ["X1", "w0", "b0"] if use_bias else ["X1", "w0"], ["Y0"],
            stride=1,
            pad=0,
            kernel=1,
            device_option=dc[0])
        relu_1 = core.CreateOperator("Relu", ["X1"], ["X1"],
                                     device_option=dc[0])
        pool = core.CreateOperator("MaxPool", ["X1"], ["S0"],
                                   stride=1,
                                   pad=0,
                                   kernel=1,
                                   device_option=dc[0])
        sum = core.CreateOperator("Sum", ["S0", "Y0"], ["S0"],
                                  device_option=dc[0])

        X = np.random.rand(batch_size, input_channels, size, size).astype(
            np.float32) - 0.5
        w = np.random.rand(input_channels, input_channels, 1, 1).astype(
            np.float32) - 0.5
        b = np.random.rand(input_channels).astype(np.float32) - 0.5

        workspace.SwitchWorkspace(old_ws_name)
        workspace.ResetWorkspace()
        workspace.FeedBlob('X0', X, dc[0])
        workspace.FeedBlob('w0', w, dc[0])
        workspace.FeedBlob('b0', b, dc[0])
        workspace.RunOperatorOnce(relu_0)
        workspace.RunOperatorOnce(conv)
        workspace.RunOperatorOnce(relu_1)
        workspace.RunOperatorOnce(pool)
        workspace.RunOperatorOnce(sum)
        S0 = workspace.FetchBlob('S0')

        workspace.ResetWorkspace()
        workspace.FeedBlob('X0', X, dc[1])
        workspace.FeedBlob('w0', w, dc[1])
        workspace.FeedBlob('b0', b, dc[1])
        relu_0_old = caffe2_pb2.OperatorDef()
        relu_0_old.CopyFrom(relu_0)
        relu_0_old.device_option.CopyFrom(dc[1])
        conv_old = caffe2_pb2.OperatorDef()
        conv_old.CopyFrom(conv)
        conv_old.device_option.CopyFrom(dc[1])
        relu_1_old = caffe2_pb2.OperatorDef()
        relu_1_old.CopyFrom(relu_1)
        relu_1_old.device_option.CopyFrom(dc[1])
        pool_old = caffe2_pb2.OperatorDef()
        pool_old.CopyFrom(pool)
        pool_old.device_option.CopyFrom(dc[1])
        sum_old = caffe2_pb2.OperatorDef()
        sum_old.CopyFrom(sum)
        sum_old.device_option.CopyFrom(dc[1])

        old_net = caffe2_pb2.NetDef()
        old_net.op.extend(
            [relu_0_old, conv_old, relu_1_old, pool_old, sum_old])
        net = core.Net("net")
        net.Proto().CopyFrom(old_net)
        optimizeForMKLDNN(net)
        self.assertTrue(len(net.Proto().op) == 5)
        workspace.RunNetOnce(net.Proto())
        S2 = workspace.FetchBlob(net.Proto().op[-1].output[0])
        if not np.allclose(S0, S2, atol=0.01, rtol=0.01):
            print(S2.flatten())
            print(S0.flatten())
            print(np.max(np.abs(S2 - S0)))
            self.assertTrue(False)
Ejemplo n.º 7
0
    def test_convolution_grouped_sum_relu_fusion(self, stride, pad, kernel,
                                                 size, input_channels,
                                                 output_channels, batch_size,
                                                 use_bias, group, gc, dc):
        conv_S0 = core.CreateOperator(
            "Conv", ["SX0", "Sw0", "Sb0"] if use_bias else ["SX0", "Sw0"],
            ["S0"],
            stride=stride,
            pad=pad,
            kernel=kernel,
            group=group,
            device_option=dc[0])
        conv = core.CreateOperator(
            "Conv", ["X0", "w0", "b0"] if use_bias else ["X0", "w0"], ["Y0"],
            stride=stride,
            pad=pad,
            kernel=kernel,
            group=group,
            device_option=dc[0])
        sum = core.CreateOperator("Sum", ["S0", "Y0"], ["S0"],
                                  device_option=dc[0])
        relu = core.CreateOperator("Relu", ["S0"], ["S0"], device_option=dc[0])

        SX = np.random.rand(batch_size, input_channels * group, size,
                            size).astype(np.float32) - 0.5
        Sw = np.random.rand(
                output_channels * group, input_channels, kernel, kernel) \
            .astype(np.float32) - 0.5
        Sb = np.random.rand(output_channels * group).astype(np.float32) - 0.5
        X = np.random.rand(batch_size, input_channels * group, size,
                           size).astype(np.float32) - 0.5
        w = np.random.rand(
                output_channels * group, input_channels, kernel, kernel) \
            .astype(np.float32) - 0.5
        b = np.random.rand(output_channels * group).astype(np.float32) - 0.5

        old_ws_name = workspace.CurrentWorkspace()
        workspace.SwitchWorkspace("_device_check_", True)
        workspace.FeedBlob('SX0', SX, dc[0])
        workspace.FeedBlob('Sw0', Sw, dc[0])
        workspace.FeedBlob('Sb0', Sb, dc[0])
        workspace.FeedBlob('X0', X, dc[0])
        workspace.FeedBlob('w0', w, dc[0])
        workspace.FeedBlob('b0', b, dc[0])
        workspace.RunOperatorOnce(conv_S0)
        workspace.RunOperatorOnce(conv)
        workspace.RunOperatorOnce(sum)
        workspace.RunOperatorOnce(relu)
        S0 = workspace.FetchBlob('S0')

        workspace.ResetWorkspace()
        old_net = caffe2_pb2.NetDef()
        conv_S0_old = caffe2_pb2.OperatorDef()
        conv_S0_old.CopyFrom(conv_S0)
        conv_S0_old.device_option.CopyFrom(dc[1])
        conv_old = caffe2_pb2.OperatorDef()
        conv_old.CopyFrom(conv)
        conv_old.device_option.CopyFrom(dc[1])
        sum_old = caffe2_pb2.OperatorDef()
        sum_old.CopyFrom(sum)
        sum_old.device_option.CopyFrom(dc[1])
        relu_old = caffe2_pb2.OperatorDef()
        relu_old.CopyFrom(relu)
        relu_old.device_option.CopyFrom(dc[1])
        old_net.op.extend([conv_S0_old, conv_old, sum_old, relu_old])
        workspace.FeedBlob('SX0', SX, dc[1])
        workspace.FeedBlob('Sw0', Sw, dc[1])
        workspace.FeedBlob('Sb0', Sb, dc[1])
        workspace.FeedBlob('X0', X, dc[1])
        workspace.FeedBlob('w0', w, dc[1])
        workspace.FeedBlob('b0', b, dc[1])
        net = core.Net("net")
        net.Proto().CopyFrom(old_net)
        optimizeForMKLDNN(net)
        workspace.RunNetOnce(net.Proto())
        # The output tensor name will be changed by optimization
        # sometimes when applying conv sum fusion
        S2 = workspace.FetchBlob(net.Proto().op[-1].output[0])
        if not np.allclose(S0, S2, atol=0.01, rtol=0.01):
            print(S2.flatten())
            print(S0.flatten())
            print(np.max(np.abs(S2 - S0)))
            self.assertTrue(False)

        workspace.SwitchWorkspace(old_ws_name)
Ejemplo n.º 8
0
    def test_in_place(self, stride, pad, kernel, size,
                             input_channels, output_channels,
                             batch_size, use_bias, gc, dc):
        # To expose fallback in-place potential issue, the fallback op
        # following ideep op must be run at least two iterations.
        conv = core.CreateOperator(
            "Conv",
            ["X", "w", "b"] if use_bias else ["X", "w"],
            ["Y"],
            stride=stride,
            pad=pad,
            kernel=kernel,
            device_option=dc[0]
        )
        X = np.random.rand(
            batch_size, input_channels, size, size).astype(np.float32) - 0.5
        w = np.random.rand(output_channels, input_channels, kernel, kernel) \
            .astype(np.float32) - 0.5
        b = np.random.rand(output_channels).astype(np.float32) - 0.5

        old_ws_name = workspace.CurrentWorkspace()
        workspace.SwitchWorkspace("_device_check_", True)
        workspace.FeedBlob('X', X, dc[0])
        workspace.FeedBlob('w', w, dc[0])
        workspace.FeedBlob('b', b, dc[0])
        workspace.RunOperatorOnce(conv)
        Y = workspace.FetchBlob('Y')

        scale = np.random.randn(Y.shape[1]).astype(np.float32)
        bias = np.random.randn(Y.shape[1]).astype(np.float32)
        ac = core.CreateOperator(
            "AffineChannel",
            ["Y", "scale", "bias"],
            ["Y"],
            is_learnable=False,
            device_option=dc[0]
        )
        workspace.FeedBlob('scale', scale, dc[0])
        workspace.FeedBlob('bias', bias, dc[0])
        workspace.RunOperatorOnce(ac)
        workspace.RunOperatorOnce(conv)
        workspace.RunOperatorOnce(ac)
        Y0 = workspace.FetchBlob('Y')

        workspace.ResetWorkspace()
        dev_net = caffe2_pb2.NetDef()
        conv_dev = caffe2_pb2.OperatorDef()
        conv_dev.CopyFrom(conv)
        conv_dev.device_option.CopyFrom(dc[1])
        ac_dev = caffe2_pb2.OperatorDef()
        ac_dev.CopyFrom(ac)
        ac_dev.device_option.CopyFrom(dc[1])
        dev_net.op.extend([conv_dev, ac_dev])
        workspace.FeedBlob('X', X, dc[1])
        workspace.FeedBlob('w', w, dc[1])
        workspace.FeedBlob('b', b, dc[1])
        workspace.FeedBlob('scale', scale, dc[1])
        workspace.FeedBlob('bias', bias, dc[1])
        workspace.RunNetOnce(dev_net)
        workspace.RunNetOnce(dev_net)
        Y1 = workspace.FetchBlob('Y')

        if not np.allclose(Y0, Y1, atol=0.01, rtol=0.01):
            print(Y1.flatten())
            print(Y0.flatten())
            print(np.max(np.abs(Y1 - Y0)))
            self.assertTrue(False)

        workspace.SwitchWorkspace(old_ws_name)
Ejemplo n.º 9
0
    def test_convolution_sum_relu_fusion(self, stride, pad, kernel, size,
                                         input_channels, output_channels,
                                         batch_size, use_bias, group, sum_add,
                                         gc, dc):
        conv_S0 = core.CreateOperator(
            "Conv", ["SX0", "Sw0", "Sb0"] if use_bias else ["SX0", "Sw0"],
            ["S0"],
            stride=stride,
            pad=pad,
            kernel=kernel,
            group=group,
            device_option=dc[0])
        conv = core.CreateOperator(
            "Conv", ["X0", "w0", "b0"] if use_bias else ["X0", "w0"], ["Y0"],
            stride=stride,
            pad=pad,
            kernel=kernel,
            group=group,
            device_option=dc[0])
        sum = core.CreateOperator(sum_add, ["S0", "Y0"], ["S0"],
                                  device_option=dc[0])
        relu = core.CreateOperator("Relu", ["S0"], ["S0"], device_option=dc[0])

        # Manual fusion for Conv + Sum + ReLU
        conv_S1 = core.CreateOperator(
            "Conv", ["SX1", "Sw1", "Sb1"] if use_bias else ["SX1", "Sw1"],
            ["S1"],
            stride=stride,
            pad=pad,
            kernel=kernel,
            group=group,
            device_option=dc[1])
        conv_fusion = core.CreateOperator(
            "ConvFusion",
            ["X1", "w1", "b1", "S1"] if use_bias else ["X1", "w1", "S1"],
            ["S1"],
            stride=stride,
            pad=pad,
            kernel=kernel,
            group=group,
            fusion_type=3,
            device_option=dc[1])
        SX = np.random.rand(batch_size, input_channels * group, size,
                            size).astype(np.float32) - 0.5
        Sw = np.random.rand(
                output_channels * group, input_channels, kernel, kernel) \
            .astype(np.float32) - 0.5
        Sb = np.random.rand(output_channels * group).astype(np.float32) - 0.5
        X = np.random.rand(batch_size, input_channels * group, size,
                           size).astype(np.float32) - 0.5
        w = np.random.rand(
                output_channels * group, input_channels, kernel, kernel) \
            .astype(np.float32) - 0.5
        b = np.random.rand(output_channels * group).astype(np.float32) - 0.5

        old_ws_name = workspace.CurrentWorkspace()
        workspace.SwitchWorkspace("_device_check_", True)
        workspace.FeedBlob('SX0', SX, dc[0])
        workspace.FeedBlob('Sw0', Sw, dc[0])
        workspace.FeedBlob('Sb0', Sb, dc[0])
        workspace.FeedBlob('X0', X, dc[0])
        workspace.FeedBlob('w0', w, dc[0])
        workspace.FeedBlob('b0', b, dc[0])
        workspace.RunOperatorOnce(conv_S0)
        workspace.RunOperatorOnce(conv)
        workspace.RunOperatorOnce(sum)
        workspace.RunOperatorOnce(relu)
        S0 = workspace.FetchBlob('S0')

        workspace.ResetWorkspace()
        workspace.FeedBlob('SX1', SX, dc[1])
        workspace.FeedBlob('Sw1', Sw, dc[1])
        workspace.FeedBlob('Sb1', Sb, dc[1])
        workspace.FeedBlob('X1', X, dc[1])
        workspace.FeedBlob('w1', w, dc[1])
        workspace.FeedBlob('b1', b, dc[1])
        workspace.RunOperatorOnce(conv_S1)
        workspace.RunOperatorOnce(conv_fusion)
        S1 = workspace.FetchBlob('S1')

        if not np.allclose(S0, S1, atol=0.01, rtol=0.01):
            print(S1.flatten())
            print(S0.flatten())
            print(np.max(np.abs(S1 - S0)))
            self.assertTrue(False)

        # Auto fusion for Conv + Sum + ReLU
        workspace.ResetWorkspace()
        old_net = caffe2_pb2.NetDef()
        conv_S0_old = caffe2_pb2.OperatorDef()
        conv_S0_old.CopyFrom(conv_S0)
        conv_S0_old.device_option.CopyFrom(dc[1])
        conv_old = caffe2_pb2.OperatorDef()
        conv_old.CopyFrom(conv)
        conv_old.device_option.CopyFrom(dc[1])
        sum_old = caffe2_pb2.OperatorDef()
        sum_old.CopyFrom(sum)
        sum_old.device_option.CopyFrom(dc[1])
        relu_old = caffe2_pb2.OperatorDef()
        relu_old.CopyFrom(relu)
        relu_old.device_option.CopyFrom(dc[1])
        old_net.op.extend([conv_S0_old, conv_old, sum_old, relu_old])
        workspace.FeedBlob('SX0', SX, dc[1])
        workspace.FeedBlob('Sw0', Sw, dc[1])
        workspace.FeedBlob('Sb0', Sb, dc[1])
        workspace.FeedBlob('X0', X, dc[1])
        workspace.FeedBlob('w0', w, dc[1])
        workspace.FeedBlob('b0', b, dc[1])
        net = core.Net("net")
        net.Proto().CopyFrom(old_net)
        optimizeForMKLDNN(net)
        self.assertTrue(len(net.Proto().op) == 2)
        self.assertTrue(net.Proto().op[1].type == "ConvFusion")
        workspace.RunNetOnce(net.Proto())
        S2 = workspace.FetchBlob('S0')
        if not np.allclose(S0, S2, atol=0.01, rtol=0.01):
            print(S2.flatten())
            print(S0.flatten())
            print(np.max(np.abs(S2 - S0)))
            self.assertTrue(False)

        workspace.SwitchWorkspace(old_ws_name)
Ejemplo n.º 10
0
def make(model='zoo_resnet50',
         create_dist_net=False,
         standardize=False,
         normalize=False,
         img_size=224,
         **kw):
    input_blob = 'data'
    remove_layers_type = ['FC', 'Softmax']  # By type.
    remove_layers_with = []  # By name.

    assert (model == 'zoo_resnet50'
            )  # TODO add the others. If statements to set other params...

    #model = 'zoo_densenet121'
    #output_blob = 'pool5'
    #global_pool = False

    #model = 'zoo_inception2'
    #output_blob = 'pool5/7x7_s1'
    #global_pool = False

    model = 'zoo_resnet50'
    output_blob = 'res4_5_branch2c_bn'
    #output_blob2 = 'res5_2_branch2c_bn'
    pooling = 'global'
    #pooling = 6
    #remove_layers_with = ['res5', 'pool5']

    DIR = '/home/slee/stuff/terrapixel/cpp_localization/'
    model = DIR + 'saves/raw/' + model

    dev = core.DeviceOption(caffe2_pb2.CUDA, 0)
    #dev = core.DeviceOption(caffe2_pb2.CPU,0)
    with core.DeviceScope(dev):
        path = '%s/predict_net.pb' % model
        pred_net = caffe2_pb2.NetDef()
        with open(path, 'rb') as f:
            pred_net.ParseFromString(f.read())
        pred_net.device_option.CopyFrom(dev)  # GPU
        print('pred do', pred_net.device_option)

        path = '%s/init_net.pb' % model
        init_net = caffe2_pb2.NetDef()
        with open(path, 'rb') as f:
            init_net.ParseFromString(f.read())
        init_net.device_option.CopyFrom(dev)  # GPU

        # Do 3 things:
        #   1) Set device on all ops to gpu
        #   2) if blobs have 'gpu_0' prefix, remove it.
        #   3) Remove FC layers, softmaxes, etc.
        for net in (pred_net, init_net):
            remove_ops = []
            for j, op in enumerate(net.op):
                if op.type in remove_layers_type or any(
                        any(n in out for out in op.output)
                        for n in remove_layers_with):
                    remove_ops.append(j)
                op.device_option.CopyFrom(dev)
                for i, f in enumerate(op.input):
                    op.input[i] = f.replace('gpu_0/', '')
                for i, f in enumerate(op.output):
                    op.output[i] = f.replace('gpu_0/', '')
            for i, f in enumerate(net.external_input):
                net.external_input[i] = f.replace('gpu_0/', '')
            for i, f in enumerate(net.external_output):
                net.external_output[i] = f.replace('gpu_0/', '')

            dbg_removed = []
            for rop in remove_ops[::-1]:
                for out in net.op[rop].output:
                    for j, exout in enumerate(net.external_output):
                        if exout == out:
                            net.external_output.pop(j)
                            break
                net.op.pop(rop)
                dbg_removed += [out]
            print('Removed Ops:', ' '.join(dbg_removed))

        pred_net = core.Net(pred_net)
        init_net = core.Net(init_net)

        # Flatten!
        '''
        if pooling == 'global':
            pred_net.AveragePool2D([output_blob], ['output_code_1'], global_pooling=True)
            pred_net.AveragePool2D([output_blob2], ['output_code_2'], global_pooling=True)
            pred_net.Concat(['output_code_1','output_code_2'], ['output_code_','whocares'], axis=1)
            #pred_net.MaxPool2D([output_blob], ['output_code_'], global_pooling=True)
            pred_net.Flatten(['output_code_'], ['output_code'])
        else:
            pred_net.AveragePool2D([output_blob], ['output_code_'], kernel=pooling, stride=pooling)
            pred_net.Flatten(['output_code_'], ['output_code'])
            #pred_net.Copy([output_blob], ['output_code'])
        output_blob = 'output_code'
        '''

        pred_net._net.external_output.append(output_blob)

        mh = model_helper.ModelHelper()
        mh.net = pred_net
        mh.param_init_net = init_net
        mh.RunAllOnGPU()

    #pdb.set_trace()

    #print('model proto', mh.net.Proto())

    # Determine code_size. Reset workspace afterwards.
    SZ = img_size
    img_size = (SZ, SZ)
    fake_x = np.random.randn(*[1, 3, *img_size]).astype(np.float32)
    #print('running param init.')
    workspace.RunNetOnce(init_net)
    #print('running net.')
    workspace.FeedBlob(input_blob, fake_x, device_option=dev)
    workspace.CreateNet(pred_net)
    #workspace.RunNet(pred_net)
    #workspace.RunNetOnce(pred_net)
    #print('fetching.')
    #fake_code = workspace.FetchBlob(output_blob)
    #workspace.ResetWorkspace()
    #workspace.FeedBlob(input_blob, fake_x, device_option=dev)

    #print(fake_code.shape)
    #assert((len(fake_code.shape) == 2 or all(fake_code.shape[2:]==1)) and 'code_size must be 1 dim')
    #code_size = fake_code.shape[1]

    return mh
Ejemplo n.º 11
0
# Convert HWC -> CHW
img = img.swapaxes(1, 2).swapaxes(0, 1)

# Convert CHW -> NCHW
img = np.array([img])

# Im info N x 3 tensor of (height, width, scale)
im_info = np.reshape(
    np.array([np.float(img.shape[2]),
              np.float(img.shape[3]), 1.0]), (1, -1))
im_info = im_info.astype('float32')

device_opts = core.DeviceOption(caffe2_pb2.CPU)

# Read the contents of the input protobufs into local variables
init_net = caffe2_pb2.NetDef()
with open(INIT_NET, 'rb') as f:
    init_net.ParseFromString(f.read())
    init_net.device_option.CopyFrom(device_opts)

predict_net = caffe2_pb2.NetDef()
with open(PREDICT_NET, "rb") as f:
    predict_net.ParseFromString(f.read())
    predict_net.device_option.CopyFrom(device_opts)

# Initialise the predictor from the input protobufs
p = workspace.Predictor(init_net, predict_net)

# Run the net and return predictions
results = p.run({'data': img, "im_info": im_info})
Ejemplo n.º 12
0
    def test_slws_fused_8bit_rowwise_acc32_nnpi(self, seed):
        workspace.GlobalInit(
            [
                "caffe2",
                "--glow_global_fp16=0",
                "--glow_global_fused_scale_offset_fp16=0",
                "--glow_global_force_sls_fp16_accum=0",
            ]
        )
        np.random.seed(seed)
        workspace.ResetWorkspace()

        n = 20000
        DIM = 6
        data = (4 * np.random.random_sample((n, DIM)) + 1).astype(np.float32)

        max_segments = 200
        max_segment_length = 200
        num_lengths = np.random.randint(0, max_segments + 1)
        # number of segments to run
        lengths = np.random.randint(2, max_segment_length + 1, size=num_lengths).astype(
            np.int32
        )
        num_indices = np.sum(lengths)
        indices = np.random.randint(low=0, high=n, size=num_indices, dtype=np.int64)
        weights = np.random.uniform(low=0.01, high=0.5, size=[len(indices)]).astype(
            np.float32
        )

        pred_net = caffe2_pb2.NetDef()
        pred_net.name = "pred"
        pred_net.external_input.extend(
            ["quantized_data", "weights", "indices", "lengths"]
        )
        pred_net.external_output.append("Y")
        pred_net.op.add().CopyFrom(
            core.CreateOperator(
                "SparseLengthsWeightedSumFused8BitRowwise",
                ["quantized_data", "weights", "indices", "lengths"],
                ["Y"],
            )
        )

        ref_net = caffe2_pb2.NetDef()
        ref_net.name = "ref"
        ref_net.external_input.extend(
            ["quantized_data", "weights", "indices", "lengths"]
        )
        ref_net.external_output.append("Y")
        ref_net.op.add().CopyFrom(
            core.CreateOperator(
                "SparseLengthsWeightedSumFused8BitRowwiseFakeFP32NNPI",
                ["quantized_data", "weights", "indices", "lengths"],
                ["Y"],
            )
        )

        workspace.FeedBlob("data", data)
        workspace.RunOperatorOnce(
            core.CreateOperator(
                "FloatToFused8BitRowwiseQuantized", ["data"], ["quantized_data"]
            )
        )
        onnxified_net = onnxifi_caffe2_net(
            pred_net,
            {},
            max_batch_size=max_segments,
            max_seq_size=max_segments * max_segment_length,
            debug=True,
            adjust_batch=True,
            use_onnx=False,
        )
        workspace.FeedBlob("indices", indices)
        workspace.FeedBlob("lengths", lengths)
        workspace.FeedBlob("weights", weights)

        workspace.CreateNet(onnxified_net)
        workspace.CreateNet(ref_net)

        workspace.RunNet(onnxified_net.name)
        Y_glow = workspace.FetchBlob("Y")

        workspace.RunNet(ref_net.name)
        Y_ref = workspace.FetchBlob("Y")

        diff = np.abs((Y_ref - Y_glow) / (Y_ref + 1e-8))
        max_err = np.max(diff, axis=1)
        num_offenders = (max_err > 0).sum()
        if num_offenders > 0:
            print_test_debug_info(
                "test_slws_fused_8bit_rowwise_acc32_nnpi",
                {
                    "indices": indices,
                    "data": data.shape,
                    "lengths": lengths,
                    "weights": weights,
                    "Y_glow": Y_glow,
                    "Y_ref": Y_ref,
                    "diff": diff,
                    "rowwise_diff": np.max(diff, axis=1),
                },
            )
            assert 0
Ejemplo n.º 13
0
    def Skip_test_tanhquantize(self, scale, zp, size, rand_seed):
        np.random.seed(rand_seed)

        workspace.ResetWorkspace()

        pred_net = caffe2_pb2.NetDef()
        pred_net.name = "ref"
        pred_net.external_input.append("X")
        pred_net.external_output.append("Y_q")

        pred_net.op.add().CopyFrom(
            core.CreateOperator(
                "Tanh", ["X"], ["Y"]
            )
        )

        pred_net.op.add().CopyFrom(
            core.CreateOperator(
                "Int8Quantize", ["Y"], ["Y_q"], Y_scale=scale, Y_zero_point=zp
            )
        )

        X = np.linspace(-1, 1, size).astype(np.float16).astype(np.float32)

        pred_net_onnxified = onnxifi_caffe2_net(
            pred_net,
            {"X": X.shape},
            debug=True,
            adjust_batch=False,
            use_onnx=False,
        )
        num_onnxified_ops = sum(
            1 if o.type == "Onnxifi" else 0 for o in pred_net_onnxified.op
        )
        np.testing.assert_equal(num_onnxified_ops, 1)
        workspace.FeedBlob("X", X)
        workspace.CreateNet(pred_net_onnxified)
        workspace.RunNet(pred_net_onnxified.name)
        Y_glow = workspace.FetchInt8Blob("Y_q")

        ref_net = caffe2_pb2.NetDef()
        ref_net.name = "ref"
        ref_net.external_input.append("X")
        ref_net.external_output.append("Y_q")

        ref_net.op.add().CopyFrom(
            core.CreateOperator(
                "TanhQuantFakeFp16NNPI", ["X"], ["Y_q"], Y_scale=scale, Y_zero_point=zp
            )
        )

        workspace.CreateNet(ref_net)
        workspace.RunNet(ref_net.name)
        Y_ref = workspace.FetchInt8Blob("Y_q")

        if not np.array_equal(Y_ref.data, Y_glow.data) or \
           not Y_ref.scale == Y_glow.scale or \
           not Y_ref.zero_point == Y_glow.zero_point:
            print_test_debug_info(
                "tanhfusion",
                {
                    "scale": scale,
                    "zp": zp,
                    "input": X,
                    "ideal nonquant": np.tanh(X),
                    "Y_glow": Y_glow,
                    "Y_c2": Y_ref,
                }
            )
            assert(0)
Ejemplo n.º 14
0
def load_caffe2_net(file):
    net = caffe2_pb2.NetDef()
    with open(file, "rb") as f:
        net.ParseFromString(f.read())
    return net
Ejemplo n.º 15
0
    def Skip_test_layernorm(self, seed, batch_size, size, epsilon,
                            elementwise_affine):
        np.random.seed(seed)
        # Reset the workspace
        workspace.ResetWorkspace()
        axis = 1

        dims = np.array(([batch_size, size]))
        X = np.random.uniform(size=dims).astype(np.float32) - 0.5
        gamma = np.random.randn(*X.shape[axis:]).astype(np.float32)
        beta = np.random.randn(*X.shape[axis:]).astype(np.float32)

        pred_net = caffe2_pb2.NetDef()
        pred_net.name = "pred"
        pred_net.external_input.extend(["X", "gamma", "beta"])
        pred_net.external_output.extend(["Y", "mean", "rstd"])
        pred_net.op.add().CopyFrom(
            core.CreateOperator(
                "LayerNorm",
                ["X", "gamma", "beta"] if elementwise_affine else ["X"],
                ["Y", "mean", "rstd"],
                axis=axis,
                epsilon=epsilon,
                elementwise_affine=elementwise_affine))

        pred_net_ref = caffe2_pb2.NetDef()
        pred_net_ref.name = "pred_ref"
        pred_net_ref.external_input.extend(["X", "gamma", "beta"])
        pred_net_ref.external_output.extend(["Y", "mean", "rstd"])
        pred_net_ref.op.add().CopyFrom(
            core.CreateOperator(
                "LayerNormFakeFP16NNPI",
                ["X", "gamma", "beta"] if elementwise_affine else ["X"],
                ["Y", "mean", "rstd"],
                axis=axis,
                epsilon=epsilon,
                elementwise_affine=elementwise_affine))

        shape_hits = {"X": X.shape, "gamma": gamma.shape, "beta": beta.shape}
        pred_net_onnxified = onnxifi_caffe2_net(pred_net,
                                                shape_hits,
                                                debug=True,
                                                adjust_batch=True,
                                                use_onnx=False)
        num_onnxified_ops = sum(1 if o.type == "Onnxifi" else 0
                                for o in pred_net_onnxified.op)
        np.testing.assert_equal(num_onnxified_ops, 1)

        workspace.FeedBlob("X", X)
        workspace.FeedBlob("gamma", gamma)
        workspace.FeedBlob("beta", beta)

        workspace.CreateNet(pred_net_ref)
        workspace.CreateNet(pred_net_onnxified)

        workspace.RunNet(pred_net_ref.name)
        Y_c2 = workspace.FetchBlob("Y")

        dims1 = np.array(([1, *dims]))
        X_glow = X.reshape(dims1)
        workspace.FeedBlob("X", X_glow)

        workspace.RunNet(pred_net_onnxified.name)
        Y_glow = workspace.FetchBlob("Y")

        if not np.allclose(Y_glow, Y_c2):
            diff_Y = np.abs(Y_glow - Y_c2)
            print_test_debug_info(
                "layernorm", {
                    "seed": seed,
                    "size": size,
                    "batch_size": batch_size,
                    "epsilon": epsilon,
                    "gamma": gamma,
                    "beta": beta,
                    "elementwise_affine": elementwise_affine,
                    "X": X,
                    "Y_glow": Y_glow,
                    "Y_c2": Y_c2,
                    "diff_Y": diff_Y,
                })
            assert (0)
Ejemplo n.º 16
0
    def prepare(cls, model, device='CPU', **kwargs):
        '''
        For Onnx Caffe2Backend, we require that init_graph don't initialize the actual input of the predict_graph,

        for example, if "img" is the input blob for the predict_net, we require that in init_graph and in
        initializer of the predict_graph, "img" is not initalized. We don't have a check for this, since
        there is no way we can know which blob is the input of the predict_graph.
        '''
        super(Caffe2Backend, cls).prepare(model, device, **kwargs)
        opset_version = None
        for imp in model.opset_import:
            if not imp.HasField("domain") or imp.domain == "":
                opset_version = imp.version
                if imp.version > cls._known_opset_version:
                    warnings.warn("This version of onnx-caffe2 targets ONNX operator set version {}, but the model we are trying to import uses version {}.  We will try to import it anyway, but if the model uses operators which had BC-breaking changes in the intervening versions, import will fail.".format(cls._known_opset_version, imp.version))
            else:
                warnings.warn("Unrecognized operator set {}".format(imp.domain))
        if opset_version is None:
            if model.ir_version >= 0x00000003:
                raise RuntimeError("Model with IR version >= 3 did not specify ONNX operator set version (onnx-caffe2 requires it)")
            else:
                opset_version = 1

        # Check whether we have RNN related ops
        pred_model = ModelProto()
        pred_model.ParseFromString(cls.optimize_onnx(model.SerializeToString(), predict=True))
        cls._inplace_rewrite(pred_model.graph)
        rnn_nodes = []
        for node in pred_model.graph.node:
            if node.op_type in {'LSTM', 'GRU', 'RNN'}:
                rnn_nodes.append(node)

        # Build the C++ backend
        # TODO: build a predictor that supports GPU
        #       And for RNN nets, we need to avoid adding init_net
        if device == 'CPU' and not rnn_nodes:
            c2_rnn_ops = []
            if rnn_nodes:
                init_model = ModelProto()
                init_model.ParseFromString(cls.optimize_onnx(model.SerializeToString(), init=True))
                cls._inplace_rewrite(init_model.graph)
                for node in rnn_nodes:
                    c2ops = cls._onnx_node_to_caffe2_op(
                        init_model, pred_model, node, opset_version)
                    init_ops = [x.SerializeToString() for x in c2ops.init_ops]
                    ops = [x.SerializeToString() for x in c2ops.ops]
                    external_inputs = c2ops.interface_blobs
                    c2_rnn_ops.append(C.Caffe2Ops(init_ops, ops, external_inputs))
                del init_model

            cbackend = C.Caffe2Backend()
            rep = cbackend.prepare(model.SerializeToString(), device, c2_rnn_ops)
            # For testing
            # Dump the net descriptions to file for comparison with the Python ones
            if "ONNX_CAFFE2_DEBUG" in os.environ:
                pred_net_str = rep.pred_net()
                pn = caffe2_pb2.NetDef()
                pn.ParseFromString(pred_net_str)
                init_net_str = rep.init_net()
                inn = caffe2_pb2.NetDef()
                inn.ParseFromString(init_net_str)
                with open("cpp.txt", "w") as f:
                    f.write("pred_net: \n{}".format(pn))

            rep_wrapper = Caffe2CppRep(rep)
            return rep_wrapper
        else:
            ws = Workspace()
            device_option = get_device_option(Device(device))

            # Directly load initializer data into blobs in workspace
            cls._direct_initialize_parameters(
                model.graph.initializer,
                ws,
                device_option,
            )

            initialized = {init.name for init in model.graph.initializer}

            cls._direct_initialize_inputs(
                model.graph.input,
                initialized,
                ws,
                device_option,
            )

            uninitialized = [value_info.name for value_info in model.graph.input if value_info.name not in initialized]

            init_net, predict_net = cls._onnx_model_to_caffe2_net(model, device, opset_version, False)
            if "ONNX_CAFFE2_DEBUG" in os.environ:
                with open("python.txt", "w") as f:
                    f.write("pred_net: \n{}".format(predict_net))
            retval = Caffe2Rep(init_net, predict_net, ws, uninitialized)
            return retval
Ejemplo n.º 17
0
def ExtractPredictorNet(
    net_proto,
    input_blobs,
    output_blobs,
    device=None,
    renames=None,
    disabled_inputs=None,
):
    '''
    Takes a model net for training and returns a net which can be
    used for prediction. For example, all gradient operators and
    input operators are removed.
    @param net_proto protobuf of the net you want to process (net.Proto())
    @param input_blobs list/set of blob names that are the inputs of predictor
    @param output_blobs list/set of blob names that are outputs of predictor
    @param device optional device option that is assigned
    @param renames dictionary of blob name to a new name (optional)
    @param disabled_inputs optional set of blobs that are 'switched off'. This
                will cause branches with those blobs as inputs to be removed
    '''
    predict_net = core.Net(net_proto.name + "_predict")
    predict_proto = predict_net.Proto()

    orig_external_inputs = set(net_proto.external_input)
    orig_external_outputs = set(net_proto.external_output)
    input_blobs = {str(b) for b in input_blobs}
    known_blobs = set(orig_external_inputs).union(input_blobs)
    output_blobs = {str(b) for b in output_blobs}
    external_inputs = set(input_blobs)
    external_outputs = set(output_blobs)

    if renames is None:
        renames = {}

    if disabled_inputs is not None:
        known_blobs = known_blobs - set(disabled_inputs)

    ops = list(net_proto.op)

    # Find the range of ops that we should include
    try:
        first_op_with_input = min([
            j for j in range(len(ops))
            if input_blobs.intersection(ops[j].input)
            and ops[j].type != 'StopGradient'
        ])
    except ValueError:
        raise Exception("No ops with input={}".format(input_blobs))
    try:
        last_op_with_output = max([
            j for j in range(len(ops))
            if output_blobs.intersection(ops[j].output)
        ])
    except ValueError:
        raise Exception("No ops with output={}".format(output_blobs))

    def validate_op(op):
        # Check that the op does not have is_test = 0 set. This is a common
        # pitfall with SpatialBN op, at lest.
        for arg in op.arg:
            if arg.name == "is_test" and arg.i == 0:
                raise Exception(
                    "An operator had is_test=0, did you try to extract a " +
                    "predictor from a train model (instead of test model)?" +
                    " Op was: {}".format(str(op)))

    # Iterate through the ops and only include those whose inputs
    # we can satisfy.
    for op in ops[first_op_with_input:(last_op_with_output + 1)]:
        if known_blobs.issuperset(op.input):

            # Special handling for recurrent nets
            # TODO: when standard argument type for "nets" is introduced,
            # this can be more general
            if op.type == 'RecurrentNetwork':
                import google.protobuf.text_format as protobuftx
                for arg in op.arg:
                    if arg.name == 'backward_step_net':
                        arg.s = b""
                    elif arg.name == 'step_net':
                        step_proto = caffe2_pb2.NetDef()
                        protobuftx.Merge(arg.s.decode("ascii"), step_proto)
                        for step_op in step_proto.op:
                            if device is not None:
                                step_op.device_option.device_type = device.device_type
                                step_op.device_option.cuda_gpu_id = device.cuda_gpu_id

                        # Add additional external inputs
                        external_inputs.update(
                            set(step_proto.external_input).intersection(
                                orig_external_inputs))
                        arg.s = str(step_proto).encode("ascii")

            if device is not None:
                op.device_option.device_type = device.device_type
                op.device_option.cuda_gpu_id = device.cuda_gpu_id
            validate_op(op)
            predict_proto.op.extend([op])
            known_blobs.update(op.output)
            external_inputs.update(
                set(op.input).intersection(orig_external_inputs))
            external_outputs.update(
                set(op.output).intersection(orig_external_outputs))

        else:
            logging.debug("Op {} had unknown inputs: {}".format(
                op.type,
                set(op.input).difference(known_blobs)))

    def rename_list(proto_list):
        # proto lists don't support assignments
        new_list = proto_list[:]
        for j, b in enumerate(new_list):
            if b in renames:
                new_list[j] = renames[b]

        del proto_list[:]
        proto_list.extend(new_list)

    # Predictor net's external inputs and outputs include only those
    # that are part of this net.
    predict_proto.external_input.extend(external_inputs)
    predict_proto.external_output.extend(external_outputs)

    rename_list(predict_proto.external_input)
    rename_list(predict_proto.external_output)

    renamed_input_blobs = []
    for b in input_blobs:
        if b in renames:
            renamed_input_blobs.append(renames[b])
        else:
            renamed_input_blobs.append(b)

    for op in predict_proto.op:
        rename_list(op.input)
        rename_list(op.output)

    return predict_net, list(
        set(predict_proto.external_input) - set(renamed_input_blobs))
    def test_batch_matmul(self, M, K, N, C, rand_seed, trans_a, trans_b,
                          run_ints):
        np.random.seed(rand_seed)
        workspace.ResetWorkspace()

        batch_dims = [C]

        if run_ints:
            X = np.random.randint(low=1, high=3,
                                  size=((C, M, K))).astype(np.float32)
        else:
            X = 100 * (np.random.rand(*(batch_dims + [M, K])).astype(
                np.float32) - 0.5)
        if trans_a:
            X = X.swapaxes(-1, -2)

        if run_ints:
            Y = np.random.randint(low=1, high=3,
                                  size=((C, K, N))).astype(np.float32)
        else:
            Y = 100 * (np.random.rand(*(batch_dims + [K, N])).astype(
                np.float32) - 0.5)
        if trans_b:
            Y = Y.swapaxes(-1, -2)

        pred_net = caffe2_pb2.NetDef()
        pred_net.name = "pred"
        pred_net.external_input.extend(["X", "Y"])
        pred_net.external_output.append("out")
        pred_net.op.add().CopyFrom(
            core.CreateOperator('BatchMatMul', ['X', 'Y'],
                                'out',
                                trans_a=trans_a,
                                trans_b=trans_b))

        pred_net_ref = core.Net("pred_net_ref")

        # Reference updated to fp16 with fp32 accumulation
        pred_net_ref.BatchMatMulFP16Acc32Fake(["X", "Y"], ['out'],
                                              trans_a=trans_a,
                                              trans_b=trans_b)

        print("dims", batch_dims, X.shape, Y.shape)
        pred_net_onnxified = onnxifi_caffe2_net(pred_net, {
            "X": X.shape,
            "Y": Y.shape
        },
                                                debug=True,
                                                adjust_batch=False,
                                                use_onnx=False)
        num_onnxified_ops = sum(1 if o.type == "Onnxifi" else 0
                                for o in pred_net_onnxified.op)
        np.testing.assert_equal(num_onnxified_ops, 1)

        workspace.FeedBlob("X", X)
        workspace.FeedBlob("Y", Y)
        workspace.CreateNet(pred_net_onnxified)
        workspace.CreateNet(pred_net_ref)

        # Run Glow net
        workspace.RunNet(pred_net_onnxified.name)
        out_glow = workspace.FetchBlob('out')

        # Run caffe2 net
        workspace.RunNet(pred_net_ref)
        out_c2_fakefp16 = workspace.FetchBlob('out')

        diff = np.abs(out_c2_fakefp16 - out_glow)

        if not np.allclose(out_glow, out_c2_fakefp16):
            print_test_debug_info(
                "bmm", {
                    "seed": rand_seed,
                    "m": M,
                    "k": K,
                    "n": N,
                    "X": X.shape,
                    "Y": Y.shape,
                    "trans_a": trans_a,
                    "trans_b": trans_b,
                    "run_ints": run_ints,
                    "out_glow": out_glow,
                    "out_c2_fakefp16": out_c2_fakefp16,
                    "diff": diff
                })
            assert (0)
Ejemplo n.º 19
0
    def test_convolution_relu_fusion(self, stride, pad, kernel, size,
                                     input_channels, output_channels,
                                     batch_size, use_bias, group, gc, dc):
        conv = core.CreateOperator(
            "Conv", ["X0", "w0", "b0"] if use_bias else ["X0", "w0"], ["Y0"],
            stride=stride,
            pad=pad,
            kernel=kernel,
            group=group,
            device_option=dc[0])
        relu = core.CreateOperator("Relu", ["Y0"], ["Y0"], device_option=dc[0])

        # Manual fusion for Conv + ReLU
        conv_fusion = core.CreateOperator(
            "ConvFusion", ["X1", "w1", "b1"] if use_bias else ["X1", "w1"],
            ["Y1"],
            stride=stride,
            pad=pad,
            kernel=kernel,
            group=group,
            fusion_type=1,
            device_option=dc[1])

        X = np.random.rand(batch_size, input_channels * group, size,
                           size).astype(np.float32) - 0.5
        w = np.random.rand(
                output_channels * group, input_channels, kernel, kernel) \
            .astype(np.float32) - 0.5
        b = np.random.rand(output_channels * group).astype(np.float32) - 0.5

        old_ws_name = workspace.CurrentWorkspace()
        workspace.SwitchWorkspace("_device_check_", True)
        workspace.FeedBlob('X0', X, dc[0])
        workspace.FeedBlob('w0', w, dc[0])
        workspace.FeedBlob('b0', b, dc[0])
        workspace.RunOperatorOnce(conv)
        workspace.RunOperatorOnce(relu)
        Y0 = workspace.FetchBlob('Y0')

        workspace.ResetWorkspace()
        workspace.FeedBlob('X1', X, dc[1])
        workspace.FeedBlob('w1', w, dc[1])
        workspace.FeedBlob('b1', b, dc[1])
        workspace.RunOperatorOnce(conv_fusion)
        Y1 = workspace.FetchBlob('Y1')
        if not np.allclose(Y0, Y1, atol=0.01, rtol=0.01):
            print(Y1.flatten())
            print(Y0.flatten())
            print(np.max(np.abs(Y1 - Y0)))
            self.assertTrue(False)

        # Auto fusion for Conv + ReLU
        workspace.ResetWorkspace()
        old_net = caffe2_pb2.NetDef()
        conv_old = caffe2_pb2.OperatorDef()
        conv_old.CopyFrom(conv)
        conv_old.device_option.CopyFrom(dc[1])
        relu_old = caffe2_pb2.OperatorDef()
        relu_old.CopyFrom(relu)
        relu_old.device_option.CopyFrom(dc[1])
        old_net.op.extend([conv_old, relu_old])
        workspace.FeedBlob('X0', X, dc[1])
        workspace.FeedBlob('w0', w, dc[1])
        workspace.FeedBlob('b0', b, dc[1])
        net = core.Net("net")
        net.Proto().CopyFrom(old_net)
        optimizeForMKLDNN(net)
        self.assertTrue(len(net.Proto().op) == 1)
        self.assertTrue(net.Proto().op[0].type == "ConvFusion")
        workspace.RunOperatorOnce(net.Proto().op[0])
        Y2 = workspace.FetchBlob('Y0')
        if not np.allclose(Y0, Y2, atol=0.01, rtol=0.01):
            print(Y2.flatten())
            print(Y0.flatten())
            print(np.max(np.abs(Y2 - Y0)))
            self.assertTrue(False)

        workspace.SwitchWorkspace(old_ws_name)
Ejemplo n.º 20
0
def read_init_net_pbtxt(init_net_file):
    init_net_txt = open(init_net_file, "r").read()
    init_net = caffe2_pb2.NetDef()
    text_format.Merge(init_net_txt, init_net)
    return init_net
Ejemplo n.º 21
0
    def test_convolution_affch_folding(self, stride, pad, kernel, size,
                                       input_channels, output_channels,
                                       batch_size, use_bias, group, inplace,
                                       gc, dc):
        conv = core.CreateOperator(
            "Conv", ["X0", "w0", "b0"] if use_bias else ["X0", "w0"], ["X1"],
            stride=stride,
            pad=pad,
            kernel=kernel,
            group=group,
            device_option=dc[1])
        affch = core.CreateOperator("AffineChannel", ["X1", "scale", "bias"],
                                    ["X1" if inplace else "Y"],
                                    device_option=dc[1])

        X = np.random.rand(batch_size, input_channels * group, size,
                           size).astype(np.float32) - 0.5
        w = np.random.rand(
                output_channels * group, input_channels, kernel, kernel) \
            .astype(np.float32) - 0.5
        b = np.random.rand(output_channels * group).astype(np.float32) - 0.5
        scale = np.random.rand(output_channels).astype(np.float32) + 0.5
        bias = np.random.rand(output_channels).astype(np.float32) - 0.5

        old_ws_name = workspace.CurrentWorkspace()
        workspace.SwitchWorkspace("_device_check_", True)
        workspace.FeedBlob('X0', X, dc[1])
        workspace.FeedBlob('w0', w, dc[1])
        workspace.FeedBlob('b0', b, dc[1])
        workspace.FeedBlob('scale', scale, dc[1])
        workspace.FeedBlob('bias', bias, dc[1])
        workspace.RunOperatorOnce(conv)
        workspace.RunOperatorOnce(affch)
        Y = workspace.FetchBlob('X1' if inplace else "Y")

        workspace.ResetWorkspace()
        old_net = caffe2_pb2.NetDef()
        conv_old = caffe2_pb2.OperatorDef()
        conv_old.CopyFrom(conv)
        conv_old.device_option.CopyFrom(dc[1])
        affch_old = caffe2_pb2.OperatorDef()
        affch_old.CopyFrom(affch)
        affch_old.device_option.CopyFrom(dc[1])
        old_net.op.extend([conv_old, affch_old])
        workspace.FeedBlob('X0', X, dc[1])
        workspace.FeedBlob('w0', w, dc[1])
        workspace.FeedBlob('b0', b, dc[1])
        workspace.FeedBlob('scale', scale, dc[1])
        workspace.FeedBlob('bias', bias, dc[1])
        net = core.Net("net")
        net.Proto().CopyFrom(old_net)
        optimizeForMKLDNN(net)
        self.assertTrue(len(net.Proto().op) == 1)
        self.assertTrue(net.Proto().op[0].type == "Conv")
        workspace.RunOperatorOnce(net.Proto().op[0])
        Y1 = workspace.FetchBlob('X1' if inplace else "Y")
        if not np.allclose(Y, Y1, atol=0.01, rtol=0.01):
            print(Y.flatten())
            print(Y1.flatten())
            print(np.max(np.abs(Y - Y1)))
            self.assertTrue(False)

        workspace.SwitchWorkspace(old_ws_name)
Ejemplo n.º 22
0
def read_init_net(init_net_file):
    init_net_pb = open(init_net_file, "rb").read()
    init_net = caffe2_pb2.NetDef()
    init_net.ParseFromString(init_net_pb)
    return init_net
Ejemplo n.º 23
0
## This is a helper script that generates simple Caffe2 models.

from caffe2.proto import caffe2_pb2
from caffe2.python import utils

# Define a weights network
weights = caffe2_pb2.NetDef()
weights.name = "init"

op = caffe2_pb2.OperatorDef()
op.type = "fake_data_provider"
op.output.extend(["data"])
weights.op.extend([op])
weights.external_output.extend(op.output)

op = caffe2_pb2.OperatorDef()
op.type = "GivenTensorFill"
op.output.extend(["fc_w"])
op.arg.extend([utils.MakeArgument("shape", [1,4])])
op.arg.extend([utils.MakeArgument("values", [1.0 for i in range(4)])])
weights.op.extend([op])
weights.external_output.extend(op.output)

op = caffe2_pb2.OperatorDef()
op.type = "GivenTensorFill"
op.output.extend(["fc_b"])
op.arg.extend([utils.MakeArgument("shape", [1,4])])
op.arg.extend([utils.MakeArgument("values", [1.0 for i in range(4)])])
weights.op.extend([op])
weights.external_output.extend(op.output)
Ejemplo n.º 24
0
def read_predict_net(predict_net_file):
    predict_net_txt = open(predict_net_file, "r").read()
    predict_net = caffe2_pb2.NetDef()
    predict_net.name = "the_model"
    text_format.Merge(predict_net_txt, predict_net)
    return predict_net
    def test_slws_fused_8bit_rowwise(self, seed, num_rows, embedding_dim,
                                     batch_size, max_weight):
        np.random.seed(seed)
        workspace.ResetWorkspace()

        data = np.random.rand(num_rows, embedding_dim).astype(np.float32)
        lengths = np.random.choice(np.arange(1, num_rows),
                                   batch_size).astype(np.int32)

        indices = []
        for length in lengths:
            indices.extend(np.random.choice(np.arange(1, num_rows), length))
        indices = np.asarray(indices).astype(np.int64)

        weights = np.random.uniform(low=0,
                                    high=max_weight,
                                    size=[len(indices)]).astype(np.float32)

        assert (len(weights) < 64000)

        pred_net = caffe2_pb2.NetDef()
        pred_net.name = "pred"
        pred_net.external_input.extend(
            ["quantized_data", "weights", "indices", "lengths"])
        pred_net.external_output.append("Y")
        pred_net.op.add().CopyFrom(
            core.CreateOperator(
                "SparseLengthsWeightedSumFused8BitRowwise",
                ["quantized_data", "weights", "indices", "lengths"],
                ["Y"],
            ))

        ref_net = caffe2_pb2.NetDef()
        ref_net.name = "ref"
        ref_net.external_input.extend(
            ["quantized_data", "weights", "indices", "lengths"])
        ref_net.external_output.append("Y")
        ref_net.op.add().CopyFrom(
            core.CreateOperator(
                "SparseLengthsWeightedSumFused8BitRowwiseFakeFP16NNPI",
                ["quantized_data", "weights", "indices", "lengths"],
                ["Y"],
            ))

        workspace.FeedBlob("data", data)
        workspace.RunOperatorOnce(
            core.CreateOperator("FloatToFused8BitRowwiseQuantized", ["data"],
                                ["quantized_data"]))
        onnxified_net = onnxifi_caffe2_net(
            pred_net,
            {},
            max_batch_size=batch_size,
            max_seq_size=np.max(lengths),
            debug=True,
            adjust_batch=True,
            use_onnx=False,
        )
        num_onnxified_ops = sum(1 if o.type == "Onnxifi" else 0
                                for o in onnxified_net.op)
        np.testing.assert_equal(num_onnxified_ops, 1)

        workspace.FeedBlob("indices", indices)
        workspace.FeedBlob("lengths", lengths)
        workspace.FeedBlob("weights", weights)

        workspace.CreateNet(onnxified_net)
        workspace.CreateNet(ref_net)

        workspace.RunNet(onnxified_net.name)
        Y_glow = workspace.FetchBlob("Y")

        workspace.RunNet(ref_net.name)
        Y_ref = workspace.FetchBlob("Y")

        diff = np.abs((Y_ref - Y_glow) / (Y_ref + 1e-8))
        max_err = np.max(diff, axis=1)
        num_offenders = (max_err > 0).sum()
        if num_offenders > 0:
            print_test_debug_info(
                "slws_fused_8bit_rowwise_inv_scale",
                {
                    "seed": seed,
                    "num_rows": num_rows,
                    "embedding_dim": embedding_dim,
                    "batch_size": batch_size,
                    "max_weight": max_weight,
                    "indices": indices,
                    "data": data.shape,
                    "lengths": lengths,
                    "weights": weights,
                    "Y_glow": Y_glow,
                    "Y_ref": Y_ref,
                    "diff": diff,
                    "rowwise_diff": np.max(diff, axis=1),
                },
            )
            assert 0
Ejemplo n.º 26
0
    def test_onnx_while_fibb(self, condition, max_trip_count, save_scopes,
                             disable_scopes, seed, gc, dc):
        np.random.seed(seed)
        if disable_scopes:
            save_scopes = False

        # Create body net
        body_net = caffe2_pb2.NetDef()
        # Two loop carried dependencies: first and second
        body_net.external_input.extend(['i', 'cond', 'first', 'second'])
        body_net.external_output.extend(
            ['cond_new', 'second', 'third', 'third'])
        add_op = core.CreateOperator(
            'Add',
            ['first', 'second'],
            ['third'],
        )
        print3 = core.CreateOperator(
            'Print',
            ['third'],
            [],
        )
        limit_const = core.CreateOperator(
            'ConstantFill',
            [],
            ['limit_const'],
            shape=[1],
            dtype=caffe2_pb2.TensorProto.FLOAT,
            value=100.0,
        )
        cond = core.CreateOperator(
            'LT',
            ['third', 'limit_const'],
            ['cond_new'],
        )
        body_net.op.extend([add_op, print3, limit_const, cond])

        while_op = core.CreateOperator(
            'ONNXWhile',
            ['max_trip_count', 'condition', 'first_init', 'second_init'],
            ['first_a', 'second_a', 'third_a'],
            body=body_net,
            has_cond=True,
            has_trip_count=True,
            save_scopes=save_scopes,
            disable_scopes=disable_scopes,
        )

        condition_arr = np.array(condition).astype(np.bool)
        max_trip_count_arr = np.array(max_trip_count).astype(np.int64)
        first_init = np.array([1]).astype(np.float32)
        second_init = np.array([1]).astype(np.float32)

        def ref(max_trip_count, condition, first_init, second_init):
            first = 1
            second = 1
            results = []
            if condition:
                for _ in range(max_trip_count):
                    third = first + second
                    first = second
                    second = third
                    results.append(third)
                    if third > 100:
                        break
            return (first, second, np.array(results).astype(np.float32))

        self.assertReferenceChecks(
            gc,
            while_op,
            [max_trip_count_arr, condition_arr, first_init, second_init],
            ref,
        )
    def test_small_sls(self, seed):
        np.random.seed(seed)
        workspace.ResetWorkspace()

        n = 2
        DIM = 3
        data = 4 * (np.random.random_sample((n, DIM)) + 1).astype(np.float32)

        lengths = np.array([n], dtype=np.int32)
        indices = np.array(range(n), dtype=np.int64)
        weights = np.random.uniform(low=0.01, high=0.5,
                                    size=[n]).astype(np.float32)

        pred_net = caffe2_pb2.NetDef()
        pred_net.name = "pred"
        pred_net.external_input.extend(
            ["quantized_data", "weights", "indices", "lengths"])
        pred_net.external_output.append("Y")
        pred_net.op.add().CopyFrom(
            core.CreateOperator(
                "SparseLengthsWeightedSumFused8BitRowwise",
                ["quantized_data", "weights", "indices", "lengths"],
                ["Y"],
            ))

        ref_net = caffe2_pb2.NetDef()
        ref_net.name = "ref"
        ref_net.external_input.extend(
            ["quantized_data", "weights", "indices", "lengths"])
        ref_net.external_output.append("Y")
        ref_net.op.add().CopyFrom(
            core.CreateOperator(
                "SparseLengthsWeightedSumFused8BitRowwiseFakeFP16NNPI",
                ["quantized_data", "weights", "indices", "lengths"],
                ["Y"],
            ))

        workspace.FeedBlob("data", data)
        workspace.RunOperatorOnce(
            core.CreateOperator("FloatToFused8BitRowwiseQuantized", ["data"],
                                ["quantized_data"]))

        quantized_data = workspace.FetchBlob("quantized_data")

        onnxified_net = onnxifi_caffe2_net(
            pred_net,
            {},
            max_batch_size=1,
            max_seq_size=n,
            debug=True,
            adjust_batch=True,
            use_onnx=False,
        )
        num_onnxified_ops = sum(1 if o.type == "Onnxifi" else 0
                                for o in onnxified_net.op)
        np.testing.assert_equal(num_onnxified_ops, 1)

        workspace.FeedBlob("indices", indices)
        workspace.FeedBlob("lengths", lengths)
        workspace.FeedBlob("weights", weights)

        workspace.CreateNet(onnxified_net)
        workspace.CreateNet(ref_net)

        workspace.RunNet(onnxified_net.name)
        Y_glow = workspace.FetchBlob("Y")

        workspace.RunNet(ref_net.name)
        Y_ref = workspace.FetchBlob("Y")

        diff = np.abs((Y_ref - Y_glow) / (Y_ref + 1e-8))
        max_err = np.max(diff, axis=1)
        num_offenders = (max_err > 0).sum()
        if num_offenders > 0:
            np.set_printoptions(precision=12)
            print(
                "ref",
                Y_ref.astype(np.float16).astype(np.float32),
                "glow",
                Y_glow.astype(np.float16).astype(np.float32),
            )
            print_test_debug_info(
                "slws_fused_8bit_rowwise_inv_scale",
                {
                    "seed": seed,
                    "indices": indices,
                    "data": data,
                    "quantized_data": quantized_data,
                    "lengths": lengths,
                    "weights": weights,
                    "Y_glow": Y_glow,
                    "Y_ref": Y_ref,
                    "diff": diff,
                    "rowwise_diff": np.max(diff, axis=1),
                },
            )
            assert 0
Ejemplo n.º 28
0
# Add the data input layer to the model, pointing at the TEST_LMDB
data, _ = AddInputLayer(test_model, 1, TEST_LMDB, 'lmdb')

# ### Populate the Model Helper with Saved Model Params
#
# To format a model for testing, we do not need to create params in the model helper, nor do we need to add gradient operators as we will only be performing forward passes. All we really need to do is populate the *.net* and *.param_init_net* members of the model helper with the contents of the saved *predict_net.pb* and *init_net.pb*, respectively. To accomplish this, we construct *caffe2_pb* objects with the protobuf from the pb files, create *Net* objects with the *caffe2_pb* objects, then **append** the net objects to the *.net* and *.param_init_net* members of the model helper. Appending is very important here! If we do not append, we would wipe out the input data layer stuff that we just added.
#
# Recall from Part 1, the saved model expected an input named *data* and produced an output called *softmax*. Conveniently (but not accidentally), the *AddInputLayer* function reads from the lmdb and puts the information into the workspace in a blob called *data*. It is also important to remember what each of the saved nets that we are appending to our model contains. The *predict_net* contains the structure of the model, including the ops involved in the forward pass. It has the definitions of the convolutional, pooling, and fc layers in the model. The *init_net* contains the weight initializations for the parameters that the ops in the *predict_net* expect. For example, if there is an op in the *predict_net* named 'fc1', the *init_net* will contain the trained weights (*fc1_w*), and biases (*fc1_b*) for that layer.
#
# After we append the nets, we add an accuracy layer to the model which uses the *softmax* output from the saved model and the *label* input from the lmdb. Note, we could manually fetch the softmax blob from the workspace after every iteration and check whether or not the class with the highest softmax score is the true label, but instead we opt for the simpler accuacy layer.

# In[5]:

# Populate the model helper obj with the init net stuff, which provides the
#    weight initializations for the model
init_net_proto = caffe2_pb2.NetDef()
with open(INIT_NET, "r") as f:
    init_net_proto.ParseFromString(f.read())
test_model.param_init_net = test_model.param_init_net.AppendNet(
    core.Net(init_net_proto))

# Populate the model helper obj with the predict net stuff, which defines
#    the structure of the model
predict_net_proto = caffe2_pb2.NetDef()
with open(PREDICT_NET, "r") as f:
    predict_net_proto.ParseFromString(f.read())
test_model.net = test_model.net.AppendNet(core.Net(predict_net_proto))

# Add an accuracy feature to the model for convenient reporting during testing
accuracy = brew.accuracy(test_model, ['softmax', 'label'], 'accuracy')
Ejemplo n.º 29
0
    def TranslateModel(
        cls,
        caffe_net,
        pretrained_net,
        is_test=False,
        input_mean=None,
        net_state=None,
    ):
        net_state = caffe_pb2.NetState() if net_state is None else net_state
        net = caffe2_pb2.NetDef()
        net.name = caffe_net.name
        net_params = caffe2_pb2.TensorProtos()
        if len(caffe_net.layer) == 0:
            raise ValueError(
                'I think something is wrong. This translation script '
                'only accepts new style layers that are stored in the '
                'layer field.')
        if input_mean:
            caffenet_mean = caffe_pb2.BlobProto()
            caffenet_mean.ParseFromString(open(input_mean, 'rb').read())
            mean_ = utils.CaffeBlobToNumpyArray(caffenet_mean)
            mean_tensor = utils.NumpyArrayToCaffe2Tensor(mean_, 'mean_')
            net_params.protos.extend([mean_tensor])
            mean_op = caffe2_pb2.OperatorDef()
            mean_op.type = 'Sub'
            mean_op.input.extend(['data_', 'mean_'])
            # Assume that input blob's name is "data"
            mean_op.output.extend(['data'])
            net.op.extend([mean_op])
        i = 0
        while i < len(caffe_net.layer):
            if not _ShouldInclude(net_state, caffe_net.layer[i]):
                log.info('Current net state does not need layer {}'.format(
                    caffe_net.layer[i].name))
                continue
            log.info('Translate layer {}'.format(caffe_net.layer[i].name))
            # Get pretrained one
            pretrained_layers_index = ([
                l for l in xrange(len(pretrained_net.layer))
                if pretrained_net.layer[l].name == caffe_net.layer[i].name
            ] + [
                l for l in xrange(len(pretrained_net.layers))
                if pretrained_net.layers[l].name == caffe_net.layer[i].name
            ])
            is_bn = False
            if len(pretrained_layers_index) > 1:
                raise ValueError(
                    'huh? more than one pretrained layer of one name?')
            elif len(pretrained_layers_index) == 1:
                if pretrained_net.layer[
                        pretrained_layers_index[0]].type == "BatchNorm":
                    # A Scale layer should follow BatchNorm layer
                    # according to paper https://arxiv.org/abs/1502.03167.
                    assert pretrained_net.layer[pretrained_layers_index[0] +
                                                1].type == "Scale"
                    pretrained_blobs = [utils.CaffeBlobToNumpyArray(blob)
                    for blob in pretrained_net.layer[pretrained_layers_index[0]].blobs] + \
                        [utils.CaffeBlobToNumpyArray(blob)
                    for blob in pretrained_net.layer[pretrained_layers_index[0] + 1].blobs]
                    is_bn = True
                else:
                    pretrained_blobs = [
                        utils.CaffeBlobToNumpyArray(blob) for blob in
                        pretrained_net.layer[pretrained_layers_index[0]].blobs
                    ]
            else:
                # No pretrained layer for the given layer name. We'll just pass
                # no parameter blobs.
                # print 'No pretrained layer for layer', layer.name
                pretrained_blobs = []

            operators, params = cls.TranslateLayer(caffe_net.layer[i],
                                                   pretrained_blobs, is_test)
            net.op.extend(operators)
            net_params.protos.extend(params)
            if is_bn:
                i += 2
            else:
                i += 1
        return net, net_params
Ejemplo n.º 30
0
    def _test_binary_op_graph(self, name):
        workspace.ResetWorkspace()
        # First dimension is the batch size
        dims = np.concatenate((np.array([1]), np.random.randint(1, 20,
                                                                size=3)))
        A = np.random.uniform(low=-100.0, high=100.0,
                              size=dims).astype(np.float32)
        B = np.random.uniform(low=-100.0, high=100.0,
                              size=dims).astype(np.float32)
        print(A.shape, B.shape)
        pred_net = caffe2_pb2.NetDef()
        pred_net.name = "pred"
        pred_net.external_input.extend(["A", "B"])
        pred_net.external_output.append("C")
        pred_net.op.add().CopyFrom(core.CreateOperator(name, ["A", "B"],
                                                       ["C"]))
        pred_net_ref = caffe2_pb2.NetDef()
        pred_net_ref.name = "ref"
        pred_net_ref.external_input.extend(["A", "B"])
        pred_net_ref.external_output.append("C_ref")
        pred_net_ref.op.add().CopyFrom(
            core.CreateOperator(
                name + "FakeFp16",
                ["A", "B"],
                ["C_ref"],
            ))

        shape_hints = {"A": A.shape, "B": B.shape}
        pred_net_onnxified = onnxifi_caffe2_net(pred_net,
                                                shape_hints,
                                                debug=True,
                                                adjust_batch=True,
                                                use_onnx=False)
        print(pred_net_onnxified)
        num_onnxified_ops = sum(1 if o.type == "Onnxifi" else 0
                                for o in pred_net_onnxified.op)
        np.testing.assert_equal(num_onnxified_ops, 1)
        workspace.SwitchWorkspace("glow_test_ws", True)
        workspace.FeedBlob("A", A)
        workspace.FeedBlob("B", B)

        workspace.CreateNet(pred_net_ref)
        workspace.CreateNet(pred_net_onnxified)
        num_iterations = 10
        for _ in range(num_iterations):
            A = np.random.uniform(low=-100.0, high=100.0,
                                  size=dims).astype(np.float32)
            B = np.random.uniform(low=-100.0, high=100.0,
                                  size=dims).astype(np.float32)
            workspace.FeedBlob("A", A)
            workspace.FeedBlob("B", B)
            # Run caffe2 net
            workspace.RunNet(pred_net_ref.name)
            Y_c2 = workspace.FetchBlob("C_ref")

            # Run Glow net
            workspace.RunNet(pred_net_onnxified.name)
            Y_glow = workspace.FetchBlob("C")

            # Results should be identical since we are comparing with the C2 emulation
            if not np.allclose(Y_c2, Y_glow):
                diff = np.abs((Y_glow - Y_c2) / (Y_c2 + kEpsilon))
                print_test_debug_info(
                    name, {
                        "dims": dims,
                        "A": A,
                        "B": B,
                        "Y_glow": Y_glow,
                        "Y_c2": Y_c2,
                        "diff": diff
                    })
                assert (0)