コード例 #1
0
ファイル: conv_test.py プロジェクト: vbvg2008/pytorch
    def test_1x1_conv(self, op_type, N, G, DX, DY, H, W, use_bias, order,
                      force_algo_fwd, force_algo_dgrad,
                      force_algo_wgrad, gc, dc):
        if hiputl.run_in_hip(gc, dc):
            assume(order == "NCHW")
        if order == "NHWC":
            G = 1

        C = G * DX
        M = G * DY

        op = core.CreateOperator(
            op_type,
            ["X", "filter", "bias"] if use_bias else ["X", "filter"],
            ["Y"],
            stride_h=1,
            stride_w=1,
            pad_t=0,
            pad_l=0,
            pad_b=0,
            pad_r=0,
            kernel=1,
            order=order,
            group=G,
            force_algo_fwd=force_algo_fwd,
            force_algo_dgrad=force_algo_dgrad,
            force_algo_wgrad=force_algo_wgrad,
        )

        if order == "NCHW":
            X = np.random.randn(N, C, H, W).astype(np.float32)
            filter = np.random.randn(M, DX, 1, 1).astype(np.float32)
        else:
            X = np.random.randn(N, H, W, C).astype(np.float32)
            filter = np.random.randn(M, 1, 1, DX).astype(np.float32)
        bias = np.random.randn(M).astype(np.float32)
        inputs = [X, filter, bias] if use_bias else [X, filter]

        def conv_1x1_nchw_ref(X, filter, bias=None):
            X = X.reshape(N, G, DX, -1)
            filter = filter.reshape(G, DY, DX)
            Y = np.zeros(shape=(N, G, DY, H * W), dtype=np.float32)
            for i in range(N):
                for j in range(G):
                    Y[i, j, :, :] = np.dot(filter[j, :, :], X[i, j, :, :])
            Y = Y.reshape(N, M, H, W)
            if bias is not None:
                bias = bias.reshape(1, M, 1, 1)
                Y = np.add(Y, bias)
            return [Y]

        def conv_1x1_nhwc_ref(X, filter, bias=None):
            X = X.reshape(N, -1, G, DX)
            filter = filter.reshape(G, DY, DX)
            Y = np.zeros(shape=(N, H * W, G, DY), dtype=np.float32)
            for i in range(N):
                for j in range(G):
                    Y[i, :, j, :] = np.dot(
                        X[i, :, j, :], filter[j, :, :].transpose())
            Y = Y.reshape(N, H, W, M)
            if bias is not None:
                bias = bias.reshape(1, 1, 1, M)
                Y = np.add(Y, bias)
            return [Y]

        if order == "NCHW":
            conv_1x1_ref = conv_1x1_nchw_ref
        else:
            conv_1x1_ref = conv_1x1_nhwc_ref
        self.assertReferenceChecks(
            device_option=gc,
            op=op,
            inputs=inputs,
            reference=conv_1x1_ref,
        )
        self.assertDeviceChecks(dc, op, inputs, [0])
        for i in range(len(inputs)):
            self.assertGradientChecks(gc, op, inputs, i, [0])
コード例 #2
0
 def testEnforce(self):
     op = core.CreateOperator("Relu", ["X"], ["Y"])
     with self.assertRaises(RuntimeError):
         workspace.RunOperatorOnce(op)
コード例 #3
0
    def test_cross_entropy_and_unjoied_cross_entropy_relation(
        self, log_D_trick, gc, dc
    ):
        logits = np.array([1.4720, 0.3500, -0.6529, -1.1908, 0.8357,
                    -1.0774, -0.3395, -0.2469, 0.6708, -1.8332], dtype='f')
        targets = np.array([1., 1., 1., 1., 1., 1., 0., 0., 0., 0.], dtype='f')
        lr_size = targets.size
        unjoined_lr_loss = False

        def sigmoid_xentr_logit_ref(logits, targets):
            if unjoined_lr_loss:
                s = unjoined_sigmoid_cross_entropy(logits, targets)
            else:
                s = sigmoid_cross_entropy_with_logits(logits, targets)
            m = np.mean(s, axis=len(logits.shape) - 1)
            return (m, )

        def sigmoid_xentr_logit_grad_ref(g_out, outputs, fwd_inputs):
            fwd_logits, fwd_targets = fwd_inputs
            inner_size = fwd_logits.shape[-1]
            if unjoined_lr_loss:
                m = unjoined_sigmoid_cross_entropy_grad(logits, targets)
            else:
                m = sigmoid_cross_entropy_with_logits_grad(
                    fwd_logits, fwd_targets)

            # m = fwd_targets - sigmoid(fwd_logits)
            g_in = -np.expand_dims(g_out, axis=-1) * m / inner_size
            return (g_in, None)

        op = core.CreateOperator(
            'SigmoidCrossEntropyWithLogits', ['logits', 'targets'],
            ['xentropy'],
            log_D_trick=log_D_trick,
            unjoined_lr_loss=unjoined_lr_loss
        )
        output_lr = self.assertReferenceChecks(
            device_option=gc,
            op=op,
            inputs=[logits, targets],
            reference=sigmoid_xentr_logit_ref,
            output_to_grad='xentropy',
            grad_reference=sigmoid_xentr_logit_grad_ref)

        # Unjoined dataset where labels change later
        logits = np.array([1.4720, 0.3500, -0.6529, -1.1908, 0.8357,
                    -1.0774, -0.3395, -0.2469, 0.6708, -1.8332, 1.4720, 0.3500,
                    -0.6529, -1.1908, 0.8357, -1.0774], dtype='f')
        targets = np.array([0., 0., 0., 0., 0., 0., 0., 0., 0.,
                            0., 1., 1., 1., 1., 1., 1.], dtype='f')
        unjoined_lr_loss = True
        unjoined_lr_size = targets.size

        op = core.CreateOperator(
            'SigmoidCrossEntropyWithLogits', ['logits', 'targets'],
            ['xentropy'],
            log_D_trick=log_D_trick,
            unjoined_lr_loss=unjoined_lr_loss
        )
        outputs_unjoined_lr = self.assertReferenceChecks(
            device_option=gc,
            op=op,
            inputs=[logits, targets],
            reference=sigmoid_xentr_logit_ref,
            output_to_grad='xentropy',
            grad_reference=sigmoid_xentr_logit_grad_ref)

        self.assertAlmostEqual(
            output_lr[0].item(0) * lr_size / unjoined_lr_size,
            outputs_unjoined_lr[0].item(0),
            delta=0.0001)
コード例 #4
0
    def test_groupwise_dnnlowp_conv_relu_int(
        self,
        stride,
        pad,
        kernel,
        dilation,
        size,
        group,
        input_channels_per_group,
        output_channels_per_group,
        batch_size,
        order,
        gc,
        dc,
    ):
        assume(group == 1 or dilation == 1)

        X, W, b = generate_conv_inputs(
            stride,
            pad,
            kernel,
            dilation,
            size,
            group,
            input_channels_per_group,
            output_channels_per_group,
            batch_size,
            order,
            True,  # group-wise
        )

        Output = collections.namedtuple("Output",
                                        ["Y", "op_type", "engine", "order"])
        outputs = []

        op_engine_list = [
            ("Conv", ""),
            ("ConvRelu", "DNNLOWP"),
            ("ConvRelu", "DNNLOWP_16"),
            ("Int8ConvRelu", "DNNLOWP"),
        ]

        for op_type, engine in op_engine_list:
            net = core.Net("test_net")

            if "DNNLOWP" in engine:
                quantize = core.CreateOperator("Quantize", ["X"], ["X_q"],
                                               engine=engine,
                                               device_option=gc)
                net.Proto().op.extend([quantize])

                conv = core.CreateOperator(
                    op_type,
                    ["X_q", "W", "b"],
                    ["Y_q"],
                    stride=stride,
                    kernel=kernel,
                    dilation=dilation,
                    pad=pad,
                    order=order,
                    engine=engine,
                    group=group,
                    quantize_groupwise=1,
                    device_option=gc,
                )
                # groupwise quantization only works with static quantization
                # so we need to set quantization parameters
                dnnlowp_utils.add_quantization_param_args(conv, outputs[0][0])
                net.Proto().op.extend([conv])

                dequantize = core.CreateOperator("Dequantize", ["Y_q"], ["Y"],
                                                 engine=engine,
                                                 device_option=gc)
                net.Proto().op.extend([dequantize])
            else:
                conv = core.CreateOperator(
                    op_type,
                    ["X", "W", "b"],
                    ["Y"],
                    stride=stride,
                    kernel=kernel,
                    dilation=dilation,
                    pad=pad,
                    order=order,
                    engine=engine,
                    group=group,
                    device_option=gc,
                )
                net.Proto().op.extend([conv])

                relu = core.CreateOperator("Relu", ["Y"], ["Y"],
                                           engine=engine,
                                           device_option=gc)
                net.Proto().op.extend([relu])

            run_conv_or_fc(self, None, net, X, W, b, op_type, engine, order,
                           gc, outputs)

        check_quantized_results_close(outputs)
コード例 #5
0
    def test_layernorm(self, seed):
        np.random.seed(seed)
        # Reset the workspace
        size = 4
        input_channels = 4
        batch_size = 1
        axis = 1
        epsilon = 1e-4
        workspace.ResetWorkspace()

        dims = np.array(([batch_size, input_channels, size, size]))
        X = np.random.uniform(size=dims).astype(np.float32) - 0.5
        gamma = np.random.randn(*X.shape[axis:]).astype(np.float32)
        beta = np.random.randn(*X.shape[axis:]).astype(np.float32)

        pred_net = caffe2_pb2.NetDef()
        pred_net.name = "pred"
        pred_net.external_input.extend(["X", "gamma", "beta"])
        pred_net.external_output.extend(["Y", "mean", "rstd"])
        pred_net.op.add().CopyFrom(
            core.CreateOperator(
                "LayerNorm",
                ["X", "gamma", "beta"],
                ["Y", "mean", "rstd"],
                axis=1,
                epsilon=epsilon,
                elementwise_affine=True
            )
        )

        pred_net_ref = caffe2_pb2.NetDef()
        pred_net_ref.name = "pred_ref"
        pred_net_ref.external_input.extend(["X", "gamma", "beta"])
        pred_net_ref.external_output.extend(["Y", "mean", "rstd"])
        pred_net_ref.op.add().CopyFrom(
            core.CreateOperator(
                "LayerNormFakeFP16NNPI",
                ["X", "gamma", "beta"],
                ["Y", "mean", "rstd"],
                axis=1,
                epsilon=epsilon,
                elementwise_affine=True
            )
        )

        shape_hits = {"X": X.shape, "gamma": gamma.shape, "beta": beta.shape}
        pred_net_onnxified = onnxifi_caffe2_net(
            pred_net,
            shape_hits,
            debug=True,
            adjust_batch=True,
            use_onnx=False
        )
        num_onnxified_ops = sum(
            1 if o.type == "Onnxifi" else 0 for o in pred_net_onnxified.op)
        np.testing.assert_equal(num_onnxified_ops, 1)

        workspace.FeedBlob("X", X)
        workspace.FeedBlob("gamma", gamma)
        workspace.FeedBlob("beta", beta)

        workspace.CreateNet(pred_net_ref)
        workspace.CreateNet(pred_net_onnxified)

        workspace.RunNet(pred_net_ref.name)
        Y_c2 = workspace.FetchBlob("Y")

        workspace.RunNet(pred_net_onnxified.name)
        Y_glow = workspace.FetchBlob("Y")

        if not np.allclose(Y_glow.astype(np.float16), Y_c2.astype(np.float16)):
            diff_Y = np.abs(Y_glow - Y_c2).astype(np.float16)
            print_test_debug_info(
                "layernorm",
                {
                    "seed": seed,
                    "size": size,
                    "input_channels": input_channels,
                    "batch_size": batch_size,
                    "epsilon": epsilon,
                    "axis": axis,
                    "X": X,
                    "Y_glow": Y_glow,
                    "Y_c2": Y_c2,
                    "diff_Y": diff_Y,
                }
            )
            assert(0)
コード例 #6
0
    def test_convert_end2end(self):
        predict_net_f = tempfile.NamedTemporaryFile()
        init_net_f = tempfile.NamedTemporaryFile()
        onnx_model_f = tempfile.NamedTemporaryFile()

        x = 'X'
        w = 'W'
        b = 'b'
        y = 'Y'

        predict_net = caffe2_pb2.NetDef()
        predict_net.name = 'test-convert-end2end'
        predict_net.external_input[:] = [x, w, b]
        predict_net.external_output[:] = [y]
        predict_net.op.extend([
            core.CreateOperator(
                'FC',
                inputs=[x, w, b],
                outputs=[y],
                axis=2,
            ),
        ])
        predict_net_f.write(predict_net.SerializeToString())
        predict_net_f.flush()

        init_net = caffe2_pb2.NetDef()
        init_net.name = 'test-convert-end2end-init'
        init_net.external_output[:] = [w, b]
        x_val = np.random.randn(1, 3, 2).astype(np.float32)
        w_val = np.random.randn(4, 2).astype(np.float32)
        b_val = np.random.randn(4).astype(np.float32)
        init_net.op.extend([
            core.CreateOperator(
                'GivenTensorFill',
                [],
                [w],
                values=w_val,
                shape=w_val.shape,
            ),
            core.CreateOperator(
                'GivenTensorFill',
                [],
                [b],
                values=b_val,
                shape=b_val.shape,
            ),
        ])
        init_net_f.write(init_net.SerializeToString())
        init_net_f.flush()

        y_val = np.matmul(x_val, w_val.transpose()) + b_val
        for _ in range(5):
            self._run_command(
                caffe2_to_onnx,
                [
                    predict_net_f.name,
                    '--caffe2-init-net',
                    init_net_f.name,
                    '--output',
                    onnx_model_f.name,
                    '--value-info',
                    json.dumps({
                        x: (TensorProto.FLOAT, (1, 3, 2)),
                    }),
                ],
                catch_exceptions=False,
            )

            onnx_model_f.seek(0)
            onnx_model = ModelProto()
            onnx_model.ParseFromString(onnx_model_f.read())
            np.testing.assert_almost_equal(
                c2.run_model(onnx_model,
                             {onnx_model.graph.input[0].name: x_val}), [y_val])

            self._run_command(onnx_to_caffe2, [
                onnx_model_f.name,
                '--output',
                predict_net_f.name,
                '--init-net-output',
                init_net_f.name,
            ])
            predict_net_f.seek(0)
            predict_net = caffe2_pb2.NetDef()
            predict_net.ParseFromString(predict_net_f.read())
            init_net_f.seek(0)
            init_net = caffe2_pb2.NetDef()
            init_net.ParseFromString(init_net_f.read())
            x = predict_net.external_input[0]
            np.testing.assert_almost_equal(
                c2_native_run_net(init_net=init_net,
                                  predict_net=predict_net,
                                  inputs={x: x_val})[1], [y_val])
コード例 #7
0
    def test_dnnlowp_average_pool(
        self,
        ndim,
        stride,
        pad,
        kernel,
        size,
        input_channels,
        batch_size,
        order,
        in_quantized,
        gc,
        dc,
    ):
        kernel = 2  # Only kernel size 2 is supported
        assume(kernel <= size)
        assume(pad < kernel)

        C = input_channels
        N = batch_size

        strides = (stride, ) * ndim
        pads = (pad, ) * (ndim * 2)
        kernels = (kernel, ) * ndim
        sizes = (size, ) * ndim

        # X has scale 1, so no input quantization error
        min_ = -100
        max_ = min_ + 255
        if order == "NCHW":
            X = np.round(
                np.random.rand(*((N, C) + sizes)) * (max_ - min_) + min_)
            X = X.astype(np.float32)
            X[(0, ) * (ndim + 2)] = min_
            X[(0, ) * (ndim + 1) + (1, )] = max_
        elif order == "NHWC":
            X = np.round(
                np.random.rand(*((N, ) + sizes + (C, ))) * (max_ - min_) +
                min_)
            X = X.astype(np.float32)
            X[(0, ) * (ndim + 2)] = min_
            X[(0, 1) + (0, ) * ndim] = max_

        Output = collections.namedtuple("Output", ["Y", "op_type", "engine"])
        outputs = []

        op_engine_list = [
            ("AveragePool", ""),
            ("AveragePool", "DNNLOWP"),
            ("Int8AveragePool", "DNNLOWP"),
        ]

        for op_type, engine in op_engine_list:
            net = core.Net("test_net")

            do_quantize = "DNNLOWP" in engine and in_quantized

            if do_quantize:
                quantize = core.CreateOperator("Quantize", ["X"], ["X_q"],
                                               engine=engine,
                                               device_option=gc)
                net.Proto().op.extend([quantize])

            max_pool = core.CreateOperator(
                op_type,
                ["X_q" if do_quantize else "X"],
                ["Y_q" if engine == "DNNLOWP" else "Y"],
                strides=strides,
                kernels=kernels,
                pads=pads,
                order=order,
                engine=engine,
                device_option=gc,
            )
            net.Proto().op.extend([max_pool])

            if engine == "DNNLOWP":
                dequantize = core.CreateOperator("Dequantize", ["Y_q"], ["Y"],
                                                 engine=engine,
                                                 device_option=gc)
                net.Proto().op.extend([dequantize])

            self.ws.create_blob("X").feed(X, device_option=gc)
            self.ws.run(net)
            outputs.append(
                Output(Y=self.ws.blobs["Y"].fetch(),
                       op_type=op_type,
                       engine=engine))

        check_quantized_results_close(outputs)
コード例 #8
0
 def testCreateWithNoneKwarg(self):
     with self.assertRaises(ValueError):
         core.CreateOperator("Ludicrous", "x", "y", arg1=None)
コード例 #9
0
ファイル: layer_model_helper.py プロジェクト: ssyd/caffe2
 def initializer(blob_name):
     return core.CreateOperator(op_name, [],
                                blob_name,
                                shape=array.shape,
                                values=array.flatten().tolist())
コード例 #10
0
def hard_sigmoid(x):
    x = (x + 1.) / 2.
    workspace.FeedBlob("x", x)
    Clip = core.CreateOperator("Clip", ["x"], ["x"], min=0., max=1.)
    workspace.RunOperatorOnce(Clip)
    return workspace.FetchBlob("x")
コード例 #11
0
ファイル: wsl_memonger.py プロジェクト: zlapp/DRN-WSOD
def share_freeze_blobs(
    net,
    namescope,
):

    log.warn("NOTE: Executing memonger to optimize gradient memory")

    # Collect ops that have something to do with gradients
    if namescope != "" and not namescope.endswith("/"):
        namescope += "/"

    netproto = copy.deepcopy(net.Proto())
    new_net = copy.deepcopy(net)
    activations = []
    external_input = set(new_net.Proto().external_input)
    external_output = set(new_net.Proto().external_output)

    start_idx = -1
    end_idx = -1

    # ops
    for idx, op in enumerate(new_net._net.op):
        # print(op)
        if namescope not in op.input[0]:
            continue
        if op.type == 'Conv' and start_idx < 0:
            start_idx = idx
        if op.type == 'StopGradient':
            end_idx = idx

    # print(namescope, 'start_idx: ', start_idx, ' end_idx: ', end_idx)

    # Hacky way to get activations, think of a better way
    for idx, op in enumerate(new_net._net.op[start_idx:end_idx]):
        if namescope not in op.input[0]:
            continue
        for b in op.output:
            if b not in external_output:
                activations.append(b)

    # print('activations: ', activations)

    used_activations = []
    for a in activations:
        if a in used_activations:
            continue
        share_pool = [
            namescope + '_shared_' + str(i) for i in range(1000, 10000)
        ]
        # print(a)
        first_idx = -1
        for idx, op in enumerate(new_net._net.op):
            if namescope not in op.input[0]:
                continue
            if a in list(op.input) + list(op.output):
                first_idx = idx
                break

        assert first_idx >= 0, first_idx

        for idx, op in enumerate(new_net._net.op[first_idx:]):
            if namescope not in op.input[0]:
                continue
            for b in list(op.input) + list(op.output):
                if b in share_pool:
                    share_pool.remove(b)

        for idx, op in enumerate(new_net._net.op):
            if namescope not in op.input[0]:
                continue
            op_input = copy.deepcopy(op.input)
            is_found = False
            for i, b in enumerate(op_input):
                if a == b:
                    op_input[i] = share_pool[-1]
                    is_found = True
            if is_found:
                del new_net._net.op[idx].input[:]
                new_net._net.op[idx].input.extend(op_input)

            op_output = copy.deepcopy(op.output)
            is_found = False
            for i, b in enumerate(op_output):
                if a == b:
                    op_output[i] = share_pool[-1]
                    is_found = True
            if is_found:
                del new_net._net.op[idx].output[:]
                new_net._net.op[idx].output.extend(op_output)

        used_activations.append(a)

    assert verify_graph_equality(net.Proto(), new_net.Proto()), \
        "Memonger graph is not equal to original."
    assert verify_inplace_blobs(net.Proto(), new_net.Proto()), \
        "Inplace assignments differ in memonger net."

    share_pool = [namescope + '_shared_' + str(i) for i in range(1000, 10000)]
    share_pool_used = {}
    for idx, op in enumerate(new_net._net.op):
        if namescope not in op.input[0]:
            continue
        for b in list(op.input) + list(op.output):
            if b in share_pool:
                share_pool_used[b] = idx

    for idx, op in enumerate(new_net._net.op[end_idx:]):
        if namescope not in op.input[0]:
            continue
        for b in list(op.input) + list(op.output):
            if b in share_pool_used.keys():
                share_pool_used.pop(b)

    ops = list(new_net._net.op)
    for inp in share_pool_used.keys():
        # print('free: ', inp)
        # new_net.Free([inp], [inp])

        ops.insert(share_pool_used[inp] + 1,
                   core.CreateOperator("Free", [inp], [inp]))
    del new_net._net.op[:]
    new_net._net.op.extend(ops)

    return new_net.Proto()
コード例 #12
0
def tt(x):
    workspace.FeedBlob("x", x)
    Clip = core.CreateOperator("Clip", ["x"], ["x"], min=0., max=1.)
    workspace.RunOperatorOnce(Clip)
    return workspace.FetchBlob("x")
コード例 #13
0
    def test_dnnlowp_elementwise_add_int(self, N, is_empty, in_quantized,
                                         out_quantized, in_place, gc, dc):
        if is_empty:
            N = 0
        # FIXME: DNNLOWP Add doesn't support inplace operation and
        # dequantize_output=1 at the same time
        if in_place[0] or in_place[1]:
            in_quantized = True
            out_quantized = True

        # A has scale 1, so exactly represented after quantization
        min_ = -100
        max_ = min_ + 255
        A = np.round(np.random.rand(N) * (max_ - min_) + min_)
        A = A.astype(np.float32)
        if N != 0:
            A[0] = min_
            A[1] = max_

        # B has scale 1/2, so exactly represented after quantization
        B = np.round(np.random.rand(N) * 255 / 2 - 64).astype(np.float32)
        if N != 0:
            B[0] = -64
            B[1] = 127.0 / 2

        Output = collections.namedtuple("Output", ["Y", "op_type", "engine"])
        outputs = []

        op_engine_list = [("Add", ""), ("Add", "DNNLOWP"),
                          ("Int8Add", "DNNLOWP")]

        for op_type, engine in op_engine_list:
            net = core.Net("test_net")

            do_quantize = "DNNLOWP" in engine and in_quantized
            do_dequantize = "DNNLOWP" in engine and out_quantized

            if do_quantize:
                quantize_A = core.CreateOperator("Quantize", ["A"], ["A_q"],
                                                 engine=engine,
                                                 device_option=gc)
                net.Proto().op.extend([quantize_A])

                quantize_B = core.CreateOperator("Quantize", ["B"], ["B_q"],
                                                 engine=engine,
                                                 device_option=gc)
                net.Proto().op.extend([quantize_B])

            out = "Y"
            if in_place[0]:
                out = "A"
            elif in_place[1]:
                out = "B"

            add = core.CreateOperator(
                op_type,
                ["A_q", "B_q"] if do_quantize else ["A", "B"],
                [(out + "_q") if do_dequantize else out],
                dequantize_output=not do_dequantize,
                engine=engine,
                device_option=gc,
            )
            net.Proto().op.extend([add])

            if do_dequantize:
                dequantize = core.CreateOperator("Dequantize", [out + "_q"],
                                                 [out],
                                                 engine=engine,
                                                 device_option=gc)
                net.Proto().op.extend([dequantize])

            self.ws.create_blob("A").feed(A, device_option=gc)
            self.ws.create_blob("B").feed(B, device_option=gc)
            self.ws.run(net)
            outputs.append(
                Output(Y=self.ws.blobs[out].fetch(),
                       op_type=op_type,
                       engine=engine))

        check_quantized_results_close(outputs)
コード例 #14
0
 def test_mkl_sigmoid(self, n, m, inplace, gc, dc):
     X = np.random.rand(m, n).astype(np.float32)
     op = core.CreateOperator("Sigmoid", ["X"],
                              ["Y" if not inplace else "X"])
     self.assertDeviceChecks(dc, op, [X], [0])
コード例 #15
0
    def test_dnnlowp_conv_acc16_outlier(
        self,
        stride,
        pad,
        kernel,
        dilation,
        size,
        group,
        input_channels_per_group,
        output_channels_per_group,
        batch_size,
        order,
        in_quantized,
        out_quantized,
        weight_quantized,
        prepack_weight,
        nbits_in_non_outlier,
        share_col_buffer,
        preserve_activation_sparsity,
        preserve_weight_sparsity,
        gc,
        dc,
    ):
        assume(group == 1 or dilation == 1)
        assume(size >= dilation * (kernel - 1) + 1)

        input_channels = input_channels_per_group * group
        output_channels = output_channels_per_group * group

        if nbits_in_non_outlier == 0:
            X, W, b = generate_conv_inputs(
                stride,
                pad,
                kernel,
                dilation,
                size,
                group,
                input_channels_per_group,
                output_channels_per_group,
                batch_size,
                order,
                preserve_activation_sparsity=preserve_activation_sparsity,
                preserve_weight_sparsity=preserve_weight_sparsity,
            )
        else:
            X_min = 0 if preserve_activation_sparsity else -77
            X_max = X_min + 255
            X = np.random.rand(batch_size, size, size,
                               input_channels) * 4 + X_min
            X = np.round(X).astype(np.float32)
            X[..., 0] = X_min
            X[0, 0, 0, 1] = X_max

            if preserve_weight_sparsity:
                W_min = -128
                W_max = 100
            else:
                W_min = -100
                W_max = W_min + 255
            W = (np.random.rand(output_channels, kernel, kernel,
                                input_channels_per_group) * 4 - 2 + W_min +
                 128)
            W = np.round(W).astype(np.float32)
            W[0, 0, 0, 0] = W_min
            W[1, 0, 0, 0] = W_max
            W[..., 1] = W_min + 128

            # No input quantization error in bias
            b = np.round(np.random.randn(output_channels)).astype(np.float32)

        Output = collections.namedtuple("Output",
                                        ["Y", "op_type", "engine", "order"])
        outputs = []

        op_engine_list = [
            ("Conv", ""),
            ("Conv", "DNNLOWP_ACC16"),
            ("Int8Conv", "DNNLOWP_ACC16"),
        ]

        for op_type, engine in op_engine_list:
            init_net = core.Net("test_init_net")
            net = core.Net("test_net")

            do_quantize = "DNNLOWP" in engine and in_quantized
            do_dequantize = "DNNLOWP" in engine and out_quantized
            do_quantize_weight = "DNNLOWP" in engine and weight_quantized
            do_prepack_weight = "DNNLOWP" in engine and prepack_weight

            if do_quantize:
                quantize = core.CreateOperator(
                    "Quantize",
                    ["X"],
                    ["X_q"],
                    preserve_activation_sparsity=preserve_activation_sparsity,
                    engine="DNNLOWP",
                    device_option=gc,
                )
                net.Proto().op.extend([quantize])

            x_q_param = dnnlowp_utils.choose_quantization_params(
                X.min(), X.max(), preserve_activation_sparsity)
            if do_quantize_weight:
                int8_given_tensor_fill, w_q_param = dnnlowp_utils.create_int8_given_tensor_fill(
                    W, "W_q", preserve_weight_sparsity)
                init_net.Proto().op.extend([int8_given_tensor_fill])

                # Bias
                int8_bias_tensor_fill = dnnlowp_utils.create_int8_bias_tensor_fill(
                    b, "b_q", x_q_param, w_q_param)
                init_net.Proto().op.extend([int8_bias_tensor_fill])

            if do_prepack_weight:
                inputs = ["W_q" if do_quantize_weight else "W"]
                if do_dequantize:
                    inputs += ["b_q" if do_quantize_weight else "b"]
                pack = core.CreateOperator(
                    "Int8ConvPackWeight",
                    inputs,
                    ["W_packed"],
                    group=group,
                    nbits_in_non_outlier=nbits_in_non_outlier,
                    preserve_weight_sparsity=preserve_weight_sparsity,
                    in_scale=x_q_param.scale,
                    engine=engine,
                )
                init_net.Proto().op.extend([pack])

            conv = core.CreateOperator(
                op_type,
                [
                    "X_q" if do_quantize else "X",
                    "W_packed" if do_prepack_weight else
                    ("W_q" if do_quantize_weight else "W"),
                    "b_q" if do_quantize_weight else "b",
                ],
                ["Y_q" if do_dequantize else "Y"],
                stride=stride,
                kernel=kernel,
                dilation=dilation,
                pad=pad,
                order=order,
                dequantize_output=not do_dequantize,
                nbits_in_non_outlier=nbits_in_non_outlier,
                shared_buffer=(1 if share_col_buffer else 0),
                preserve_activation_sparsity=preserve_activation_sparsity,
                preserve_weight_sparsity=preserve_weight_sparsity,
                engine=engine,
                group=group,
                device_option=gc,
            )
            if do_dequantize or do_quantize_weight or do_prepack_weight:
                # When quantized weight is provided, we can't rescale the
                # output dynamically by looking at the range of output of each
                # batch, so here we provide the range of output observed from
                # fp32 reference implementation
                dnnlowp_utils.add_quantization_param_args(
                    conv, outputs[0][0], preserve_activation_sparsity)
            net.Proto().op.extend([conv])

            if do_dequantize:
                dequantize = core.CreateOperator("Dequantize", ["Y_q"], ["Y"],
                                                 engine="DNNLOWP",
                                                 device_option=gc)
                net.Proto().op.extend([dequantize])

            self.ws.create_blob("X").feed(X, device_option=gc)
            self.ws.create_blob("W").feed(W, device_option=gc)
            self.ws.create_blob("b").feed(b, device_option=gc)
            self.ws.run(init_net)
            self.ws.run(net)
            Y = self.ws.blobs["Y"].fetch()
            outputs.append(
                Output(Y=Y, op_type=op_type, engine=engine, order=order))

        check_quantized_results_close(outputs,
                                      symmetric=preserve_activation_sparsity)
コード例 #16
0
    def test_upsample_grad(self, height_scale, width_scale, height, width,
                          num_channels, batch_size, seed, gc, dc):

        np.random.seed(seed)

        output_height = np.int32(height * height_scale)
        output_width = np.int32(width * width_scale)
        X = np.random.rand(batch_size,
                           num_channels,
                           height,
                           width).astype(np.float32)
        dY = np.random.rand(batch_size,
                            num_channels,
                            output_height,
                            output_width).astype(np.float32)
        scales = np.array([height_scale, width_scale]).astype(np.float32)

        ops = [
            (
                core.CreateOperator(
                    "UpsampleBilinearGradient",
                    ["dY", "X"],
                    ["dX"],
                    width_scale=width_scale,
                    height_scale=height_scale,
                ),
                [dY, X],
            ),
            (
                core.CreateOperator(
                    "UpsampleBilinearGradient",
                    ["dY", "X", "scales"],
                    ["dX"],
                ),
                [dY, X, scales],
            ),
        ]

        for op, inputs in ops:
            def ref(dY, X, scales=None):
                dX = np.zeros_like(X)

                rheight = ((height - 1) / (output_height - 1)
                        if output_height > 1
                        else float(0))
                rwidth = ((width - 1) / (output_width - 1)
                        if output_width > 1
                        else float(0))

                for i in range(output_height):
                    h1r = rheight * i
                    h1 = int(h1r)
                    h1p = 1 if h1 < height - 1 else 0
                    h1lambda = h1r - h1
                    h0lambda = float(1) - h1lambda
                    for j in range(output_width):
                        w1r = rwidth * j
                        w1 = int(w1r)
                        w1p = 1 if w1 < width - 1 else 0
                        w1lambda = w1r - w1
                        w0lambda = float(1) - w1lambda
                        dX[:, :, h1, w1] += (
                            h0lambda * w0lambda * dY[:, :, i, j])
                        dX[:, :, h1, w1 + w1p] += (
                            h0lambda * w1lambda * dY[:, :, i, j])
                        dX[:, :, h1 + h1p, w1] += (
                            h1lambda * w0lambda * dY[:, :, i, j])
                        dX[:, :, h1 + h1p, w1 + w1p] += (
                            h1lambda * w1lambda * dY[:, :, i, j])

                return dX,

            self.assertDeviceChecks(dc, op, inputs, [0])
            self.assertReferenceChecks(gc, op, inputs, ref)
コード例 #17
0
    def test_dnnlowp_conv_acc16_int(
        self,
        stride,
        pad,
        kernel,
        dilation,
        size,
        group,
        input_channels_per_group,
        output_channels_per_group,
        batch_size,
        order,
        in_quantized,
        out_quantized,
        weight_quantized,
        share_col_buffer,
        preserve_activation_sparsity,
        preserve_weight_sparsity,
        gc,
        dc,
    ):
        assume(group == 1 or dilation == 1)
        assume(size >= dilation * (kernel - 1) + 1)

        input_channels = input_channels_per_group * group
        output_channels = output_channels_per_group * group

        # X and W have scale 1, so exactly represented after quantization
        # This was made sure by having at least one 0 and one 255 for unsigned
        # 8-bit tensors, and at least one -128 and one 127 for signed 8-bit
        # tensors.
        # Since fbgemm_acc16 accumulates to 16-bit, To avoid overflow, we use
        # small numbers except for those 0, 255, -128, and 127, for this test
        # We also make sure 255, -128, or 127 are not multiplied together by
        # putting them in different input channels and the corresponding input
        # channel in other matrix is 0.
        # For example, we put 255 in input channel 1 in X, so we make the
        # corresponding input channel in W all zeros.
        X_min = 0 if preserve_activation_sparsity else -77
        X_max = X_min + 255
        X = np.random.rand(batch_size, size, size, input_channels) * 4 + X_min
        X = np.round(X).astype(np.float32)
        X[..., 0] = X_min
        X[0, 0, 0, 1] = X_max

        if preserve_weight_sparsity:
            W_min = -128
            W_max = 100
        else:
            W_min = -100
            W_max = W_min + 255
        W = (np.random.rand(output_channels, kernel, kernel,
                            input_channels_per_group) * 4 - 2 + W_min + 128)
        W = np.round(W).astype(np.float32)
        W[0, 0, 0, 0] = W_min
        W[1, 0, 0, 0] = W_max
        W[..., 1] = W_min + 128  # "zeros"

        if order == "NCHW":
            X = nhwc2nchw(X)
            W = nhwc2nchw(W)

        # No input quantization error in bias
        b = np.round(np.random.randn(output_channels)).astype(np.float32)

        Output = collections.namedtuple("Output",
                                        ["Y", "op_type", "engine", "order"])
        outputs = []

        op_engine_list = [
            ("Conv", ""),
            ("Conv", "DNNLOWP_ACC16"),
            ("Int8Conv", "DNNLOWP_ACC16"),
        ]

        for op_type, engine in op_engine_list:
            net = core.Net("test_net")

            do_quantize = "DNNLOWP" in engine and in_quantized
            do_dequantize = "DNNLOWP" in engine and out_quantized
            do_quantize_weight = ("DNNLOWP" in engine and weight_quantized
                                  and len(outputs) > 0)

            if do_quantize:
                quantize = core.CreateOperator(
                    "Quantize",
                    ["X"],
                    ["X_q"],
                    preserve_activation_sparsity=preserve_activation_sparsity,
                    engine="DNNLOWP",
                    device_option=gc,
                )
                net.Proto().op.extend([quantize])

            if do_quantize_weight:
                int8_given_tensor_fill, w_q_param = dnnlowp_utils.create_int8_given_tensor_fill(
                    W, "W_q", preserve_weight_sparsity)
                net.Proto().op.extend([int8_given_tensor_fill])

                # Bias
                x_q_param = dnnlowp_utils.choose_quantization_params(
                    X.min(), X.max(), preserve_activation_sparsity)
                int8_bias_tensor_fill = dnnlowp_utils.create_int8_bias_tensor_fill(
                    b, "b_q", x_q_param, w_q_param)
                net.Proto().op.extend([int8_bias_tensor_fill])

            conv = core.CreateOperator(
                op_type,
                [
                    "X_q" if do_quantize else "X",
                    "W_q" if do_quantize_weight else "W",
                    "b_q" if do_quantize_weight else "b",
                ],
                ["Y_q" if do_dequantize else "Y"],
                stride=stride,
                kernel=kernel,
                dilation=dilation,
                pad=pad,
                order=order,
                dequantize_output=not do_dequantize,
                shared_buffer=(1 if share_col_buffer else 0),
                preserve_activation_sparsity=preserve_activation_sparsity,
                preserve_weight_sparsity=preserve_weight_sparsity,
                engine=engine,
                group=group,
                device_option=gc,
            )
            if do_dequantize or do_quantize_weight:
                # When quantized weight is provided, we can't rescale the
                # output dynamically by looking at the range of output of each
                # batch, so here we provide the range of output observed from
                # fp32 reference implementation
                dnnlowp_utils.add_quantization_param_args(
                    conv, outputs[0][0], preserve_activation_sparsity)
            net.Proto().op.extend([conv])

            if do_dequantize:
                dequantize = core.CreateOperator("Dequantize", ["Y_q"], ["Y"],
                                                 engine="DNNLOWP",
                                                 device_option=gc)
                net.Proto().op.extend([dequantize])

            self.ws.create_blob("X").feed(X, device_option=gc)
            self.ws.create_blob("W").feed(W, device_option=gc)
            self.ws.create_blob("b").feed(b, device_option=gc)
            self.ws.run(net)
            Y = self.ws.blobs["Y"].fetch()
            outputs.append(
                Output(Y=Y, op_type=op_type, engine=engine, order=order))

        check_quantized_results_close(outputs,
                                      symmetric=preserve_activation_sparsity)
コード例 #18
0
    def test_upsample(self, height_scale, width_scale, height, width,
                     num_channels, batch_size, seed,
                     gc, dc):

        np.random.seed(seed)

        X = np.random.rand(
            batch_size, num_channels, height, width).astype(np.float32)
        scales = np.array([height_scale, width_scale]).astype(np.float32)

        ops = [
            (
                core.CreateOperator(
                    "UpsampleBilinear",
                    ["X"],
                    ["Y"],
                    width_scale=width_scale,
                    height_scale=height_scale,
                ),
                [X],
            ),
            (
                core.CreateOperator(
                    "UpsampleBilinear",
                    ["X", "scales"],
                    ["Y"],
                ),
                [X, scales],
            ),
        ]

        for op, inputs in ops:
            def ref(X, scales=None):
                output_height = np.int32(height * height_scale)
                output_width = np.int32(width * width_scale)

                Y = np.random.rand(
                    batch_size, num_channels, output_height,
                    output_width).astype(np.float32)

                rheight = ((height - 1) / (output_height - 1)
                        if output_height > 1
                        else float(0))
                rwidth = ((width - 1) / (output_width - 1)
                        if output_width > 1
                        else float(0))

                for i in range(output_height):
                    h1r = rheight * i
                    h1 = int(h1r)
                    h1p = 1 if h1 < height - 1 else 0
                    h1lambda = h1r - h1
                    h0lambda = float(1) - h1lambda
                    for j in range(output_width):
                        w1r = rwidth * j
                        w1 = int(w1r)
                        w1p = 1 if w1 < width - 1 else 0
                        w1lambda = w1r - w1
                        w0lambda = float(1) - w1lambda
                        Y[:, :, i, j] = (h0lambda * (
                            w0lambda * X[:, :, h1, w1] +
                            w1lambda * X[:, :, h1, w1 + w1p]) +
                            h1lambda * (w0lambda * X[:, :, h1 + h1p, w1] +
                            w1lambda * X[:, :, h1 + h1p, w1 + w1p]))

                return Y,

            self.assertReferenceChecks(gc, op, inputs, ref)
            self.assertDeviceChecks(dc, op, inputs, [0])
            self.assertGradientChecks(gc, op, inputs, 0, [0], stepsize=0.1,
                                      threshold=1e-2)
コード例 #19
0
# Let's print the current workspace. Note that there is nothing in the
# workspace yet.
print("Current workspace: {}".format(workspace.CurrentWorkspace()))
print("Current blobs in the workspace: {}".format(workspace.Blobs()))

workspace.SwitchWorkspace("default")
print("Current workspace: {}".format(workspace.CurrentWorkspace()))
print("Current blobs in the workspace: {}".format(workspace.Blobs()))

workspace.ResetWorkspace()

# Create an operator.
op = core.CreateOperator(
    "Relu",  # The type of operator that we want to run
    ["X"],  # A list of input blobs by their names
    ["Y"],  # A list of output blobs by their names
)
# and we are done!

print("Type of the created op is: {}".format(type(op)))
print("Content:\n")
print(str(op))

workspace.FeedBlob("X", np.random.randn(2, 3).astype(np.float32))
workspace.RunOperatorOnce(op)

print("Current blobs in the workspace: {}\n".format(workspace.Blobs()))
print("X:\n{}\n".format(workspace.FetchBlob("X")))
print("Y:\n{}\n".format(workspace.FetchBlob("Y")))
print("Expected:\n{}\n".format(np.maximum(workspace.FetchBlob("X"), 0)))
コード例 #20
0
    def test_broadcast_powt(self, gc, dc):
        np.random.seed(101)

        #operator
        def powt_op(X, Y):
            return [np.power(X, Y)]

        #two gradients Y*X^(Y-1) and X^Y * ln(X)
        def powt_grad(g_out, outputs, fwd_inputs):
            [X, Y] = fwd_inputs
            Z = outputs[0]
            return ([Y * np.power(X, Y - 1), Z * np.log(X)] * g_out)

        #1. Set broadcast and no axis, i.e. broadcasting last dimensions.
        X = np.random.rand(2, 3, 4, 5).astype(np.float32) + 1.0
        Y = np.random.rand(4, 5).astype(np.float32) + 2.0

        #two gradients Y*X^(Y-1) and X^Y * ln(X)
        #latter gradient is sumed over 1 and 0 dims to account for broadcast
        def powt_grad_broadcast(g_out, outputs, fwd_inputs):
            [GX, GY] = powt_grad(g_out, outputs, fwd_inputs)
            return ([GX, np.sum(np.sum(GY, 1), 0)])

        op = core.CreateOperator("Pow", ["X", "Y"], "Z", broadcast=1)
        self.assertReferenceChecks(device_option=gc,
                                   op=op,
                                   inputs=[X, Y],
                                   reference=powt_op,
                                   output_to_grad="Z",
                                   grad_reference=powt_grad_broadcast)

        #2. broadcasting intermediate dimensions
        X = np.random.rand(2, 3, 4, 5).astype(np.float32) + 1.0
        Y = np.random.rand(3, 4).astype(np.float32) + 2.0

        #pow op with the latter array increased by one dim
        def powt_op_axis1(X, Y):
            return powt_op(X, Y[:, :, np.newaxis])

        #two gradients Y*X^(Y-1) and X^Y * ln(X)
        #latter gradient is sumed over 3 and 0 dims to account for broadcast
        def powt_grad_axis1(g_out, outputs, fwd_inputs):
            [X, Y] = fwd_inputs
            [GX, GY] = powt_grad(g_out, outputs, [X, Y[:, :, np.newaxis]])
            return ([GX, np.sum(np.sum(GY, 3), 0)])

        op = core.CreateOperator("Pow", ["X", "Y"], "Z", broadcast=1, axis=1)
        self.assertReferenceChecks(device_option=gc,
                                   op=op,
                                   inputs=[X, Y],
                                   reference=powt_op_axis1,
                                   output_to_grad="Z",
                                   grad_reference=powt_grad_axis1)

        #3. broadcasting the first dimension
        X = np.random.rand(2, 3, 4, 5).astype(np.float32) + 1.0
        Y = np.random.rand(2).astype(np.float32) + 2.0

        #pow op with the latter array increased by one dim
        def powt_op_axis0(X, Y):
            return powt_op(X, Y[:, np.newaxis, np.newaxis, np.newaxis])

        #two gradients Y*X^(Y-1) and X^Y * ln(X)
        #latter gradient is sumed over 3, 2 and 1 dims to account for broadcast
        def powt_grad_axis0(g_out, outputs, fwd_inputs):
            [X, Y] = fwd_inputs
            [GX, GY] = powt_grad(g_out, outputs,
                                 [X, Y[:, np.newaxis, np.newaxis, np.newaxis]])
            return ([GX, np.sum(np.sum(np.sum(GY, 3), 2), 1)])

        op = core.CreateOperator("Pow", ["X", "Y"], "Z", broadcast=1, axis=0)
        self.assertReferenceChecks(device_option=gc,
                                   op=op,
                                   inputs=[X, Y],
                                   reference=powt_op_axis0,
                                   output_to_grad="Z",
                                   grad_reference=powt_grad_axis0)

        #4. broadcasting with single elem dimensions at both ends
        X = np.random.rand(2, 3, 4, 5).astype(np.float32) + 1.0
        Y = np.random.rand(1, 4, 1).astype(np.float32) + 2.0

        #pow op with the latter array increased by one dim
        def powt_op_mixed(X, Y):
            return powt_op(X, Y[np.newaxis, :, :, :])

        #two gradients Y*X^(Y-1) and X^Y * ln(X)
        #latter gradient is sumed over 0 and 1 dims to account for broadcast
        def powt_grad_mixed(g_out, outputs, fwd_inputs):
            [X, Y] = fwd_inputs
            [GX, GY] = powt_grad(g_out, outputs, [X, Y[np.newaxis, :, :, :]])
            return ([
                GX,
                np.reshape(np.sum(np.sum(np.sum(GY, 3), 1), 0), (1, 4, 1))
            ])

        op = core.CreateOperator("Pow", ["X", "Y"], "Z", broadcast=1, axis=1)
        self.assertReferenceChecks(device_option=gc,
                                   op=op,
                                   inputs=[X, Y],
                                   reference=powt_op_mixed,
                                   output_to_grad="Z",
                                   grad_reference=powt_grad_mixed)
コード例 #21
0
    def test_dnnlowp_max_pool(
        self,
        stride,
        pad,
        kernel,
        size,
        input_channels,
        batch_size,
        order,
        in_quantized,
        gc,
        dc,
    ):
        assume(kernel <= size)
        assume(pad < kernel)

        C = input_channels
        N = batch_size
        H = W = size

        min_ = -10
        max_ = 20
        if order == "NCHW":
            X = np.round(np.random.rand(N, C, H, W) * (max_ - min_) + min_)
        elif order == "NHWC":
            X = np.round(np.random.rand(N, H, W, C) * (max_ - min_) + min_)
        X = X.astype(np.float32)
        Output = collections.namedtuple("Output", ["Y", "op_type", "engine"])
        outputs = []

        op_engine_list = [
            ("MaxPool", ""),
            ("MaxPool", "DNNLOWP"),
            ("Int8MaxPool", "DNNLOWP"),
        ]

        for op_type, engine in op_engine_list:
            net = core.Net("test_net")

            do_quantize = "DNNLOWP" in engine and in_quantized

            if do_quantize:
                quantize = core.CreateOperator("Quantize", ["X"], ["X_q"],
                                               engine=engine,
                                               device_option=gc)
                net.Proto().op.extend([quantize])

            max_pool = core.CreateOperator(
                op_type,
                ["X_q" if do_quantize else "X"],
                ["Y_q" if engine == "DNNLOWP" else "Y"],
                stride=stride,
                kernel=kernel,
                pad=pad,
                order=order,
                engine=engine,
                device_option=gc,
            )
            net.Proto().op.extend([max_pool])

            if engine == "DNNLOWP":
                dequantize = core.CreateOperator("Dequantize", ["Y_q"], ["Y"],
                                                 engine=engine,
                                                 device_option=gc)
                net.Proto().op.extend([dequantize])

            self.ws.create_blob("X").feed(X, device_option=gc)
            self.ws.run(net)
            outputs.append(
                Output(Y=self.ws.blobs["Y"].fetch(),
                       op_type=op_type,
                       engine=engine))

        # Y_i = max(X_j) so the only error is in quantization of inputs
        check_quantized_results_close(outputs, ref=X)
コード例 #22
0
    def test_sum_reduce(self, gc, dc):
        # Set broadcast and no axis, i.e. broadcasting last dimensions.
        X = np.random.rand(2, 3, 4, 5).astype(np.float32)
        Y = np.random.rand(4, 5).astype(np.float32)
        op = core.CreateOperator("SumReduceLike", ["X", "Y"],
                                 "out",
                                 broadcast=1)
        workspace.FeedBlob("X", X)
        workspace.FeedBlob("Y", Y)
        workspace.RunOperatorOnce(op)
        out = workspace.FetchBlob("out")
        res = np.sum(X, axis=0)
        res = np.sum(res, axis=0)
        np.testing.assert_array_almost_equal(out, res)
        self.assertDeviceChecks(dc, op, [X, Y], [0])

        # Set broadcast and no axis, i.e. broadcasting last dimensions.
        X = np.random.rand(2, 3, 4, 5).astype(np.float32)
        Y = np.random.rand(2, 3).astype(np.float32)
        op = core.CreateOperator("SumReduceLike", ["X", "Y"],
                                 "out",
                                 broadcast=1,
                                 axis=0)
        workspace.FeedBlob("X", X)
        workspace.FeedBlob("Y", Y)
        workspace.RunOperatorOnce(op)
        out = workspace.FetchBlob("out")
        res = np.sum(X, axis=3)
        res = np.sum(res, axis=2)
        np.testing.assert_array_almost_equal(out, res, decimal=3)
        self.assertDeviceChecks(dc, op, [X, Y], [0])

        # broadcasting intermediate dimensions
        X = np.random.rand(2, 3, 4, 5).astype(np.float32)
        Y = np.random.rand(3, 4).astype(np.float32)
        op = core.CreateOperator("SumReduceLike", ["X", "Y"],
                                 "out",
                                 broadcast=1,
                                 axis=1)
        workspace.FeedBlob("X", X)
        workspace.FeedBlob("Y", Y)
        workspace.RunOperatorOnce(op)
        out = workspace.FetchBlob("out")
        res = np.sum(X, axis=0)
        res = np.sum(res, axis=2)
        np.testing.assert_array_almost_equal(out, res)
        self.assertDeviceChecks(dc, op, [X, Y], [0])

        # broadcasting intermediate dimensions
        X = np.random.rand(2, 3, 4, 500).astype(np.float64)
        Y = np.random.rand(1).astype(np.float64)
        op = core.CreateOperator("SumReduceLike", ["X", "Y"],
                                 "out",
                                 broadcast=1)
        workspace.FeedBlob("X", X)
        workspace.FeedBlob("Y", Y)
        workspace.RunOperatorOnce(op)
        out = workspace.FetchBlob("out")
        res = np.array(np.sum(X))
        np.testing.assert_array_almost_equal(out, res, decimal=0)

        # broadcasting with single elem dimensions at both ends
        X = np.random.rand(2, 3, 4, 5).astype(np.float32)
        Y = np.random.rand(1, 3, 4, 1).astype(np.float32)
        op = core.CreateOperator("SumReduceLike", ["X", "Y"],
                                 "out",
                                 broadcast=1)
        workspace.FeedBlob("X", X)
        workspace.FeedBlob("Y", Y)
        workspace.RunOperatorOnce(op)
        out = workspace.FetchBlob("out")
        res = np.sum(X, axis=0)
        res = np.sum(res, axis=2).reshape(Y.shape)
        np.testing.assert_array_almost_equal(out, res)
        self.assertDeviceChecks(dc, op, [X, Y], [0])

        # fp64 is not supported with the CUDA op
        dc_cpu_only = [d for d in dc if d.device_type != caffe2_pb2.CUDA]
        self.assertDeviceChecks(dc_cpu_only, op, [X, Y], [0])
コード例 #23
0
    def test_groupwise_dnnlowp_conv_int(
        self,
        stride,
        pad,
        kernel,
        dilation,
        size,
        group,
        input_channels_per_group,
        output_channels_per_group,
        batch_size,
        order,
        prepack_weight,
        preserve_activation_sparsity,
        preserve_weight_sparsity,
        gc,
        dc,
    ):
        assume(group == 1 or dilation == 1)
        assume((not prepack_weight) or order == "NHWC")

        X, W, b = generate_conv_inputs(
            stride,
            pad,
            kernel,
            dilation,
            size,
            group,
            input_channels_per_group,
            output_channels_per_group,
            batch_size,
            order,
            groupwise_quantization=True,
            preserve_activation_sparsity=preserve_activation_sparsity,
            preserve_weight_sparsity=preserve_weight_sparsity,
        )

        Output = collections.namedtuple("Output",
                                        ["Y", "op_type", "engine", "order"])
        outputs = []

        op_engine_list = [
            ("Conv", ""),
            ("Conv", "DNNLOWP"),
            ("Conv", "DNNLOWP_16"),
            ("Int8Conv", "DNNLOWP"),
        ]

        for op_type, engine in op_engine_list:
            init_net = core.Net("test_init_net")
            net = core.Net("test_net")

            do_quantize = "DNNLOWP" in engine
            do_dequantize = "DNNLOWP" in engine
            do_prepack_weight = engine == "DNNLOWP" and prepack_weight

            if do_quantize:
                quantize = core.CreateOperator(
                    "Quantize",
                    ["X"],
                    ["X_q"],
                    preserve_activation_sparsity=preserve_activation_sparsity,
                    engine=engine,
                    device_option=gc,
                )
                net.Proto().op.extend([quantize])

            if do_prepack_weight:
                X_min = 0 if X.size == 0 else X.min()
                X_max = 0 if X.size == 0 else X.max()
                x_q_param = hardcode_scale_zp.choose_quantization_params(
                    X_min, X_max)
                inputs = ["W"]
                if do_dequantize:
                    inputs += ["b"]
                pack = core.CreateOperator(
                    "Int8ConvPackWeight",
                    inputs,
                    ["W_packed"],
                    group=group,
                    quantize_groupwise=1,
                    preserve_weight_sparsity=preserve_weight_sparsity,
                    in_scale=x_q_param.scale,
                    engine=engine,
                )
                init_net.Proto().op.extend([pack])

            conv = core.CreateOperator(
                op_type,
                [
                    "X_q" if do_quantize else "X",
                    "W_packed" if do_prepack_weight else "W",
                    "b",
                ],
                ["Y_q" if do_dequantize else "Y"],
                stride=stride,
                kernel=kernel,
                dilation=dilation,
                pad=pad,
                order=order,
                preserve_activation_sparsity=preserve_activation_sparsity,
                preserve_weight_sparsity=preserve_weight_sparsity,
                engine=engine,
                group=group,
                quantize_groupwise=1,
                device_option=gc,
            )
            if do_dequantize or do_prepack_weight:
                # groupwise quantization only works with static quantization
                # so we need to set quantization parameters
                dnnlowp_utils.add_quantization_param_args(
                    conv, outputs[0][0], preserve_activation_sparsity)
            net.Proto().op.extend([conv])

            if do_dequantize:
                dequantize = core.CreateOperator(
                    "Dequantize",
                    ["Y_q"],
                    ["Y"],
                    preserve_activation_sparsity=preserve_activation_sparsity,
                    engine=engine,
                    device_option=gc,
                )
                net.Proto().op.extend([dequantize])

            run_conv_or_fc(self, init_net, net, X, W, b, op_type, engine,
                           order, gc, outputs)

        check_quantized_results_close(outputs,
                                      symmetric=preserve_activation_sparsity)
コード例 #24
0
    def test_sum_reduce_fp16(self, gc, dc):
        assume(core.IsGPUDeviceType(gc.device_type))

        # Set broadcast and no axis, i.e. broadcasting last dimensions.
        X = np.random.rand(2, 3, 4, 5).astype(np.float16)
        Y = np.random.rand(4, 5).astype(np.float16)
        op = core.CreateOperator("SumReduceLike", ["X", "Y"],
                                 "out",
                                 broadcast=1,
                                 device_option=gc)

        def ref_op(X, Y):
            res = np.sum(X, axis=0)
            res = np.sum(res, axis=0)
            return [res]

        self.assertReferenceChecks(device_option=gc,
                                   op=op,
                                   inputs=[X, Y],
                                   reference=ref_op,
                                   threshold=1e-3)

        # Set broadcast and no axis, i.e. broadcasting last dimensions.
        X = np.random.rand(2, 3, 4, 5).astype(np.float16)
        Y = np.random.rand(2, 3).astype(np.float16)
        op = core.CreateOperator("SumReduceLike", ["X", "Y"],
                                 "out",
                                 broadcast=1,
                                 axis=0)

        def ref_op(X, Y):
            res = np.sum(X, axis=3)
            res = np.sum(res, axis=2)
            return [res]

        self.assertReferenceChecks(device_option=gc,
                                   op=op,
                                   inputs=[X, Y],
                                   reference=ref_op,
                                   threshold=1e-3)

        # broadcasting intermediate dimensions
        X = np.random.rand(2, 3, 4, 5).astype(np.float16)
        Y = np.random.rand(3, 4).astype(np.float16)
        op = core.CreateOperator("SumReduceLike", ["X", "Y"],
                                 "out",
                                 broadcast=1,
                                 axis=1)

        def ref_op(X, Y):
            res = np.sum(X, axis=0)
            res = np.sum(res, axis=2)
            return [res]

        self.assertReferenceChecks(device_option=gc,
                                   op=op,
                                   inputs=[X, Y],
                                   reference=ref_op,
                                   threshold=1e-3)

        # broadcasting with single elem dimensions at both ends
        X = np.random.rand(2, 3, 4, 5).astype(np.float16)
        Y = np.random.rand(1, 3, 4, 1).astype(np.float16)
        op = core.CreateOperator("SumReduceLike", ["X", "Y"],
                                 "out",
                                 broadcast=1)

        def ref_op(X, Y):
            res = np.sum(X, axis=0)
            res = np.sum(res, axis=2)
            return [res.reshape(Y.shape)]

        self.assertReferenceChecks(device_option=gc,
                                   op=op,
                                   inputs=[X, Y],
                                   reference=ref_op,
                                   threshold=1e-3)
コード例 #25
0
    def test_hsm_search(self):
        samples = 10
        dim_in = 5
        X = np.random.rand(samples, dim_in).astype(np.float32) - 0.5
        w = np.random.rand(hierarchy_proto.size, dim_in) \
            .astype(np.float32) - 0.5
        b = np.random.rand(hierarchy_proto.size).astype(np.float32) - 0.5
        labels = np.array([np.random.randint(0, 8) for i in range(samples)]) \
            .astype(np.int32)

        workspace.GlobalInit(['caffe2'])
        workspace.FeedBlob("data", X)
        workspace.FeedBlob("weights", w)
        workspace.FeedBlob("bias", b)
        workspace.FeedBlob("labels", labels)
        op = core.CreateOperator('HSoftmaxSearch', ['data', 'weights', 'bias'],
                                 ['names', 'scores'],
                                 'HSoftmaxSearch',
                                 arg=args_search)
        workspace.RunOperatorOnce(op)
        names = workspace.FetchBlob('names')
        scores = workspace.FetchBlob('scores')

        def simulation_hsm_search():
            names = []
            scores = []
            for line in struct:
                s, e = line[0], line[0] + line[1]
                score = np.dot(X, w[s:e].transpose()) + b[s:e]
                score = np.exp(score - np.max(score, axis=1, keepdims=True))
                score /= score.sum(axis=1, keepdims=True)
                score = -np.log(score)

                score = score.transpose()
                idx = -1
                for j, n in enumerate(names):
                    if n == line[3]:
                        idx = j
                        score += scores[j]
                if idx == -1:
                    score[score > beam] = np.inf
                else:
                    score[score - scores[idx] > beam] = np.inf

                for i, name in enumerate(line[2]):
                    scores.append(score[i])
                    names.append(name)
            scores = np.vstack(scores)
            return names, scores.transpose()

        p_names, p_scores = simulation_hsm_search()
        idx = np.argsort(p_scores, axis=1)
        p_scores = np.sort(p_scores, axis=1)
        p_names = np.array(p_names)[idx]
        for i in range(names.shape[0]):
            for j in range(names.shape[1]):
                if names[i][j]:
                    self.assertEquals(names[i][j],
                                      p_names[i][j].item().encode('utf-8'))
                    self.assertAlmostEqual(scores[i][j],
                                           p_scores[i][j],
                                           delta=0.001)
コード例 #26
0
def Train(args):
    # Either use specified device list or generate one
    if args.gpus is not None:
        gpus = [int(x) for x in args.gpus.split(',')]
        num_gpus = len(gpus)
    else:
        gpus = list(range(args.num_gpus))
        num_gpus = args.num_gpus

    log.info("Running on GPUs: {}".format(gpus))

    # Verify valid batch size
    total_batch_size = args.batch_size
    batch_per_device = total_batch_size // num_gpus
    assert \
        total_batch_size % num_gpus == 0, \
        "Number of GPUs must divide batch size"

    # Round down epoch size to closest multiple of batch size across machines
    global_batch_size = total_batch_size * args.num_shards
    epoch_iters = int(args.epoch_size / global_batch_size)

    assert \
        epoch_iters > 0, \
        "Epoch size must be larger than batch size times shard count"

    args.epoch_size = epoch_iters * global_batch_size
    log.info("Using epoch size: {}".format(args.epoch_size))

    # Create ModelHelper object
    train_arg_scope = {
        'order': 'NCHW',
        'use_cudnn': True,
        'cudnn_exhaustive_search': True,
        'ws_nbytes_limit': (args.cudnn_workspace_limit_mb * 1024 * 1024),
    }
    train_model = model_helper.ModelHelper(name="resnet50",
                                           arg_scope=train_arg_scope)

    num_shards = args.num_shards
    shard_id = args.shard_id

    # Expect interfaces to be comma separated.
    # Use of multiple network interfaces is not yet complete,
    # so simply use the first one in the list.
    interfaces = args.distributed_interfaces.split(",")

    # Rendezvous using MPI when run with mpirun
    if os.getenv("OMPI_COMM_WORLD_SIZE") is not None:
        num_shards = int(os.getenv("OMPI_COMM_WORLD_SIZE", 1))
        shard_id = int(os.getenv("OMPI_COMM_WORLD_RANK", 0))
        if num_shards > 1:
            rendezvous = dict(kv_handler=None,
                              num_shards=num_shards,
                              shard_id=shard_id,
                              engine="GLOO",
                              transport=args.distributed_transport,
                              interface=interfaces[0],
                              mpi_rendezvous=True,
                              exit_nets=None)

    elif num_shards > 1:
        # Create rendezvous for distributed computation
        store_handler = "store_handler"
        if args.redis_host is not None:
            # Use Redis for rendezvous if Redis host is specified
            workspace.RunOperatorOnce(
                core.CreateOperator(
                    "RedisStoreHandlerCreate",
                    [],
                    [store_handler],
                    host=args.redis_host,
                    port=args.redis_port,
                    prefix=args.run_id,
                ))
        else:
            # Use filesystem for rendezvous otherwise
            workspace.RunOperatorOnce(
                core.CreateOperator(
                    "FileStoreHandlerCreate",
                    [],
                    [store_handler],
                    path=args.file_store_path,
                    prefix=args.run_id,
                ))

        rendezvous = dict(kv_handler=store_handler,
                          shard_id=shard_id,
                          num_shards=num_shards,
                          engine="GLOO",
                          transport=args.distributed_transport,
                          interface=interfaces[0],
                          exit_nets=None)

    else:
        rendezvous = None

    # Model building functions
    def create_resnet50_model_ops(model, loss_scale):
        initializer = (PseudoFP16Initializer
                       if args.dtype == 'float16' else Initializer)

        with brew.arg_scope([brew.conv, brew.fc],
                            WeightInitializer=initializer,
                            BiasInitializer=initializer,
                            enable_tensor_core=args.enable_tensor_core,
                            float16_compute=args.float16_compute):
            pred = resnet.create_resnet50(
                model,
                "data",
                num_input_channels=args.num_channels,
                num_labels=args.num_labels,
                no_bias=True,
                no_loss=True,
            )

        if args.dtype == 'float16':
            pred = model.net.HalfToFloat(pred, pred + '_fp32')

        softmax, loss = model.SoftmaxWithLoss([pred, 'label'],
                                              ['softmax', 'loss'])
        loss = model.Scale(loss, scale=loss_scale)
        brew.accuracy(model, [softmax, "label"], "accuracy")
        return [loss]

    def add_optimizer(model):
        stepsz = int(30 * args.epoch_size / total_batch_size / num_shards)

        if args.float16_compute:
            # TODO: merge with multi-prceision optimizer
            opt = optimizer.build_fp16_sgd(
                model,
                args.base_learning_rate,
                momentum=0.9,
                nesterov=1,
                weight_decay=args.weight_decay,  # weight decay included
                policy="step",
                stepsize=stepsz,
                gamma=0.1)
        else:
            optimizer.add_weight_decay(model, args.weight_decay)
            opt = optimizer.build_multi_precision_sgd(model,
                                                      args.base_learning_rate,
                                                      momentum=0.9,
                                                      nesterov=1,
                                                      policy="step",
                                                      stepsize=stepsz,
                                                      gamma=0.1)
        return opt

    # Define add_image_input function.
    # Depends on the "train_data" argument.
    # Note that the reader will be shared with between all GPUS.
    if args.train_data == "null":

        def add_image_input(model):
            AddNullInput(
                model,
                None,
                batch_size=batch_per_device,
                img_size=args.image_size,
                dtype=args.dtype,
            )
    else:
        reader = train_model.CreateDB(
            "reader",
            db=args.train_data,
            db_type=args.db_type,
            num_shards=num_shards,
            shard_id=shard_id,
        )

        def add_image_input(model):
            AddImageInput(
                model,
                reader,
                batch_size=batch_per_device,
                img_size=args.image_size,
                dtype=args.dtype,
                is_test=False,
            )

    def add_post_sync_ops(model):
        """Add ops applied after initial parameter sync."""
        for param_info in model.GetOptimizationParamInfo(model.GetParams()):
            if param_info.blob_copy is not None:
                model.param_init_net.HalfToFloat(
                    param_info.blob, param_info.blob_copy[core.DataType.FLOAT])

    # Create parallelized model
    data_parallel_model.Parallelize(
        train_model,
        input_builder_fun=add_image_input,
        forward_pass_builder_fun=create_resnet50_model_ops,
        optimizer_builder_fun=add_optimizer,
        post_sync_builder_fun=add_post_sync_ops,
        devices=gpus,
        rendezvous=rendezvous,
        optimize_gradient_memory=False,
        cpu_device=args.use_cpu,
        shared_model=args.use_cpu,
        combine_spatial_bn=args.use_cpu,
    )

    data_parallel_model.OptimizeGradientMemory(train_model, {}, set(), False)

    workspace.RunNetOnce(train_model.param_init_net)
    workspace.CreateNet(train_model.net)

    # Add test model, if specified
    test_model = None
    if (args.test_data is not None):
        log.info("----- Create test net ----")
        test_arg_scope = {
            'order': "NCHW",
            'use_cudnn': True,
            'cudnn_exhaustive_search': True,
        }
        test_model = model_helper.ModelHelper(name="resnet50_test",
                                              arg_scope=test_arg_scope,
                                              init_params=False)

        test_reader = test_model.CreateDB(
            "test_reader",
            db=args.test_data,
            db_type=args.db_type,
        )

        def test_input_fn(model):
            AddImageInput(
                model,
                test_reader,
                batch_size=batch_per_device,
                img_size=args.image_size,
                dtype=args.dtype,
                is_test=True,
            )

        data_parallel_model.Parallelize(
            test_model,
            input_builder_fun=test_input_fn,
            forward_pass_builder_fun=create_resnet50_model_ops,
            post_sync_builder_fun=add_post_sync_ops,
            param_update_builder_fun=None,
            devices=gpus,
            cpu_device=args.use_cpu,
        )
        workspace.RunNetOnce(test_model.param_init_net)
        workspace.CreateNet(test_model.net)

    epoch = 0
    # load the pre-trained model and reset epoch
    if args.load_model_path is not None:
        LoadModel(args.load_model_path, train_model)

        # Sync the model params
        data_parallel_model.FinalizeAfterCheckpoint(train_model)

        # reset epoch. load_model_path should end with *_X.mdl,
        # where X is the epoch number
        last_str = args.load_model_path.split('_')[-1]
        if last_str.endswith('.mdl'):
            epoch = int(last_str[:-4])
            log.info("Reset epoch to {}".format(epoch))
        else:
            log.warning("The format of load_model_path doesn't match!")

    expname = "resnet50_gpu%d_b%d_L%d_lr%.2f_v2" % (
        args.num_gpus,
        total_batch_size,
        args.num_labels,
        args.base_learning_rate,
    )

    explog = experiment_util.ModelTrainerLog(expname, args)

    # Run the training one epoch a time
    while epoch < args.num_epochs:
        epoch = RunEpoch(args, epoch, train_model, test_model,
                         total_batch_size, num_shards, expname, explog)

        # Save the model for each epoch
        SaveModel(args, train_model, epoch)

        model_path = "%s/%s_" % (args.file_store_path, args.save_model_name)
        # remove the saved model from the previous epoch if it exists
        if os.path.isfile(model_path + str(epoch - 1) + ".mdl"):
            os.remove(model_path + str(epoch - 1) + ".mdl")
コード例 #27
0
ファイル: backend.py プロジェクト: lcskrishna/caffe2
    def _create_slice(cls, init_model, pred_model, n, opset_version):
        op = cls._common_onnx_node_to_caffe2_op(init_model, pred_model, n, opset_version)
        args = {arg.name: arg for arg in op.arg}
        starts_vals = np.array(
            args.pop('starts').ints, dtype=np.int64).tolist()
        ends_vals = np.array(
            [i - 1 if i < 0 else i for i in args.pop('ends').ints],
            dtype=np.int64).tolist()
        if 'axes' in args:
            axes_vals = np.array(
                args.pop('axes').ints, dtype=np.int32).tolist()
        else:
            ndims = len(starts_vals)
            axes_vals = np.array(range(ndims), dtype=np.int32).tolist()

        data, = op.input
        ops = []

        shape_tensor = dummy_name()
        ops.append(core.CreateOperator(
            'Shape',
            [data],
            [shape_tensor]
        ))

        axes_tensor = dummy_name()
        ops.extend([
            core.CreateOperator(
                'GivenTensorIntFill',
                [],
                [axes_tensor],
                shape=[len(axes_vals)],
                values=axes_vals,
            ),
        ])

        starts_vals_tensor = dummy_name()
        starts_tensor = dummy_name()
        casted_starts_tensor = dummy_name()
        ops.extend([
            core.CreateOperator(
                'GivenTensorInt64Fill',
                [],
                [starts_vals_tensor],
                shape=[len(starts_vals)],
                values=starts_vals,
            ),
            core.CreateOperator(
                'ConstantFill',
                [shape_tensor],
                [starts_tensor],
                dtype=caffe2_pb2.TensorProto.INT64,
                value=0,
            ),
            core.CreateOperator(
                'ScatterAssign',
                [starts_tensor, axes_tensor, starts_vals_tensor],
                [starts_tensor],
            ),
            # Slice only accepts starts as int
            core.CreateOperator(
                'Cast',
                [starts_tensor],
                [casted_starts_tensor],
                to=caffe2_pb2.TensorProto.INT32,
            ),
        ])

        ends_vals_tensor = dummy_name()
        ends_tensor = dummy_name()
        casted_ends_tensor = dummy_name()
        ops.extend([
            core.CreateOperator(
                'GivenTensorInt64Fill',
                [],
                [ends_vals_tensor],
                shape=[len(ends_vals)],
                values=ends_vals,
            ),
            core.CreateOperator(
                'ConstantFill',
                [shape_tensor],
                [ends_tensor],
                dtype=caffe2_pb2.TensorProto.INT64,
                value=-1,
            ),
            core.CreateOperator(
                'ScatterAssign',
                [ends_tensor, axes_tensor, ends_vals_tensor],
                [ends_tensor],
            ),
            # Slice only accepts ends as int
            core.CreateOperator(
                'Cast',
                [ends_tensor],
                [casted_ends_tensor],
                to=caffe2_pb2.TensorProto.INT32,
            ),
        ])

        op.input[:] = [data, casted_starts_tensor, casted_ends_tensor]
        del op.arg[:]
        op.arg.extend(args.values())
        ops.append(op)

        return ops
コード例 #28
0
ファイル: adam_test.py プロジェクト: narain1/pytorch
    def test_row_wise_sparse_adam_output_grad(self, inputs, ITER, LR, beta1, beta2,
                                              epsilon, data_strategy, gc, dc):
        param, mom1, grad = inputs
        ITER = np.array([ITER], dtype=np.int64)
        LR = np.array([LR], dtype=np.float32)

        # Create a 1D row-wise average 2nd moment tensor.
        mom2 = data_strategy.draw(
            hu.tensor1d(min_len=param.shape[0], max_len=param.shape[0],
                        elements=hu.elements_of_type(dtype=np.float32))
        )
        mom2 = np.absolute(mom2)

        # Create an indexing array containing values which index into grad
        indices = data_strategy.draw(
            hu.tensor(
                max_dim=1,
                min_value=1,
                max_value=grad.shape[0],
                dtype=np.int64,
                elements=st.sampled_from(np.arange(grad.shape[0])),
            ),
        )

        # Note that unlike SparseAdam, RowWiseSparseAdam uses a moment
        # tensor that is strictly 1-dimensional and equal in length to the
        # first dimension of the parameters, so indices must also be
        # 1-dimensional.
        indices = indices.flatten()

        hypothesis.note('indices.shape: %s' % str(indices.shape))

        # Verify that the generated indices are unique
        hypothesis.assume(np.array_equal(np.unique(indices), np.sort(indices)))

        # Sparsify grad
        grad = grad[indices]

        op = core.CreateOperator(
            "RowWiseSparseAdam",
            ["param", "mom1", "mom2", "indices", "grad", "lr", "iter"],
            ["param", "mom1", "mom2", "output_grad"],
            beta1=beta1, beta2=beta2, epsilon=epsilon)

        def ref_row_wise_sparse_output_grad(param, mom1, mom2, indices, grad, LR, ITER,
                                            beta1, beta2, epsilon, output_grad):
            param_out = np.copy(param)
            mom1_out = np.copy(mom1)
            mom2_out = np.copy(mom2)
            grad_out = np.copy(grad)

            for i, index in enumerate(indices):
                param_out[index], mom1_out[index], mom2_out[index], grad_out[i] = \
                    self.ref_row_wise_adam(param[index], mom1[index], mom2[index],
                                           grad[i], LR, ITER,
                                           beta1, beta2, epsilon, output_grad)
            return (param_out, mom1_out, mom2_out, grad_out)

        # Iter lives on the CPU
        input_device_options = {'iter': hu.cpu_do}

        self.assertDeviceChecks(
            dc, op,
            [param, mom1, mom2, indices, grad, LR, ITER],
            [0, 1, 2, 3],
            input_device_options=input_device_options)

        self.assertReferenceChecks(
            gc, op,
            [param, mom1, mom2, indices, grad, LR, ITER],
            functools.partial(
                ref_row_wise_sparse_output_grad,
                beta1=beta1, beta2=beta2, epsilon=epsilon, output_grad=True),
            input_device_options=input_device_options)
コード例 #29
0
    def test_sparse_normalize(
        self, inputs, use_max_norm, norm, data_strategy, use_fp16, gc, dc
    ):
        param, grad = inputs
        param += 0.02 * np.sign(param)
        param[param == 0.0] += 0.02

        if use_fp16:
            param = param.astype(np.float16)
            grad = grad.astype(np.float16)

        # Create an indexing array containing values that are lists of indices,
        # which index into param
        indices = data_strategy.draw(
            hu.tensor(
                dtype=np.int64,
                min_dim=1,
                max_dim=1,
                elements=st.sampled_from(np.arange(param.shape[0])),
            )
        )
        hypothesis.note("indices.shape: %s" % str(indices.shape))

        # For now, the indices must be unique
        hypothesis.assume(
            np.array_equal(np.unique(indices.flatten()), np.sort(indices.flatten()))
        )

        op1 = core.CreateOperator(
            "Float16SparseNormalize" if use_fp16 else "SparseNormalize",
            ["param", "indices"],
            ["param"],
            use_max_norm=use_max_norm,
            norm=norm,
        )

        # Sparsify grad
        grad = grad[indices]

        op2 = core.CreateOperator(
            "Float16SparseNormalize" if use_fp16 else "SparseNormalize",
            ["param", "indices", "grad"],
            ["param"],
            use_max_norm=use_max_norm,
            norm=norm,
        )

        def ref_sparse_normalize(param, indices, grad=None):
            param_out = np.copy(param)
            for _, index in enumerate(indices):
                param_out[index] = self.ref_normalize(param[index], use_max_norm, norm)
            return (param_out,)

        # self.assertDeviceChecks(dc, op, [param, indices], [0])
        self.assertReferenceChecks(
            gc,
            op1,
            [param, indices],
            ref_sparse_normalize,
            threshold=1e-2 if use_fp16 else 1e-4,
        )

        self.assertReferenceChecks(
            gc,
            op2,
            [param, indices, grad],
            ref_sparse_normalize,
            threshold=1e-2 if use_fp16 else 1e-4,
        )
コード例 #30
0
ファイル: conv_test.py プロジェクト: vbvg2008/pytorch
    def test_convolution_layout(self, op_type, stride, pad, kernel, dilation,
                                size, input_channels, output_channels,
                                batch_size, use_bias, gc, dc):
        assume(size >= dilation * (kernel - 1) + 1)

        X = np.random.rand(
            batch_size, size, size, input_channels).astype(np.float32) - 0.5
        w = np.random.rand(
            output_channels, kernel, kernel, input_channels
        ).astype(np.float32) - 0.5
        b = np.random.rand(output_channels).astype(np.float32) - 0.5
        Output = collections.namedtuple("Output", ["Y", "engine", "order"])
        outputs = []

        for order in ["NCHW", "NHWC"]:
            engine_list = ['']
            if hiputl.run_in_hip(gc, dc):
                if order == 'NCHW':
                    engine_list.append('MIOPEN')
            else:
                if _cudnn_supports(dilation=(dilation > 1), nhwc=(order == 'NHWC')):
                    engine_list.append('CUDNN')

            for engine in engine_list:
                op = core.CreateOperator(
                    op_type,
                    ["X", "w", "b"] if use_bias else ["X", "w"],
                    ["Y"],
                    stride=stride,
                    kernel=kernel,
                    dilation=dilation,
                    pad=pad,
                    order=order,
                    engine=engine,
                    device_option=gc,
                    exhaustive_search=True,
                )
                if order == "NCHW":
                    X_f = X.transpose((0, 3, 1, 2))
                    w_f = w.transpose((0, 3, 1, 2))
                else:
                    X_f = X
                    w_f = w
                self.assertDeviceChecks(
                    dc,
                    op,
                    [X_f, w_f, b] if use_bias else [X_f, w_f],
                    [0])
                self.ws.create_blob("X").feed(X_f, device_option=gc)
                self.ws.create_blob("w").feed(w_f, device_option=gc)
                self.ws.create_blob("b").feed(b, device_option=gc)
                self.ws.run(op)
                outputs.append(Output(
                    Y=self.ws.blobs["Y"].fetch(), engine=engine, order=order))

        def canonical(o):
            if o.order == "NHWC":
                return o.Y.transpose((0, 3, 1, 2))
            else:
                return o.Y

        for o in outputs:
            np.testing.assert_allclose(
                canonical(outputs[0]),
                canonical(o),
                atol=1e-4,
                rtol=1e-4)