Exemple #1
0
    def _check_subgraph_exe3(sym, subgraph_backend, op_names):
        """Use the partitioned sym to bind an executor and compare the outputs
        with those of the original executor"""
        out = SymbolHandle()
        check_call(_LIB.MXBuildSubgraphByOpNames(sym.handle, c_str(subgraph_backend), mx_uint(len(op_names)),
                                                  c_str_array(op_names), ctypes.byref(out)))

        partitioned_sym = Symbol(out)
        input_names = sym.list_inputs()
        arg_names = sym.list_arguments()
        aux_names = sym.list_auxiliary_states()
        assert partitioned_sym.list_inputs() == input_names
        assert partitioned_sym.list_arguments() == arg_names
        assert partitioned_sym.list_auxiliary_states() == aux_names
        arg_shapes, _, aux_shapes = sym.infer_shape()
        arg_array = [mx.nd.random.uniform(shape=shape) for shape in arg_shapes]
        aux_array = [mx.nd.random.uniform(shape=shape) for shape in aux_shapes]
        exe = sym.bind(ctx=mx.current_context(), args=arg_array, aux_states=aux_array, grad_req='null')
        partitioned_exe = partitioned_sym.bind(ctx=mx.current_context(), args=arg_array,
                                               aux_states=aux_array, grad_req='null')
        exe.forward()
        partitioned_exe.forward()
        assert len(exe.outputs) == len(partitioned_exe.outputs)
        for i in range(len(exe.outputs)):
            assert_almost_equal((exe.outputs[i] - partitioned_exe.outputs[i]).abs().sum().asnumpy(),
                                np.zeros(shape=(1,)))
    def check_quantized_pooling(data_shape, kernel, pool_type, pad, stride, global_pool):
        with mx.Context('gpu', 0):
            data = mx.sym.Variable(name='data', shape=data_shape, dtype='float32')
            pooling_fp32 = mx.sym.Pooling(data=data, kernel=kernel, pad=pad, stride=stride,
                                          pool_type=pool_type, global_pool=global_pool, cudnn_off=False)
            arg_shapes, _, _ = pooling_fp32.infer_shape(data=data_shape)
            arg_names = pooling_fp32.list_arguments()
            pooling_fp32_exe = pooling_fp32.simple_bind(ctx=mx.current_context(), grad_req='null')
            pooling_fp32_exe.arg_dict[arg_names[0]][:] = mx.nd.random.uniform(low=-127.0, high=127.0,
                                                                              shape=data_shape).astype('int32')
            output = pooling_fp32_exe.forward()[0]

            qdata = mx.sym.Variable(name='qdata', shape=data_shape, dtype='int8')
            min_data = mx.sym.Variable(name='min_data')
            max_data = mx.sym.Variable(name='max_data')
            quantized_pooling = mx.sym.contrib.quantized_pooling(data=qdata, min_data=min_data,
                                                                 max_data=max_data, kernel=kernel,
                                                                 pad=pad, stride=stride, pool_type=pool_type,
                                                                 global_pool=global_pool)
            pooling_int8_exe = quantized_pooling.simple_bind(ctx=mx.current_context(), grad_req='null')
            qarg_names = quantized_pooling.list_arguments()
            pooling_int8_exe.arg_dict[qarg_names[0]][:] = pooling_fp32_exe.arg_dict[arg_names[0]].astype('int8')
            quantized_range = 127.0
            pooling_int8_exe.arg_dict[qarg_names[1]][:] = -quantized_range
            pooling_int8_exe.arg_dict[qarg_names[2]][:] = quantized_range
            qoutput, min_range, max_range = pooling_int8_exe.forward()

            if pool_type == 'max':
                assert_almost_equal(output.asnumpy(), qoutput.asnumpy())
            elif pool_type == 'avg':  # for avg pooling, fp32 and int8 may be different due to rounding errors
                diff = mx.nd.abs(output - qoutput.astype(output.dtype))
                cond = mx.nd.lesser(2, diff).sum().asscalar()
                assert cond == 0
Exemple #3
0
    def _check_subgraph_exe1(sym, subgraph_backend, op_names):
        """Use the partitioned sym to simple_bind an executor and compare the outputs
        with those of the original executor"""
        out = SymbolHandle()
        check_call(_LIB.MXBuildSubgraphByOpNames(sym.handle, c_str(subgraph_backend), mx_uint(len(op_names)),
                                                  c_str_array(op_names), ctypes.byref(out)))

        partitioned_sym = Symbol(out)
        assert partitioned_sym.list_inputs() == sym.list_inputs()
        assert partitioned_sym.list_arguments() == sym.list_arguments()
        assert partitioned_sym.list_auxiliary_states() == sym.list_auxiliary_states()
        exe = sym.simple_bind(ctx=mx.current_context(), grad_req='null')
        partitioned_exe = partitioned_sym.simple_bind(ctx=mx.current_context(), grad_req='null')
        input_names = sym.list_inputs()
        for name in input_names:
            if name in exe.arg_dict:
                exe.arg_dict[name][:] = mx.nd.random.uniform(shape=exe.arg_dict[name].shape)
                partitioned_exe.arg_dict[name][:] = exe.arg_dict[name]
            else:
                assert name in exe.aux_dict
                exe.aux_dict[name][:] = mx.nd.random.uniform(shape=exe.aux_dict[name].shape)
                partitioned_exe.aux_dict[name][:] = exe.aux_dict[name]
        exe.forward()
        partitioned_exe.forward()
        assert len(exe.outputs) == len(partitioned_exe.outputs)
        for i in range(len(exe.outputs)):
            assert_almost_equal((exe.outputs[i] - partitioned_exe.outputs[i]).abs().sum().asnumpy(),
                                np.zeros(shape=(1,)))
    def check_quantized_conv(data_shape, kernel, num_filter, pad, stride, no_bias):
        with mx.Context('gpu', 0):
            # run fp32 conv
            data = mx.sym.Variable(name='data', shape=data_shape, dtype='float32')
            conv2d = mx.sym.Convolution(data=data, kernel=kernel, num_filter=num_filter, pad=pad, stride=stride,
                                        no_bias=no_bias, cudnn_off=False, name='conv2d')
            arg_shapes, _, _ = conv2d.infer_shape(data=data_shape)
            arg_names = conv2d.list_arguments()
            conv_exe_fp32 = conv2d.simple_bind(ctx=mx.current_context(), grad_req='null')
            conv_exe_fp32.arg_dict[arg_names[0]][:] = mx.nd.random.uniform(low=-127.0, high=127.0,
                                                                           shape=data_shape).astype('int32')
            conv_exe_fp32.arg_dict[arg_names[1]][:] = mx.nd.random.uniform(low=-127.0, high=127.0,
                                                                           shape=arg_shapes[1]).astype('int32')
            if not no_bias:
                conv_exe_fp32.arg_dict[arg_names[2]][:] = mx.nd.random.uniform(low=-127.0, high=127.0,
                                                                               shape=arg_shapes[2]).astype('int32')
            output = conv_exe_fp32.forward()[0]

            # run quantized conv
            qdata = mx.sym.Variable(name='qdata', shape=data_shape, dtype='int8')
            qweight = mx.sym.Variable(name='qweight', dtype='int8')
            min_data = mx.sym.Variable(name='min_data')
            max_data = mx.sym.Variable(name='max_data')
            min_weight = mx.sym.Variable(name='min_weight')
            max_weight = mx.sym.Variable(name='max_weight')
            quantized_conv2d = mx.sym.contrib.quantized_conv(data=qdata, weight=qweight, min_data=min_data,
                                                             max_data=max_data, min_weight=min_weight,
                                                             max_weight=max_weight, kernel=kernel,
                                                             num_filter=num_filter, pad=pad, stride=stride,
                                                             no_bias=no_bias)
            qarg_names = quantized_conv2d.list_arguments()
            type_dict = None
            if not no_bias:
                type_dict = {qarg_names[2]: 'int8'}
            conv_exe_int8 = quantized_conv2d.simple_bind(ctx=mx.current_context(), type_dict=type_dict, grad_req='null')
            conv_exe_int8.arg_dict[qarg_names[0]][:] = conv_exe_fp32.arg_dict[arg_names[0]].astype('int8')
            conv_exe_int8.arg_dict[qarg_names[1]][:] = conv_exe_fp32.arg_dict[arg_names[1]].astype('int8')
            quantized_range = 127.0
            if no_bias:
                conv_exe_int8.arg_dict[qarg_names[2]][:] = -quantized_range
                conv_exe_int8.arg_dict[qarg_names[3]][:] = quantized_range
                conv_exe_int8.arg_dict[qarg_names[4]][:] = -quantized_range
                conv_exe_int8.arg_dict[qarg_names[5]][:] = quantized_range
            else:
                conv_exe_int8.arg_dict[qarg_names[2]][:] = conv_exe_fp32.arg_dict[arg_names[2]].astype('int8')
                conv_exe_int8.arg_dict[qarg_names[3]][:] = -quantized_range
                conv_exe_int8.arg_dict[qarg_names[4]][:] = quantized_range
                conv_exe_int8.arg_dict[qarg_names[5]][:] = -quantized_range
                conv_exe_int8.arg_dict[qarg_names[6]][:] = quantized_range
                conv_exe_int8.arg_dict[qarg_names[7]][:] = -quantized_range
                conv_exe_int8.arg_dict[qarg_names[8]][:] = quantized_range
            qoutput, min_range, max_range = conv_exe_int8.forward()

            if no_bias:
                assert_almost_equal(output.asnumpy(), qoutput.asnumpy())
            else:
                # with adding bias, accuracy loss should not be greater than one
                diff = mx.nd.abs(output - qoutput.astype(output.dtype))
                cond = mx.nd.lesser(2, diff).sum().asscalar()
                assert cond == 0
    def check_quantize_model(qdtype):
        def check_params(params, qparams, qsym=None):
            if qsym is None:
                assert len(params) == len(qparams)
                for k, v in params.items():
                    assert k in qparams
                    assert same(v.asnumpy(), qparams[k].asnumpy())
            else:
                qparams_ground_truth = mx.contrib.quant._quantize_params(qsym, params)
                assert len(qparams) == len(qparams_ground_truth)
                for k, v in qparams_ground_truth.items():
                    assert k in qparams
                    assert same(v.asnumpy(), qparams[k].asnumpy())

        def check_qsym_calibrated(qsym):
            attrs = qsym.attr_dict()
            for k, v in attrs.items():
                if k.find('requantize_') != -1:
                    assert 'min_calib_range' in v
                    assert 'max_calib_range' in v

        def check_qsym_qdtype(qsym, qdtype):
            attrs = qsym.attr_dict()
            for k, v in attrs.items():
                if k.find('_quantize') != -1:
                    assert 'out_type' in v
                    assert v['out_type'] == qdtype

        sym = get_fp32_sym()
        mod = Module(symbol=sym)
        batch_size = 4
        data_shape = (batch_size, 4, 10, 10)
        label_shape = (batch_size, 10)
        mod.bind(data_shapes=[('data', data_shape)], label_shapes=[('softmax_label', label_shape)])
        mod.init_params()
        arg_params, aux_params = mod.get_params()
        qsym, qarg_params, qaux_params = mx.contrib.quant.quantize_model(sym=sym,
                                                                         arg_params=arg_params,
                                                                         aux_params=aux_params,
                                                                         ctx=mx.current_context(),
                                                                         quantized_dtype=qdtype,
                                                                         calib_mode='none')
        check_params(arg_params, qarg_params, qsym)
        check_params(aux_params, qaux_params)

        calib_data = mx.nd.random.uniform(shape=data_shape)
        calib_data = NDArrayIter(data=calib_data)
        calib_data = DummyIter(calib_data)
        qsym, qarg_params, qaux_params = mx.contrib.quant.quantize_model(sym=sym,
                                                                         arg_params=arg_params,
                                                                         aux_params=aux_params,
                                                                         ctx=mx.current_context(),
                                                                         quantized_dtype=qdtype,
                                                                         calib_mode='naive',
                                                                         calib_data=calib_data,
                                                                         num_calib_examples=20)
        check_params(arg_params, qarg_params, qsym)
        check_params(aux_params, qaux_params)
        check_qsym_calibrated(qsym)
        check_qsym_qdtype(qsym, qdtype)
    def check_quantized_fc(data_shape, num_hidden, no_bias, flatten=True):
        with mx.Context('gpu', 0):
            data = mx.sym.Variable(name='data', shape=data_shape, dtype='float32')
            fc_fp32 = mx.sym.FullyConnected(data=data, num_hidden=num_hidden, no_bias=no_bias, flatten=flatten)
            arg_shapes, _, _ = fc_fp32.infer_shape(data=data_shape)
            arg_names = fc_fp32.list_arguments()
            fc_fp32_exe = fc_fp32.simple_bind(ctx=mx.current_context(), grad_req='null')
            fc_fp32_exe.arg_dict[arg_names[0]][:] = mx.nd.random.uniform(low=-127.0, high=127.0,
                                                                         shape=data_shape).astype('int32')
            fc_fp32_exe.arg_dict[arg_names[1]][:] = mx.nd.random.uniform(low=-127.0, high=127.0,
                                                                         shape=arg_shapes[1]).astype('int32')
            if not no_bias:
                fc_fp32_exe.arg_dict[arg_names[2]][:] = mx.nd.random.uniform(low=-127.0, high=127.0,
                                                                             shape=arg_shapes[2]).astype('int32')
            output = fc_fp32_exe.forward()[0]

            qdata = mx.sym.Variable(name='qdata', shape=data_shape, dtype='int8')
            fc_int8 = mx.sym.contrib.quantized_fully_connected(data=qdata, num_hidden=num_hidden,
                                                               no_bias=no_bias, flatten=flatten)
            qarg_names = fc_int8.list_arguments()
            type_dict = {qarg_names[1]: 'int8'}
            if not no_bias:
                type_dict.update({qarg_names[2]: 'int8'})
            fc_int8_exe = fc_int8.simple_bind(ctx=mx.current_context(), type_dict=type_dict, grad_req='null')
            fc_int8_exe.arg_dict[qarg_names[0]][:] = fc_fp32_exe.arg_dict[arg_names[0]].astype('int8')
            fc_int8_exe.arg_dict[qarg_names[1]][:] = fc_fp32_exe.arg_dict[arg_names[1]].astype('int8')
            quantized_range = 127.0
            if no_bias:
                fc_int8_exe.arg_dict[qarg_names[2]][:] = -quantized_range
                fc_int8_exe.arg_dict[qarg_names[3]][:] = quantized_range
                fc_int8_exe.arg_dict[qarg_names[4]][:] = -quantized_range
                fc_int8_exe.arg_dict[qarg_names[5]][:] = quantized_range
            else:
                fc_int8_exe.arg_dict[qarg_names[2]][:] = fc_fp32_exe.arg_dict[arg_names[2]].astype('int8')
                fc_int8_exe.arg_dict[qarg_names[3]][:] = -quantized_range
                fc_int8_exe.arg_dict[qarg_names[4]][:] = quantized_range
                fc_int8_exe.arg_dict[qarg_names[5]][:] = -quantized_range
                fc_int8_exe.arg_dict[qarg_names[6]][:] = quantized_range
                fc_int8_exe.arg_dict[qarg_names[7]][:] = -quantized_range
                fc_int8_exe.arg_dict[qarg_names[8]][:] = quantized_range
            qoutput, min_range, max_range = fc_int8_exe.forward()

            if no_bias:
                assert_almost_equal(output.asnumpy(), qoutput.asnumpy())
            else:
                # with adding bias, accuracy loss should not be greater than one
                diff = mx.nd.abs(output - qoutput.astype(output.dtype))
                cond = mx.nd.lesser(2, diff).sum().asscalar()
                assert cond == 0
 def testSmall(data, repeat1, repeat2):
     # Check that the shuffling is along the first axis.
     # The order of the elements in each subarray must not change.
     # This takes long time so `repeat1` need to be small.
     for i in range(repeat1):
         ret = mx.nd.random.shuffle(data)
         check_first_axis_shuffle(ret)
     # Count the number of each different outcome.
     # The sequence composed of the first elements of the subarrays is enough to discriminate
     # the outcomes as long as the order of the elements in each subarray does not change.
     count = {}
     stride = int(data.size / data.shape[0])
     for i in range(repeat2):
         ret = mx.nd.random.shuffle(data)
         h = str(ret.reshape((ret.size,))[::stride])
         c = count.get(h, 0)
         count[h] = c + 1
     # Check the total number of possible outcomes.
     # If `repeat2` is not large enough, this could fail with high probability.
     assert len(count) == math.factorial(data.shape[0])
     # The outcomes must be uniformly distributed.
     # If `repeat2` is not large enough, this could fail with high probability.
     for p in itertools.permutations(range(0, data.size - stride + 1, stride)):
         err = abs(1. * count[str(mx.nd.array(p))] / repeat2 - 1. / math.factorial(data.shape[0]))
         assert err < 0.01, "The absolute error {} is larger than the tolerance.".format(err)
     # Check symbol interface
     a = mx.sym.Variable('a')
     b = mx.sym.random.shuffle(a)
     c = mx.sym.random.shuffle(data=b, name='c')
     d = mx.sym.sort(c, axis=0)
     assert (d.eval(a=data, ctx=mx.current_context())[0] == data).prod() == 1
    def check_quantized_pooling(data_shape, kernel, pool_type, pad, stride, global_pool, qdtype, convention='valid'):
        if is_test_for_native_cpu():
            print('skipped testing quantized_pooling for native cpu since it is not supported yet')
            return
        elif qdtype == 'uint8' and is_test_for_gpu():
            print('skipped testing quantized_pooling for gpu uint8 since it is not supported yet')
            return

        data = mx.sym.Variable(name='data', shape=data_shape, dtype='float32')
        pooling_fp32 = mx.sym.Pooling(data=data, kernel=kernel, pad=pad, stride=stride,
                                      pool_type=pool_type, global_pool=global_pool, cudnn_off=False,
                                      pooling_convention=convention)
        arg_shapes, _, _ = pooling_fp32.infer_shape(data=data_shape)
        arg_names = pooling_fp32.list_arguments()
        pooling_fp32_exe = pooling_fp32.simple_bind(ctx=mx.current_context(), grad_req='null')
        if qdtype == 'uint8':
            data_low = 0.0
            data_high = 127.0
        else:
            data_low = -127.0
            data_high = 127.0
        pooling_fp32_exe.arg_dict[arg_names[0]][:] = mx.nd.random.uniform(low=data_low, high=data_high,
                                                                            shape=data_shape).astype('int32')
        output = pooling_fp32_exe.forward()[0]

        qdata = mx.sym.Variable(name='qdata', shape=data_shape, dtype=qdtype)
        min_data = mx.sym.Variable(name='min_data')
        max_data = mx.sym.Variable(name='max_data')
        quantized_pooling = mx.sym.contrib.quantized_pooling(data=qdata, min_data=min_data,
                                                             max_data=max_data, kernel=kernel,
                                                             pad=pad, stride=stride, pool_type=pool_type,
                                                             global_pool=global_pool,
                                                             pooling_convention=convention)
        pooling_int8_exe = quantized_pooling.simple_bind(ctx=mx.current_context(), grad_req='null')
        qarg_names = quantized_pooling.list_arguments()
        pooling_int8_exe.arg_dict[qarg_names[0]][:] = pooling_fp32_exe.arg_dict[arg_names[0]].astype(qdtype)
        quantized_range = 127.0
        pooling_int8_exe.arg_dict[qarg_names[1]][:] = -quantized_range
        pooling_int8_exe.arg_dict[qarg_names[2]][:] = quantized_range
        qoutput, min_range, max_range = pooling_int8_exe.forward()

        if pool_type == 'max':
            assert_almost_equal(output.asnumpy(), qoutput.asnumpy())
        elif pool_type == 'avg':  # for avg pooling, fp32 and int8 may be different due to rounding errors
            diff = mx.nd.abs(output - qoutput.astype(output.dtype))
            cond = mx.nd.lesser(2, diff).sum().asscalar()
            assert cond == 0
def check_quantize(sym, data_shape, check_conv=True):
  fc = mx.sym.FullyConnected(data=sym, num_hidden=10, flatten=True, name='fc')
  sym = mx.sym.SoftmaxOutput(data=fc, name='softmax')
  sym_sg = sym.get_backend_symbol("MKLDNN")
  label_shape = (data_shape[0], 10)
  mod = Module(symbol=sym)
  mod.bind(for_training=False,
           data_shapes=[('data', data_shape)],
           label_shapes=[('softmax_label', label_shape)])
  mod.init_params(mx.init.Normal(0.5))
  arg_params, aux_params = mod.get_params()

  data = [mx.random.uniform(-1, 1, shape=shape, ctx=mx.current_context()) for _, shape in mod.data_shapes]
  batch = mx.io.DataBatch(data, [])

  mod.forward(batch, is_train=False)
  for output in mod.get_outputs():
      output.wait_to_read()
  ref_out = mod.get_outputs()

  excluded_sym_names = []
  if mx.current_context() == mx.cpu():
    excluded_sym_names += ['fc']

  calib_data = mx.nd.random.uniform(shape=data_shape)
  calib_data = NDArrayIter(data=calib_data)
  calib_data = DummyIter(calib_data)
  calib_layer = lambda name: name.endswith('_output')
  qsym, qarg_params, qaux_params = mx.contrib.quant.quantize_model(sym=sym_sg,
                                                                   arg_params=arg_params,
                                                                   aux_params=aux_params,
                                                                   ctx=mx.current_context(),
                                                                   excluded_sym_names=excluded_sym_names,
                                                                   quantized_dtype='uint8',
                                                                   calib_mode='naive',
                                                                   calib_data=calib_data,
                                                                   calib_layer=calib_layer,
                                                                   calib_quantize_op=True,
                                                                   num_calib_examples=5)
  qsym = qsym.get_backend_symbol("MKLDNN_POST_QUANTIZE")
  if check_conv:
    check_qsym_calibrated(qsym)
  quantized_out = check_qsym_forward(qsym, qarg_params, qaux_params, batch, data_shape, label_shape)
  for i in range(len(ref_out)):
    assert_almost_equal(ref_out[i].asnumpy(), quantized_out[i].asnumpy(), atol = 1)
  check_qsym_dummy_forward(qsym, batch, data_shape, label_shape)
Exemple #10
0
def check_qsym_dummy_forward(qsym, batch, data_shape, label_shape):
  mod = mx.mod.Module(symbol=qsym, context=mx.current_context())
  mod.bind(for_training=False,
           data_shapes=[('data', data_shape)],
           label_shapes=[('softmax_label', label_shape)])
  mod.init_params(initializer=mx.init.Xavier(magnitude=2.))
  mod.forward(batch, is_train=False)
  for output in mod.get_outputs():
    output.wait_to_read()
  return mod.get_outputs()
Exemple #11
0
def check_qsym_forward(qsym, qarg_params, qaux_params, batch, data_shape, label_shape):
  mod = mx.mod.Module(symbol=qsym, context=mx.current_context())
  mod.bind(for_training=False,
           data_shapes=[('data', data_shape)],
           label_shapes=[('softmax_label', label_shape)])
  mod.set_params(qarg_params, qaux_params)
  mod.forward(batch, is_train=False)
  for output in mod.get_outputs():
    output.wait_to_read()
  return mod.get_outputs()
 def check_qsym_forward(qsym, qarg_params, qaux_params, data_shape, label_shape):
     mod = mx.mod.Module(symbol=qsym, context=mx.current_context())
     mod.bind(for_training=False,
              data_shapes=[('data', data_shape)],
              label_shapes=[('softmax_label', label_shape)])
     mod.set_params(qarg_params, qaux_params)
     data = [mx.random.uniform(-1.0, 1.0, shape=shape) for _, shape in mod.data_shapes]
     batch = mx.io.DataBatch(data, [])
     mod.forward(batch, is_train=False)
     for output in mod.get_outputs():
         output.wait_to_read()
Exemple #13
0
def check_fusion(sym, data_shape, attrs_op):
  sym_sg = sym.get_backend_symbol("MKLDNN")
  assert ''.join(sym_sg.get_internals().list_outputs()).find('sg_mkldnn_conv') != -1
  for k, v in sym_sg.attr_dict().items():
    if k.find('sg_mkldnn_conv') != -1:
      for attr_op in attrs_op:
        assert v[attr_op] == 'true'

  arg_shapes, _, aux_shapes = sym.infer_shape()
  arg_array = [mx.nd.random.uniform(-1, 1, shape=shape) for shape in arg_shapes]
  aux_array = [mx.nd.random.uniform(shape=shape) for shape in aux_shapes]
  exe = sym.bind(ctx=mx.current_context(), args=arg_array, aux_states=aux_array, grad_req='null')
  exe.forward()
  os.environ['MXNET_SUBGRAPH_BACKEND'] = 'MKLDNN'
  exe_sg = sym.bind(ctx=mx.current_context(), args=arg_array, aux_states=aux_array, grad_req='null')
  exe_sg.forward()
  del os.environ['MXNET_SUBGRAPH_BACKEND']
  for i in range(len(exe.outputs)):
    assert_almost_equal(exe.outputs[i].asnumpy(), exe_sg.outputs[i].asnumpy(), rtol=1e-3, atol=1e-3)

  # fp32 to uint8
  check_quantize(sym, data_shape)
 def check_qsym_forward(qsym, qarg_params, qaux_params, data_shape,
                        label_shape):
     mod = mx.mod.Module(symbol=qsym, context=mx.current_context())
     mod.bind(for_training=False,
              data_shapes=[('data', data_shape)],
              label_shapes=[('softmax_label', label_shape)])
     mod.set_params(qarg_params, qaux_params)
     data = [
         mx.random.uniform(-1.0, 1.0, shape=shape)
         for _, shape in mod.data_shapes
     ]
     batch = mx.io.DataBatch(data, [])
     mod.forward(batch, is_train=False)
     for output in mod.get_outputs():
         output.wait_to_read()
def test_sdml_loss():

    N = 5  # number of samples
    DIM = 10  # Dimensionality
    EPOCHS = 20

    # Generate randomized data and 'positive' samples
    data = mx.random.uniform(-1, 1, shape=(N, DIM))
    pos = data + mx.random.uniform(-0.1, 0.1,
                                   shape=(N, DIM))  # correlated paired data
    data_iter = mx.io.NDArrayIter({'data': data, 'pos': pos}, batch_size=N)

    # Init model and trainer
    sdml_loss = gluon.loss.SDMLLoss()
    model = gluon.nn.Dense(DIM, activation='tanh')  # Simple NN encoder
    model.initialize(mx.init.Xavier(), ctx=mx.current_context())
    trainer = gluon.Trainer(model.collect_params(), 'adam',
                            {'learning_rate': 0.1})

    for i in range(EPOCHS):  # Training loop
        data_iter.reset()
        for iter_batch in data_iter:
            batch = [
                datum.as_in_context(mx.current_context())
                for datum in iter_batch.data
            ]
            with autograd.record():
                data, pos = batch
                z_data, z_pos = model(data), model(pos)
                loss = sdml_loss(z_data, z_pos)
                loss.backward()
            trainer.step(1)

    # After training euclidean distance between aligned pairs should be lower than all non-aligned pairs
    avg_loss = loss.sum() / len(loss)
    assert (avg_loss < 0.05)
Exemple #16
0
 def finish_update(self, optimizer):
     if self._optimizer is None:
         self._optimizer, self._trainer = self._create_optimizer(optimizer)
     if getattr(optimizer, "grad_clip", None):
         ctx = mx.current_context()
         grads = [
             i.grad(ctx) for i in self._model.collect_params().values()
             if i._grad is not None
         ]
         mxnet.gluon.utils.clip_global_norm(grads, optimizer.grad_clip)
     if self._trainer:
         self._trainer.step(1)
     for param in self._model.collect_params().values():
         param.zero_grad()
     self._update_mxnet_averages(optimizer)
def check_subgraph_exe1(sym, subgraph_backend, op_names):
    """Use the partitioned sym to simple_bind an executor and compare the outputs
    with those of the original executor"""
    out = SymbolHandle()
    check_call(
        _LIB.MXBuildSubgraphByOpNames(sym.handle, c_str(subgraph_backend),
                                      mx_uint(len(op_names)),
                                      c_str_array(op_names),
                                      ctypes.byref(out)))

    partitioned_sym = Symbol(out)
    assert partitioned_sym.list_inputs() == sym.list_inputs()
    assert partitioned_sym.list_arguments() == sym.list_arguments()
    assert partitioned_sym.list_auxiliary_states(
    ) == sym.list_auxiliary_states()
    exe = sym.simple_bind(ctx=mx.current_context(), grad_req='null')
    partitioned_exe = partitioned_sym.simple_bind(ctx=mx.current_context(),
                                                  grad_req='null')
    input_names = sym.list_inputs()
    for name in input_names:
        if name in exe.arg_dict:
            exe.arg_dict[name][:] = mx.nd.random.uniform(
                shape=exe.arg_dict[name].shape)
            partitioned_exe.arg_dict[name][:] = exe.arg_dict[name]
        else:
            assert name in exe.aux_dict
            exe.aux_dict[name][:] = mx.nd.random.uniform(
                shape=exe.aux_dict[name].shape)
            partitioned_exe.aux_dict[name][:] = exe.aux_dict[name]
    exe.forward()
    partitioned_exe.forward()
    assert len(exe.outputs) == len(partitioned_exe.outputs)
    for i in range(len(exe.outputs)):
        assert_almost_equal((exe.outputs[i] -
                             partitioned_exe.outputs[i]).abs().sum().asnumpy(),
                            np.zeros(shape=(1, )))
Exemple #18
0
def test_subgraph_exe3(sym, subgraph_backend, op_names):
    """Use the partitioned sym to bind an executor and compare the outputs
    with those of the original executor"""
    sym, _, _ = sym
    out = SymbolHandle()
    check_call(
        _LIB.MXBuildSubgraphByOpNames(sym.handle, c_str(subgraph_backend),
                                      mx_uint(len(op_names)),
                                      c_str_array(op_names),
                                      ctypes.byref(out)))

    partitioned_sym = Symbol(out)
    input_names = sym.list_inputs()
    arg_names = sym.list_arguments()
    aux_names = sym.list_auxiliary_states()
    assert partitioned_sym.list_inputs() == input_names
    assert partitioned_sym.list_arguments() == arg_names
    assert partitioned_sym.list_auxiliary_states() == aux_names
    arg_shapes, _, aux_shapes = sym.infer_shape()
    arg_array = [mx.nd.random.uniform(shape=shape) for shape in arg_shapes]
    aux_array = [mx.nd.random.uniform(shape=shape) for shape in aux_shapes]
    exe = sym._bind(ctx=mx.current_context(),
                    args=arg_array,
                    aux_states=aux_array,
                    grad_req='null')
    partitioned_exe = partitioned_sym._bind(ctx=mx.current_context(),
                                            args=arg_array,
                                            aux_states=aux_array,
                                            grad_req='null')
    exe.forward()
    partitioned_exe.forward()
    assert len(exe.outputs) == len(partitioned_exe.outputs)
    for i in range(len(exe.outputs)):
        assert_almost_equal((exe.outputs[i] -
                             partitioned_exe.outputs[i]).abs().sum().asnumpy(),
                            np.zeros(shape=(1, )))
 def get_executor(sym,
                  subgraph_backend=None,
                  op_names=None,
                  original_exec=None):
     exe = sym._simple_bind(ctx=mx.current_context(), grad_req='null')
     input_names = sym.list_inputs()
     for name in input_names:
         if name in exe.arg_dict:
             exe.arg_dict[name][:] = mx.nd.random.uniform(shape=exe.arg_dict[name].shape)\
                 if original_exec is None else original_exec.arg_dict[name]
         else:
             assert name in exe.aux_dict
             exe.aux_dict[name][:] = mx.nd.random.uniform(shape=exe.aux_dict[name].shape)\
                 if original_exec is None else original_exec.aux_dict[name]
     exe.forward()
     return exe
Exemple #20
0
def test_gnmt_encoder_decoder():
    ctx = mx.current_context()
    num_hidden = 8
    encoder = GNMTEncoder(cell_type="lstm", num_layers=3, num_bi_layers=1, hidden_size=num_hidden,
                          dropout=0.0, use_residual=True, prefix='gnmt_encoder_')
    encoder.initialize(ctx=ctx)
    encoder.hybridize()
    for output_attention in [True, False]:
        for use_residual in [True, False]:
            decoder = GNMTDecoder(cell_type="lstm", num_layers=3, hidden_size=num_hidden, dropout=0.0,
                                  output_attention=output_attention, use_residual=use_residual, prefix='gnmt_decoder_')
            decoder.initialize(ctx=ctx)
            decoder.hybridize()
            for batch_size in [4]:
                for src_seq_length, tgt_seq_length in [(5, 10), (10, 5)]:
                    src_seq_nd = mx.nd.random.normal(0, 1, shape=(batch_size, src_seq_length, 4), ctx=ctx)
                    tgt_seq_nd = mx.nd.random.normal(0, 1, shape=(batch_size, tgt_seq_length, 4), ctx=ctx)
                    src_valid_length_nd = mx.nd.array(np.random.randint(1, src_seq_length, size=(batch_size,)), ctx=ctx)
                    tgt_valid_length_nd = mx.nd.array(np.random.randint(1, tgt_seq_length, size=(batch_size,)), ctx=ctx)
                    src_valid_length_npy = src_valid_length_nd.asnumpy()
                    tgt_valid_length_npy = tgt_valid_length_nd.asnumpy()
                    encoder_outputs, _ = encoder(src_seq_nd, valid_length=src_valid_length_nd)
                    decoder_states = decoder.init_state_from_encoder(encoder_outputs, src_valid_length_nd)

                    # Test multi step forwarding
                    output, new_states, additional_outputs = decoder.decode_seq(tgt_seq_nd,
                                                                                decoder_states,
                                                                                tgt_valid_length_nd)
                    assert(output.shape == (batch_size, tgt_seq_length, num_hidden))
                    output_npy = output.asnumpy()
                    for i in range(batch_size):
                        tgt_v_len = int(tgt_valid_length_npy[i])
                        if tgt_v_len < tgt_seq_length - 1:
                            assert((output_npy[i, tgt_v_len:, :] == 0).all())
                    if output_attention:
                        assert(len(additional_outputs) == 1)
                        attention_out = additional_outputs[0].asnumpy()
                        assert(attention_out.shape == (batch_size, tgt_seq_length, src_seq_length))
                        for i in range(batch_size):
                            mem_v_len = int(src_valid_length_npy[i])
                            if mem_v_len < src_seq_length - 1:
                                assert((attention_out[i, :, mem_v_len:] == 0).all())
                            if mem_v_len > 0:
                                assert_almost_equal(attention_out[i, :, :].sum(axis=-1),
                                                    np.ones(attention_out.shape[1]))
                    else:
                        assert(len(additional_outputs) == 0)
Exemple #21
0
def test_quantize_whole_model_with_forward(qdtype):
    batch_size = 4
    data_shape = (batch_size, 4, 10, 10)
    data = mx.sym.Variable('data')
    conv0 = mx.sym.Convolution(data,
                               kernel=(1, 1),
                               num_filter=16,
                               name='conv0')
    sym = mx.sym.Convolution(conv0, kernel=(1, 1), num_filter=16, name='conv1')

    sym_block = mx.gluon.SymbolBlock(outputs=sym, inputs=data)
    initialize_block_params(sym_block, mx.init.Normal(0.5))

    in_data = mx.random.uniform(0.0 if qdtype == 'uint8' else -1.0,
                                1.0,
                                shape=data_shape)
    ref_out = sym_block(in_data)

    excluded_layers = []

    calib_data = mx.nd.random.uniform(0.0 if qdtype == 'uint8' else -1.0,
                                      1.0,
                                      shape=data_shape)
    calib_data = mx.gluon.data.DataLoader(calib_data, batch_size=batch_size)
    qsym = mx.contrib.quantization.quantize_net(sym_block,
                                                ctx=mx.current_context(),
                                                exclude_layers=excluded_layers,
                                                quantized_dtype=qdtype,
                                                calib_mode='naive',
                                                calib_data=calib_data,
                                                num_calib_batches=1,
                                                quantize_mode='full')

    outputs = qsym(in_data)
    for output in outputs:
        output.wait_to_read()

    for i in range(len(ref_out)):
        min_range = mx.nd.min(ref_out[i]).asscalar()
        max_range = mx.nd.max(ref_out[i]).asscalar()
        atol = 0.1 * max(abs(min_range), abs(max_range))
        assert_almost_equal_with_err(outputs[i].asnumpy(),
                                     ref_out[i].asnumpy(),
                                     rtol=0.1,
                                     atol=atol,
                                     etol=0.2)
Exemple #22
0
def test_array_creation():
    dtypes = [_np.int8, _np.int32, _np.float16, _np.float32, _np.float64, None]
    objects = [[], (), [[1, 2], [3, 4]],
               _np.random.uniform(size=rand_shape_nd(3)),
               _np.random.uniform(size=(3, 0, 4))]
    for dtype in dtypes:
        for src in objects:
            mx_arr = np.array(src, dtype=dtype)
            assert mx_arr.context == mx.current_context()
            if isinstance(src, mx.nd.NDArray):
                np_arr = _np.array(
                    src.asnumpy(),
                    dtype=dtype if dtype is not None else _np.float32)
            else:
                np_arr = _np.array(
                    src, dtype=dtype if dtype is not None else _np.float32)
            assert mx_arr.dtype == np_arr.dtype
            assert same(mx_arr.asnumpy(), np_arr)
def check_fusion(net_original, data_shape, attrs_dict, check_fp32_fusion=True, check_quantization=True,
                 out_types=['uint8', 'int8', 'auto'], dedup_subgraph=True):
  net_original.initialize()
  net_original.hybridize(static_alloc=False, static_shape=False)
  data = mx.random.uniform(shape=data_shape, dtype='float32', ctx=mx.current_context())
  net_original(data)
  net_fusion = copy.copy(net_original)
  sym, params = net_original.export(None)

  if check_fp32_fusion:
    data_min = -1.0
    data_max = 1.0
    if ''.join(sym.get_internals().list_outputs()).find('sqrt') != -1:
      check_quantization = False
      data_min = 0

    sym_sg = sym.optimize_for(SG_PASS_NAME, dedup_subgraph=dedup_subgraph, skip_infer=True)
    for name, attrs in attrs_dict.items():
      if name in config:
        op_name = config[name][OP_NAME]
      else:
        op_name = name
      assert ''.join(sym_sg.get_internals().list_outputs()).find(op_name) != -1
      if len(attrs):
          found = False
          for k, v in sym_sg.attr_dict().items():
            if k.find(op_name) != -1:
              found = True
              for attr_name, attr_value in attrs.items():
                assert v[attr_name].lower() == attr_value.lower()
          assert found

    data = mx.nd.random.uniform(shape=data_shape, low=data_min, high=data_max)
    out_unfused = net_original(data)

    net_fusion.optimize_for(data, backend=SG_PASS_NAME)
    out_fused = net_fusion(data)

    assert_almost_equal(out_unfused.asnumpy(), out_fused.asnumpy(), rtol=1e-3, atol=1e-1)

  if check_quantization:
    # fp32 to int8
    for out_type in out_types:
      check_quantize(net_original, data_shape, out_type, name=name)
Exemple #24
0
def conv3d(x,
           kernel,
           strides=(1, 1, 1),
           border_mode='valid',
           dim_ordering='default',
           volume_shape=None,
           filter_shape=None):
    '''3D convolution.

    # Arguments
        kernel: kernel tensor.
        strides: strides tuple.
        border_mode: string, "same" or "valid".
        dim_ordering: "tf" or "th".
            Whether to use Theano or TensorFlow dimension ordering
            for inputs/kernels/ouputs.
    '''
    if dim_ordering == 'default':
        dim_ordering = image_dim_ordering()
    if dim_ordering not in {'th', 'tf'}:
        raise ValueError('Unknown dim_ordering ' + str(dim_ordering))

    x = _preprocess_conv3d_input(x, dim_ordering)
    kernel = _preprocess_conv3d_kernel(kernel, dim_ordering)
    padding = _preprocess_border_mode(border_mode)

    data = mx.sym.Variable(name="data")
    shp = (kernel.shape[2], kernel.shape[3], kernel.shape[4])
    conv = mx.sym.Convolution(data=data,
                              kernel=shp,
                              no_bias=True,
                              num_filter=kernel.shape[0],
                              stride=strides,
                              name="conv")
    executor = conv.bind(ctx=mx.current_context(),
                         args={
                             'data': x,
                             'conv_weight': kernel
                         })
    executor.forward()
    y = executor.outputs[0]

    return _postprocess_conv3d_output(y, dim_ordering)
Exemple #25
0
def test_np_get_dtype():
    dtypes = [_np.int8, _np.int32, _np.float16, _np.float32, _np.float64, _np.bool, _np.bool_,
              'int8', 'int32', 'float16', 'float32', 'float64', 'bool', None]
    objects = [
        [],
        (),
        [[1, 2], [3, 4]],
        _np.random.uniform(size=rand_shape_nd(3)),
        _np.random.uniform(size=(3, 0, 4))
    ]
    for dtype in dtypes:
        for src in objects:
            mx_arr = np.array(src, dtype=dtype)
            assert mx_arr.ctx == mx.current_context()
            if isinstance(src, mx.nd.NDArray):
                np_arr = _np.array(src.asnumpy(), dtype=dtype if dtype is not None else _np.float32)
            else:
                np_arr = _np.array(src, dtype=dtype if dtype is not None else _np.float32)
            assert type(mx_arr.dtype) == type(np_arr.dtype)
Exemple #26
0
 def get_executor(sym, subgraph_backend=None, op_names=None, original_exec=None):
     if subgraph_backend is not None:
         os.environ['MXNET_SUBGRAPH_BACKEND'] = subgraph_backend
         check_call(_LIB.MXSetSubgraphPropertyOpNames(c_str(subgraph_backend), mx_uint(len(op_names)),
                                                      c_str_array(op_names)))
     exe = sym.simple_bind(ctx=mx.current_context(), grad_req='null')
     input_names = sym.list_inputs()
     for name in input_names:
         if name in exe.arg_dict:
             exe.arg_dict[name][:] = mx.nd.random.uniform(shape=exe.arg_dict[name].shape)\
                 if original_exec is None else original_exec.arg_dict[name]
         else:
             assert name in exe.aux_dict
             exe.aux_dict[name][:] = mx.nd.random.uniform(shape=exe.aux_dict[name].shape)\
                 if original_exec is None else original_exec.aux_dict[name]
     exe.forward()
     if subgraph_backend is not None:
         check_call(_LIB.MXRemoveSubgraphPropertyOpNames(c_str(subgraph_backend)))
         del os.environ['MXNET_SUBGRAPH_BACKEND']
     return exe
Exemple #27
0
    def check_amp_convert_bucketing_module():
        model = train_model(context=mx.current_context())
        result_model = amp.convert_bucketing_module(model)
        val_sent = []
        batch_size = 128
        invalid_label = -1
        num_sentence = 1000
        buckets = [5, 10, 20, 30, 40]
        len_vocab = 50

        for _ in range(num_sentence):
            len_sentence = randint(6,
                                   max(buckets) -
                                   1)  # leave out the two last buckets empty
            val_sentence = []
            for _ in range(len_sentence):
                val_sentence.append(randint(1, len_vocab))
            val_sent.append(val_sentence)

        data_val = mx.rnn.BucketSentenceIter(val_sent,
                                             batch_size,
                                             buckets=buckets,
                                             invalid_label=invalid_label)
        result_model.bind(data_val.provide_data,
                          data_val.provide_label,
                          for_training=False)
        result_model.score(data_val,
                           mx.metric.Perplexity(invalid_label),
                           batch_end_callback=mx.callback.Speedometer(
                               batch_size, 1))

        # AMP conversion with cast_optional_params set to true
        result_model = amp.convert_bucketing_module(model,
                                                    cast_optional_params=True)
        result_model.bind(data_val.provide_data,
                          data_val.provide_label,
                          for_training=False)
        result_model.score(data_val,
                           mx.metric.Perplexity(invalid_label),
                           batch_end_callback=mx.callback.Speedometer(
                               batch_size, 1))
Exemple #28
0
 def get_executor(sym, subgraph_backend=None, op_names=None, original_exec=None):
     if subgraph_backend is not None:
         os.environ['MXNET_SUBGRAPH_BACKEND'] = subgraph_backend
         check_call(_LIB.MXSetSubgraphPropertyOpNames(c_str(subgraph_backend), mx_uint(len(op_names)),
                                                      c_str_array(op_names)))
     arg_shapes, _, aux_shapes = sym.infer_shape()
     if subgraph_backend is None:
         arg_array = [mx.nd.random.uniform(shape=shape) for shape in arg_shapes]
         aux_array = [mx.nd.random.uniform(shape=shape) for shape in aux_shapes]
     else:
         arg_array = None
         aux_array = None
     exe = sym.bind(ctx=mx.current_context(),
                    args=arg_array if subgraph_backend is None else original_exec.arg_arrays,
                    aux_states=aux_array if subgraph_backend is None else original_exec.aux_arrays,
                    grad_req='null')
     exe.forward()
     if subgraph_backend is not None:
         check_call(_LIB.MXRemoveSubgraphPropertyOpNames(c_str(subgraph_backend)))
         del os.environ['MXNET_SUBGRAPH_BACKEND']
     return exe
Exemple #29
0
    def test_conv_model_quantization(self):
        """
        Use Conv model to test KL calibration and user specific evaluate function.
        """
        for shape in [
            (500, 3, 224, 224),
        ]:
            arg_shapes, _, _ = self.conv_model.infer_shape(data=shape)

            mod = mx.mod.Module(symbol=self.conv_model,
                                context=mx.current_context())
            mod.bind(for_training=False, data_shapes=[('data', arg_shapes[0])])
            mod.init_params()

            arg_params, aux_params = mod.get_params()
            data = mx.nd.random.uniform(low=self.data_low,
                                        high=self.data_high,
                                        shape=shape).astype('float32')
            calib_data = mx.io.NDArrayIter(data=data, batch_size=shape[0])

            fp32_model = (self.conv_model, arg_params, aux_params)
            qmodel = self.quantizer_2(fp32_model, q_dataloader=calib_data, \
                                      eval_dataloader=calib_data, eval_func=eval_func)
            # test inspected_tensor
            inspect_tensor = self.quantizer_2.strategy.adaptor.inspect_tensor
            inspected_tensor = inspect_tensor(
                fp32_model,
                calib_data,
                op_list=[('sg_mkldnn_conv_bn_act_0_output', 'CONV'),
                         ('data', 'input')],
                iteration_list=[0, 2, 4])
            inspected_qtensor = inspect_tensor(
                qmodel,
                calib_data,
                op_list=[('quantized_sg_mkldnn_conv_bn_act_0_output', 'CONV')],
                iteration_list=[0])

            self.assertNotEqual(len(inspected_tensor), 0)
            self.assertNotEqual(len(inspected_qtensor), 0)
            self.assertIsInstance(qmodel[0], mx.symbol.Symbol)
Exemple #30
0
    def check_quantize_whole_model(out_type):
        batch_size = 4
        data_shape = (batch_size, 4, 10, 10)
        data = mx.sym.Variable('data')
        conv0 = mx.sym.Convolution(data,
                                   kernel=(1, 1),
                                   num_filter=16,
                                   name='conv0')
        sym = mx.sym.Convolution(conv0,
                                 kernel=(1, 1),
                                 num_filter=16,
                                 name='conv1')
        sym_sg = sym.get_backend_symbol('MKLDNN_QUANTIZE')
        mod = Module(symbol=sym, label_names=None)
        mod.bind(for_training=False, data_shapes=[('data', data_shape)])

        mod.init_params(mx.init.Normal(0.5))
        arg_params, aux_params = mod.get_params()

        excluded_sym_names = []

        calib_data = mx.nd.random.uniform(shape=data_shape)
        calib_data = mx.io.NDArrayIter(data=calib_data)
        calib_data = DummyIter(calib_data)
        calib_layer = lambda name: name.endswith('_output')
        qsym, qarg_params, qaux_params = mx.contrib.quant.quantize_model(
            sym=sym_sg,
            arg_params=arg_params,
            aux_params=aux_params,
            ctx=mx.current_context(),
            excluded_sym_names=excluded_sym_names,
            quantized_dtype=out_type,
            calib_mode='naive',
            calib_data=calib_data,
            calib_layer=calib_layer,
            label_names=None,
            num_calib_examples=1)
        qsym = qsym.get_backend_symbol('MKLDNN_QUANTIZE')
        check_qsym_forward(qsym, qarg_params, qaux_params, data_shape)
def test_gnmt_encoder():
    ctx = mx.current_context()
    for cell_type in ["lstm", "gru", "relu_rnn", "tanh_rnn"]:
        for num_layers, num_bi_layers in [(2, 1), (3, 0)]:
            for use_residual in [False, True]:
                encoder = GNMTEncoder(cell_type=cell_type,
                                      num_layers=num_layers,
                                      num_bi_layers=num_bi_layers,
                                      hidden_size=8,
                                      dropout=0.0,
                                      use_residual=use_residual,
                                      prefix='gnmt_encoder_')
                encoder.initialize(ctx=ctx)
                encoder.hybridize()
                for batch_size in [4]:
                    for seq_length in [5, 10]:
                        inputs_nd = mx.nd.random.normal(0,
                                                        1,
                                                        shape=(batch_size,
                                                               seq_length, 4),
                                                        ctx=ctx)
                        valid_length_nd = mx.nd.array(np.random.randint(
                            1, seq_length, size=(batch_size, )),
                                                      ctx=ctx)
                        encoder_outputs, _ = encoder(
                            inputs_nd, valid_length=valid_length_nd)
                        valid_length_npy = valid_length_nd.asnumpy()
                        rnn_output = encoder_outputs[0].asnumpy()
                        for i in range(batch_size):
                            if valid_length_npy[i] < seq_length - 1:
                                padded_out = rnn_output[
                                    i, int(valid_length_npy[i]):, :]
                                assert_almost_equal(padded_out,
                                                    np.zeros_like(padded_out),
                                                    1E-6, 1E-6)
                        assert (encoder_outputs[0].shape == (batch_size,
                                                             seq_length, 8))
                        assert (len(encoder_outputs[1]) == num_layers)
 def get_executor(sym,
                  subgraph_backend=None,
                  op_names=None,
                  original_exec=None):
     arg_shapes, _, aux_shapes = sym.infer_shape()
     if subgraph_backend is None:
         arg_array = [
             mx.nd.random.uniform(shape=shape) for shape in arg_shapes
         ]
         aux_array = [
             mx.nd.random.uniform(shape=shape) for shape in aux_shapes
         ]
     else:
         arg_array = None
         aux_array = None
     exe = sym._bind(ctx=mx.current_context(),
                     args=arg_array if subgraph_backend is None else
                     original_exec.arg_arrays,
                     aux_states=aux_array if subgraph_backend is None else
                     original_exec.aux_arrays,
                     grad_req='null')
     exe.forward()
     return exe
Exemple #33
0
def test_np_array_creation():
    dtypes = [_np.int8, _np.int32, _np.float16, _np.float32, _np.float64, _np.bool, _np.bool_,
              'int8', 'int32', 'float16', 'float32', 'float64', 'bool', None]
    objects = [
        [],
        (),
        [[1, 2], [3, 4]],
        _np.random.randint(-10, 10, size=rand_shape_nd(3)),
        _np.random.uniform(size=rand_shape_nd(3)),
        _np.random.uniform(size=(3, 0, 4))
    ]
    for dtype in dtypes:
        for src in objects:
            mx_arr = np.array(src, dtype=dtype)
            assert mx_arr.ctx == mx.current_context()
            if dtype is None:
                dtype = src.dtype if isinstance(src, _np.ndarray) else _np.float32
            if isinstance(src, mx.nd.NDArray):
                np_arr = _np.array(src.asnumpy(), dtype=dtype)
            else:
                np_arr = _np.array(src, dtype=dtype)
            assert mx_arr.dtype == np_arr.dtype
            assert same(mx_arr.asnumpy(), np_arr)
Exemple #34
0
def test_mxnet_module_wrapper(data_frame):
    from datawig.imputer import _MXNetModule
    import mxnet as mx
    from datawig.iterators import ImputerIterDf

    feature_col, label_col = "feature", "label"
    df = data_frame(n_samples=100,
                    feature_col=feature_col,
                    label_col=label_col)
    label_encoders = [CategoricalEncoder(label_col)]
    data_encoders = [BowEncoder(feature_col)]
    data_featurizers = [BowFeaturizer(feature_col, vocab_size=100)]
    iter_train = ImputerIterDf(df, data_encoders, label_encoders)

    mod = _MXNetModule(mx.current_context(),
                       label_encoders,
                       data_featurizers,
                       final_fc_hidden_units=[])(iter_train)

    assert mod._label_names == [label_col]
    assert mod.data_names == [feature_col]
    # weights and biases
    assert len(mod._arg_params) == 2
Exemple #35
0
def import_symb_block(num_inputs: int,
                      model_dir: Path,
                      model_name: str,
                      epoch: int = 0) -> mx.gluon.SymbolBlock:
    """
    Deserializes a hybridized Gluon `HybridBlock` as a `SymbolBlock`.

    Parameters
    ----------
    num_inputs
        The number of inputs of the serialized block.
    model_dir
        The path where the model is saved.
    model_name
        The name identifying the model.
    epoch
        The epoch number, which together with the `model_name` identifies the
        model parameters.

    Returns
    -------
    mx.gluon.SymbolBlock
        The deserialized block.
    """
    if num_inputs == 1:
        input_names = ['data']
    else:
        input_names = [f'data{i}' for i in range(num_inputs)]

    # FIXME: mx.gluon.SymbolBlock cannot infer float_type and uses default np.float32
    # FIXME: https://github.com/apache/incubator-mxnet/issues/11849
    return mx.gluon.SymbolBlock.imports(
        symbol_file=str(model_dir / f'{model_name}-symbol.json'),
        input_names=input_names,
        param_file=str(model_dir / f'{model_name}-{epoch:04}.params'),
        ctx=mx.current_context(),
    )
Exemple #36
0
def test_transformer_encoder():
    ctx = mx.current_context()
    for num_layers in range(1, 3):
        for output_attention in [True, False]:
            for use_residual in [False, True]:
                encoder = TransformerEncoder(num_layers=num_layers, max_length=10,
                                             units=16, hidden_size=32, num_heads=8,
                                             dropout=0.0, use_residual=use_residual,
                                             output_attention=output_attention, prefix='transformer_encoder_')
                encoder.initialize(ctx=ctx)
                encoder.hybridize()
                for batch_size in [4]:
                    for seq_length in [5, 10]:
                        inputs_nd = mx.nd.random.normal(0, 1, shape=(batch_size, seq_length, 16), ctx=ctx)
                        valid_length_nd = mx.nd.array(np.random.randint(1, seq_length,
                                                                        size=(batch_size,)), ctx=ctx)
                        encoder_outputs, additional_outputs = encoder(inputs_nd, valid_length=valid_length_nd)
                        valid_length_npy = valid_length_nd.asnumpy()
                        encoder_outputs = encoder_outputs.asnumpy()
                        for i in range(batch_size):
                            if valid_length_npy[i] < seq_length - 1:
                                padded_out = encoder_outputs[i, int(valid_length_npy[i]):, :]
                                assert_almost_equal(padded_out, np.zeros_like(padded_out), 1E-6, 1E-6)
                        assert(encoder_outputs.shape == (batch_size, seq_length, 16))
                        if output_attention:
                            assert(len(additional_outputs) == num_layers)
                            attention_out = additional_outputs[0][0].asnumpy()
                            assert(attention_out.shape == (batch_size, 8, seq_length, seq_length))
                            for i in range(batch_size):
                                mem_v_len = int(valid_length_npy[i])
                                if mem_v_len < seq_length - 1:
                                    assert((attention_out[i, :, :, mem_v_len:] == 0).all())
                                if mem_v_len > 0:
                                    assert_almost_equal(attention_out[i, :, :, :].sum(axis=-1),
                                                      np.ones(attention_out.shape[1:3]))
                        else:
                            assert(len(additional_outputs) == 0)
Exemple #37
0
def test_bf16_offline_casting():
  class TestNet(nn.HybridBlock):
    def __init__(self):
      super().__init__()
      self.lp16_op1 = nn.Conv2D(4, 3)
      self.lp16_op2 = nn.Conv2DTranspose(4, 3)
      self.fp32_op = nn.Dense(4)

    def forward(self, x):
      x = self.lp16_op1(x)
      x = self.lp16_op2(x)
      x = x.reshape(x.shape[0], -1)
      x = self.fp32_op(x)
      return x

  net = TestNet()
  net.initialize()
  data_example = mx.np.random.uniform(-1, 1, (4, 3, 16, 16))
  lp_net = amp.convert_hybrid_block(net, data_example, target_dtype=bfloat16,
                                    target_dtype_ops=['Convolution'], fp32_ops=['FullyConnected'],
                                    cast_params_offline=True, device=mx.current_context())
  lp_net(data_example)
  for name, data in lp_net.collect_params().items():
    assert data.dtype == (np.float32 if 'fp32_op' in name else bfloat16)
 def finish_update(self, optimizer: Optimizer):
     params = []
     grads = []
     shapes = []
     ctx = mx.current_context()
     for key, value in self._model.collect_params().items():
         grad = cast(FloatsXd, mxnet2xp(value.grad(ctx)))
         param = cast(FloatsXd, mxnet2xp(value.data(ctx)))
         params.append(param.ravel())
         grads.append(grad.ravel())
         shapes.append((param.size, param.shape))
     if not params:
         return
     xp = get_array_module(params[0])
     flat_params, flat_grads = optimizer((self.id, "mxnet-shim"),
                                         xp.concatenate(params),
                                         xp.concatenate(grads))
     start = 0
     for key, value in self._model.collect_params().items():
         size, shape = shapes.pop(0)
         param = flat_params[start:start + size].reshape(shape)
         value.set_data(xp2mxnet(param))
         value.zero_grad()
         start += size
Exemple #39
0
def check_amp_fuse(net, data_example, expected_sym=None, quantized_nodes=[], rtol=0.05):
  net.hybridize()
  out_ref = net(*data_example)

  net.optimize_for(data_example, backend=SG_PASS_NAME)  # amp pass works only on oneDNN nodes
  lp_net = amp.convert_hybrid_block(net, data_example, target_dtype=AMP_DTYPE,
                                    excluded_sym_names=quantized_nodes, cast_params_offline=True,
                                    device=mx.current_context())
  lp_net.optimize_for(data_example, backend=AMP_SG_PASS_NAME)
  out_lp_net = lp_net(*data_example)

  # check outputs
  out_ref = [out_ref] if not isinstance(out_ref, list) else out_ref
  out_lp_net = [out_lp_net] if not isinstance(out_ref, list) else out_lp_net
  for ref_out, lp_out in zip(out_ref, out_lp_net):
    assert_almost_equal(ref_out, lp_out, rtol=rtol, atol=1.0)

  # check graph
  if expected_sym is not None:
    lp_symnet = lp_net.export(None, remove_amp_cast=False)[0]
    same_graph_structure(lp_symnet, expected_sym, True)

  # check amp with quantization
  check_amp_with_quantization(net, data_example, quantized_nodes)
    def check_quantize_model(qdtype):
        if is_test_for_native_cpu():
            print('skipped testing quantized_residual_unit for native cpu since it is not supported yet')
            return
        elif qdtype == 'int8' and is_test_for_mkldnn():
            print('skipped testing quantized_residual_unit for mkldnn cpu int8 since it is not supported yet')
            return
        elif qdtype == 'uint8' and is_test_for_gpu():
            print('skipped testing quantized_residual_unit for gpu uint8 since it is not supported yet')
            return

        def check_params(params, qparams, qsym=None):
            if qsym is None:
                assert len(params) == len(qparams)
                for k, v in params.items():
                    assert k in qparams
                    assert same(v.asnumpy(), qparams[k].asnumpy())
            else:
                qparams_ground_truth = mx.contrib.quant._quantize_params(qsym, params)
                assert len(qparams) == len(qparams_ground_truth)
                for k, v in qparams_ground_truth.items():
                    assert k in qparams
                    assert same(v.asnumpy(), qparams[k].asnumpy())

        def check_qsym_calibrated(qsym):
            attrs = qsym.attr_dict()
            for k, v in attrs.items():
                if k.find('requantize_') != -1:
                    assert 'min_calib_range' in v
                    assert 'max_calib_range' in v

        def check_qsym_qdtype(qsym, qdtype):
            attrs = qsym.attr_dict()
            for k, v in attrs.items():
                if k.find('_quantize') != -1:
                    assert 'out_type' in v
                    assert v['out_type'] == qdtype

        def check_qsym_forward(qsym, qarg_params, qaux_params, data_shape, label_shape):
            mod = mx.mod.Module(symbol=qsym, context=mx.current_context())
            mod.bind(for_training=False,
                     data_shapes=[('data', data_shape)],
                     label_shapes=[('softmax_label', label_shape)])
            mod.set_params(qarg_params, qaux_params)
            data = [mx.random.uniform(-1.0, 1.0, shape=shape) for _, shape in mod.data_shapes]
            batch = mx.io.DataBatch(data, [])
            mod.forward(batch, is_train=False)
            for output in mod.get_outputs():
                output.wait_to_read()
             

        sym = get_fp32_residual()
        mod = Module(symbol=sym)
        batch_size = 4
        data_shape = (batch_size, 4, 10, 10)
        label_shape = (batch_size, 10)
        mod.bind(data_shapes=[('data', data_shape)], label_shapes=[('softmax_label', label_shape)])
        mod.init_params()
        arg_params, aux_params = mod.get_params()
        excluded_sym_names = []
        if mx.current_context() == mx.cpu():
           excluded_sym_names += ['fc']
        qsym, qarg_params, qaux_params = mx.contrib.quant.quantize_model(sym=sym,
                                                                         arg_params=arg_params,
                                                                         aux_params=aux_params,
                                                                         excluded_sym_names=excluded_sym_names,
                                                                         ctx=mx.current_context(),
                                                                         quantized_dtype=qdtype,
                                                                         calib_mode='none')
        check_params(arg_params, qarg_params, qsym)
        check_params(aux_params, qaux_params)
        check_qsym_forward(qsym, qarg_params, qaux_params, data_shape, label_shape)

        calib_data = mx.nd.random.uniform(shape=data_shape)
        calib_data = NDArrayIter(data=calib_data)
        calib_data = DummyIter(calib_data)
        qsym, qarg_params, qaux_params = mx.contrib.quant.quantize_model(sym=sym,
                                                                         arg_params=arg_params,
                                                                         aux_params=aux_params,
                                                                         excluded_sym_names=excluded_sym_names,
                                                                         ctx=mx.current_context(),
                                                                         quantized_dtype=qdtype,
                                                                         calib_mode='naive',
                                                                         calib_data=calib_data,
                                                                         num_calib_examples=20)
        check_params(arg_params, qarg_params, qsym)
        check_params(aux_params, qaux_params)
        check_qsym_calibrated(qsym)
        check_qsym_qdtype(qsym, qdtype)
        check_qsym_forward(qsym, qarg_params, qaux_params, data_shape, label_shape)
Exemple #41
0
def check_quantize(sym,
                   data_shape,
                   out_type,
                   name='conv',
                   check_calibration=True,
                   gluon_forward=False,
                   check_scale_align=False):
    if name in config:
        name = config[name][OP_NAME]
    sym_sg = sym.get_backend_symbol(QUANTIZE_SG_PASS_NAME)
    mod = Module(symbol=sym, label_names=None)
    mod.bind(for_training=False, data_shapes=[('data', data_shape)])
    mod.init_params(mx.init.Normal(0.5))
    arg_params, aux_params = mod.get_params()

    if out_type == 'uint8':
        data = [
            mx.random.uniform(0.0, 1.0, shape=shape, ctx=mx.current_context())
            for _, shape in mod.data_shapes
        ]
    else:
        data = [
            mx.random.uniform(-1.0, 1.0, shape=shape, ctx=mx.current_context())
            for _, shape in mod.data_shapes
        ]
    batch = mx.io.DataBatch(data, [])

    mod.forward(batch, is_train=False)
    for output in mod.get_outputs():
        output.wait_to_read()
    ref_out = mod.get_outputs()

    excluded_sym_names = []
    if mx.current_context() == mx.cpu() and gluon_forward == True:
        excluded_sym_names += ['sg_mkldnn_fully_connected_0']
        excluded_sym_names += ['fc_softmax']

    calib_data = CalibIter(batch, data_shape, 1)

    qsym, qarg_params, qaux_params = mx.contrib.quant.quantize_model(
        sym=sym_sg,
        arg_params=arg_params,
        aux_params=aux_params,
        ctx=mx.current_context(),
        excluded_sym_names=excluded_sym_names,
        quantized_dtype=out_type,
        calib_mode='naive',
        calib_data=calib_data,
        calib_layer=None,
        label_names=None,
        num_calib_examples=1)
    qsym = qsym.get_backend_symbol(QUANTIZE_SG_PASS_NAME)
    if check_calibration:
        check_qsym_calibrated(qsym, out_type, name=name)
    if check_scale_align:
        check_qsym_scale_align(qsym)
    if gluon_forward == True:
        check_qsym_gluon_forward(qsym, qarg_params, qaux_params, data_shape)
    else:
        quantized_out = check_qsym_forward(qsym, qarg_params, qaux_params,
                                           batch, data_shape)
        for i in range(len(ref_out)):
            min_range = mx.nd.min(ref_out[i]).asscalar()
            max_range = mx.nd.max(ref_out[i]).asscalar()
            atol = 0.1 * max(abs(min_range), abs(max_range))
            assert_almost_equal_with_err(quantized_out[i].asnumpy(),
                                         ref_out[i].asnumpy(),
                                         rtol=0.1,
                                         atol=atol,
                                         etol=0.2)
        check_qsym_dummy_forward(qsym, batch, data_shape)
Exemple #42
0
def _test_backward_template(version: str):
    """
    This method serves as a base method for all backward tests.
    It tests for a given version of Rational whether Rational can be integrated into a small
    MxNet model. It also tests whether the rational activation function's coefficients
    (weights) are updated

    :param version: version of the rational activation function
    """

    # define network
    net = nn.Sequential()
    net.add(nn.Dense(128, activation='relu'))
    net.add(nn.Dense(64, activation='relu'))
    # insert a rational activation function as a layer
    fut = Rational(version=version)
    net.add(fut)
    net.add(nn.Dense(10))

    gpus = mx.test_utils.list_gpus()
    # include current context, so parameters can be read from this test method
    ctx = [mx.gpu(), mx.current_context()] if gpus else [mx.cpu(0), mx.cpu(1)]
    net.initialize(ctx=ctx)
    trainer = gluon.Trainer(net.collect_params(), 'sgd',
                            {'learning_rate': 0.02})

    # copy the old coefficient values
    nums_before_training = fut.numerator.data(mx.current_context()).asnumpy()
    dens_before_training = fut.denominator.data(mx.current_context()).asnumpy()

    # Use Accuracy as the evaluation metric.
    metric = mx.metric.Accuracy()
    softmax_cross_entropy_loss = gluon.loss.SoftmaxCrossEntropyLoss()
    # Reset the train data iterator.
    train_data.reset()
    # Loop over the train data iterator.
    for batch in train_data:
        # Splits train data into multiple slices along batch_axis
        # and copy each slice into a context.
        data = gluon.utils.split_and_load(batch.data[0],
                                          ctx_list=ctx,
                                          batch_axis=0,
                                          even_split=False)
        # Splits train labels into multiple slices along batch_axis
        # and copy each slice into a context.
        label = gluon.utils.split_and_load(batch.label[0],
                                           ctx_list=ctx,
                                           batch_axis=0,
                                           even_split=False)
        outputs = []
        # Inside training scope
        with ag.record():
            for x, y in zip(data, label):
                z = net(x)
                # Computes softmax cross entropy loss.
                loss = softmax_cross_entropy_loss(z, y)
                # back-propagate the error for one iteration.
                loss.backward()
                outputs.append(z)
        # Updates internal evaluation
        metric.update(label, outputs)
        # Make one step of parameter update. Trainer needs to know the
        # batch size of data to normalize the gradient by 1/batch_size.
        trainer.step(batch.data[0].shape[0])

        # exit after first loop
        break

    # Gets the evaluation result.
    name, acc = metric.get()
    # Reset evaluation result to initial state.
    metric.reset()
    print('training acc: %s=%f' % (name, acc))

    # copy the new coefficient values
    nums_after_training = fut.numerator.data(mx.current_context()).asnumpy()
    dens_after_training = fut.denominator.data(mx.current_context()).asnumpy()

    # check that at least one coefficient changed in numerators
    assert not np.all(np.equal(nums_before_training, nums_after_training))
    # check that at least one coefficient changed in denominators
    assert not np.all(np.equal(dens_before_training, dens_after_training))
    def check_quantized_fc(data_shape, num_hidden, no_bias, qdtype, flatten=True):
        if mx.current_context().device_type != 'gpu':
            hasMKL = False;
            for key in os.environ.keys():
                if operator.eq(key, "BUILD_TAG"):
                    if os.environ['BUILD_TAG'].find("MKL") != -1:
                        hasMKL = True
                    break
            if hasMKL == False:
                print('skipped testing quantized_fc on cpu since s8u8s32 is only supported by MKL BLAS library')
                return
        elif qdtype == 'uint8' and is_test_for_gpu():
            print('skipped testing quantized_fc for gpu uint8 since it is not supported yet')
            return

        data = mx.sym.Variable(name='data', shape=data_shape, dtype='float32')
        fc_fp32 = mx.sym.FullyConnected(data=data, num_hidden=num_hidden, no_bias=no_bias, flatten=flatten)
        arg_shapes, _, _ = fc_fp32.infer_shape(data=data_shape)
        arg_names = fc_fp32.list_arguments()
        fc_fp32_exe = fc_fp32.simple_bind(ctx=mx.current_context(), grad_req='null')
        if qdtype == 'uint8':
            data_low = 0.0
            data_high = 63.0
        else:
            data_low = -63.0
            data_high = 63.0
        fc_fp32_exe.arg_dict[arg_names[0]][:] = mx.nd.random.uniform(low=data_low, high=data_high,
                                                                     shape=data_shape).astype('int32')
        fc_fp32_exe.arg_dict[arg_names[1]][:] = mx.nd.random.uniform(low=data_low, high=data_high,
                                                                     shape=arg_shapes[1]).astype('int32')
        if not no_bias:
            fc_fp32_exe.arg_dict[arg_names[2]][:] = mx.nd.random.uniform(low=data_low, high=data_high,
                                                                         shape=arg_shapes[2]).astype('int32')
        output = fc_fp32_exe.forward()[0]

        qdata = mx.sym.Variable(name='qdata', shape=data_shape, dtype='int8')
        fc_int8 = mx.sym.contrib.quantized_fully_connected(data=qdata, num_hidden=num_hidden,
                                                           no_bias=no_bias, flatten=flatten)
        qarg_names = fc_int8.list_arguments()
        type_dict = {qarg_names[1]: 'int8'}
        if not no_bias:
            type_dict.update({qarg_names[2]: 'int8'})
        fc_int8_exe = fc_int8.simple_bind(ctx=mx.current_context(), type_dict=type_dict, grad_req='null')
        fc_int8_exe.arg_dict[qarg_names[0]][:] = fc_fp32_exe.arg_dict[arg_names[0]].astype(qdtype)
        fc_int8_exe.arg_dict[qarg_names[1]][:] = fc_fp32_exe.arg_dict[arg_names[1]].astype('int8')
        quantized_range = 127.0
        if no_bias:
            fc_int8_exe.arg_dict[qarg_names[2]][:] = -quantized_range
            fc_int8_exe.arg_dict[qarg_names[3]][:] = quantized_range
            fc_int8_exe.arg_dict[qarg_names[4]][:] = -quantized_range
            fc_int8_exe.arg_dict[qarg_names[5]][:] = quantized_range
        else:
            fc_int8_exe.arg_dict[qarg_names[2]][:] = fc_fp32_exe.arg_dict[arg_names[2]].astype('int8')
            fc_int8_exe.arg_dict[qarg_names[3]][:] = -quantized_range
            fc_int8_exe.arg_dict[qarg_names[4]][:] = quantized_range
            fc_int8_exe.arg_dict[qarg_names[5]][:] = -quantized_range
            fc_int8_exe.arg_dict[qarg_names[6]][:] = quantized_range
            fc_int8_exe.arg_dict[qarg_names[7]][:] = -quantized_range
            fc_int8_exe.arg_dict[qarg_names[8]][:] = quantized_range
        qoutput, min_range, max_range = fc_int8_exe.forward()

        if no_bias:
            assert_almost_equal(output.asnumpy(), qoutput.asnumpy())
        else:
            # with adding bias, accuracy loss should not be greater than one
            diff = mx.nd.abs(output - qoutput.astype(output.dtype))
            cond = mx.nd.lesser(2, diff).sum().asscalar()
            assert cond == 0
Exemple #44
0
 def _current_context(self):
     if has_gpu:
         return mx.gpu(hvd.local_rank())
     else:
         return mx.current_context()
Exemple #45
0
#pylint: skip-file
import mxnet as mx

a = mx.narray.create((3000, 4000))
b = mx.narray.create((3000, 4000))
a.numpy[:] = 10
b.numpy[:] = 11
print(a.numpy)

c = b * a

cc = mx.op.mul(b, a)

print(c.context)
print(cc.numpy)
d = c.copyto(mx.Context('cpu', 0))

print(d.numpy)

with mx.Context('gpu', 0) as ctx:
    # gpu operations
    print mx.current_context()
    print ctx
    a_gpu = a.copyto(ctx)
    b_gpu = b.copyto(ctx)
    c_gpu = b * a

d_cpu = c_gpu.copyto(mx.current_context())
print d_cpu.numpy

def is_test_for_gpu():
    return mx.current_context().device_type == 'gpu'
Exemple #47
0
def check_fusion(sym,
                 data_shape,
                 attrs_dict,
                 check_fp32_fusion=True,
                 check_quantization=True,
                 out_types=['uint8', 'int8', 'auto']):
    if check_fp32_fusion:
        sym_sg = sym.get_backend_symbol(SG_PASS_NAME)
        for name, attrs in attrs_dict.items():
            if name in config:
                op_name = config[name][OP_NAME]
            else:
                op_name = name
            assert ''.join(
                sym_sg.get_internals().list_outputs()).find(op_name) != -1
            if len(attrs):
                found = False
                for k, v in sym_sg.attr_dict().items():
                    if k.find(op_name) != -1:
                        found = True
                        for attr_name, attr_value in attrs.items():
                            assert v[attr_name].lower() == attr_value.lower()
                assert found

        arg_shapes, _, aux_shapes = sym.infer_shape()
        arg_array = [
            mx.nd.random.uniform(-1.0, 1.0, shape=shape)
            for shape in arg_shapes
        ]
        aux_array = [mx.nd.random.uniform(shape=shape) for shape in aux_shapes]
        exe = sym.bind(ctx=mx.current_context(),
                       args=arg_array,
                       aux_states=aux_array,
                       grad_req='null')
        exe.forward()
        os.environ['MXNET_SUBGRAPH_BACKEND'] = SG_PASS_NAME
        exe_sg = sym.bind(ctx=mx.current_context(),
                          args=arg_array,
                          aux_states=aux_array,
                          grad_req='null')
        exe_sg.forward()
        del os.environ['MXNET_SUBGRAPH_BACKEND']
        for i in range(len(exe.outputs)):
            assert_almost_equal(exe.outputs[i].asnumpy(),
                                exe_sg.outputs[i].asnumpy(),
                                rtol=1e-3,
                                atol=1e-1)

    if check_quantization:
        # fp32 to int8
        for out_type in out_types:
            check_quantize(sym, data_shape, out_type, name=op_name)
            # TODO(ciyong), since quantized fc save its params in int8, while gluon treat the default
            # variable from symbol file as fp32 which results in mismatch dtype of params.
            # Skip quantized fc in gluon pass.
            if name != 'fc':
                check_quantize(sym,
                               data_shape,
                               out_type,
                               name=op_name,
                               gluon_forward=True)
def is_test_for_native_cpu():
    return (mx.current_context().device_type == 'cpu'
            and os.environ.get('ENABLE_MKLDNN_QUANTIZATION_TEST') == None)