def _check_subgraph_exe3(sym, subgraph_backend, op_names): """Use the partitioned sym to bind an executor and compare the outputs with those of the original executor""" out = SymbolHandle() check_call(_LIB.MXBuildSubgraphByOpNames(sym.handle, c_str(subgraph_backend), mx_uint(len(op_names)), c_str_array(op_names), ctypes.byref(out))) partitioned_sym = Symbol(out) input_names = sym.list_inputs() arg_names = sym.list_arguments() aux_names = sym.list_auxiliary_states() assert partitioned_sym.list_inputs() == input_names assert partitioned_sym.list_arguments() == arg_names assert partitioned_sym.list_auxiliary_states() == aux_names arg_shapes, _, aux_shapes = sym.infer_shape() arg_array = [mx.nd.random.uniform(shape=shape) for shape in arg_shapes] aux_array = [mx.nd.random.uniform(shape=shape) for shape in aux_shapes] exe = sym.bind(ctx=mx.current_context(), args=arg_array, aux_states=aux_array, grad_req='null') partitioned_exe = partitioned_sym.bind(ctx=mx.current_context(), args=arg_array, aux_states=aux_array, grad_req='null') exe.forward() partitioned_exe.forward() assert len(exe.outputs) == len(partitioned_exe.outputs) for i in range(len(exe.outputs)): assert_almost_equal((exe.outputs[i] - partitioned_exe.outputs[i]).abs().sum().asnumpy(), np.zeros(shape=(1,)))
def check_quantized_pooling(data_shape, kernel, pool_type, pad, stride, global_pool): with mx.Context('gpu', 0): data = mx.sym.Variable(name='data', shape=data_shape, dtype='float32') pooling_fp32 = mx.sym.Pooling(data=data, kernel=kernel, pad=pad, stride=stride, pool_type=pool_type, global_pool=global_pool, cudnn_off=False) arg_shapes, _, _ = pooling_fp32.infer_shape(data=data_shape) arg_names = pooling_fp32.list_arguments() pooling_fp32_exe = pooling_fp32.simple_bind(ctx=mx.current_context(), grad_req='null') pooling_fp32_exe.arg_dict[arg_names[0]][:] = mx.nd.random.uniform(low=-127.0, high=127.0, shape=data_shape).astype('int32') output = pooling_fp32_exe.forward()[0] qdata = mx.sym.Variable(name='qdata', shape=data_shape, dtype='int8') min_data = mx.sym.Variable(name='min_data') max_data = mx.sym.Variable(name='max_data') quantized_pooling = mx.sym.contrib.quantized_pooling(data=qdata, min_data=min_data, max_data=max_data, kernel=kernel, pad=pad, stride=stride, pool_type=pool_type, global_pool=global_pool) pooling_int8_exe = quantized_pooling.simple_bind(ctx=mx.current_context(), grad_req='null') qarg_names = quantized_pooling.list_arguments() pooling_int8_exe.arg_dict[qarg_names[0]][:] = pooling_fp32_exe.arg_dict[arg_names[0]].astype('int8') quantized_range = 127.0 pooling_int8_exe.arg_dict[qarg_names[1]][:] = -quantized_range pooling_int8_exe.arg_dict[qarg_names[2]][:] = quantized_range qoutput, min_range, max_range = pooling_int8_exe.forward() if pool_type == 'max': assert_almost_equal(output.asnumpy(), qoutput.asnumpy()) elif pool_type == 'avg': # for avg pooling, fp32 and int8 may be different due to rounding errors diff = mx.nd.abs(output - qoutput.astype(output.dtype)) cond = mx.nd.lesser(2, diff).sum().asscalar() assert cond == 0
def _check_subgraph_exe1(sym, subgraph_backend, op_names): """Use the partitioned sym to simple_bind an executor and compare the outputs with those of the original executor""" out = SymbolHandle() check_call(_LIB.MXBuildSubgraphByOpNames(sym.handle, c_str(subgraph_backend), mx_uint(len(op_names)), c_str_array(op_names), ctypes.byref(out))) partitioned_sym = Symbol(out) assert partitioned_sym.list_inputs() == sym.list_inputs() assert partitioned_sym.list_arguments() == sym.list_arguments() assert partitioned_sym.list_auxiliary_states() == sym.list_auxiliary_states() exe = sym.simple_bind(ctx=mx.current_context(), grad_req='null') partitioned_exe = partitioned_sym.simple_bind(ctx=mx.current_context(), grad_req='null') input_names = sym.list_inputs() for name in input_names: if name in exe.arg_dict: exe.arg_dict[name][:] = mx.nd.random.uniform(shape=exe.arg_dict[name].shape) partitioned_exe.arg_dict[name][:] = exe.arg_dict[name] else: assert name in exe.aux_dict exe.aux_dict[name][:] = mx.nd.random.uniform(shape=exe.aux_dict[name].shape) partitioned_exe.aux_dict[name][:] = exe.aux_dict[name] exe.forward() partitioned_exe.forward() assert len(exe.outputs) == len(partitioned_exe.outputs) for i in range(len(exe.outputs)): assert_almost_equal((exe.outputs[i] - partitioned_exe.outputs[i]).abs().sum().asnumpy(), np.zeros(shape=(1,)))
def check_quantized_conv(data_shape, kernel, num_filter, pad, stride, no_bias): with mx.Context('gpu', 0): # run fp32 conv data = mx.sym.Variable(name='data', shape=data_shape, dtype='float32') conv2d = mx.sym.Convolution(data=data, kernel=kernel, num_filter=num_filter, pad=pad, stride=stride, no_bias=no_bias, cudnn_off=False, name='conv2d') arg_shapes, _, _ = conv2d.infer_shape(data=data_shape) arg_names = conv2d.list_arguments() conv_exe_fp32 = conv2d.simple_bind(ctx=mx.current_context(), grad_req='null') conv_exe_fp32.arg_dict[arg_names[0]][:] = mx.nd.random.uniform(low=-127.0, high=127.0, shape=data_shape).astype('int32') conv_exe_fp32.arg_dict[arg_names[1]][:] = mx.nd.random.uniform(low=-127.0, high=127.0, shape=arg_shapes[1]).astype('int32') if not no_bias: conv_exe_fp32.arg_dict[arg_names[2]][:] = mx.nd.random.uniform(low=-127.0, high=127.0, shape=arg_shapes[2]).astype('int32') output = conv_exe_fp32.forward()[0] # run quantized conv qdata = mx.sym.Variable(name='qdata', shape=data_shape, dtype='int8') qweight = mx.sym.Variable(name='qweight', dtype='int8') min_data = mx.sym.Variable(name='min_data') max_data = mx.sym.Variable(name='max_data') min_weight = mx.sym.Variable(name='min_weight') max_weight = mx.sym.Variable(name='max_weight') quantized_conv2d = mx.sym.contrib.quantized_conv(data=qdata, weight=qweight, min_data=min_data, max_data=max_data, min_weight=min_weight, max_weight=max_weight, kernel=kernel, num_filter=num_filter, pad=pad, stride=stride, no_bias=no_bias) qarg_names = quantized_conv2d.list_arguments() type_dict = None if not no_bias: type_dict = {qarg_names[2]: 'int8'} conv_exe_int8 = quantized_conv2d.simple_bind(ctx=mx.current_context(), type_dict=type_dict, grad_req='null') conv_exe_int8.arg_dict[qarg_names[0]][:] = conv_exe_fp32.arg_dict[arg_names[0]].astype('int8') conv_exe_int8.arg_dict[qarg_names[1]][:] = conv_exe_fp32.arg_dict[arg_names[1]].astype('int8') quantized_range = 127.0 if no_bias: conv_exe_int8.arg_dict[qarg_names[2]][:] = -quantized_range conv_exe_int8.arg_dict[qarg_names[3]][:] = quantized_range conv_exe_int8.arg_dict[qarg_names[4]][:] = -quantized_range conv_exe_int8.arg_dict[qarg_names[5]][:] = quantized_range else: conv_exe_int8.arg_dict[qarg_names[2]][:] = conv_exe_fp32.arg_dict[arg_names[2]].astype('int8') conv_exe_int8.arg_dict[qarg_names[3]][:] = -quantized_range conv_exe_int8.arg_dict[qarg_names[4]][:] = quantized_range conv_exe_int8.arg_dict[qarg_names[5]][:] = -quantized_range conv_exe_int8.arg_dict[qarg_names[6]][:] = quantized_range conv_exe_int8.arg_dict[qarg_names[7]][:] = -quantized_range conv_exe_int8.arg_dict[qarg_names[8]][:] = quantized_range qoutput, min_range, max_range = conv_exe_int8.forward() if no_bias: assert_almost_equal(output.asnumpy(), qoutput.asnumpy()) else: # with adding bias, accuracy loss should not be greater than one diff = mx.nd.abs(output - qoutput.astype(output.dtype)) cond = mx.nd.lesser(2, diff).sum().asscalar() assert cond == 0
def check_quantize_model(qdtype): def check_params(params, qparams, qsym=None): if qsym is None: assert len(params) == len(qparams) for k, v in params.items(): assert k in qparams assert same(v.asnumpy(), qparams[k].asnumpy()) else: qparams_ground_truth = mx.contrib.quant._quantize_params(qsym, params) assert len(qparams) == len(qparams_ground_truth) for k, v in qparams_ground_truth.items(): assert k in qparams assert same(v.asnumpy(), qparams[k].asnumpy()) def check_qsym_calibrated(qsym): attrs = qsym.attr_dict() for k, v in attrs.items(): if k.find('requantize_') != -1: assert 'min_calib_range' in v assert 'max_calib_range' in v def check_qsym_qdtype(qsym, qdtype): attrs = qsym.attr_dict() for k, v in attrs.items(): if k.find('_quantize') != -1: assert 'out_type' in v assert v['out_type'] == qdtype sym = get_fp32_sym() mod = Module(symbol=sym) batch_size = 4 data_shape = (batch_size, 4, 10, 10) label_shape = (batch_size, 10) mod.bind(data_shapes=[('data', data_shape)], label_shapes=[('softmax_label', label_shape)]) mod.init_params() arg_params, aux_params = mod.get_params() qsym, qarg_params, qaux_params = mx.contrib.quant.quantize_model(sym=sym, arg_params=arg_params, aux_params=aux_params, ctx=mx.current_context(), quantized_dtype=qdtype, calib_mode='none') check_params(arg_params, qarg_params, qsym) check_params(aux_params, qaux_params) calib_data = mx.nd.random.uniform(shape=data_shape) calib_data = NDArrayIter(data=calib_data) calib_data = DummyIter(calib_data) qsym, qarg_params, qaux_params = mx.contrib.quant.quantize_model(sym=sym, arg_params=arg_params, aux_params=aux_params, ctx=mx.current_context(), quantized_dtype=qdtype, calib_mode='naive', calib_data=calib_data, num_calib_examples=20) check_params(arg_params, qarg_params, qsym) check_params(aux_params, qaux_params) check_qsym_calibrated(qsym) check_qsym_qdtype(qsym, qdtype)
def check_quantized_fc(data_shape, num_hidden, no_bias, flatten=True): with mx.Context('gpu', 0): data = mx.sym.Variable(name='data', shape=data_shape, dtype='float32') fc_fp32 = mx.sym.FullyConnected(data=data, num_hidden=num_hidden, no_bias=no_bias, flatten=flatten) arg_shapes, _, _ = fc_fp32.infer_shape(data=data_shape) arg_names = fc_fp32.list_arguments() fc_fp32_exe = fc_fp32.simple_bind(ctx=mx.current_context(), grad_req='null') fc_fp32_exe.arg_dict[arg_names[0]][:] = mx.nd.random.uniform(low=-127.0, high=127.0, shape=data_shape).astype('int32') fc_fp32_exe.arg_dict[arg_names[1]][:] = mx.nd.random.uniform(low=-127.0, high=127.0, shape=arg_shapes[1]).astype('int32') if not no_bias: fc_fp32_exe.arg_dict[arg_names[2]][:] = mx.nd.random.uniform(low=-127.0, high=127.0, shape=arg_shapes[2]).astype('int32') output = fc_fp32_exe.forward()[0] qdata = mx.sym.Variable(name='qdata', shape=data_shape, dtype='int8') fc_int8 = mx.sym.contrib.quantized_fully_connected(data=qdata, num_hidden=num_hidden, no_bias=no_bias, flatten=flatten) qarg_names = fc_int8.list_arguments() type_dict = {qarg_names[1]: 'int8'} if not no_bias: type_dict.update({qarg_names[2]: 'int8'}) fc_int8_exe = fc_int8.simple_bind(ctx=mx.current_context(), type_dict=type_dict, grad_req='null') fc_int8_exe.arg_dict[qarg_names[0]][:] = fc_fp32_exe.arg_dict[arg_names[0]].astype('int8') fc_int8_exe.arg_dict[qarg_names[1]][:] = fc_fp32_exe.arg_dict[arg_names[1]].astype('int8') quantized_range = 127.0 if no_bias: fc_int8_exe.arg_dict[qarg_names[2]][:] = -quantized_range fc_int8_exe.arg_dict[qarg_names[3]][:] = quantized_range fc_int8_exe.arg_dict[qarg_names[4]][:] = -quantized_range fc_int8_exe.arg_dict[qarg_names[5]][:] = quantized_range else: fc_int8_exe.arg_dict[qarg_names[2]][:] = fc_fp32_exe.arg_dict[arg_names[2]].astype('int8') fc_int8_exe.arg_dict[qarg_names[3]][:] = -quantized_range fc_int8_exe.arg_dict[qarg_names[4]][:] = quantized_range fc_int8_exe.arg_dict[qarg_names[5]][:] = -quantized_range fc_int8_exe.arg_dict[qarg_names[6]][:] = quantized_range fc_int8_exe.arg_dict[qarg_names[7]][:] = -quantized_range fc_int8_exe.arg_dict[qarg_names[8]][:] = quantized_range qoutput, min_range, max_range = fc_int8_exe.forward() if no_bias: assert_almost_equal(output.asnumpy(), qoutput.asnumpy()) else: # with adding bias, accuracy loss should not be greater than one diff = mx.nd.abs(output - qoutput.astype(output.dtype)) cond = mx.nd.lesser(2, diff).sum().asscalar() assert cond == 0
def testSmall(data, repeat1, repeat2): # Check that the shuffling is along the first axis. # The order of the elements in each subarray must not change. # This takes long time so `repeat1` need to be small. for i in range(repeat1): ret = mx.nd.random.shuffle(data) check_first_axis_shuffle(ret) # Count the number of each different outcome. # The sequence composed of the first elements of the subarrays is enough to discriminate # the outcomes as long as the order of the elements in each subarray does not change. count = {} stride = int(data.size / data.shape[0]) for i in range(repeat2): ret = mx.nd.random.shuffle(data) h = str(ret.reshape((ret.size,))[::stride]) c = count.get(h, 0) count[h] = c + 1 # Check the total number of possible outcomes. # If `repeat2` is not large enough, this could fail with high probability. assert len(count) == math.factorial(data.shape[0]) # The outcomes must be uniformly distributed. # If `repeat2` is not large enough, this could fail with high probability. for p in itertools.permutations(range(0, data.size - stride + 1, stride)): err = abs(1. * count[str(mx.nd.array(p))] / repeat2 - 1. / math.factorial(data.shape[0])) assert err < 0.01, "The absolute error {} is larger than the tolerance.".format(err) # Check symbol interface a = mx.sym.Variable('a') b = mx.sym.random.shuffle(a) c = mx.sym.random.shuffle(data=b, name='c') d = mx.sym.sort(c, axis=0) assert (d.eval(a=data, ctx=mx.current_context())[0] == data).prod() == 1
def check_quantized_pooling(data_shape, kernel, pool_type, pad, stride, global_pool, qdtype, convention='valid'): if is_test_for_native_cpu(): print('skipped testing quantized_pooling for native cpu since it is not supported yet') return elif qdtype == 'uint8' and is_test_for_gpu(): print('skipped testing quantized_pooling for gpu uint8 since it is not supported yet') return data = mx.sym.Variable(name='data', shape=data_shape, dtype='float32') pooling_fp32 = mx.sym.Pooling(data=data, kernel=kernel, pad=pad, stride=stride, pool_type=pool_type, global_pool=global_pool, cudnn_off=False, pooling_convention=convention) arg_shapes, _, _ = pooling_fp32.infer_shape(data=data_shape) arg_names = pooling_fp32.list_arguments() pooling_fp32_exe = pooling_fp32.simple_bind(ctx=mx.current_context(), grad_req='null') if qdtype == 'uint8': data_low = 0.0 data_high = 127.0 else: data_low = -127.0 data_high = 127.0 pooling_fp32_exe.arg_dict[arg_names[0]][:] = mx.nd.random.uniform(low=data_low, high=data_high, shape=data_shape).astype('int32') output = pooling_fp32_exe.forward()[0] qdata = mx.sym.Variable(name='qdata', shape=data_shape, dtype=qdtype) min_data = mx.sym.Variable(name='min_data') max_data = mx.sym.Variable(name='max_data') quantized_pooling = mx.sym.contrib.quantized_pooling(data=qdata, min_data=min_data, max_data=max_data, kernel=kernel, pad=pad, stride=stride, pool_type=pool_type, global_pool=global_pool, pooling_convention=convention) pooling_int8_exe = quantized_pooling.simple_bind(ctx=mx.current_context(), grad_req='null') qarg_names = quantized_pooling.list_arguments() pooling_int8_exe.arg_dict[qarg_names[0]][:] = pooling_fp32_exe.arg_dict[arg_names[0]].astype(qdtype) quantized_range = 127.0 pooling_int8_exe.arg_dict[qarg_names[1]][:] = -quantized_range pooling_int8_exe.arg_dict[qarg_names[2]][:] = quantized_range qoutput, min_range, max_range = pooling_int8_exe.forward() if pool_type == 'max': assert_almost_equal(output.asnumpy(), qoutput.asnumpy()) elif pool_type == 'avg': # for avg pooling, fp32 and int8 may be different due to rounding errors diff = mx.nd.abs(output - qoutput.astype(output.dtype)) cond = mx.nd.lesser(2, diff).sum().asscalar() assert cond == 0
def check_quantize(sym, data_shape, check_conv=True): fc = mx.sym.FullyConnected(data=sym, num_hidden=10, flatten=True, name='fc') sym = mx.sym.SoftmaxOutput(data=fc, name='softmax') sym_sg = sym.get_backend_symbol("MKLDNN") label_shape = (data_shape[0], 10) mod = Module(symbol=sym) mod.bind(for_training=False, data_shapes=[('data', data_shape)], label_shapes=[('softmax_label', label_shape)]) mod.init_params(mx.init.Normal(0.5)) arg_params, aux_params = mod.get_params() data = [mx.random.uniform(-1, 1, shape=shape, ctx=mx.current_context()) for _, shape in mod.data_shapes] batch = mx.io.DataBatch(data, []) mod.forward(batch, is_train=False) for output in mod.get_outputs(): output.wait_to_read() ref_out = mod.get_outputs() excluded_sym_names = [] if mx.current_context() == mx.cpu(): excluded_sym_names += ['fc'] calib_data = mx.nd.random.uniform(shape=data_shape) calib_data = NDArrayIter(data=calib_data) calib_data = DummyIter(calib_data) calib_layer = lambda name: name.endswith('_output') qsym, qarg_params, qaux_params = mx.contrib.quant.quantize_model(sym=sym_sg, arg_params=arg_params, aux_params=aux_params, ctx=mx.current_context(), excluded_sym_names=excluded_sym_names, quantized_dtype='uint8', calib_mode='naive', calib_data=calib_data, calib_layer=calib_layer, calib_quantize_op=True, num_calib_examples=5) qsym = qsym.get_backend_symbol("MKLDNN_POST_QUANTIZE") if check_conv: check_qsym_calibrated(qsym) quantized_out = check_qsym_forward(qsym, qarg_params, qaux_params, batch, data_shape, label_shape) for i in range(len(ref_out)): assert_almost_equal(ref_out[i].asnumpy(), quantized_out[i].asnumpy(), atol = 1) check_qsym_dummy_forward(qsym, batch, data_shape, label_shape)
def check_qsym_dummy_forward(qsym, batch, data_shape, label_shape): mod = mx.mod.Module(symbol=qsym, context=mx.current_context()) mod.bind(for_training=False, data_shapes=[('data', data_shape)], label_shapes=[('softmax_label', label_shape)]) mod.init_params(initializer=mx.init.Xavier(magnitude=2.)) mod.forward(batch, is_train=False) for output in mod.get_outputs(): output.wait_to_read() return mod.get_outputs()
def check_qsym_forward(qsym, qarg_params, qaux_params, batch, data_shape, label_shape): mod = mx.mod.Module(symbol=qsym, context=mx.current_context()) mod.bind(for_training=False, data_shapes=[('data', data_shape)], label_shapes=[('softmax_label', label_shape)]) mod.set_params(qarg_params, qaux_params) mod.forward(batch, is_train=False) for output in mod.get_outputs(): output.wait_to_read() return mod.get_outputs()
def check_qsym_forward(qsym, qarg_params, qaux_params, data_shape, label_shape): mod = mx.mod.Module(symbol=qsym, context=mx.current_context()) mod.bind(for_training=False, data_shapes=[('data', data_shape)], label_shapes=[('softmax_label', label_shape)]) mod.set_params(qarg_params, qaux_params) data = [mx.random.uniform(-1.0, 1.0, shape=shape) for _, shape in mod.data_shapes] batch = mx.io.DataBatch(data, []) mod.forward(batch, is_train=False) for output in mod.get_outputs(): output.wait_to_read()
def check_fusion(sym, data_shape, attrs_op): sym_sg = sym.get_backend_symbol("MKLDNN") assert ''.join(sym_sg.get_internals().list_outputs()).find('sg_mkldnn_conv') != -1 for k, v in sym_sg.attr_dict().items(): if k.find('sg_mkldnn_conv') != -1: for attr_op in attrs_op: assert v[attr_op] == 'true' arg_shapes, _, aux_shapes = sym.infer_shape() arg_array = [mx.nd.random.uniform(-1, 1, shape=shape) for shape in arg_shapes] aux_array = [mx.nd.random.uniform(shape=shape) for shape in aux_shapes] exe = sym.bind(ctx=mx.current_context(), args=arg_array, aux_states=aux_array, grad_req='null') exe.forward() os.environ['MXNET_SUBGRAPH_BACKEND'] = 'MKLDNN' exe_sg = sym.bind(ctx=mx.current_context(), args=arg_array, aux_states=aux_array, grad_req='null') exe_sg.forward() del os.environ['MXNET_SUBGRAPH_BACKEND'] for i in range(len(exe.outputs)): assert_almost_equal(exe.outputs[i].asnumpy(), exe_sg.outputs[i].asnumpy(), rtol=1e-3, atol=1e-3) # fp32 to uint8 check_quantize(sym, data_shape)
def check_qsym_forward(qsym, qarg_params, qaux_params, data_shape, label_shape): mod = mx.mod.Module(symbol=qsym, context=mx.current_context()) mod.bind(for_training=False, data_shapes=[('data', data_shape)], label_shapes=[('softmax_label', label_shape)]) mod.set_params(qarg_params, qaux_params) data = [ mx.random.uniform(-1.0, 1.0, shape=shape) for _, shape in mod.data_shapes ] batch = mx.io.DataBatch(data, []) mod.forward(batch, is_train=False) for output in mod.get_outputs(): output.wait_to_read()
def test_sdml_loss(): N = 5 # number of samples DIM = 10 # Dimensionality EPOCHS = 20 # Generate randomized data and 'positive' samples data = mx.random.uniform(-1, 1, shape=(N, DIM)) pos = data + mx.random.uniform(-0.1, 0.1, shape=(N, DIM)) # correlated paired data data_iter = mx.io.NDArrayIter({'data': data, 'pos': pos}, batch_size=N) # Init model and trainer sdml_loss = gluon.loss.SDMLLoss() model = gluon.nn.Dense(DIM, activation='tanh') # Simple NN encoder model.initialize(mx.init.Xavier(), ctx=mx.current_context()) trainer = gluon.Trainer(model.collect_params(), 'adam', {'learning_rate': 0.1}) for i in range(EPOCHS): # Training loop data_iter.reset() for iter_batch in data_iter: batch = [ datum.as_in_context(mx.current_context()) for datum in iter_batch.data ] with autograd.record(): data, pos = batch z_data, z_pos = model(data), model(pos) loss = sdml_loss(z_data, z_pos) loss.backward() trainer.step(1) # After training euclidean distance between aligned pairs should be lower than all non-aligned pairs avg_loss = loss.sum() / len(loss) assert (avg_loss < 0.05)
def finish_update(self, optimizer): if self._optimizer is None: self._optimizer, self._trainer = self._create_optimizer(optimizer) if getattr(optimizer, "grad_clip", None): ctx = mx.current_context() grads = [ i.grad(ctx) for i in self._model.collect_params().values() if i._grad is not None ] mxnet.gluon.utils.clip_global_norm(grads, optimizer.grad_clip) if self._trainer: self._trainer.step(1) for param in self._model.collect_params().values(): param.zero_grad() self._update_mxnet_averages(optimizer)
def check_subgraph_exe1(sym, subgraph_backend, op_names): """Use the partitioned sym to simple_bind an executor and compare the outputs with those of the original executor""" out = SymbolHandle() check_call( _LIB.MXBuildSubgraphByOpNames(sym.handle, c_str(subgraph_backend), mx_uint(len(op_names)), c_str_array(op_names), ctypes.byref(out))) partitioned_sym = Symbol(out) assert partitioned_sym.list_inputs() == sym.list_inputs() assert partitioned_sym.list_arguments() == sym.list_arguments() assert partitioned_sym.list_auxiliary_states( ) == sym.list_auxiliary_states() exe = sym.simple_bind(ctx=mx.current_context(), grad_req='null') partitioned_exe = partitioned_sym.simple_bind(ctx=mx.current_context(), grad_req='null') input_names = sym.list_inputs() for name in input_names: if name in exe.arg_dict: exe.arg_dict[name][:] = mx.nd.random.uniform( shape=exe.arg_dict[name].shape) partitioned_exe.arg_dict[name][:] = exe.arg_dict[name] else: assert name in exe.aux_dict exe.aux_dict[name][:] = mx.nd.random.uniform( shape=exe.aux_dict[name].shape) partitioned_exe.aux_dict[name][:] = exe.aux_dict[name] exe.forward() partitioned_exe.forward() assert len(exe.outputs) == len(partitioned_exe.outputs) for i in range(len(exe.outputs)): assert_almost_equal((exe.outputs[i] - partitioned_exe.outputs[i]).abs().sum().asnumpy(), np.zeros(shape=(1, )))
def test_subgraph_exe3(sym, subgraph_backend, op_names): """Use the partitioned sym to bind an executor and compare the outputs with those of the original executor""" sym, _, _ = sym out = SymbolHandle() check_call( _LIB.MXBuildSubgraphByOpNames(sym.handle, c_str(subgraph_backend), mx_uint(len(op_names)), c_str_array(op_names), ctypes.byref(out))) partitioned_sym = Symbol(out) input_names = sym.list_inputs() arg_names = sym.list_arguments() aux_names = sym.list_auxiliary_states() assert partitioned_sym.list_inputs() == input_names assert partitioned_sym.list_arguments() == arg_names assert partitioned_sym.list_auxiliary_states() == aux_names arg_shapes, _, aux_shapes = sym.infer_shape() arg_array = [mx.nd.random.uniform(shape=shape) for shape in arg_shapes] aux_array = [mx.nd.random.uniform(shape=shape) for shape in aux_shapes] exe = sym._bind(ctx=mx.current_context(), args=arg_array, aux_states=aux_array, grad_req='null') partitioned_exe = partitioned_sym._bind(ctx=mx.current_context(), args=arg_array, aux_states=aux_array, grad_req='null') exe.forward() partitioned_exe.forward() assert len(exe.outputs) == len(partitioned_exe.outputs) for i in range(len(exe.outputs)): assert_almost_equal((exe.outputs[i] - partitioned_exe.outputs[i]).abs().sum().asnumpy(), np.zeros(shape=(1, )))
def get_executor(sym, subgraph_backend=None, op_names=None, original_exec=None): exe = sym._simple_bind(ctx=mx.current_context(), grad_req='null') input_names = sym.list_inputs() for name in input_names: if name in exe.arg_dict: exe.arg_dict[name][:] = mx.nd.random.uniform(shape=exe.arg_dict[name].shape)\ if original_exec is None else original_exec.arg_dict[name] else: assert name in exe.aux_dict exe.aux_dict[name][:] = mx.nd.random.uniform(shape=exe.aux_dict[name].shape)\ if original_exec is None else original_exec.aux_dict[name] exe.forward() return exe
def test_gnmt_encoder_decoder(): ctx = mx.current_context() num_hidden = 8 encoder = GNMTEncoder(cell_type="lstm", num_layers=3, num_bi_layers=1, hidden_size=num_hidden, dropout=0.0, use_residual=True, prefix='gnmt_encoder_') encoder.initialize(ctx=ctx) encoder.hybridize() for output_attention in [True, False]: for use_residual in [True, False]: decoder = GNMTDecoder(cell_type="lstm", num_layers=3, hidden_size=num_hidden, dropout=0.0, output_attention=output_attention, use_residual=use_residual, prefix='gnmt_decoder_') decoder.initialize(ctx=ctx) decoder.hybridize() for batch_size in [4]: for src_seq_length, tgt_seq_length in [(5, 10), (10, 5)]: src_seq_nd = mx.nd.random.normal(0, 1, shape=(batch_size, src_seq_length, 4), ctx=ctx) tgt_seq_nd = mx.nd.random.normal(0, 1, shape=(batch_size, tgt_seq_length, 4), ctx=ctx) src_valid_length_nd = mx.nd.array(np.random.randint(1, src_seq_length, size=(batch_size,)), ctx=ctx) tgt_valid_length_nd = mx.nd.array(np.random.randint(1, tgt_seq_length, size=(batch_size,)), ctx=ctx) src_valid_length_npy = src_valid_length_nd.asnumpy() tgt_valid_length_npy = tgt_valid_length_nd.asnumpy() encoder_outputs, _ = encoder(src_seq_nd, valid_length=src_valid_length_nd) decoder_states = decoder.init_state_from_encoder(encoder_outputs, src_valid_length_nd) # Test multi step forwarding output, new_states, additional_outputs = decoder.decode_seq(tgt_seq_nd, decoder_states, tgt_valid_length_nd) assert(output.shape == (batch_size, tgt_seq_length, num_hidden)) output_npy = output.asnumpy() for i in range(batch_size): tgt_v_len = int(tgt_valid_length_npy[i]) if tgt_v_len < tgt_seq_length - 1: assert((output_npy[i, tgt_v_len:, :] == 0).all()) if output_attention: assert(len(additional_outputs) == 1) attention_out = additional_outputs[0].asnumpy() assert(attention_out.shape == (batch_size, tgt_seq_length, src_seq_length)) for i in range(batch_size): mem_v_len = int(src_valid_length_npy[i]) if mem_v_len < src_seq_length - 1: assert((attention_out[i, :, mem_v_len:] == 0).all()) if mem_v_len > 0: assert_almost_equal(attention_out[i, :, :].sum(axis=-1), np.ones(attention_out.shape[1])) else: assert(len(additional_outputs) == 0)
def test_quantize_whole_model_with_forward(qdtype): batch_size = 4 data_shape = (batch_size, 4, 10, 10) data = mx.sym.Variable('data') conv0 = mx.sym.Convolution(data, kernel=(1, 1), num_filter=16, name='conv0') sym = mx.sym.Convolution(conv0, kernel=(1, 1), num_filter=16, name='conv1') sym_block = mx.gluon.SymbolBlock(outputs=sym, inputs=data) initialize_block_params(sym_block, mx.init.Normal(0.5)) in_data = mx.random.uniform(0.0 if qdtype == 'uint8' else -1.0, 1.0, shape=data_shape) ref_out = sym_block(in_data) excluded_layers = [] calib_data = mx.nd.random.uniform(0.0 if qdtype == 'uint8' else -1.0, 1.0, shape=data_shape) calib_data = mx.gluon.data.DataLoader(calib_data, batch_size=batch_size) qsym = mx.contrib.quantization.quantize_net(sym_block, ctx=mx.current_context(), exclude_layers=excluded_layers, quantized_dtype=qdtype, calib_mode='naive', calib_data=calib_data, num_calib_batches=1, quantize_mode='full') outputs = qsym(in_data) for output in outputs: output.wait_to_read() for i in range(len(ref_out)): min_range = mx.nd.min(ref_out[i]).asscalar() max_range = mx.nd.max(ref_out[i]).asscalar() atol = 0.1 * max(abs(min_range), abs(max_range)) assert_almost_equal_with_err(outputs[i].asnumpy(), ref_out[i].asnumpy(), rtol=0.1, atol=atol, etol=0.2)
def test_array_creation(): dtypes = [_np.int8, _np.int32, _np.float16, _np.float32, _np.float64, None] objects = [[], (), [[1, 2], [3, 4]], _np.random.uniform(size=rand_shape_nd(3)), _np.random.uniform(size=(3, 0, 4))] for dtype in dtypes: for src in objects: mx_arr = np.array(src, dtype=dtype) assert mx_arr.context == mx.current_context() if isinstance(src, mx.nd.NDArray): np_arr = _np.array( src.asnumpy(), dtype=dtype if dtype is not None else _np.float32) else: np_arr = _np.array( src, dtype=dtype if dtype is not None else _np.float32) assert mx_arr.dtype == np_arr.dtype assert same(mx_arr.asnumpy(), np_arr)
def check_fusion(net_original, data_shape, attrs_dict, check_fp32_fusion=True, check_quantization=True, out_types=['uint8', 'int8', 'auto'], dedup_subgraph=True): net_original.initialize() net_original.hybridize(static_alloc=False, static_shape=False) data = mx.random.uniform(shape=data_shape, dtype='float32', ctx=mx.current_context()) net_original(data) net_fusion = copy.copy(net_original) sym, params = net_original.export(None) if check_fp32_fusion: data_min = -1.0 data_max = 1.0 if ''.join(sym.get_internals().list_outputs()).find('sqrt') != -1: check_quantization = False data_min = 0 sym_sg = sym.optimize_for(SG_PASS_NAME, dedup_subgraph=dedup_subgraph, skip_infer=True) for name, attrs in attrs_dict.items(): if name in config: op_name = config[name][OP_NAME] else: op_name = name assert ''.join(sym_sg.get_internals().list_outputs()).find(op_name) != -1 if len(attrs): found = False for k, v in sym_sg.attr_dict().items(): if k.find(op_name) != -1: found = True for attr_name, attr_value in attrs.items(): assert v[attr_name].lower() == attr_value.lower() assert found data = mx.nd.random.uniform(shape=data_shape, low=data_min, high=data_max) out_unfused = net_original(data) net_fusion.optimize_for(data, backend=SG_PASS_NAME) out_fused = net_fusion(data) assert_almost_equal(out_unfused.asnumpy(), out_fused.asnumpy(), rtol=1e-3, atol=1e-1) if check_quantization: # fp32 to int8 for out_type in out_types: check_quantize(net_original, data_shape, out_type, name=name)
def conv3d(x, kernel, strides=(1, 1, 1), border_mode='valid', dim_ordering='default', volume_shape=None, filter_shape=None): '''3D convolution. # Arguments kernel: kernel tensor. strides: strides tuple. border_mode: string, "same" or "valid". dim_ordering: "tf" or "th". Whether to use Theano or TensorFlow dimension ordering for inputs/kernels/ouputs. ''' if dim_ordering == 'default': dim_ordering = image_dim_ordering() if dim_ordering not in {'th', 'tf'}: raise ValueError('Unknown dim_ordering ' + str(dim_ordering)) x = _preprocess_conv3d_input(x, dim_ordering) kernel = _preprocess_conv3d_kernel(kernel, dim_ordering) padding = _preprocess_border_mode(border_mode) data = mx.sym.Variable(name="data") shp = (kernel.shape[2], kernel.shape[3], kernel.shape[4]) conv = mx.sym.Convolution(data=data, kernel=shp, no_bias=True, num_filter=kernel.shape[0], stride=strides, name="conv") executor = conv.bind(ctx=mx.current_context(), args={ 'data': x, 'conv_weight': kernel }) executor.forward() y = executor.outputs[0] return _postprocess_conv3d_output(y, dim_ordering)
def test_np_get_dtype(): dtypes = [_np.int8, _np.int32, _np.float16, _np.float32, _np.float64, _np.bool, _np.bool_, 'int8', 'int32', 'float16', 'float32', 'float64', 'bool', None] objects = [ [], (), [[1, 2], [3, 4]], _np.random.uniform(size=rand_shape_nd(3)), _np.random.uniform(size=(3, 0, 4)) ] for dtype in dtypes: for src in objects: mx_arr = np.array(src, dtype=dtype) assert mx_arr.ctx == mx.current_context() if isinstance(src, mx.nd.NDArray): np_arr = _np.array(src.asnumpy(), dtype=dtype if dtype is not None else _np.float32) else: np_arr = _np.array(src, dtype=dtype if dtype is not None else _np.float32) assert type(mx_arr.dtype) == type(np_arr.dtype)
def get_executor(sym, subgraph_backend=None, op_names=None, original_exec=None): if subgraph_backend is not None: os.environ['MXNET_SUBGRAPH_BACKEND'] = subgraph_backend check_call(_LIB.MXSetSubgraphPropertyOpNames(c_str(subgraph_backend), mx_uint(len(op_names)), c_str_array(op_names))) exe = sym.simple_bind(ctx=mx.current_context(), grad_req='null') input_names = sym.list_inputs() for name in input_names: if name in exe.arg_dict: exe.arg_dict[name][:] = mx.nd.random.uniform(shape=exe.arg_dict[name].shape)\ if original_exec is None else original_exec.arg_dict[name] else: assert name in exe.aux_dict exe.aux_dict[name][:] = mx.nd.random.uniform(shape=exe.aux_dict[name].shape)\ if original_exec is None else original_exec.aux_dict[name] exe.forward() if subgraph_backend is not None: check_call(_LIB.MXRemoveSubgraphPropertyOpNames(c_str(subgraph_backend))) del os.environ['MXNET_SUBGRAPH_BACKEND'] return exe
def check_amp_convert_bucketing_module(): model = train_model(context=mx.current_context()) result_model = amp.convert_bucketing_module(model) val_sent = [] batch_size = 128 invalid_label = -1 num_sentence = 1000 buckets = [5, 10, 20, 30, 40] len_vocab = 50 for _ in range(num_sentence): len_sentence = randint(6, max(buckets) - 1) # leave out the two last buckets empty val_sentence = [] for _ in range(len_sentence): val_sentence.append(randint(1, len_vocab)) val_sent.append(val_sentence) data_val = mx.rnn.BucketSentenceIter(val_sent, batch_size, buckets=buckets, invalid_label=invalid_label) result_model.bind(data_val.provide_data, data_val.provide_label, for_training=False) result_model.score(data_val, mx.metric.Perplexity(invalid_label), batch_end_callback=mx.callback.Speedometer( batch_size, 1)) # AMP conversion with cast_optional_params set to true result_model = amp.convert_bucketing_module(model, cast_optional_params=True) result_model.bind(data_val.provide_data, data_val.provide_label, for_training=False) result_model.score(data_val, mx.metric.Perplexity(invalid_label), batch_end_callback=mx.callback.Speedometer( batch_size, 1))
def get_executor(sym, subgraph_backend=None, op_names=None, original_exec=None): if subgraph_backend is not None: os.environ['MXNET_SUBGRAPH_BACKEND'] = subgraph_backend check_call(_LIB.MXSetSubgraphPropertyOpNames(c_str(subgraph_backend), mx_uint(len(op_names)), c_str_array(op_names))) arg_shapes, _, aux_shapes = sym.infer_shape() if subgraph_backend is None: arg_array = [mx.nd.random.uniform(shape=shape) for shape in arg_shapes] aux_array = [mx.nd.random.uniform(shape=shape) for shape in aux_shapes] else: arg_array = None aux_array = None exe = sym.bind(ctx=mx.current_context(), args=arg_array if subgraph_backend is None else original_exec.arg_arrays, aux_states=aux_array if subgraph_backend is None else original_exec.aux_arrays, grad_req='null') exe.forward() if subgraph_backend is not None: check_call(_LIB.MXRemoveSubgraphPropertyOpNames(c_str(subgraph_backend))) del os.environ['MXNET_SUBGRAPH_BACKEND'] return exe
def test_conv_model_quantization(self): """ Use Conv model to test KL calibration and user specific evaluate function. """ for shape in [ (500, 3, 224, 224), ]: arg_shapes, _, _ = self.conv_model.infer_shape(data=shape) mod = mx.mod.Module(symbol=self.conv_model, context=mx.current_context()) mod.bind(for_training=False, data_shapes=[('data', arg_shapes[0])]) mod.init_params() arg_params, aux_params = mod.get_params() data = mx.nd.random.uniform(low=self.data_low, high=self.data_high, shape=shape).astype('float32') calib_data = mx.io.NDArrayIter(data=data, batch_size=shape[0]) fp32_model = (self.conv_model, arg_params, aux_params) qmodel = self.quantizer_2(fp32_model, q_dataloader=calib_data, \ eval_dataloader=calib_data, eval_func=eval_func) # test inspected_tensor inspect_tensor = self.quantizer_2.strategy.adaptor.inspect_tensor inspected_tensor = inspect_tensor( fp32_model, calib_data, op_list=[('sg_mkldnn_conv_bn_act_0_output', 'CONV'), ('data', 'input')], iteration_list=[0, 2, 4]) inspected_qtensor = inspect_tensor( qmodel, calib_data, op_list=[('quantized_sg_mkldnn_conv_bn_act_0_output', 'CONV')], iteration_list=[0]) self.assertNotEqual(len(inspected_tensor), 0) self.assertNotEqual(len(inspected_qtensor), 0) self.assertIsInstance(qmodel[0], mx.symbol.Symbol)
def check_quantize_whole_model(out_type): batch_size = 4 data_shape = (batch_size, 4, 10, 10) data = mx.sym.Variable('data') conv0 = mx.sym.Convolution(data, kernel=(1, 1), num_filter=16, name='conv0') sym = mx.sym.Convolution(conv0, kernel=(1, 1), num_filter=16, name='conv1') sym_sg = sym.get_backend_symbol('MKLDNN_QUANTIZE') mod = Module(symbol=sym, label_names=None) mod.bind(for_training=False, data_shapes=[('data', data_shape)]) mod.init_params(mx.init.Normal(0.5)) arg_params, aux_params = mod.get_params() excluded_sym_names = [] calib_data = mx.nd.random.uniform(shape=data_shape) calib_data = mx.io.NDArrayIter(data=calib_data) calib_data = DummyIter(calib_data) calib_layer = lambda name: name.endswith('_output') qsym, qarg_params, qaux_params = mx.contrib.quant.quantize_model( sym=sym_sg, arg_params=arg_params, aux_params=aux_params, ctx=mx.current_context(), excluded_sym_names=excluded_sym_names, quantized_dtype=out_type, calib_mode='naive', calib_data=calib_data, calib_layer=calib_layer, label_names=None, num_calib_examples=1) qsym = qsym.get_backend_symbol('MKLDNN_QUANTIZE') check_qsym_forward(qsym, qarg_params, qaux_params, data_shape)
def test_gnmt_encoder(): ctx = mx.current_context() for cell_type in ["lstm", "gru", "relu_rnn", "tanh_rnn"]: for num_layers, num_bi_layers in [(2, 1), (3, 0)]: for use_residual in [False, True]: encoder = GNMTEncoder(cell_type=cell_type, num_layers=num_layers, num_bi_layers=num_bi_layers, hidden_size=8, dropout=0.0, use_residual=use_residual, prefix='gnmt_encoder_') encoder.initialize(ctx=ctx) encoder.hybridize() for batch_size in [4]: for seq_length in [5, 10]: inputs_nd = mx.nd.random.normal(0, 1, shape=(batch_size, seq_length, 4), ctx=ctx) valid_length_nd = mx.nd.array(np.random.randint( 1, seq_length, size=(batch_size, )), ctx=ctx) encoder_outputs, _ = encoder( inputs_nd, valid_length=valid_length_nd) valid_length_npy = valid_length_nd.asnumpy() rnn_output = encoder_outputs[0].asnumpy() for i in range(batch_size): if valid_length_npy[i] < seq_length - 1: padded_out = rnn_output[ i, int(valid_length_npy[i]):, :] assert_almost_equal(padded_out, np.zeros_like(padded_out), 1E-6, 1E-6) assert (encoder_outputs[0].shape == (batch_size, seq_length, 8)) assert (len(encoder_outputs[1]) == num_layers)
def get_executor(sym, subgraph_backend=None, op_names=None, original_exec=None): arg_shapes, _, aux_shapes = sym.infer_shape() if subgraph_backend is None: arg_array = [ mx.nd.random.uniform(shape=shape) for shape in arg_shapes ] aux_array = [ mx.nd.random.uniform(shape=shape) for shape in aux_shapes ] else: arg_array = None aux_array = None exe = sym._bind(ctx=mx.current_context(), args=arg_array if subgraph_backend is None else original_exec.arg_arrays, aux_states=aux_array if subgraph_backend is None else original_exec.aux_arrays, grad_req='null') exe.forward() return exe
def test_np_array_creation(): dtypes = [_np.int8, _np.int32, _np.float16, _np.float32, _np.float64, _np.bool, _np.bool_, 'int8', 'int32', 'float16', 'float32', 'float64', 'bool', None] objects = [ [], (), [[1, 2], [3, 4]], _np.random.randint(-10, 10, size=rand_shape_nd(3)), _np.random.uniform(size=rand_shape_nd(3)), _np.random.uniform(size=(3, 0, 4)) ] for dtype in dtypes: for src in objects: mx_arr = np.array(src, dtype=dtype) assert mx_arr.ctx == mx.current_context() if dtype is None: dtype = src.dtype if isinstance(src, _np.ndarray) else _np.float32 if isinstance(src, mx.nd.NDArray): np_arr = _np.array(src.asnumpy(), dtype=dtype) else: np_arr = _np.array(src, dtype=dtype) assert mx_arr.dtype == np_arr.dtype assert same(mx_arr.asnumpy(), np_arr)
def test_mxnet_module_wrapper(data_frame): from datawig.imputer import _MXNetModule import mxnet as mx from datawig.iterators import ImputerIterDf feature_col, label_col = "feature", "label" df = data_frame(n_samples=100, feature_col=feature_col, label_col=label_col) label_encoders = [CategoricalEncoder(label_col)] data_encoders = [BowEncoder(feature_col)] data_featurizers = [BowFeaturizer(feature_col, vocab_size=100)] iter_train = ImputerIterDf(df, data_encoders, label_encoders) mod = _MXNetModule(mx.current_context(), label_encoders, data_featurizers, final_fc_hidden_units=[])(iter_train) assert mod._label_names == [label_col] assert mod.data_names == [feature_col] # weights and biases assert len(mod._arg_params) == 2
def import_symb_block(num_inputs: int, model_dir: Path, model_name: str, epoch: int = 0) -> mx.gluon.SymbolBlock: """ Deserializes a hybridized Gluon `HybridBlock` as a `SymbolBlock`. Parameters ---------- num_inputs The number of inputs of the serialized block. model_dir The path where the model is saved. model_name The name identifying the model. epoch The epoch number, which together with the `model_name` identifies the model parameters. Returns ------- mx.gluon.SymbolBlock The deserialized block. """ if num_inputs == 1: input_names = ['data'] else: input_names = [f'data{i}' for i in range(num_inputs)] # FIXME: mx.gluon.SymbolBlock cannot infer float_type and uses default np.float32 # FIXME: https://github.com/apache/incubator-mxnet/issues/11849 return mx.gluon.SymbolBlock.imports( symbol_file=str(model_dir / f'{model_name}-symbol.json'), input_names=input_names, param_file=str(model_dir / f'{model_name}-{epoch:04}.params'), ctx=mx.current_context(), )
def test_transformer_encoder(): ctx = mx.current_context() for num_layers in range(1, 3): for output_attention in [True, False]: for use_residual in [False, True]: encoder = TransformerEncoder(num_layers=num_layers, max_length=10, units=16, hidden_size=32, num_heads=8, dropout=0.0, use_residual=use_residual, output_attention=output_attention, prefix='transformer_encoder_') encoder.initialize(ctx=ctx) encoder.hybridize() for batch_size in [4]: for seq_length in [5, 10]: inputs_nd = mx.nd.random.normal(0, 1, shape=(batch_size, seq_length, 16), ctx=ctx) valid_length_nd = mx.nd.array(np.random.randint(1, seq_length, size=(batch_size,)), ctx=ctx) encoder_outputs, additional_outputs = encoder(inputs_nd, valid_length=valid_length_nd) valid_length_npy = valid_length_nd.asnumpy() encoder_outputs = encoder_outputs.asnumpy() for i in range(batch_size): if valid_length_npy[i] < seq_length - 1: padded_out = encoder_outputs[i, int(valid_length_npy[i]):, :] assert_almost_equal(padded_out, np.zeros_like(padded_out), 1E-6, 1E-6) assert(encoder_outputs.shape == (batch_size, seq_length, 16)) if output_attention: assert(len(additional_outputs) == num_layers) attention_out = additional_outputs[0][0].asnumpy() assert(attention_out.shape == (batch_size, 8, seq_length, seq_length)) for i in range(batch_size): mem_v_len = int(valid_length_npy[i]) if mem_v_len < seq_length - 1: assert((attention_out[i, :, :, mem_v_len:] == 0).all()) if mem_v_len > 0: assert_almost_equal(attention_out[i, :, :, :].sum(axis=-1), np.ones(attention_out.shape[1:3])) else: assert(len(additional_outputs) == 0)
def test_bf16_offline_casting(): class TestNet(nn.HybridBlock): def __init__(self): super().__init__() self.lp16_op1 = nn.Conv2D(4, 3) self.lp16_op2 = nn.Conv2DTranspose(4, 3) self.fp32_op = nn.Dense(4) def forward(self, x): x = self.lp16_op1(x) x = self.lp16_op2(x) x = x.reshape(x.shape[0], -1) x = self.fp32_op(x) return x net = TestNet() net.initialize() data_example = mx.np.random.uniform(-1, 1, (4, 3, 16, 16)) lp_net = amp.convert_hybrid_block(net, data_example, target_dtype=bfloat16, target_dtype_ops=['Convolution'], fp32_ops=['FullyConnected'], cast_params_offline=True, device=mx.current_context()) lp_net(data_example) for name, data in lp_net.collect_params().items(): assert data.dtype == (np.float32 if 'fp32_op' in name else bfloat16)
def finish_update(self, optimizer: Optimizer): params = [] grads = [] shapes = [] ctx = mx.current_context() for key, value in self._model.collect_params().items(): grad = cast(FloatsXd, mxnet2xp(value.grad(ctx))) param = cast(FloatsXd, mxnet2xp(value.data(ctx))) params.append(param.ravel()) grads.append(grad.ravel()) shapes.append((param.size, param.shape)) if not params: return xp = get_array_module(params[0]) flat_params, flat_grads = optimizer((self.id, "mxnet-shim"), xp.concatenate(params), xp.concatenate(grads)) start = 0 for key, value in self._model.collect_params().items(): size, shape = shapes.pop(0) param = flat_params[start:start + size].reshape(shape) value.set_data(xp2mxnet(param)) value.zero_grad() start += size
def check_amp_fuse(net, data_example, expected_sym=None, quantized_nodes=[], rtol=0.05): net.hybridize() out_ref = net(*data_example) net.optimize_for(data_example, backend=SG_PASS_NAME) # amp pass works only on oneDNN nodes lp_net = amp.convert_hybrid_block(net, data_example, target_dtype=AMP_DTYPE, excluded_sym_names=quantized_nodes, cast_params_offline=True, device=mx.current_context()) lp_net.optimize_for(data_example, backend=AMP_SG_PASS_NAME) out_lp_net = lp_net(*data_example) # check outputs out_ref = [out_ref] if not isinstance(out_ref, list) else out_ref out_lp_net = [out_lp_net] if not isinstance(out_ref, list) else out_lp_net for ref_out, lp_out in zip(out_ref, out_lp_net): assert_almost_equal(ref_out, lp_out, rtol=rtol, atol=1.0) # check graph if expected_sym is not None: lp_symnet = lp_net.export(None, remove_amp_cast=False)[0] same_graph_structure(lp_symnet, expected_sym, True) # check amp with quantization check_amp_with_quantization(net, data_example, quantized_nodes)
def check_quantize_model(qdtype): if is_test_for_native_cpu(): print('skipped testing quantized_residual_unit for native cpu since it is not supported yet') return elif qdtype == 'int8' and is_test_for_mkldnn(): print('skipped testing quantized_residual_unit for mkldnn cpu int8 since it is not supported yet') return elif qdtype == 'uint8' and is_test_for_gpu(): print('skipped testing quantized_residual_unit for gpu uint8 since it is not supported yet') return def check_params(params, qparams, qsym=None): if qsym is None: assert len(params) == len(qparams) for k, v in params.items(): assert k in qparams assert same(v.asnumpy(), qparams[k].asnumpy()) else: qparams_ground_truth = mx.contrib.quant._quantize_params(qsym, params) assert len(qparams) == len(qparams_ground_truth) for k, v in qparams_ground_truth.items(): assert k in qparams assert same(v.asnumpy(), qparams[k].asnumpy()) def check_qsym_calibrated(qsym): attrs = qsym.attr_dict() for k, v in attrs.items(): if k.find('requantize_') != -1: assert 'min_calib_range' in v assert 'max_calib_range' in v def check_qsym_qdtype(qsym, qdtype): attrs = qsym.attr_dict() for k, v in attrs.items(): if k.find('_quantize') != -1: assert 'out_type' in v assert v['out_type'] == qdtype def check_qsym_forward(qsym, qarg_params, qaux_params, data_shape, label_shape): mod = mx.mod.Module(symbol=qsym, context=mx.current_context()) mod.bind(for_training=False, data_shapes=[('data', data_shape)], label_shapes=[('softmax_label', label_shape)]) mod.set_params(qarg_params, qaux_params) data = [mx.random.uniform(-1.0, 1.0, shape=shape) for _, shape in mod.data_shapes] batch = mx.io.DataBatch(data, []) mod.forward(batch, is_train=False) for output in mod.get_outputs(): output.wait_to_read() sym = get_fp32_residual() mod = Module(symbol=sym) batch_size = 4 data_shape = (batch_size, 4, 10, 10) label_shape = (batch_size, 10) mod.bind(data_shapes=[('data', data_shape)], label_shapes=[('softmax_label', label_shape)]) mod.init_params() arg_params, aux_params = mod.get_params() excluded_sym_names = [] if mx.current_context() == mx.cpu(): excluded_sym_names += ['fc'] qsym, qarg_params, qaux_params = mx.contrib.quant.quantize_model(sym=sym, arg_params=arg_params, aux_params=aux_params, excluded_sym_names=excluded_sym_names, ctx=mx.current_context(), quantized_dtype=qdtype, calib_mode='none') check_params(arg_params, qarg_params, qsym) check_params(aux_params, qaux_params) check_qsym_forward(qsym, qarg_params, qaux_params, data_shape, label_shape) calib_data = mx.nd.random.uniform(shape=data_shape) calib_data = NDArrayIter(data=calib_data) calib_data = DummyIter(calib_data) qsym, qarg_params, qaux_params = mx.contrib.quant.quantize_model(sym=sym, arg_params=arg_params, aux_params=aux_params, excluded_sym_names=excluded_sym_names, ctx=mx.current_context(), quantized_dtype=qdtype, calib_mode='naive', calib_data=calib_data, num_calib_examples=20) check_params(arg_params, qarg_params, qsym) check_params(aux_params, qaux_params) check_qsym_calibrated(qsym) check_qsym_qdtype(qsym, qdtype) check_qsym_forward(qsym, qarg_params, qaux_params, data_shape, label_shape)
def check_quantize(sym, data_shape, out_type, name='conv', check_calibration=True, gluon_forward=False, check_scale_align=False): if name in config: name = config[name][OP_NAME] sym_sg = sym.get_backend_symbol(QUANTIZE_SG_PASS_NAME) mod = Module(symbol=sym, label_names=None) mod.bind(for_training=False, data_shapes=[('data', data_shape)]) mod.init_params(mx.init.Normal(0.5)) arg_params, aux_params = mod.get_params() if out_type == 'uint8': data = [ mx.random.uniform(0.0, 1.0, shape=shape, ctx=mx.current_context()) for _, shape in mod.data_shapes ] else: data = [ mx.random.uniform(-1.0, 1.0, shape=shape, ctx=mx.current_context()) for _, shape in mod.data_shapes ] batch = mx.io.DataBatch(data, []) mod.forward(batch, is_train=False) for output in mod.get_outputs(): output.wait_to_read() ref_out = mod.get_outputs() excluded_sym_names = [] if mx.current_context() == mx.cpu() and gluon_forward == True: excluded_sym_names += ['sg_mkldnn_fully_connected_0'] excluded_sym_names += ['fc_softmax'] calib_data = CalibIter(batch, data_shape, 1) qsym, qarg_params, qaux_params = mx.contrib.quant.quantize_model( sym=sym_sg, arg_params=arg_params, aux_params=aux_params, ctx=mx.current_context(), excluded_sym_names=excluded_sym_names, quantized_dtype=out_type, calib_mode='naive', calib_data=calib_data, calib_layer=None, label_names=None, num_calib_examples=1) qsym = qsym.get_backend_symbol(QUANTIZE_SG_PASS_NAME) if check_calibration: check_qsym_calibrated(qsym, out_type, name=name) if check_scale_align: check_qsym_scale_align(qsym) if gluon_forward == True: check_qsym_gluon_forward(qsym, qarg_params, qaux_params, data_shape) else: quantized_out = check_qsym_forward(qsym, qarg_params, qaux_params, batch, data_shape) for i in range(len(ref_out)): min_range = mx.nd.min(ref_out[i]).asscalar() max_range = mx.nd.max(ref_out[i]).asscalar() atol = 0.1 * max(abs(min_range), abs(max_range)) assert_almost_equal_with_err(quantized_out[i].asnumpy(), ref_out[i].asnumpy(), rtol=0.1, atol=atol, etol=0.2) check_qsym_dummy_forward(qsym, batch, data_shape)
def _test_backward_template(version: str): """ This method serves as a base method for all backward tests. It tests for a given version of Rational whether Rational can be integrated into a small MxNet model. It also tests whether the rational activation function's coefficients (weights) are updated :param version: version of the rational activation function """ # define network net = nn.Sequential() net.add(nn.Dense(128, activation='relu')) net.add(nn.Dense(64, activation='relu')) # insert a rational activation function as a layer fut = Rational(version=version) net.add(fut) net.add(nn.Dense(10)) gpus = mx.test_utils.list_gpus() # include current context, so parameters can be read from this test method ctx = [mx.gpu(), mx.current_context()] if gpus else [mx.cpu(0), mx.cpu(1)] net.initialize(ctx=ctx) trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': 0.02}) # copy the old coefficient values nums_before_training = fut.numerator.data(mx.current_context()).asnumpy() dens_before_training = fut.denominator.data(mx.current_context()).asnumpy() # Use Accuracy as the evaluation metric. metric = mx.metric.Accuracy() softmax_cross_entropy_loss = gluon.loss.SoftmaxCrossEntropyLoss() # Reset the train data iterator. train_data.reset() # Loop over the train data iterator. for batch in train_data: # Splits train data into multiple slices along batch_axis # and copy each slice into a context. data = gluon.utils.split_and_load(batch.data[0], ctx_list=ctx, batch_axis=0, even_split=False) # Splits train labels into multiple slices along batch_axis # and copy each slice into a context. label = gluon.utils.split_and_load(batch.label[0], ctx_list=ctx, batch_axis=0, even_split=False) outputs = [] # Inside training scope with ag.record(): for x, y in zip(data, label): z = net(x) # Computes softmax cross entropy loss. loss = softmax_cross_entropy_loss(z, y) # back-propagate the error for one iteration. loss.backward() outputs.append(z) # Updates internal evaluation metric.update(label, outputs) # Make one step of parameter update. Trainer needs to know the # batch size of data to normalize the gradient by 1/batch_size. trainer.step(batch.data[0].shape[0]) # exit after first loop break # Gets the evaluation result. name, acc = metric.get() # Reset evaluation result to initial state. metric.reset() print('training acc: %s=%f' % (name, acc)) # copy the new coefficient values nums_after_training = fut.numerator.data(mx.current_context()).asnumpy() dens_after_training = fut.denominator.data(mx.current_context()).asnumpy() # check that at least one coefficient changed in numerators assert not np.all(np.equal(nums_before_training, nums_after_training)) # check that at least one coefficient changed in denominators assert not np.all(np.equal(dens_before_training, dens_after_training))
def check_quantized_fc(data_shape, num_hidden, no_bias, qdtype, flatten=True): if mx.current_context().device_type != 'gpu': hasMKL = False; for key in os.environ.keys(): if operator.eq(key, "BUILD_TAG"): if os.environ['BUILD_TAG'].find("MKL") != -1: hasMKL = True break if hasMKL == False: print('skipped testing quantized_fc on cpu since s8u8s32 is only supported by MKL BLAS library') return elif qdtype == 'uint8' and is_test_for_gpu(): print('skipped testing quantized_fc for gpu uint8 since it is not supported yet') return data = mx.sym.Variable(name='data', shape=data_shape, dtype='float32') fc_fp32 = mx.sym.FullyConnected(data=data, num_hidden=num_hidden, no_bias=no_bias, flatten=flatten) arg_shapes, _, _ = fc_fp32.infer_shape(data=data_shape) arg_names = fc_fp32.list_arguments() fc_fp32_exe = fc_fp32.simple_bind(ctx=mx.current_context(), grad_req='null') if qdtype == 'uint8': data_low = 0.0 data_high = 63.0 else: data_low = -63.0 data_high = 63.0 fc_fp32_exe.arg_dict[arg_names[0]][:] = mx.nd.random.uniform(low=data_low, high=data_high, shape=data_shape).astype('int32') fc_fp32_exe.arg_dict[arg_names[1]][:] = mx.nd.random.uniform(low=data_low, high=data_high, shape=arg_shapes[1]).astype('int32') if not no_bias: fc_fp32_exe.arg_dict[arg_names[2]][:] = mx.nd.random.uniform(low=data_low, high=data_high, shape=arg_shapes[2]).astype('int32') output = fc_fp32_exe.forward()[0] qdata = mx.sym.Variable(name='qdata', shape=data_shape, dtype='int8') fc_int8 = mx.sym.contrib.quantized_fully_connected(data=qdata, num_hidden=num_hidden, no_bias=no_bias, flatten=flatten) qarg_names = fc_int8.list_arguments() type_dict = {qarg_names[1]: 'int8'} if not no_bias: type_dict.update({qarg_names[2]: 'int8'}) fc_int8_exe = fc_int8.simple_bind(ctx=mx.current_context(), type_dict=type_dict, grad_req='null') fc_int8_exe.arg_dict[qarg_names[0]][:] = fc_fp32_exe.arg_dict[arg_names[0]].astype(qdtype) fc_int8_exe.arg_dict[qarg_names[1]][:] = fc_fp32_exe.arg_dict[arg_names[1]].astype('int8') quantized_range = 127.0 if no_bias: fc_int8_exe.arg_dict[qarg_names[2]][:] = -quantized_range fc_int8_exe.arg_dict[qarg_names[3]][:] = quantized_range fc_int8_exe.arg_dict[qarg_names[4]][:] = -quantized_range fc_int8_exe.arg_dict[qarg_names[5]][:] = quantized_range else: fc_int8_exe.arg_dict[qarg_names[2]][:] = fc_fp32_exe.arg_dict[arg_names[2]].astype('int8') fc_int8_exe.arg_dict[qarg_names[3]][:] = -quantized_range fc_int8_exe.arg_dict[qarg_names[4]][:] = quantized_range fc_int8_exe.arg_dict[qarg_names[5]][:] = -quantized_range fc_int8_exe.arg_dict[qarg_names[6]][:] = quantized_range fc_int8_exe.arg_dict[qarg_names[7]][:] = -quantized_range fc_int8_exe.arg_dict[qarg_names[8]][:] = quantized_range qoutput, min_range, max_range = fc_int8_exe.forward() if no_bias: assert_almost_equal(output.asnumpy(), qoutput.asnumpy()) else: # with adding bias, accuracy loss should not be greater than one diff = mx.nd.abs(output - qoutput.astype(output.dtype)) cond = mx.nd.lesser(2, diff).sum().asscalar() assert cond == 0
def _current_context(self): if has_gpu: return mx.gpu(hvd.local_rank()) else: return mx.current_context()
#pylint: skip-file import mxnet as mx a = mx.narray.create((3000, 4000)) b = mx.narray.create((3000, 4000)) a.numpy[:] = 10 b.numpy[:] = 11 print(a.numpy) c = b * a cc = mx.op.mul(b, a) print(c.context) print(cc.numpy) d = c.copyto(mx.Context('cpu', 0)) print(d.numpy) with mx.Context('gpu', 0) as ctx: # gpu operations print mx.current_context() print ctx a_gpu = a.copyto(ctx) b_gpu = b.copyto(ctx) c_gpu = b * a d_cpu = c_gpu.copyto(mx.current_context()) print d_cpu.numpy
def is_test_for_gpu(): return mx.current_context().device_type == 'gpu'
def check_fusion(sym, data_shape, attrs_dict, check_fp32_fusion=True, check_quantization=True, out_types=['uint8', 'int8', 'auto']): if check_fp32_fusion: sym_sg = sym.get_backend_symbol(SG_PASS_NAME) for name, attrs in attrs_dict.items(): if name in config: op_name = config[name][OP_NAME] else: op_name = name assert ''.join( sym_sg.get_internals().list_outputs()).find(op_name) != -1 if len(attrs): found = False for k, v in sym_sg.attr_dict().items(): if k.find(op_name) != -1: found = True for attr_name, attr_value in attrs.items(): assert v[attr_name].lower() == attr_value.lower() assert found arg_shapes, _, aux_shapes = sym.infer_shape() arg_array = [ mx.nd.random.uniform(-1.0, 1.0, shape=shape) for shape in arg_shapes ] aux_array = [mx.nd.random.uniform(shape=shape) for shape in aux_shapes] exe = sym.bind(ctx=mx.current_context(), args=arg_array, aux_states=aux_array, grad_req='null') exe.forward() os.environ['MXNET_SUBGRAPH_BACKEND'] = SG_PASS_NAME exe_sg = sym.bind(ctx=mx.current_context(), args=arg_array, aux_states=aux_array, grad_req='null') exe_sg.forward() del os.environ['MXNET_SUBGRAPH_BACKEND'] for i in range(len(exe.outputs)): assert_almost_equal(exe.outputs[i].asnumpy(), exe_sg.outputs[i].asnumpy(), rtol=1e-3, atol=1e-1) if check_quantization: # fp32 to int8 for out_type in out_types: check_quantize(sym, data_shape, out_type, name=op_name) # TODO(ciyong), since quantized fc save its params in int8, while gluon treat the default # variable from symbol file as fp32 which results in mismatch dtype of params. # Skip quantized fc in gluon pass. if name != 'fc': check_quantize(sym, data_shape, out_type, name=op_name, gluon_forward=True)
def is_test_for_native_cpu(): return (mx.current_context().device_type == 'cpu' and os.environ.get('ENABLE_MKLDNN_QUANTIZATION_TEST') == None)