def test_collect_tensor_ops(self): init_net = core.Net('init_net') blobs = ['blob_1', 'blob_2', 'blob_3'] bvec_map = {} ONE = init_net.ConstantFill([], 'ONE', shape=[1, 2], value=1) for b in blobs: init_net.ConstantFill([], [b], shape=[1, 2], value=0) bvec_map[b] = b + '_vec' init_net.CreateTensorVector([], [bvec_map[b]]) reader_net = core.Net('reader_net') for b in blobs: reader_net.Add([b, ONE], [b]) collect_net = core.Net('collect_net') num_to_collect = 1000 max_example_to_cover = 100000 bvec = [bvec_map[b] for b in blobs] collect_net.CollectTensor( bvec + blobs, bvec, num_to_collect=num_to_collect, ) print('Collect Net Proto: {}'.format(collect_net.Proto())) plan = core.Plan('collect_data') plan.AddStep(core.execution_step('collect_init', init_net)) plan.AddStep( core.execution_step('collect_data', [reader_net, collect_net], num_iter=max_example_to_cover)) workspace.RunPlan(plan) # concat the collected tensors concat_net = core.Net('concat_net') bconcated_map = {} bsize_map = {} for b in blobs: bconcated_map[b] = b + '_concated' bsize_map[b] = b + '_size' concat_net.ConcatTensorVector([bvec_map[b]], [bconcated_map[b]]) concat_net.TensorVectorSize([bvec_map[b]], [bsize_map[b]]) workspace.RunNetOnce(concat_net) # check data reference_result = workspace.FetchBlob(bconcated_map[blobs[0]]) self.assertEqual(reference_result.shape, (min(num_to_collect, max_example_to_cover), 2)) size = workspace.FetchBlob(bsize_map[blobs[0]]) self.assertEqual(tuple(), size.shape) self.assertEqual(min(num_to_collect, max_example_to_cover), size.item()) hist, _ = np.histogram(reference_result[:, 0], bins=10, range=(1, max_example_to_cover)) print('Sample histogram: {}'.format(hist)) self.assertTrue(all(hist > 0.6 * (num_to_collect / 10))) for i in range(1, len(blobs)): result = workspace.FetchBlob(bconcated_map[blobs[i]]) self.assertEqual(reference_result.tolist(), result.tolist())
def test_groupwise_dnnlowp_conv_int( self, stride, pad, kernel, dilation, size, group, input_channels_per_group, output_channels_per_group, batch_size, order, prepack_weight, preserve_activation_sparsity, preserve_weight_sparsity, gc, dc, ): assume(group == 1 or dilation == 1) assume((not prepack_weight) or order == "NHWC") X, W, b = generate_conv_inputs( stride, pad, kernel, dilation, size, group, input_channels_per_group, output_channels_per_group, batch_size, order, groupwise_quantization=True, preserve_activation_sparsity=preserve_activation_sparsity, preserve_weight_sparsity=preserve_weight_sparsity, ) Output = collections.namedtuple("Output", ["Y", "op_type", "engine", "order"]) outputs = [] op_engine_list = [ ("Conv", ""), ("Conv", "DNNLOWP"), ("Conv", "DNNLOWP_16"), ("Int8Conv", "DNNLOWP"), ] for op_type, engine in op_engine_list: init_net = core.Net("test_init_net") net = core.Net("test_net") do_quantize = "DNNLOWP" in engine do_dequantize = "DNNLOWP" in engine do_prepack_weight = engine == "DNNLOWP" and prepack_weight if do_quantize: quantize = core.CreateOperator( "Quantize", ["X"], ["X_q"], preserve_activation_sparsity=preserve_activation_sparsity, engine=engine, device_option=gc, ) net.Proto().op.extend([quantize]) if do_prepack_weight: X_min = 0 if X.size == 0 else X.min() X_max = 0 if X.size == 0 else X.max() x_q_param = hardcode_scale_zp.choose_quantization_params( X_min, X_max) inputs = ["W"] if do_dequantize: inputs += ["b"] pack = core.CreateOperator( "Int8ConvPackWeight", inputs, ["W_packed"], group=group, quantize_groupwise=1, preserve_weight_sparsity=preserve_weight_sparsity, in_scale=x_q_param.scale, engine=engine, ) init_net.Proto().op.extend([pack]) conv = core.CreateOperator( op_type, [ "X_q" if do_quantize else "X", "W_packed" if do_prepack_weight else "W", "b", ], ["Y_q" if do_dequantize else "Y"], stride=stride, kernel=kernel, dilation=dilation, pad=pad, order=order, preserve_activation_sparsity=preserve_activation_sparsity, preserve_weight_sparsity=preserve_weight_sparsity, engine=engine, group=group, quantize_groupwise=1, device_option=gc, ) if do_dequantize or do_prepack_weight: # groupwise quantization only works with static quantization # so we need to set quantization parameters dnnlowp_utils.add_quantization_param_args( conv, outputs[0][0], preserve_activation_sparsity) net.Proto().op.extend([conv]) if do_dequantize: dequantize = core.CreateOperator( "Dequantize", ["Y_q"], ["Y"], preserve_activation_sparsity=preserve_activation_sparsity, engine=engine, device_option=gc, ) net.Proto().op.extend([dequantize]) run_conv_or_fc(self, init_net, net, X, W, b, op_type, engine, order, gc, outputs) check_quantized_results_close(outputs, symmetric=preserve_activation_sparsity)
def test_dnnlowp_elementwise_mul_int( self, N, is_empty, in_quantized, out_quantized, in_place, gc, dc ): if is_empty: N = 0 # FIXME: DNNLOWP Mul doesn't support inplace operation and # dequantize_output=1 at the same time if in_place[0] or in_place[1]: in_quantized = True out_quantized = True # All inputs have scale 1, so exactly represented after quantization min_ = -100 max_ = min_ + 255 A = np.round(np.random.rand(N) * (max_ - min_) + min_) A = A.astype(np.float32) if N != 0: A[0] = min_ A[1] = max_ B = np.round(np.random.rand(N) * 255 - 128).astype(np.float32) if N != 0: B[0] = -128 B[1] = 127 Output = collections.namedtuple("Output", ["Y", "engine"]) outputs = [] engine_list = ["", "DNNLOWP"] for engine in engine_list: net = core.Net("test_net") do_quantize = "DNNLOWP" in engine and in_quantized do_dequantize = "DNNLOWP" in engine and out_quantized if do_quantize: quantize_A = core.CreateOperator( "Quantize", ["A"], ["A_q"], engine=engine, device_option=gc ) net.Proto().op.extend([quantize_A]) quantize_B = core.CreateOperator( "Quantize", ["B"], ["B_q"], engine=engine, device_option=gc ) net.Proto().op.extend([quantize_B]) out = "Y" if in_place[0]: out = "A" elif in_place[1]: out = "B" mul = core.CreateOperator( "Mul", ["A_q", "B_q"] if do_quantize else ["A", "B"], [(out + "_q") if do_dequantize else out], dequantize_output=not do_dequantize, engine=engine, device_option=gc, ) net.Proto().op.extend([mul]) if do_dequantize: dequantize = core.CreateOperator( "Dequantize", [out + "_q"], [out], engine=engine, device_option=gc ) net.Proto().op.extend([dequantize]) self.ws.create_blob("A").feed(A, device_option=gc) self.ws.create_blob("B").feed(B, device_option=gc) self.ws.run(net) outputs.append(Output(Y=self.ws.blobs[out].fetch(), engine=engine)) check_quantized_results_close(outputs)
def CreateModel(self): log.debug("Start training") model = cnn.CNNModelHelper(name="char_rnn") input_blob, seq_lengths, hidden_init, cell_init, target = \ model.net.AddExternalInputs( 'input_blob', 'seq_lengths', 'hidden_init', 'cell_init', 'target', ) hidden_output_all, self.hidden_output, _, self.cell_state = LSTM( model, input_blob, seq_lengths, (hidden_init, cell_init), self.D, self.hidden_size, scope="LSTM") output = model.FC(hidden_output_all, None, dim_in=self.hidden_size, dim_out=self.D, axis=2) # axis is 2 as first two are T (time) and N (batch size). # We treat them as one big batch of size T * N softmax = model.Softmax(output, 'softmax', axis=2) softmax_reshaped, _ = model.Reshape(softmax, ['softmax_reshaped', '_'], shape=[-1, self.D]) # Create a copy of the current net. We will use it on the forward # pass where we don't need loss and backward operators self.forward_net = core.Net(model.net.Proto()) xent = model.LabelCrossEntropy([softmax_reshaped, target], 'xent') # Loss is average both across batch and through time # Thats why the learning rate below is multiplied by self.seq_length loss = model.AveragedLoss(xent, 'loss') model.AddGradientOperators([loss]) # Hand made SGD update. Normally one can use helper functions # to build an optimizer ITER = model.Iter("iter") LR = model.LearningRate(ITER, "LR", base_lr=-0.1 * self.seq_length, policy="step", stepsize=1, gamma=0.9999) ONE = model.param_init_net.ConstantFill([], "ONE", shape=[1], value=1.0) # Update weights for each of the model parameters for param in model.params: param_grad = model.param_to_grad[param] model.WeightedSum([param, ONE, param_grad, LR], param) self.model = model self.predictions = softmax self.loss = loss self.prepare_state = core.Net("prepare_state") self.prepare_state.Copy(self.hidden_output, hidden_init) self.prepare_state.Copy(self.cell_state, cell_init)
def test_dnnlowp_average_pool( self, ndim, stride, pad, kernel, size, input_channels, batch_size, order, in_quantized, gc, dc, ): kernel = 2 # Only kernel size 2 is supported assume(kernel <= size) assume(pad < kernel) C = input_channels N = batch_size strides = (stride, ) * ndim pads = (pad, ) * (ndim * 2) kernels = (kernel, ) * ndim sizes = (size, ) * ndim # X has scale 1, so no input quantization error min_ = -100 max_ = min_ + 255 if order == "NCHW": X = np.round( np.random.rand(*((N, C) + sizes)) * (max_ - min_) + min_) X = X.astype(np.float32) X[(0, ) * (ndim + 2)] = min_ X[(0, ) * (ndim + 1) + (1, )] = max_ elif order == "NHWC": X = np.round( np.random.rand(*((N, ) + sizes + (C, ))) * (max_ - min_) + min_) X = X.astype(np.float32) X[(0, ) * (ndim + 2)] = min_ X[(0, 1) + (0, ) * ndim] = max_ Output = collections.namedtuple("Output", ["Y", "op_type", "engine"]) outputs = [] op_engine_list = [ ("AveragePool", ""), ("AveragePool", "DNNLOWP"), ("Int8AveragePool", "DNNLOWP"), ] for op_type, engine in op_engine_list: net = core.Net("test_net") do_quantize = "DNNLOWP" in engine and in_quantized if do_quantize: quantize = core.CreateOperator("Quantize", ["X"], ["X_q"], engine=engine, device_option=gc) net.Proto().op.extend([quantize]) max_pool = core.CreateOperator( op_type, ["X_q" if do_quantize else "X"], ["Y_q" if engine == "DNNLOWP" else "Y"], strides=strides, kernels=kernels, pads=pads, order=order, engine=engine, device_option=gc, ) net.Proto().op.extend([max_pool]) if engine == "DNNLOWP": dequantize = core.CreateOperator("Dequantize", ["Y_q"], ["Y"], engine=engine, device_option=gc) net.Proto().op.extend([dequantize]) self.ws.create_blob("X").feed(X, device_option=gc) self.ws.run(net) outputs.append( Output(Y=self.ws.blobs["Y"].fetch(), op_type=op_type, engine=engine)) check_quantized_results_close(outputs)
def write_ex(self, fields, local_init_net, local_finish_net, status): self._wrapper._new_writer(self.schema(), local_init_net) enqueue_net = core.Net('enqueue') enqueue(enqueue_net, self._wrapper.queue(), fields, status) return [enqueue_net]
def test_dnnlowp_conv_acc16_outlier( self, stride, pad, kernel, dilation, size, group, input_channels_per_group, output_channels_per_group, batch_size, order, in_quantized, out_quantized, weight_quantized, prepack_weight, nbits_in_non_outlier, share_col_buffer, preserve_activation_sparsity, preserve_weight_sparsity, gc, dc, ): assume(group == 1 or dilation == 1) assume(size >= dilation * (kernel - 1) + 1) input_channels = input_channels_per_group * group output_channels = output_channels_per_group * group if nbits_in_non_outlier == 0: X, W, b = generate_conv_inputs( stride, pad, kernel, dilation, size, group, input_channels_per_group, output_channels_per_group, batch_size, order, preserve_activation_sparsity=preserve_activation_sparsity, preserve_weight_sparsity=preserve_weight_sparsity, ) else: X_min = 0 if preserve_activation_sparsity else -77 X_max = X_min + 255 X = np.random.rand(batch_size, size, size, input_channels) * 4 + X_min X = np.round(X).astype(np.float32) X[..., 0] = X_min X[0, 0, 0, 1] = X_max if preserve_weight_sparsity: W_min = -128 W_max = 100 else: W_min = -100 W_max = W_min + 255 W = (np.random.rand(output_channels, kernel, kernel, input_channels_per_group) * 4 - 2 + W_min + 128) W = np.round(W).astype(np.float32) W[0, 0, 0, 0] = W_min W[1, 0, 0, 0] = W_max W[..., 1] = W_min + 128 # No input quantization error in bias b = np.round(np.random.randn(output_channels)).astype(np.float32) Output = collections.namedtuple("Output", ["Y", "op_type", "engine", "order"]) outputs = [] op_engine_list = [ ("Conv", ""), ("Conv", "DNNLOWP_ACC16"), ("Int8Conv", "DNNLOWP_ACC16"), ] for op_type, engine in op_engine_list: init_net = core.Net("test_init_net") net = core.Net("test_net") do_quantize = "DNNLOWP" in engine and in_quantized do_dequantize = "DNNLOWP" in engine and out_quantized do_quantize_weight = "DNNLOWP" in engine and weight_quantized do_prepack_weight = "DNNLOWP" in engine and prepack_weight if do_quantize: quantize = core.CreateOperator( "Quantize", ["X"], ["X_q"], preserve_activation_sparsity=preserve_activation_sparsity, engine="DNNLOWP", device_option=gc, ) net.Proto().op.extend([quantize]) x_q_param = dnnlowp_utils.choose_quantization_params( X.min(), X.max(), preserve_activation_sparsity) if do_quantize_weight: int8_given_tensor_fill, w_q_param = dnnlowp_utils.create_int8_given_tensor_fill( W, "W_q", preserve_weight_sparsity) init_net.Proto().op.extend([int8_given_tensor_fill]) # Bias int8_bias_tensor_fill = dnnlowp_utils.create_int8_bias_tensor_fill( b, "b_q", x_q_param, w_q_param) init_net.Proto().op.extend([int8_bias_tensor_fill]) if do_prepack_weight: inputs = ["W_q" if do_quantize_weight else "W"] if do_dequantize: inputs += ["b_q" if do_quantize_weight else "b"] pack = core.CreateOperator( "Int8ConvPackWeight", inputs, ["W_packed"], group=group, nbits_in_non_outlier=nbits_in_non_outlier, preserve_weight_sparsity=preserve_weight_sparsity, in_scale=x_q_param.scale, engine=engine, ) init_net.Proto().op.extend([pack]) conv = core.CreateOperator( op_type, [ "X_q" if do_quantize else "X", "W_packed" if do_prepack_weight else ("W_q" if do_quantize_weight else "W"), "b_q" if do_quantize_weight else "b", ], ["Y_q" if do_dequantize else "Y"], stride=stride, kernel=kernel, dilation=dilation, pad=pad, order=order, dequantize_output=not do_dequantize, nbits_in_non_outlier=nbits_in_non_outlier, shared_buffer=(1 if share_col_buffer else 0), preserve_activation_sparsity=preserve_activation_sparsity, preserve_weight_sparsity=preserve_weight_sparsity, engine=engine, group=group, device_option=gc, ) if do_dequantize or do_quantize_weight or do_prepack_weight: # When quantized weight is provided, we can't rescale the # output dynamically by looking at the range of output of each # batch, so here we provide the range of output observed from # fp32 reference implementation dnnlowp_utils.add_quantization_param_args( conv, outputs[0][0], preserve_activation_sparsity) net.Proto().op.extend([conv]) if do_dequantize: dequantize = core.CreateOperator("Dequantize", ["Y_q"], ["Y"], engine="DNNLOWP", device_option=gc) net.Proto().op.extend([dequantize]) self.ws.create_blob("X").feed(X, device_option=gc) self.ws.create_blob("W").feed(W, device_option=gc) self.ws.create_blob("b").feed(b, device_option=gc) self.ws.run(init_net) self.ws.run(net) Y = self.ws.blobs["Y"].fetch() outputs.append( Output(Y=Y, op_type=op_type, engine=engine, order=order)) check_quantized_results_close(outputs, symmetric=preserve_activation_sparsity)
def test_imageinput(self, size_tuple, means, stds, gc, dc): # TODO: Does not test on GPU and does not test use_gpu_transform # WARNING: Using ModelHelper automatically does NHWC to NCHW # transformation if needed. width, height, minsize, crop = size_tuple means = [float(m) for m in means] stds = [float(s) for s in stds] out_dir = tempfile.mkdtemp() count_images = 2 # One with bounding box and one without expected_images = create_test(out_dir, width=width, height=height, default_bound=(3, 5, height - 3, width - 5), minsize=minsize, crop=crop, means=means, stds=stds, count=count_images) for device_option in dc: with hu.temp_workspace(): reader_net = core.Net('reader') reader_net.CreateDB([], 'DB', db=out_dir, db_type="lmdb") workspace.RunNetOnce(reader_net) imageop = core.CreateOperator( 'ImageInput', ['DB'], ["data", "label"], batch_size=count_images, color=3, minsize=minsize, crop=crop, is_test=True, bounding_ymin=3, bounding_xmin=5, bounding_height=height - 3, bounding_width=width - 5, mean_per_channel=means, std_per_channel=stds, use_gpu_transform=(device_option.device_type == 1)) imageop.device_option.CopyFrom(device_option) main_net = core.Net('main') main_net.Proto().op.extend([imageop]) workspace.RunNetOnce(main_net) l = workspace.FetchBlob('label') result = workspace.FetchBlob('data').astype(np.int32) # If we don't use_gpu_transform, the output is in NHWC # Our reference output is CHW so we swap if device_option.device_type != 1: expected = [ img.swapaxes(0, 1).swapaxes(1, 2) for img in expected_images ] else: expected = expected_images for i in range(count_images): self.assertEqual(l[i], i) self.assertEqual((expected[i] - result[i] > 1).sum(), 0) # End for # End with # End for shutil.rmtree(out_dir)
def test_dnnlowp_conv_relu_int( self, stride, pad, kernel, dilation, size, group, input_channels_per_group, output_channels_per_group, batch_size, order, share_col_buffer, gc, dc, ): assume(group == 1 or dilation == 1) X, W, b = generate_conv_inputs( stride, pad, kernel, dilation, size, group, input_channels_per_group, output_channels_per_group, batch_size, order, ) Output = collections.namedtuple("Output", ["Y", "op_type", "engine", "order"]) outputs = [] op_engine_list = [ ("Conv", ""), ("ConvRelu", "DNNLOWP"), ("ConvRelu", "DNNLOWP_16"), ("Int8ConvRelu", "DNNLOWP"), ] for op_type, engine in op_engine_list: net = core.Net("test_net") if "DNNLOWP" in engine: quantize = core.CreateOperator("Quantize", ["X"], ["X_q"], engine=engine, device_option=gc) net.Proto().op.extend([quantize]) conv = core.CreateOperator( op_type, ["X_q", "W", "b"], ["Y_q"], stride=stride, kernel=kernel, dilation=dilation, pad=pad, order=order, engine=engine, shared_buffer=(1 if share_col_buffer else 0), group=group, device_option=gc, ) net.Proto().op.extend([conv]) dequantize = core.CreateOperator("Dequantize", ["Y_q"], ["Y"], engine=engine, device_option=gc) net.Proto().op.extend([dequantize]) else: conv = core.CreateOperator( op_type, ["X", "W", "b"], ["Y"], stride=stride, kernel=kernel, dilation=dilation, pad=pad, order=order, shared_buffer=(1 if share_col_buffer else 0), engine=engine, group=group, device_option=gc, ) net.Proto().op.extend([conv]) relu = core.CreateOperator("Relu", ["Y"], ["Y"], engine=engine, device_option=gc) net.Proto().op.extend([relu]) self.ws.create_blob("X").feed(X, device_option=gc) self.ws.create_blob("W").feed(W, device_option=gc) self.ws.create_blob("b").feed(b, device_option=gc) self.ws.run(net) Y = self.ws.blobs["Y"].fetch() outputs.append( Output(Y=Y, op_type=op_type, engine=engine, order=order)) check_quantized_results_close(outputs)
def create_init_net(self, name): init_net = core.Net(name) self._add_global_constants(init_net) return init_net
def test_meta_net_def_net_runs(self): for param, value in viewitems(self.params): workspace.FeedBlob(param, value) extra_init_net = core.Net('extra_init') extra_init_net.ConstantFill('data', 'data', value=1.0) pem = pe.PredictorExportMeta( predict_net=self.predictor_export_meta.predict_net, parameters=self.predictor_export_meta.parameters, inputs=self.predictor_export_meta.inputs, outputs=self.predictor_export_meta.outputs, shapes=self.predictor_export_meta.shapes, extra_init_net=extra_init_net, net_type='dag', ) db_type = 'minidb' db_file = tempfile.NamedTemporaryFile(delete=False, suffix=".{}".format(db_type)) pe.save_to_db(db_type=db_type, db_destination=db_file.name, predictor_export_meta=pem) workspace.ResetWorkspace() meta_net_def = pe.load_from_db( db_type=db_type, filename=db_file.name, ) self.assertTrue("data" not in workspace.Blobs()) self.assertTrue("y" not in workspace.Blobs()) init_net = pred_utils.GetNet(meta_net_def, pc.PREDICT_INIT_NET_TYPE) # 0-fills externalblobs blobs and runs extra_init_net workspace.RunNetOnce(init_net) self.assertTrue("data" in workspace.Blobs()) self.assertTrue("y" in workspace.Blobs()) print(workspace.FetchBlob("data")) np.testing.assert_array_equal(workspace.FetchBlob("data"), np.ones(shape=(1, 5))) np.testing.assert_array_equal(workspace.FetchBlob("y"), np.zeros(shape=(1, 10))) # Load parameters from DB global_init_net = pred_utils.GetNet(meta_net_def, pc.GLOBAL_INIT_NET_TYPE) workspace.RunNetOnce(global_init_net) # Run the net with a reshaped input and verify we are # producing good numbers (with our custom implementation) workspace.FeedBlob("data", np.random.randn(2, 5).astype(np.float32)) predict_net = pred_utils.GetNet(meta_net_def, pc.PREDICT_NET_TYPE) self.assertEqual(predict_net.type, 'dag') workspace.RunNetOnce(predict_net) np.testing.assert_array_almost_equal( workspace.FetchBlob("y"), workspace.FetchBlob("data").dot(self.params["y_w"].T) + self.params["y_b"])
def test_inject_copy_multi_use(self): net = core.Net("test") device_option = caffe2_pb2.DeviceOption() device_option.device_type = caffe2_pb2.CUDA device_option.cuda_gpu_id = 1 with core.DeviceScope(device_option): net.Relu("data", "relu1") net.Relu("data", "relu2") with core.DeviceScope(device_option): net.Relu("data", "relu3") net.Relu("data", "relu4") device_option.cuda_gpu_id = 0 with core.DeviceScope(device_option): net.Relu("data", "relu5") device_option.cuda_gpu_id = 1 with core.DeviceScope(device_option): net.Relu("data", "relu6") new_net, _ = core.InjectCrossDeviceCopies(net) op = new_net._net.op[0] self.assertEqual(op.type, "CopyCPUToGPU") self.assertEqual(op.device_option.device_type, 1) self.assertEqual(op.device_option.cuda_gpu_id, 1) self.assertEqual(op.output[0], "data_cuda_1") op = new_net._net.op[1] self.assertEqual(op.type, "Relu") self.assertEqual(op.device_option.device_type, 1) self.assertEqual(op.device_option.cuda_gpu_id, 1) self.assertEqual(op.output[0], "relu1") op = new_net._net.op[2] self.assertEqual(op.type, "Relu") self.assertEqual(op.device_option.device_type, 0) self.assertEqual(op.output[0], "relu2") op = new_net._net.op[3] self.assertEqual(op.type, "Relu") self.assertEqual(op.device_option.device_type, 1) self.assertEqual(op.device_option.cuda_gpu_id, 1) self.assertEqual(op.input[0], "data_cuda_1") self.assertEqual(op.output[0], "relu3") op = new_net._net.op[4] self.assertEqual(op.type, "Relu") self.assertEqual(op.device_option.device_type, 0) self.assertEqual(op.output[0], "relu4") op = new_net._net.op[5] self.assertEqual(op.type, "CopyCPUToGPU") self.assertEqual(op.device_option.device_type, 1) self.assertEqual(op.device_option.cuda_gpu_id, 0) self.assertEqual(op.output[0], "data_cuda_0") op = new_net._net.op[6] self.assertEqual(op.type, "Relu") self.assertEqual(op.device_option.device_type, 1) self.assertEqual(op.device_option.cuda_gpu_id, 0) self.assertEqual(op.input[0], "data_cuda_0") self.assertEqual(op.output[0], "relu5") op = new_net._net.op[7] self.assertEqual(op.type, "Relu") self.assertEqual(op.device_option.device_type, 1) self.assertEqual(op.device_option.cuda_gpu_id, 1) self.assertEqual(op.input[0], "data_cuda_1") self.assertEqual(op.output[0], "relu6") """
def testPartialClone(self): params = core.Net('params') p1 = params.ConstantFill([], ['p1']) workspace.CreateNet(params) workspace.RunNetOnce(params) n = core.Net('original') a1 = n.AddExternalInput('a1') a2 = n.AddExternalInput('a2') b1, b2 = n.Concat([a1, a2], ['b1', 'b2'], axis=0) c1 = n.Sum([b1, p1], ['c1']) c2 = n.Sum([b2], ['c2']) d = n.Sum([c1, c2], ['d']) # test that gradient ops are ignored when partial-cloning n.AddGradientOperators([d]) # test some in-place ops k = n.Sum([p1], ['k']) e = n.Sum([d], ['e']) e = n.Sum([e, k], [e]) e = n.Sum([e], [e]) f = n.Sum(e, ['f']) def net_assert(net, num_ops, inputs, outputs, internals): self.assertEqual(len(net.Proto().op), num_ops) self.assertEqual(set(net.Proto().external_input), inputs) self.assertEqual(set(net.Proto().external_output), outputs) all_blobs = set(net.Proto().external_input) all_blobs |= set(net.Proto().external_output) for op in net.Proto().op: all_blobs |= set(op.input) | set(op.output) self.assertEqual(all_blobs, inputs | outputs | internals) # create net to make sure its valid for input in inputs: workspace.FeedBlob(input, np.array([])) workspace.CreateNet(net) n2, (d22, ) = n.ClonePartial('f1', {a1: 'a11', a2: 'a22'}, [d]) net_assert(n2, 4, {'p1', 'a11', 'a22'}, {'f1/d'}, {'f1/b1', 'f1/b2', 'f1/c1', 'f1/c2', 'p1'}) self.assertTrue(isinstance(d22, core.BlobReference)) self.assertEqual(d22.Net(), n2) self.assertEqual(str(d22), 'f1/d') n3, (d22, ) = n.ClonePartial('f2', [b1, b2], [d]) net_assert(n3, 3, {'p1', 'b1', 'b2'}, {'f2/d'}, {'f2/c1', 'f2/c2', 'p1'}) self.assertEqual(str(d22), 'f2/d') n4, (c22, ) = n.ClonePartial('f3', [b1], [c1]) net_assert(n4, 1, {'p1', 'b1'}, {'f3/c1'}, {'p1'}) self.assertEqual(str(c22), 'f3/c1') n5, (c11, c22) = n.ClonePartial('f4', [b1, b2], [c1, c2]) net_assert(n5, 2, {'p1', 'b1', 'b2'}, {'f4/c1', 'f4/c2'}, {'p1'}) self.assertEqual(str(c11), 'f4/c1') self.assertEqual(str(c22), 'f4/c2') with self.assertRaises(AssertionError): n.ClonePartial('f4', [a1, a2, c2], [d]) n6, (e22, ) = n.ClonePartial('f5', [d], [e]) net_assert(n6, 4, {'p1', 'd'}, {'f5/e'}, {'f5/k', 'p1'}) self.assertEqual(str(e22), 'f5/e') n8, (e22, f22) = n.ClonePartial('f7', [d], [e, f]) net_assert(n8, 5, {'p1', 'd'}, {'f7/e', 'f7/f'}, {'p1', 'f7/k'}) self.assertEqual(str(e22), 'f7/e') self.assertEqual(str(f22), 'f7/f') params._CheckLookupTables() n._CheckLookupTables()
def _create_rnn_variant(cls, init_model, pred_model, n, opset_version): assert init_model is not None, "cannot convert RNNs without access to the full model" assert pred_model is not None, "cannot convert RNNs without access to the full model" attrs = dict(n.attrs) # make a copy, which is safe to mutate hidden_size = attrs.pop('hidden_size') direction = force_unicode(attrs.pop('direction', 'forward')) if n.op_type == 'RNN': activation = force_unicode(attrs.pop('activations', ('tanh',))[0]) elif n.op_type == 'GRU': linear_before_reset = attrs.pop('linear_before_reset', 0) assert not attrs, "unsupported RNN attributes: " + str(attrs.keys()) assert direction in ['forward', 'bidirectional'], "unsupported backwards RNN/GRU/LSTM" if n.op_type in ['RNN', 'GRU']: input_blob, W, R, B, sequence_lens, initial_h = n.inputs elif n.op_type == 'LSTM': input_blob, W, R, B, sequence_lens, initial_h, initial_c = n.inputs if sequence_lens == "": sequence_lens = None for x in itertools.chain(init_model.graph.input, init_model.graph.value_info, pred_model.graph.input, pred_model.graph.value_info): if x.name == W: input_size = x.type.tensor_type.shape.dim[2].dim_value break else: raise RuntimeError("best-effort shape inference for RNN/GRU/LSTM failed") init_net = core.Net("init-net") pred_mh = ModelHelper() init_net.Reshape(W, [W, cls.dummy_name()], shape=[1,-1,0]) init_net.Squeeze(W, W, dims=[0]) init_net.Reshape(R, [R, cls.dummy_name()], shape=[1,-1,0]) init_net.Squeeze(R, R, dims=[0]) init_net.Reshape(B, [B, cls.dummy_name()], shape=[1,-1]) init_net.Squeeze(B, B, dims=[0]) if n.op_type == 'RNN': def reform(*args): pass def make_cell(*args, **kwargs): return rnn_cell.BasicRNN(*args, activation=activation, **kwargs) def make_rnn(direction_offset): return cls._make_rnn_direction( input_blob, B, W, R, [(initial_h, '/initial_h')], sequence_lens, pred_mh, init_net, input_size, hidden_size, 1, direction_offset, "/i2h_b", "/gates_t_b", "/i2h_w", "/gates_t_w", reform, make_cell, lambda x: x) elif n.op_type == 'GRU': def reform(Bi, Br, W_, R_, name, hidden_size, init_net): # caffe2 has a different order from onnx. We need to rearrange # z r h -> r z h reforms = ((W_, 'i2h_w', True, [(0,-1)]), (R_, 'gate_t_w', False, [(0,-1)]), (Bi, 'i2h_b', True, []), (Br, 'gate_t_b', False, [])) cls._rnn_reform_weights(reforms, name, hidden_size, init_net, ['update', 'reset', 'output'], [1, 0, 2]) def make_cell(*args, **kwargs): return gru_cell.GRU(*args, linear_before_reset=linear_before_reset, **kwargs) def make_rnn(direction_offset): return cls._make_rnn_direction( input_blob, B, W, R, [(initial_h, '/initial_h')], sequence_lens, pred_mh, init_net, input_size, hidden_size, 3, direction_offset, "_bias_i2h", "_bias_gates", "/i2h_w_pre", "/gates_t_w_pre", reform, make_cell, lambda x: x) elif n.op_type == 'LSTM': def reform(Bi, Br, W_, R_, name, hidden_size, init_net): # caffe2 has a different order from onnx. We need to rearrange # i o f c -> i f o c reforms = ((W_, 'i2h_w', True, [(0, -1)]), (R_, 'gates_t_w', True, [(0, -1)]), (Bi, 'i2h_b' , True, []), (Br, 'gates_t_b', True, [])) cls._rnn_reform_weights(reforms, name, hidden_size, init_net, ['input', 'output', 'forget', 'cell'], [0, 2, 1, 3]) def make_cell(*args, **kwargs): return rnn_cell.LSTM(*args, **kwargs) def make_rnn(direction_offset): return cls._make_rnn_direction( input_blob, B, W, R, [(initial_h, '/initial_h'), (initial_c, '/initial_c')], sequence_lens, pred_mh, init_net, input_size, hidden_size, 4, direction_offset, "/i2h_b", "/gates_t_b", "/i2h_w", "/gates_t_w", reform, make_cell, lambda x: [x[0], x[1], x[3]]) if direction == 'forward': outputs = make_rnn(0) # in the forward case, storage is shared between the # last outputs. We need to decouple them so that the # VariableLengthSequencePadding only mutates # n.outputs[0] for i in range(1, len(outputs)): pred_mh.net.Copy(outputs[i], n.outputs[i]) if sequence_lens is not None: pred_mh.net.VariableLengthSequencePadding( [outputs[0], sequence_lens], [outputs[0]]) pred_mh.net.ExpandDims([outputs[0]], [n.outputs[0]], dims=[1]) elif direction == 'bidirectional': outputs_f = make_rnn(0) outputs_b = make_rnn(1) concatted_output, _ = pred_mh.net.Concat( [outputs_f[0], outputs_b[0]], [cls.dummy_name(), cls.dummy_name()], axis=2) if sequence_lens is not None: pred_mh.net.VariableLengthSequencePadding( [concatted_output, sequence_lens], [concatted_output]) unsqueezed_output = pred_mh.net.ExpandDims([concatted_output], [cls.dummy_name()], dims=[1]) pred_mh.net.Reshape(unsqueezed_output, [n.outputs[0], cls.dummy_name()], shape=[0,2,0,-1]) for i in range(1, len(n.outputs)): pred_mh.net.Concat([outputs_f[i], outputs_b[i]], [n.outputs[i], cls.dummy_name()], axis=0) return Caffe2Ops(list(pred_mh.Proto().op), list(init_net.Proto().op), list(pred_mh.Proto().external_input))
def create_net(self): net = core.Net("feature_extractor") init_net = core.Net("feature_extractor_init") missing_scalar = self.create_const(init_net, "MISSING_SCALAR", MISSING_VALUE) action_schema = map_schema( ) if self.sorted_action_features else schema.Scalar() if self.max_q_learning: next_action_field = InputColumn.POSSIBLE_NEXT_ACTIONS next_action_schema = schema.List(action_schema) else: next_action_field = InputColumn.NEXT_ACTION next_action_schema = action_schema input_schema = schema.Struct( (InputColumn.STATE_FEATURES, map_schema()), (InputColumn.NEXT_STATE_FEATURES, map_schema()), (InputColumn.ACTION, action_schema), (next_action_field, next_action_schema), ) input_record = net.set_input_record(input_schema) state = self.extract_float_features( net, "state", input_record[InputColumn.STATE_FEATURES], self.sorted_state_features, missing_scalar, ) next_state = self.extract_float_features( net, "next_state", input_record[InputColumn.NEXT_STATE_FEATURES], self.sorted_state_features, missing_scalar, ) if self.max_q_learning and self.sorted_action_features is not None: next_state_field = "tiled_next_state" # TODO: this will need to be more complicated to support sparse features next_state = net.LengthsTile( [next_state, input_record.possible_next_actions.lengths()], ["tiled_next_state"], ) else: next_state_field = "next_state" action = input_record.action next_action = input_record[next_action_field] if self.max_q_learning: next_action = next_action["values"] if self.sorted_action_features: action = self.extract_float_features(net, "action", action, self.sorted_action_features, missing_scalar) next_action = self.extract_float_features( net, next_action_field, next_action, self.sorted_action_features, missing_scalar, ) next_action_output = (schema.List( next_action, lengths_blob=input_record.possible_next_actions.lengths) if self.max_q_learning else next_action) net.set_output_record( schema.Struct( ("state", state), ("action", action), (next_state_field, next_state), (next_action_field, next_action_output), )) return FeatureExtractorNet(net, init_net)
def _test_dnnlowp_nd_int( self, stride, pad, kernels, dilation, size, group, input_channels_per_group, output_channels_per_group, batch_size, order, prepack_weight, gc, dc, ): assume(group == 1 or dilation == 1) assume((not prepack_weight) or order == "NHWC") ndim = len(kernels) X, W, b = generate_convnd_inputs( (stride, ) * ndim, (pad, ) * ndim, kernels, (dilation, ) * ndim, (size, ) * ndim, group, input_channels_per_group, output_channels_per_group, batch_size, order, ) Output = collections.namedtuple("Output", ["Y", "op_type", "engine", "order"]) outputs = [] op_engine_list = [("Conv", ""), ("Conv", "DNNLOWP_16"), ("Int8Conv", "DNNLOWP")] for op_type, engine in op_engine_list: init_net = core.Net("test_init_net") net = core.Net("test_net") fall_back_to_NCHW = "DNNLOWP" not in engine and order == "NHWC" if fall_back_to_NCHW: X_nchw = utils.NHWC2NCHW(X) W_nchw = utils.NHWC2NCHW(W) do_quantize = "DNNLOWP" in engine do_dequantize = "DNNLOWP" in engine # If output scale/zp aren't set, it gets computed from ref fp32 op # in DNNLOWP, which isn't possible when we quantize input weights. # Make sure atleast one output is collected to compute output # scale/zp. do_quantize_weight = engine == "DNNLOWP" and len(outputs) > 0 do_prepack_weight = engine == "DNNLOWP" and prepack_weight if do_quantize: quantize = core.CreateOperator("Quantize", ["X"], ["X_q"], engine=engine, device_option=gc) net.Proto().op.extend([quantize]) x_q_param = dnnlowp_utils.choose_quantization_params( X.min(), X.max()) if do_quantize_weight: int8_given_tensor_fill, w_q_param = dnnlowp_utils.create_int8_given_tensor_fill( W, "W_q") init_net.Proto().op.extend([int8_given_tensor_fill]) # Bias int8_bias_tensor_fill = dnnlowp_utils.create_int8_bias_tensor_fill( b, "b_q", x_q_param, w_q_param) init_net.Proto().op.extend([int8_bias_tensor_fill]) if do_prepack_weight: inputs = ["W_q" if do_quantize_weight else "W"] if do_dequantize: inputs += ["b_q" if do_quantize_weight else "b"] pack = core.CreateOperator( "Int8ConvPackWeight", inputs, ["W_packed"], group=group, in_scale=x_q_param.scale, engine=engine, ) init_net.Proto().op.extend([pack]) conv = core.CreateOperator( op_type, [ "X_q" if do_quantize else "X", "W_packed" if do_prepack_weight else ("W_q" if do_quantize_weight else "W"), "b_q" if do_quantize_weight else "b", ], ["Y_q" if do_dequantize else "Y"], strides=[stride] * ndim, kernels=kernels, dilations=[dilation] * ndim, pads=[pad] * (ndim * 2), order="NCHW" if fall_back_to_NCHW else order, dequantize_output=not do_dequantize, engine=engine, group=group, device_option=gc, ) if do_quantize_weight or do_prepack_weight: # When quantized weight is provided, we can't rescale the # output dynamically by looking at the range of output of each # batch, so here we provide the range of output observed from # fp32 reference implementation dnnlowp_utils.add_quantization_param_args(conv, outputs[0][0]) net.Proto().op.extend([conv]) if do_dequantize: dequantize = core.CreateOperator("Dequantize", ["Y_q"], ["Y"], engine=engine, device_option=gc) net.Proto().op.extend([dequantize]) self.ws.create_blob("X").feed(X_nchw if fall_back_to_NCHW else X, device_option=gc) self.ws.create_blob("W").feed(W_nchw if fall_back_to_NCHW else W, device_option=gc) self.ws.create_blob("b").feed(b, device_option=gc) self.ws.run(init_net) self.ws.run(net) Y = self.ws.blobs["Y"].fetch() if fall_back_to_NCHW: Y = utils.NCHW2NHWC(Y) outputs.append( Output(Y=Y, op_type=op_type, engine=engine, order=order)) check_quantized_results_close(outputs)
def read_ex(self, local_init_net, local_finish_net): self._wrapper._new_reader(local_init_net) dequeue_net = core.Net('dequeue') fields, status_blob = dequeue(dequeue_net, self._wrapper.queue(), len(self.schema().field_names())) return [dequeue_net], status_blob, fields
def test_dnnlowp_conv_int( self, stride, pad, kernel, dilation, size, group, input_channels_per_group, output_channels_per_group, batch_size, order, weight_quantized, prepack_weight, share_col_buffer, preserve_activation_sparsity, preserve_weight_sparsity, gc, dc, ): assume(group == 1 or dilation == 1) assume((not prepack_weight) or order == "NHWC") X, W, b = generate_conv_inputs( stride, pad, kernel, dilation, size, group, input_channels_per_group, output_channels_per_group, batch_size, order, preserve_activation_sparsity=preserve_activation_sparsity, preserve_weight_sparsity=preserve_weight_sparsity, ) Output = collections.namedtuple("Output", ["Y", "op_type", "engine", "order"]) outputs = [] op_engine_list = [ ("Conv", ""), ("Conv", "DNNLOWP"), ("Conv", "DNNLOWP_16"), ("Int8Conv", "DNNLOWP"), ] for op_type, engine in op_engine_list: init_net = core.Net("test_init_net") net = core.Net("test_net") do_quantize = "DNNLOWP" in engine do_dequantize = "DNNLOWP" in engine # If output scale/zp aren't set, it gets computed from ref fp32 op # in DNNLOWP, which isn't possible when we quantize input weights. # Make sure atleast one output is collected to compute output # scale/zp. do_quantize_weight = (engine == "DNNLOWP" and weight_quantized and len(outputs) > 0) do_prepack_weight = engine == "DNNLOWP" and prepack_weight if do_quantize: quantize = core.CreateOperator( "Quantize", ["X"], ["X_q"], preserve_activation_sparsity=preserve_activation_sparsity, engine=engine, device_option=gc, ) net.Proto().op.extend([quantize]) x_q_param = dnnlowp_utils.choose_quantization_params( X.min(), X.max(), preserve_activation_sparsity) # noqa if do_quantize_weight: int8_given_tensor_fill, w_q_param = dnnlowp_utils.create_int8_given_tensor_fill( W, "W_q", preserve_weight_sparsity) init_net.Proto().op.extend([int8_given_tensor_fill]) # Bias int8_bias_tensor_fill = dnnlowp_utils.create_int8_bias_tensor_fill( b, "b_q", x_q_param, w_q_param) init_net.Proto().op.extend([int8_bias_tensor_fill]) if do_prepack_weight: inputs = ["W_q" if do_quantize_weight else "W"] if do_dequantize: inputs += ["b_q" if do_quantize_weight else "b"] pack = core.CreateOperator( "Int8ConvPackWeight", inputs, ["W_packed"], group=group, preserve_weight_sparsity=preserve_weight_sparsity, in_scale=x_q_param.scale, engine=engine, ) init_net.Proto().op.extend([pack]) conv = core.CreateOperator( op_type, [ "X_q" if do_quantize else "X", "W_packed" if do_prepack_weight else ("W_q" if do_quantize_weight else "W"), "b_q" if do_quantize_weight else "b", ], ["Y_q" if do_dequantize else "Y"], stride=stride, kernel=kernel, dilation=dilation, pad=pad, order=order, shared_buffer=(1 if share_col_buffer else 0), preserve_activation_sparsity=preserve_activation_sparsity, preserve_weight_sparsity=preserve_weight_sparsity, engine=engine, group=group, device_option=gc, ) if do_quantize_weight or do_prepack_weight: # When quantized weight is provided, we can't rescale the # output dynamically by looking at the range of output of each # batch, so here we provide the range of output observed from # fp32 reference implementation dnnlowp_utils.add_quantization_param_args( conv, outputs[0][0], preserve_activation_sparsity) net.Proto().op.extend([conv]) if do_dequantize: dequantize = core.CreateOperator("Dequantize", ["Y_q"], ["Y"], engine=engine, device_option=gc) net.Proto().op.extend([dequantize]) self.ws.create_blob("X").feed(X, device_option=gc) self.ws.create_blob("W").feed(W, device_option=gc) self.ws.create_blob("b").feed(b, device_option=gc) self.ws.run(init_net) self.ws.run(net) Y = self.ws.blobs["Y"].fetch() outputs.append( Output(Y=Y, op_type=op_type, engine=engine, order=order)) check_quantized_results_close(outputs, symmetric=preserve_activation_sparsity)
def test_rowwise_dnnlowp_fully_connected_int( self, input_channels, output_channels, batch_size, in_quantized, out_quantized, prepack_weight, gc, dc, ): print("@given M ", batch_size, " K ", input_channels, " N ", output_channels) print("@given in_quantized ", in_quantized, " out_quantized ", out_quantized) # X has scale 1, so exactly represented after quantization X_min = -77 X_max = X_min + 255 X = np.round( np.random.rand(batch_size, input_channels) * (X_max - X_min) + X_min) X = X.astype(np.float32) # input channels 0 and 1 are all X_min to avoid overflow from vpmaddubsw # when multiplied with W_min and W_max X[:, 0:2] = X_min X[0, 2] = X_max # Each row of W has scale 1 but with different offset, so row-wise # quantization shouldn't have any input quantization error. W = np.zeros((output_channels, input_channels)) W = W.astype(np.float32) for i in range(output_channels): W_min = -100 + i W_max = W_min + 255 W[i, :] = np.round( np.random.rand(input_channels) * (W_max - W_min) + W_min) W[i, 0] = W_min W[i, 1] = W_max # Make sure we won't have overflows from vpmaddubsw instruction used in # fbgemm avoid_vpmaddubsw_overflow_fc( batch_size, input_channels, 1, X, X_min, X_max, W[i:i + 1, ], W_min, W_max, ) b = np.random.randn(output_channels).astype(np.float32) Output = collections.namedtuple("Output", ["Y", "op_type", "engine"]) outputs = [] op_engine_list = [ ("FC", ""), ("FC", "DNNLOWP_ROWWISE"), ("FC", "DNNLOWP_ROWWISE_16"), ("Int8FC", "DNNLOWP_ROWWISE"), ("Int8FCRowWise", "DNNLOWP"), ] for op_type, engine in op_engine_list: init_net = core.Net("test_init_net") net = core.Net("test_net") do_quantize = "DNNLOWP" in engine and in_quantized do_dequantize = "DNNLOWP" in engine and out_quantized do_prepack_weight = engine == "DNNLOWP_ROWWISE" and prepack_weight if do_quantize: quantize = core.CreateOperator("Quantize", ["X"], ["X_q"], engine=engine, device_option=gc) net.Proto().op.extend([quantize]) x_q_param = dnnlowp_utils.choose_quantization_params( X.min(), X.max()) if do_prepack_weight: inputs = ["W"] if do_dequantize: inputs += ["b"] pack = core.CreateOperator( "Int8FCPackWeight", inputs, ["W_packed"], in_scale=x_q_param.scale, engine=engine, ) init_net.Proto().op.extend([pack]) fc = core.CreateOperator( op_type, [ "X_q" if do_quantize else "X", "W_packed" if do_prepack_weight else "W", "b", ], ["Y_q" if do_dequantize else "Y"], dequantize_output=not do_dequantize, engine=engine, device_option=gc, ) if do_prepack_weight: # When pre-packed quantized weight is provided, we can't rescale # the output dynamically by looking at the range of output of # each batch, so here we provide the range of output observed # from fp32 reference implementation dnnlowp_utils.add_quantization_param_args(fc, outputs[0][0]) net.Proto().op.extend([fc]) if do_dequantize: dequantize = core.CreateOperator("Dequantize", ["Y_q"], ["Y"], engine=engine, device_option=gc) net.Proto().op.extend([dequantize]) self.ws.create_blob("X").feed(X, device_option=gc) self.ws.create_blob("W").feed(W, device_option=gc) self.ws.create_blob("b").feed(b, device_option=gc) self.ws.run(init_net) self.ws.run(net) outputs.append( Output(Y=self.ws.blobs["Y"].fetch(), op_type=op_type, engine=engine)) check_quantized_results_close(outputs)
def _create_rnn(cls, init_model, pred_model, n, opset_version): assert init_model is not None, "cannot convert RNNs without access to the full model" assert pred_model is not None, "cannot convert RNNs without access to the full model" attrs = dict(n.attrs) # make a copy, which is safe to mutate hidden_size = attrs.pop('hidden_size') activation = force_unicode(attrs.pop('activations', ('tanh', ))[0]) direction = force_unicode(attrs.pop('direction', 'forward')) assert not attrs, "unsupported RNN attributes: " + str(attrs.keys()) assert direction in ['forward', 'bidirectional'], "unsupported backwards RNN" input_blob, W, R, B, sequence_lens, initial_h = n.inputs if sequence_lens == "": sequence_lens = None input_size = cls._rnn_shape_inference(init_model, pred_model, n, input_blob, W) if input_size is None: raise RuntimeError( "best-effort shape inference for RNN input failed") init_net = core.Net("init-net") pred_mh = ModelHelper() def make_rnn(direction_offset): name = dummy_name() # input and recurrence biases are squashed together in # onnx but not in caffe2 bias_offset = 2 * direction_offset * hidden_size init_net.Slice(B, name + "/i2h_b", starts=[bias_offset + 0 * hidden_size], ends=[bias_offset + 1 * hidden_size]) init_net.Slice(B, name + "/gates_t_b", starts=[bias_offset + 1 * hidden_size], ends=[bias_offset + 2 * hidden_size]) weight_offset = direction_offset * hidden_size init_net.Slice(W, name + '/i2h_w', starts=[weight_offset + 0 * hidden_size, 0], ends=[weight_offset + 1 * hidden_size, -1]) init_net.Slice(R, name + '/gates_t_w', starts=[weight_offset + 0 * hidden_size, 0], ends=[weight_offset + 1 * hidden_size, -1]) initial_h_sliced = name + '/initial_h' init_net.Slice(initial_h, initial_h_sliced, starts=[direction_offset + 0, 0, 0], ends=[direction_offset + 1, -1, -1]) if direction_offset == 1: input = pred_mh.net.ReversePackedSegs( [input_blob, sequence_lens], name + "/input-reversed") else: input = input_blob hidden_t_all, hidden_t_last = rnn_cell.BasicRNN( pred_mh, input, sequence_lens, [initial_h_sliced], input_size, hidden_size, name, drop_states=False, forward_only=True, activation=activation) if direction_offset == 1: hidden_t_all = pred_mh.net.ReversePackedSegs( [hidden_t_all, sequence_lens], name + "/output-reversed") return hidden_t_all, hidden_t_last if direction == 'forward': hidden_t_all, hidden_t_last = make_rnn(0) # in the forward case, storage is shared between the two # outputs. We need to decouple them so that the # VariableLengthSequencePadding only mutates n.outputs[0] pred_mh.net.Copy(hidden_t_last, n.outputs[1]) pred_mh.net = pred_mh.net.Clone( "dummy-clone-net", blob_remap={hidden_t_all: n.outputs[0]}) elif direction == 'bidirectional': hidden_t_all_f, hidden_t_last_f = make_rnn(0) hidden_t_all_b, hidden_t_last_b = make_rnn(1) pred_mh.net.Concat([hidden_t_all_f, hidden_t_all_b], [n.outputs[0], dummy_name()], axis=2) pred_mh.net.Concat([hidden_t_last_f, hidden_t_last_b], [n.outputs[1], dummy_name()], axis=0) if sequence_lens is not None: pred_mh.net.VariableLengthSequencePadding( [n.outputs[0], sequence_lens], [n.outputs[0]]) return Caffe2Ops(list(pred_mh.Proto().op), list(init_net.Proto().op), list(pred_mh.Proto().external_input))
def test_dnnlowp_conv_acc16_int( self, stride, pad, kernel, dilation, size, group, input_channels_per_group, output_channels_per_group, batch_size, order, in_quantized, out_quantized, weight_quantized, share_col_buffer, preserve_activation_sparsity, preserve_weight_sparsity, gc, dc, ): assume(group == 1 or dilation == 1) assume(size >= dilation * (kernel - 1) + 1) input_channels = input_channels_per_group * group output_channels = output_channels_per_group * group # X and W have scale 1, so exactly represented after quantization # This was made sure by having at least one 0 and one 255 for unsigned # 8-bit tensors, and at least one -128 and one 127 for signed 8-bit # tensors. # Since fbgemm_acc16 accumulates to 16-bit, To avoid overflow, we use # small numbers except for those 0, 255, -128, and 127, for this test # We also make sure 255, -128, or 127 are not multiplied together by # putting them in different input channels and the corresponding input # channel in other matrix is 0. # For example, we put 255 in input channel 1 in X, so we make the # corresponding input channel in W all zeros. X_min = 0 if preserve_activation_sparsity else -77 X_max = X_min + 255 X = np.random.rand(batch_size, size, size, input_channels) * 4 + X_min X = np.round(X).astype(np.float32) X[..., 0] = X_min X[0, 0, 0, 1] = X_max if preserve_weight_sparsity: W_min = -128 W_max = 100 else: W_min = -100 W_max = W_min + 255 W = (np.random.rand(output_channels, kernel, kernel, input_channels_per_group) * 4 - 2 + W_min + 128) W = np.round(W).astype(np.float32) W[0, 0, 0, 0] = W_min W[1, 0, 0, 0] = W_max W[..., 1] = W_min + 128 # "zeros" if order == "NCHW": X = nhwc2nchw(X) W = nhwc2nchw(W) # No input quantization error in bias b = np.round(np.random.randn(output_channels)).astype(np.float32) Output = collections.namedtuple("Output", ["Y", "op_type", "engine", "order"]) outputs = [] op_engine_list = [ ("Conv", ""), ("Conv", "DNNLOWP_ACC16"), ("Int8Conv", "DNNLOWP_ACC16"), ] for op_type, engine in op_engine_list: net = core.Net("test_net") do_quantize = "DNNLOWP" in engine and in_quantized do_dequantize = "DNNLOWP" in engine and out_quantized do_quantize_weight = ("DNNLOWP" in engine and weight_quantized and len(outputs) > 0) if do_quantize: quantize = core.CreateOperator( "Quantize", ["X"], ["X_q"], preserve_activation_sparsity=preserve_activation_sparsity, engine="DNNLOWP", device_option=gc, ) net.Proto().op.extend([quantize]) if do_quantize_weight: int8_given_tensor_fill, w_q_param = dnnlowp_utils.create_int8_given_tensor_fill( W, "W_q", preserve_weight_sparsity) net.Proto().op.extend([int8_given_tensor_fill]) # Bias x_q_param = dnnlowp_utils.choose_quantization_params( X.min(), X.max(), preserve_activation_sparsity) int8_bias_tensor_fill = dnnlowp_utils.create_int8_bias_tensor_fill( b, "b_q", x_q_param, w_q_param) net.Proto().op.extend([int8_bias_tensor_fill]) conv = core.CreateOperator( op_type, [ "X_q" if do_quantize else "X", "W_q" if do_quantize_weight else "W", "b_q" if do_quantize_weight else "b", ], ["Y_q" if do_dequantize else "Y"], stride=stride, kernel=kernel, dilation=dilation, pad=pad, order=order, dequantize_output=not do_dequantize, shared_buffer=(1 if share_col_buffer else 0), preserve_activation_sparsity=preserve_activation_sparsity, preserve_weight_sparsity=preserve_weight_sparsity, engine=engine, group=group, device_option=gc, ) if do_dequantize or do_quantize_weight: # When quantized weight is provided, we can't rescale the # output dynamically by looking at the range of output of each # batch, so here we provide the range of output observed from # fp32 reference implementation dnnlowp_utils.add_quantization_param_args( conv, outputs[0][0], preserve_activation_sparsity) net.Proto().op.extend([conv]) if do_dequantize: dequantize = core.CreateOperator("Dequantize", ["Y_q"], ["Y"], engine="DNNLOWP", device_option=gc) net.Proto().op.extend([dequantize]) self.ws.create_blob("X").feed(X, device_option=gc) self.ws.create_blob("W").feed(W, device_option=gc) self.ws.create_blob("b").feed(b, device_option=gc) self.ws.run(net) Y = self.ws.blobs["Y"].fetch() outputs.append( Output(Y=Y, op_type=op_type, engine=engine, order=order)) check_quantized_results_close(outputs, symmetric=preserve_activation_sparsity)
def _create_gru(cls, init_model, pred_model, n, opset_version): assert init_model is not None, "cannot convert GRUs without access to the full model" assert pred_model is not None, "cannot convert GRUs without access to the full model" attrs = dict(n.attrs) # make a copy, which is safe to mutate hidden_size = attrs.pop('hidden_size') linear_before_reset = attrs.pop('linear_before_reset', 0) direction = force_unicode(attrs.pop('direction', 'forward')) assert not attrs, "unsupported GRU attributes: " + str(attrs.keys()) assert direction in ['forward', 'bidirectional'], "unsupported backwards GRU" input_blob, W, R, B, sequence_lens, initial_h = n.inputs if sequence_lens == "": sequence_lens = None input_size = cls._rnn_shape_inference(init_model, pred_model, n, input_blob, W) if input_size is None: raise RuntimeError( "best-effort shape inference for GRU input failed") init_net = core.Net("init-net") pred_mh = ModelHelper() def make_gru(direction_offset): name = dummy_name() # input and recurrence biases are squashed together in # onnx but not in caffe2 bias_offset = 6 * direction_offset * hidden_size Bi = init_net.Slice(B, name + "_bias_i2h", starts=[bias_offset + 0 * hidden_size], ends=[bias_offset + 3 * hidden_size]) Br = init_net.Slice(B, name + "_bias_gates", starts=[bias_offset + 3 * hidden_size], ends=[bias_offset + 6 * hidden_size]) weight_offset = 3 * direction_offset * hidden_size W_ = init_net.Slice(W, name + '/i2h_w_pre', starts=[weight_offset + 0 * hidden_size, 0], ends=[weight_offset + 3 * hidden_size, -1]) R_ = init_net.Slice(R, name + '/gates_t_w_pre', starts=[weight_offset + 0 * hidden_size, 0], ends=[weight_offset + 3 * hidden_size, -1]) # caffe2 has a different order from onnx. We need to rearrange # z r h -> r z h reforms = ((W_, 'i2h_w', True, [(0, -1)]), (R_, 'gate_t_w', False, [(0, -1)]), (Bi, 'i2h_b', True, []), (Br, 'gate_t_b', False, [])) for name_from, name_to, do_concat, extra_dims in reforms: xz, xr, xh = [ '%s/%s_%s' % (name, prefix, name_to) for prefix in ('update', 'reset', 'output') ] for i, x in enumerate([xz, xr, xh]): dim0 = i * hidden_size, (i + 1) * hidden_size starts, ends = zip(dim0, *extra_dims) init_net.Slice(name_from, x, starts=starts, ends=ends) if do_concat: init_net.Concat([xr, xz, xh], ['%s/%s' % (name, name_to), dummy_name()], axis=0) initial_h_sliced = name + '/initial_h' init_net.Slice(initial_h, initial_h_sliced, starts=[direction_offset + 0, 0, 0], ends=[direction_offset + 1, -1, -1]) if direction_offset == 1: input = pred_mh.net.ReversePackedSegs( [input_blob, sequence_lens], name + "/input-reversed") else: input = input_blob hidden_t_all, hidden_t_last = gru_cell.GRU( pred_mh, input, sequence_lens, [initial_h_sliced], input_size, hidden_size, name, drop_states=False, forward_only=True, linear_before_reset=linear_before_reset) if direction_offset == 1: hidden_t_all = pred_mh.net.ReversePackedSegs( [hidden_t_all, sequence_lens], name + "/output-reversed") return hidden_t_all, hidden_t_last if direction == 'forward': hidden_t_all, hidden_t_last = make_gru(0) # in the forward case, storage is shared between the two # outputs. We need to decouple them so that the # VariableLengthSequencePadding only mutates n.outputs[0] pred_mh.net.Copy(hidden_t_last, n.outputs[1]) pred_mh.net = pred_mh.net.Clone( "dummy-clone-net", blob_remap={hidden_t_all: n.outputs[0]}) elif direction == 'bidirectional': hidden_t_all_f, hidden_t_last_f = make_gru(0) hidden_t_all_b, hidden_t_last_b = make_gru(1) pred_mh.net.Concat([hidden_t_all_f, hidden_t_all_b], [n.outputs[0], dummy_name()], axis=2) pred_mh.net.Concat([hidden_t_last_f, hidden_t_last_b], [n.outputs[1], dummy_name()], axis=0) if sequence_lens is not None: pred_mh.net.VariableLengthSequencePadding( [n.outputs[0], sequence_lens], [n.outputs[0]]) return Caffe2Ops(list(pred_mh.Proto().op), list(init_net.Proto().op), list(pred_mh.Proto().external_input))
[], # GaussianFill does not need any parameters. ["Z"], shape=[100, 100], # shape argument as a list of ints. mean=1.0, # mean as a single float std=1.0, # std as a single float ) print("Content of op:\n") print(str(op)) workspace.RunOperatorOnce(op) temp = workspace.FetchBlob("Z") pyplot.hist(temp.flatten(), bins=50) pyplot.title("Distribution of Z") pyplot.show() net = core.Net("my_first_net") X = net.GaussianFill([], ["X"], mean=0.0, std=1.0, shape=[2, 3], run_once=0) print("New network proto:\n\n{}".format(net.Proto())) print("Type of X is: {}".format(type(X))) print("The blob name is: {}".format(str(X))) W = net.GaussianFill([], ["W"], mean=0.0, std=1.0, shape=[5, 3], run_once=0) b = net.ConstantFill([], ["b"], shape=[ 5, ], value=1.0, run_once=0) Y = X.FC([W, b], ["Y"]) from caffe2.python import net_drawer from IPython import display graph = net_drawer.GetPydotGraph(net, rankdir="LR")
def preprocess_samples( self, states: List[Dict[int, float]], actions: List[Dict[int, float]], rewards: List[float], next_states: List[Dict[int, float]], next_actions: List[Dict[int, float]], is_terminals: List[bool], possible_next_actions: List[List[Dict[int, float]]], reward_timelines: List[Dict[int, float]], minibatch_size: int, ) -> List[TrainingDataPage]: # Shuffle merged = list( zip(states, actions, rewards, next_states, next_actions, is_terminals, possible_next_actions, reward_timelines)) random.shuffle(merged) states, actions, rewards, next_states, next_actions, is_terminals, \ possible_next_actions, reward_timelines = zip(*merged) net = core.Net('gridworld_preprocessing') C2.set_net(net) preprocessor = PreprocessorNet(net, True) saa = StackedAssociativeArray.from_dict_list(states, 'states') state_matrix, _ = preprocessor.normalize_sparse_matrix( saa.lengths, saa.keys, saa.values, self.normalization, 'state_norm', ) saa = StackedAssociativeArray.from_dict_list(next_states, 'next_states') next_state_matrix, _ = preprocessor.normalize_sparse_matrix( saa.lengths, saa.keys, saa.values, self.normalization, 'next_state_norm', ) saa = StackedAssociativeArray.from_dict_list(actions, 'action') action_matrix, _ = preprocessor.normalize_sparse_matrix( saa.lengths, saa.keys, saa.values, self.normalization_action, 'action_norm', ) saa = StackedAssociativeArray.from_dict_list(next_actions, 'next_action') next_action_matrix, _ = preprocessor.normalize_sparse_matrix( saa.lengths, saa.keys, saa.values, self.normalization_action, 'next_action_norm', ) rewards = np.array(rewards, dtype=np.float32).reshape(-1, 1) pnas_lengths_list = [] pnas_flat = [] for pnas in possible_next_actions: pnas_lengths_list.append(len(pnas)) pnas_flat.extend(pnas) saa = StackedAssociativeArray.from_dict_list(pnas_flat, 'possible_next_actions') pnas_lengths = np.array(pnas_lengths_list, dtype=np.int32) possible_next_actions_matrix, _ = preprocessor.normalize_sparse_matrix( saa.lengths, saa.keys, saa.values, self.normalization_action, 'possible_next_action_norm', ) workspace.RunNetOnce(net) states_ndarray = workspace.FetchBlob(state_matrix) actions_ndarray = workspace.FetchBlob(action_matrix) next_states_ndarray = workspace.FetchBlob(next_state_matrix) next_actions_ndarray = workspace.FetchBlob(next_action_matrix) possible_next_actions_ndarray = workspace.FetchBlob( possible_next_actions_matrix) tdps = [] pnas_start = 0 for start in range(0, states_ndarray.shape[0], minibatch_size): end = start + minibatch_size if end > states_ndarray.shape[0]: break pnas_end = pnas_start + np.sum(pnas_lengths[start:end]) pnas = possible_next_actions_ndarray[pnas_start:pnas_end] pnas_start = pnas_end tdps.append( TrainingDataPage( states=states_ndarray[start:end], actions=actions_ndarray[start:end], rewards=rewards[start:end], next_states=next_states_ndarray[start:end], next_actions=next_actions_ndarray[start:end], possible_next_actions=StackedArray(pnas_lengths[start:end], pnas), not_terminals=(pnas_lengths[start:end] > 0).reshape(-1, 1), reward_timelines=reward_timelines[start:end] if reward_timelines else None, )) return tdps
def test_dnnlowp_max_pool( self, stride, pad, kernel, size, input_channels, batch_size, order, in_quantized, gc, dc, ): assume(kernel <= size) assume(pad < kernel) C = input_channels N = batch_size H = W = size min_ = -10 max_ = 20 if order == "NCHW": X = np.round(np.random.rand(N, C, H, W) * (max_ - min_) + min_) elif order == "NHWC": X = np.round(np.random.rand(N, H, W, C) * (max_ - min_) + min_) X = X.astype(np.float32) Output = collections.namedtuple("Output", ["Y", "op_type", "engine"]) outputs = [] op_engine_list = [ ("MaxPool", ""), ("MaxPool", "DNNLOWP"), ("Int8MaxPool", "DNNLOWP"), ] for op_type, engine in op_engine_list: net = core.Net("test_net") do_quantize = "DNNLOWP" in engine and in_quantized if do_quantize: quantize = core.CreateOperator("Quantize", ["X"], ["X_q"], engine=engine, device_option=gc) net.Proto().op.extend([quantize]) max_pool = core.CreateOperator( op_type, ["X_q" if do_quantize else "X"], ["Y_q" if engine == "DNNLOWP" else "Y"], stride=stride, kernel=kernel, pad=pad, order=order, engine=engine, device_option=gc, ) net.Proto().op.extend([max_pool]) if engine == "DNNLOWP": dequantize = core.CreateOperator("Dequantize", ["Y_q"], ["Y"], engine=engine, device_option=gc) net.Proto().op.extend([dequantize]) self.ws.create_blob("X").feed(X, device_option=gc) self.ws.run(net) outputs.append( Output(Y=self.ws.blobs["Y"].fetch(), op_type=op_type, engine=engine)) # Y_i = max(X_j) so the only error is in quantization of inputs check_quantized_results_close(outputs, ref=X)
def recurrent_net(net, cell_net, inputs, initial_cell_inputs, links, timestep=None, scope=None, outputs_with_grads=(0, )): ''' net: the main net operator should be added to cell_net: cell_net which is executed in a recurrent fasion inputs: sequences to be fed into the recurrent net. Currently only one input is supported. It has to be in a format T x N x (D1...Dk) where T is lengths of the sequence. N is a batch size and (D1...Dk) are the rest of dimentions initial_cell_inputs: inputs of the cell_net for the 0 timestamp. Format for each input is: (cell_net_input_name, external_blob_with_data) links: a dictionary from cell_net input names in moment t+1 and output names of moment t. Currently we assume that each output becomes an input for the next timestep. timestep: name of the timestep blob to be used. If not provided "timestep" is used. scope: Internal blobs are going to be scoped in a format <scope_name>/<blob_name> If not provided we generate a scope name automatically outputs_with_grads : position indices of output blobs which will receive error gradient (from outside recurrent network) during backpropagation ''' assert len(inputs) == 1, "Only one input blob is supported so far" input_blobs = [str(i[0]) for i in inputs] initial_input_blobs = [str(x[1]) for x in initial_cell_inputs] op_name = net.NextName('recurrent') def s(name): # We have to manually scope due to our internal/external blob # relationships. scope_name = op_name if scope is None else scope return "{}/{}".format(str(scope_name), str(name)) # determine inputs that are considered to be references # it is those that are not referred to in inputs or initial_cell_inputs known_inputs = map(str, input_blobs + initial_input_blobs) known_inputs += [str(x[0]) for x in initial_cell_inputs] if timestep is not None: known_inputs.append(str(timestep)) references = [ b for b in cell_net.Proto().external_input if b not in known_inputs ] inner_outputs = list(cell_net.Proto().external_output) # These gradients are expected to be available during the backward pass inner_outputs_map = {o: o + '_grad' for o in inner_outputs} # compute the backward pass of the cell net backward_ops, backward_mapping = core.GradientRegistry.GetBackwardPass( cell_net.Proto().op, inner_outputs_map) backward_mapping = {str(k): str(v) for k, v in backward_mapping.items()} backward_cell_net = core.Net("RecurrentBackwardStep") del backward_cell_net.Proto().op[:] backward_cell_net.Proto().op.extend(backward_ops) # compute blobs used but not defined in the backward pass ssa, _ = core.get_ssa(backward_cell_net.Proto()) undefined = core.get_undefined_blobs(ssa) # also add to the output list the intermediate outputs of fwd_step that # are used by backward. ssa, blob_versions = core.get_ssa(cell_net.Proto()) scratches = [ blob for (blob, ver) in blob_versions.items() if ver > 0 and blob in undefined and blob not in cell_net.Proto().external_output ] backward_cell_net.Proto().external_input.extend(scratches) all_inputs = [i[1] for i in inputs] + [x[1] for x in initial_cell_inputs ] + references all_outputs = [] cell_net.Proto().type = 'simple' backward_cell_net.Proto().type = 'simple' # Internal arguments used by RecurrentNetwork operator # Links are in the format blob_name, recurrent_states, offset. # In the moment t we know that corresponding data block is at # t + offset position in the recurrent_states tensor forward_links = [] backward_links = [] # Aliases are used to expose outputs to external world # Format (internal_blob, external_blob, offset) # Negative offset stands for going from the end, # positive - from the beginning aliases = [] # States held inputs to the cell net recurrent_states = [] for cell_input, _ in initial_cell_inputs: cell_input = str(cell_input) # Recurrent_states is going to be (T + 1) x ... # It stores all inputs and outputs of the cell net over time. # Or their gradients in the case of the backward pass. state = s(cell_input + "_states") states_grad = state + "_grad" cell_output = links[str(cell_input)] forward_links.append((cell_input, state, 0)) forward_links.append((cell_output, state, 1)) backward_links.append((cell_input + "_grad", states_grad, 0)) backward_links.append((cell_output + "_grad", states_grad, 1)) backward_cell_net.Proto().external_input.append( str(cell_output) + "_grad") aliases.append((state, cell_output + "_all", 1)) aliases.append((state, cell_output + "_last", -1)) all_outputs.extend([cell_output + "_all", cell_output + "_last"]) recurrent_states.append(state) for input_t, input_blob in inputs: forward_links.append((str(input_t), str(input_blob), 0)) backward_links.append( (backward_mapping[str(input_t)], str(input_blob) + "_grad", 0)) backward_cell_net.Proto().external_input.extend( cell_net.Proto().external_input) backward_cell_net.Proto().external_input.extend( cell_net.Proto().external_output) def unpack_triple(x): if x: a, b, c = zip(*x) return a, b, c return [], [], [] # Splitting to separate lists so we can pass them to c++ # where we ensemle them back link_internal, link_external, link_offset = unpack_triple(forward_links) backward_link_internal, backward_link_external, backward_link_offset = \ unpack_triple(backward_links) alias_src, alias_dst, alias_offset = unpack_triple(aliases) params = [x for x in references if x in backward_mapping.keys()] recurrent_inputs = [str(x[1]) for x in initial_cell_inputs] results = net.RecurrentNetwork( all_inputs, all_outputs + [s("step_workspaces")], param=map(all_inputs.index, params), alias_src=alias_src, alias_dst=map(str, alias_dst), alias_offset=alias_offset, recurrent_states=recurrent_states, initial_recurrent_state_ids=map(all_inputs.index, recurrent_inputs), link_internal=map(str, link_internal), link_external=map(str, link_external), link_offset=link_offset, backward_link_internal=map(str, backward_link_internal), backward_link_external=map(str, backward_link_external), backward_link_offset=backward_link_offset, step_net=str(cell_net.Proto()), backward_step_net=str(backward_cell_net.Proto()), timestep="timestep" if timestep is None else str(timestep), outputs_with_grads=outputs_with_grads, ) # The last output is a list of step workspaces, # which is only needed internally for gradient propogation return results[:-1]
def __init__(self, model, input_record, output_names_or_num, function, name='functional', output_dtypes=None, tags=None, **kwargs): # allow coercion input_record = schema.as_record(input_record) super(Functional, self).__init__(model, name, input_record, tags=tags, **kwargs) self._function = function self._kwargs = kwargs return_struct = ( isinstance(output_names_or_num, list) or (isinstance(output_names_or_num, six.integer_types) and output_names_or_num != 1) ) with scope.NameScope(self.name, reset=True): if isinstance(output_names_or_num, int): struct_output_schema = schema.NewRecord( model.net, schema.RawTuple(output_names_or_num)) elif isinstance(output_names_or_num, schema.Field): self.output_schema = output_names_or_num.clone(keep_blobs=True) return else: if not isinstance(output_names_or_num, list): output_names_or_num = [output_names_or_num] out_tuple = [(out, np.void) for out in output_names_or_num] struct_output_schema = schema.NewRecord( model.net, schema.Struct(*out_tuple)) num_outputs = len(struct_output_schema.field_blobs()) # functional layer returns Struct if more than one outputs or output is # a list, otherwise Scalar if return_struct: self.output_schema = struct_output_schema else: self.output_schema = struct_output_schema[0] # If output_dtypes is provided, use it for output schema. Otherwise # the shape and type will be inferred. if output_dtypes is not None: if not isinstance(output_dtypes, list): output_dtypes = [output_dtypes] * num_outputs assert len(output_dtypes) == num_outputs for dtype, scalar in zip(output_dtypes, self.output_schema.all_scalars()): scalar.set_type(dtype) return # Fake execution of the function to infer shapes and types automatically had_issues = False try: type_net = core.Net('_temp_type_and_shape_inference_net') schema.InitEmptyRecord(type_net, input_record, enforce_types=True) function(type_net, self.input_record, self.output_schema, **kwargs) (shapes, types) = workspace.InferShapesAndTypes([type_net], {}) for i in range(num_outputs): scalar_schema = (self.output_schema[i] if return_struct else self.output_schema) blob = scalar_schema() if blob not in types or blob not in shapes: had_issues = True continue if shapes[blob] == []: # Scalar type shape = tuple() elif shapes[blob][0] == 0: shape = tuple(shapes[blob][1:]) else: logger.warning("unexpected shape: {}".format(shapes[blob])) # If batch dimension is not first - give up on shape # inference for that blob had_issues = True continue # TODO(amalevich): Move it to some shared library dtype = None if types[blob] == caffe2_pb2.TensorProto.DOUBLE: dtype = (np.float64, shape) elif types[blob] == caffe2_pb2.TensorProto.FLOAT: dtype = (np.float32, shape) elif types[blob] == caffe2_pb2.TensorProto.INT32: dtype = (np.int32, shape) elif types[blob] == caffe2_pb2.TensorProto.INT64: dtype = (np.int64, shape) elif types[blob] == caffe2_pb2.TensorProto.FLOAT16: dtype = (np.float16, shape) if dtype is not None: scalar_schema.set_type(dtype) except TypeError as ex: had_issues = True logger.warning(str(ex)) if had_issues: logger.warning( "Type inference had problems for layer: {}".format(self.name))
def test_prepare_normalization_and_normalize(self): feature_value_map = read_data() normalization_parameters = {} for name, values in feature_value_map.items(): normalization_parameters[name] = normalization.identify_parameter( values, 10, feature_type=self._feature_type_override(name)) for k, v in normalization_parameters.items(): if id_to_type(k) == CONTINUOUS: self.assertEqual(v.feature_type, CONTINUOUS) self.assertIs(v.boxcox_lambda, None) self.assertIs(v.boxcox_shift, None) elif id_to_type(k) == BOXCOX: self.assertEqual(v.feature_type, BOXCOX) self.assertIsNot(v.boxcox_lambda, None) self.assertIsNot(v.boxcox_shift, None) else: assert v.feature_type == id_to_type(k) sorted_features, _ = sort_features_by_normalization( normalization_parameters) norm_net = core.Net("net") C2.set_net(norm_net) preprocessor = PreprocessorNet(False) input_matrix = np.zeros([10000, len(sorted_features)], dtype=np.float32) for i, feature in enumerate(sorted_features): input_matrix[:, i] = feature_value_map[feature] input_matrix_blob = "input_matrix_blob" workspace.FeedBlob(input_matrix_blob, np.array([], dtype=np.float32)) output_blob, _ = preprocessor.normalize_dense_matrix( input_matrix_blob, sorted_features, normalization_parameters, "", False) workspace.FeedBlob(input_matrix_blob, input_matrix) workspace.RunNetOnce(norm_net) normalized_feature_matrix = workspace.FetchBlob(output_blob) normalized_features = {} on_column = 0 for feature in sorted_features: norm = normalization_parameters[feature] if norm.feature_type == ENUM: column_size = len(norm.possible_values) else: column_size = 1 normalized_features[ feature] = normalized_feature_matrix[:, on_column:(on_column + column_size)] on_column += column_size self.assertTrue( all([ np.isfinite(parameter.stddev) and np.isfinite(parameter.mean) for parameter in normalization_parameters.values() ])) for k, v in six.iteritems(normalized_features): self.assertTrue(np.all(np.isfinite(v))) feature_type = normalization_parameters[k].feature_type if feature_type == identify_types.PROBABILITY: sigmoidv = special.expit(v) self.assertTrue( np.all( np.logical_and(np.greater(sigmoidv, 0), np.less(sigmoidv, 1)))) elif feature_type == identify_types.ENUM: possible_values = normalization_parameters[k].possible_values self.assertEqual(v.shape[0], len(feature_value_map[k])) self.assertEqual(v.shape[1], len(possible_values)) possible_value_map = {} for i, possible_value in enumerate(possible_values): possible_value_map[possible_value] = i for i, row in enumerate(v): original_feature = feature_value_map[k][i] self.assertEqual(possible_value_map[original_feature], np.where(row == 1)[0][0]) elif feature_type == identify_types.QUANTILE: for i, feature in enumerate(v[0]): original_feature = feature_value_map[k][i] expected = self._value_to_quantile( original_feature, normalization_parameters[k].quantiles) self.assertAlmostEqual(feature, expected, 2) elif feature_type == identify_types.BINARY: pass elif (feature_type == identify_types.CONTINUOUS or feature_type == identify_types.BOXCOX): one_stddev = np.isclose(np.std(v, ddof=1), 1, atol=0.01) zero_stddev = np.isclose(np.std(v, ddof=1), 0, atol=0.01) zero_mean = np.isclose(np.mean(v), 0, atol=0.01) self.assertTrue( np.all(zero_mean), "mean of feature {} is {}, not 0".format(k, np.mean(v)), ) self.assertTrue(np.all(np.logical_or(one_stddev, zero_stddev))) elif feature_type == identify_types.CONTINUOUS_ACTION: less_than_max = v < 1 more_than_min = v > -1 self.assertTrue( np.all(less_than_max), "values are not less than 1: {}".format( v[less_than_max == False]), ) self.assertTrue( np.all(more_than_min), "values are not more than -1: {}".format( v[more_than_min == False]), ) else: raise NotImplementedError()
def test_groupwise_dnnlowp_conv_relu_int( self, stride, pad, kernel, dilation, size, group, input_channels_per_group, output_channels_per_group, batch_size, order, gc, dc, ): if group > 1: dilation = 1 X, W, b = generate_conv_inputs( stride, pad, kernel, dilation, size, group, input_channels_per_group, output_channels_per_group, batch_size, order, True, # group-wise ) Output = collections.namedtuple("Output", ["Y", "op_type", "engine", "order"]) outputs = [] op_engine_list = [ ("Conv", ""), ("ConvRelu", "DNNLOWP"), ("ConvRelu", "DNNLOWP_16"), ("Int8ConvRelu", "DNNLOWP"), ] for op_type, engine in op_engine_list: net = core.Net("test_net") if "DNNLOWP" in engine: quantize = core.CreateOperator("Quantize", ["X"], ["X_q"], engine=engine, device_option=gc) net.Proto().op.extend([quantize]) conv = core.CreateOperator( op_type, ["X_q", "W", "b"], ["Y_q"], stride=stride, kernel=kernel, dilation=dilation, pad=pad, order=order, engine=engine, group=group, quantize_groupwise=1, device_option=gc, ) # groupwise quantization only works with static quantization # so we need to set quantization parameters dnnlowp_utils.add_quantization_param_args(conv, outputs[0][0]) net.Proto().op.extend([conv]) dequantize = core.CreateOperator("Dequantize", ["Y_q"], ["Y"], engine=engine, device_option=gc) net.Proto().op.extend([dequantize]) else: conv = core.CreateOperator( op_type, ["X", "W", "b"], ["Y"], stride=stride, kernel=kernel, dilation=dilation, pad=pad, order=order, engine=engine, group=group, device_option=gc, ) net.Proto().op.extend([conv]) relu = core.CreateOperator("Relu", ["Y"], ["Y"], engine=engine, device_option=gc) net.Proto().op.extend([relu]) self.ws.create_blob("X").feed(X, device_option=gc) self.ws.create_blob("W").feed(W, device_option=gc) self.ws.create_blob("b").feed(b, device_option=gc) self.ws.run(net) Y = self.ws.blobs["Y"].fetch() outputs.append( Output(Y=Y, op_type=op_type, engine=engine, order=order)) check_quantized_results_close(outputs)
def test_dataset_ops(self): """ 1. Defining the schema of our dataset. This example schema could represent, for example, a search query log. """ schema = Struct( # fixed size vector, which will be stored as a matrix when batched ('dense', Scalar((np.float32, 3))), # could represent a feature map from feature ID to float value ('floats', Map(Scalar(np.int32), Scalar(np.float32))), # could represent a multi-valued categorical feature map ('int_lists', Map( Scalar(np.int32), List(Scalar(np.int64)), )), # could represent a multi-valued, weighted categorical feature map ('id_score_pairs', Map( Scalar(np.int32), Map(Scalar(np.int64), Scalar(np.float32), keys_name='ids', values_name='scores'), )), # additional scalar information ('metadata', Struct( ('user_id', Scalar(np.int64)), ('user_embed', Scalar((np.float32, 2))), ('query', Scalar(str)), )), ) """ This is what the flattened fields for this schema look like, along with its type. Each one of these fields will be stored, read and written as a tensor. """ expected_fields = [ ('dense', (np.float32, 3)), ('floats:lengths', np.int32), ('floats:values:keys', np.int32), ('floats:values:values', np.float32), ('int_lists:lengths', np.int32), ('int_lists:values:keys', np.int32), ('int_lists:values:values:lengths', np.int32), ('int_lists:values:values:values', np.int64), ('id_score_pairs:lengths', np.int32), ('id_score_pairs:values:keys', np.int32), ('id_score_pairs:values:values:lengths', np.int32), ('id_score_pairs:values:values:values:ids', np.int64), ('id_score_pairs:values:values:values:scores', np.float32), ('metadata:user_id', np.int64), ('metadata:user_embed', (np.float32, 2)), ('metadata:query', str), ] zipped = zip(expected_fields, schema.field_names(), schema.field_types()) for (ref_name, ref_type), name, dtype in zipped: self.assertEquals(ref_name, name) self.assertEquals(np.dtype(ref_type), dtype) """ 2. The contents of our dataset. Contents as defined below could represent, for example, a log of search queries along with dense, sparse features and metadata. The dataset below has 3 top-level entries. """ contents_raw = [ # dense [[1.1, 1.2, 1.3], [2.1, 2.2, 2.3], [3.1, 3.2, 3.3]], # floats [1, 2, 3], # len [11, 21, 22, 31, 32, 33], # key [1.1, 2.1, 2.2, 3.1, 3.2, 3.3], # value # int lists [2, 0, 1], # len [11, 12, 31], # key [2, 4, 3], # value:len [111, 112, 121, 122, 123, 124, 311, 312, 313], # value:value # id score pairs [1, 2, 2], # len [11, 21, 22, 31, 32], # key [1, 1, 2, 2, 3], # value:len [111, 211, 221, 222, 311, 312, 321, 322, 323], # value:ids [11.1, 21.1, 22.1, 22.2, 31.1, 31.2, 32.1, 32.2, 32.3], # val:score # metadata [123, 234, 456], # user_id [[0.2, 0.8], [0.5, 0.5], [0.7, 0.3]], # user_embed ['dog posts', 'friends who like to', 'posts about ca'], # query ] # convert the above content to ndarrays, checking against the schema contents = from_blob_list(schema, contents_raw) """ 3. Creating and appending to the dataset. We first create an empty dataset with the given schema. Then, a Writer is used to append these entries to the dataset. """ ds = dataset.Dataset(schema) net = core.Net('init') with core.NameScope('init'): ds.init_empty(net) content_blobs = NewRecord(net, contents) FeedRecord(content_blobs, contents) writer = ds.writer(init_net=net) writer.write_record(net, content_blobs) workspace.RunNetOnce(net) """ 4. Iterating through the dataset contents. If we were to iterate through the top level entries of our dataset, this is what we should expect to see: """ entries_raw = [ ( [[1.1, 1.2, 1.3]], # dense [1], [11], [1.1], # floats [2], [11, 12], [2, 4], [111, 112, 121, 122, 123, 124], # intlst [1], [11], [1], [111], [11.1], # id score pairs [123], [[0.2, 0.8]], ['dog posts'], # metadata ), ( [[2.1, 2.2, 2.3]], # dense [2], [21, 22], [2.1, 2.2], # floats [0], [], [], [], # int list [2], [21, 22], [1, 2], [211, 221, 222], [21.1, 22.1, 22.2], [234], [[0.5, 0.5]], ['friends who like to'], # metadata ), ( [[3.1, 3.2, 3.3]], # dense [3], [31, 32, 33], [3.1, 3.2, 3.3], # floats [1], [31], [3], [311, 312, 313], # int lst [2], [31, 32], [2, 3], [311, 312, 321, 322, 323], [31.1, 31.2, 32.1, 32.2, 32.3], # id score list [456], [[0.7, 0.3]], ['posts about ca'], # metadata ), # after the end of the dataset, we will keep getting empty vectors ( [], ) * 16, ([], ) * 16, ] entries = [from_blob_list(schema, e) for e in entries_raw] """ Let's go ahead and create the reading nets. We will run `read` net multiple times and assert that we are reading the entries the way we stated above. """ read_init_net = core.Net('read_init') read_next_net = core.Net('read_next') reader = ds.reader(read_init_net) should_continue, batch = reader.read_record(read_next_net) workspace.RunNetOnce(read_init_net) workspace.CreateNet(read_next_net, True) for entry in entries: workspace.RunNet(str(read_next_net)) actual = FetchRecord(batch) _assert_records_equal(actual, entry) """ 5. Reading/writing in a single plan If all of operations on the data are expressible as Caffe2 operators, we don't need to load the data to python, iterating through the dataset in a single Plan. Where we will process the dataset a little and store it in a second dataset. We can reuse the same Reader since it supports reset. """ reset_net = core.Net('reset_net') reader.reset(reset_net) read_step, batch = reader.execution_step() """ We will add the line number * 1000 to the feature ids. """ process_net = core.Net('process') line_no = Const(process_net, 0, dtype=np.int32) const_one = Const(process_net, 1000, dtype=np.int32) process_net.Add([line_no, const_one], [line_no]) field = batch.floats.keys.get() process_net.Print(field, []) process_net.Add([field, line_no], field, broadcast=1, axis=0) """ Lets create a second dataset and append to it. """ ds2 = dataset.Dataset(schema, name='dataset2') ds2.init_empty(reset_net) writer = ds2.writer(reset_net) writer.write_record(process_net, batch) # commit is not necessary for DatasetWriter but will add it for # generality of the example commit_net = core.Net('commit') writer.commit(commit_net) """ Time to create and run a plan which will do the processing """ plan = core.Plan('process') plan.AddStep(core.execution_step('reset', reset_net)) plan.AddStep(read_step.AddNet(process_net)) plan.AddStep(core.execution_step('commit', commit_net)) workspace.RunPlan(plan) """ Now we should have dataset2 populated. """ ds2_data = FetchRecord(ds2.content()) field = ds2_data.floats.keys field.set(blob=field.get() - [1000, 2000, 2000, 3000, 3000, 3000]) _assert_records_equal(contents, ds2_data) """ 6. Slicing a dataset You can create a new schema from pieces of another schema and reuse the same data. """ subschema = Struct(('top_level', schema.int_lists.values)) int_list_contents = contents.int_lists.values.field_names() self.assertEquals(len(subschema.field_names()), len(int_list_contents)) """ 7. Random Access a dataset """ read_init_net = core.Net('read_init') read_next_net = core.Net('read_next') idx = np.array([2, 1, 0]) indices_blob = Const(read_init_net, idx, name='indices') reader = ds.random_reader(read_init_net, indices_blob) reader.computeoffset(read_init_net) should_stop, batch = reader.read_record(read_next_net) workspace.CreateNet(read_init_net, True) workspace.RunNetOnce(read_init_net) workspace.CreateNet(read_next_net, True) for i in range(len(entries)): k = idx[i] if i in idx else i entry = entries[k] workspace.RunNet(str(read_next_net)) actual = FetchRecord(batch) _assert_records_equal(actual, entry) workspace.RunNet(str(read_next_net)) self.assertEquals(True, workspace.FetchBlob(should_stop)) """ 8. Random Access a dataset with loop_over = true """ read_init_net = core.Net('read_init') read_next_net = core.Net('read_next') idx = np.array([2, 1, 0]) indices_blob = Const(read_init_net, idx, name='indices') reader = ds.random_reader(read_init_net, indices_blob, loop_over=True) reader.computeoffset(read_init_net) should_stop, batch = reader.read_record(read_next_net) workspace.CreateNet(read_init_net, True) workspace.RunNetOnce(read_init_net) workspace.CreateNet(read_next_net, True) for _ in range(len(entries) * 3): workspace.RunNet(str(read_next_net)) self.assertEquals(False, workspace.FetchBlob(should_stop)) """ 9. Sort and shuffle a dataset This sort the dataset using the score of a certain column, and then shuffle within each chunk of size batch_size * shuffle_size before shuffling the chunks. """ read_init_net = core.Net('read_init') read_next_net = core.Net('read_next') reader = ds.random_reader(read_init_net) reader.sort_and_shuffle(read_init_net, 'int_lists:lengths', 1, 2) reader.computeoffset(read_init_net) should_continue, batch = reader.read_record(read_next_net) workspace.CreateNet(read_init_net, True) workspace.RunNetOnce(read_init_net) workspace.CreateNet(read_next_net, True) expected_idx = np.array([2, 1, 0]) for i in range(len(entries)): k = expected_idx[i] if i in expected_idx else i entry = entries[k] workspace.RunNet(str(read_next_net)) actual = FetchRecord(batch) _assert_records_equal(actual, entry) """ Trim a dataset """ trim_net = core.Net('trim_ds') ds.trim(trim_net, multiple_of=2) workspace.RunNetOnce(trim_net) trimmed = FetchRecord(ds.content()) EXPECTED_SIZES = [2, 2, 3, 3, 2, 2, 2, 6, 2, 3, 3, 4, 4, 2, 2, 2] actual_sizes = [d.shape[0] for d in trimmed.field_blobs()] self.assertEquals(EXPECTED_SIZES, actual_sizes)