def run_model(self, gpu_devices): ''' Helper function for test_equiv ''' def input_builder_fun(model): return None def model_build_fun(model, loss_scale): fc = model.FC("data", "fc", 16, 1, ("ConstantFill", {}), ("ConstantFill", {})) fc_fl = model.FlattenToVec(fc, "fc_fl") sigm = model.Sigmoid(fc_fl, "sigm") sq = model.SquaredL2Distance([sigm, "label"], "sq") loss = model.AveragedLoss(sq, "loss") loss = model.Scale(loss, scale=loss_scale) return [loss] def param_update_fun(model): ITER = model.Iter("ITER") LR = model.net.LearningRate( [ITER], "LR", base_lr=(-0.1), policy="fixed", ) ONE = model.param_init_net.ConstantFill( [], "ONE", shape=[1], value=1.0, ) for param in model.GetParams(): grad = model.param_to_grad[param] model.WeightedSum([param, ONE, grad, LR], param) workspace.ResetWorkspace() model = cnn.CNNModelHelper( order="NHWC", name="test{}".format(gpu_devices), ) data_parallel_model.Parallelize_GPU( model, input_builder_fun=input_builder_fun, forward_pass_builder_fun=model_build_fun, param_update_builder_fun=param_update_fun, devices=gpu_devices, ) np.random.seed(2603) # Each run has same input, independent of number of gpus batch_size = 64 for i in range(0, 10): full_data = np.random.rand(batch_size, 16) full_labels = np.round(full_data[:, 0]) batch_per_device = batch_size // len(gpu_devices) for (j, g) in enumerate(gpu_devices): st = j * batch_per_device en = st + batch_per_device data = full_data[st:en, :].astype(np.float32) labels = full_labels[st:en].astype(np.float32) with core.DeviceScope(core.DeviceOption(caffe2_pb2.CUDA, g)): workspace.FeedBlob("gpu_{}/data".format(g), data) workspace.FeedBlob("gpu_{}/label".format(g), labels) if i == 0: workspace.RunNetOnce(model.param_init_net) workspace.CreateNet(model.net) print(i, workspace.FetchBlob("gpu_0/fc_w").flatten()[:5]) workspace.RunNet(model.net.Proto().name) return workspace.FetchBlob("gpu_0/fc_w")
def setUp(self): workspace.SwitchWorkspace("default") workspace.ResetWorkspace()
def setUp(self): self.net = core.Net("test-net") self.testblob_ref = self.net.ConstantFill( [], "testblob", shape=[1, 2, 3, 4], value=1.0) workspace.ResetWorkspace()
def test_convolution_sum_relu_fusion(self, stride, pad, kernel, size, input_channels, output_channels, batch_size, use_bias, group, gc, dc): conv_S0 = core.CreateOperator( "Conv", ["SX0", "Sw0", "Sb0"] if use_bias else ["SX0", "Sw0"], ["S0"], stride=stride, pad=pad, kernel=kernel, group=group, device_option=dc[0]) conv = core.CreateOperator( "Conv", ["X0", "w0", "b0"] if use_bias else ["X0", "w0"], ["Y0"], stride=stride, pad=pad, kernel=kernel, group=group, device_option=dc[0]) sum = core.CreateOperator("Sum", ["S0", "Y0"], ["S0"], device_option=dc[0]) relu = core.CreateOperator("Relu", ["S0"], ["S0"], device_option=dc[0]) # Manual fusion for Conv + Sum + ReLU conv_S1 = core.CreateOperator( "Conv", ["SX1", "Sw1", "Sb1"] if use_bias else ["SX1", "Sw1"], ["S1"], stride=stride, pad=pad, kernel=kernel, group=group, device_option=dc[1]) conv_fusion = core.CreateOperator( "ConvFusion", ["X1", "w1", "b1", "S1"] if use_bias else ["X1", "w1", "S1"], ["S1"], stride=stride, pad=pad, kernel=kernel, group=group, fusion_type=3, device_option=dc[1]) SX = np.random.rand(batch_size, input_channels * group, size, size).astype(np.float32) - 0.5 Sw = np.random.rand( output_channels * group, input_channels, kernel, kernel) \ .astype(np.float32) - 0.5 Sb = np.random.rand(output_channels * group).astype(np.float32) - 0.5 X = np.random.rand(batch_size, input_channels * group, size, size).astype(np.float32) - 0.5 w = np.random.rand( output_channels * group, input_channels, kernel, kernel) \ .astype(np.float32) - 0.5 b = np.random.rand(output_channels * group).astype(np.float32) - 0.5 old_ws_name = workspace.CurrentWorkspace() workspace.SwitchWorkspace("_device_check_", True) workspace.FeedBlob('SX0', SX, dc[0]) workspace.FeedBlob('Sw0', Sw, dc[0]) workspace.FeedBlob('Sb0', Sb, dc[0]) workspace.FeedBlob('X0', X, dc[0]) workspace.FeedBlob('w0', w, dc[0]) workspace.FeedBlob('b0', b, dc[0]) workspace.RunOperatorOnce(conv_S0) workspace.RunOperatorOnce(conv) workspace.RunOperatorOnce(sum) workspace.RunOperatorOnce(relu) S0 = workspace.FetchBlob('S0') workspace.ResetWorkspace() workspace.FeedBlob('SX1', SX, dc[1]) workspace.FeedBlob('Sw1', Sw, dc[1]) workspace.FeedBlob('Sb1', Sb, dc[1]) workspace.FeedBlob('X1', X, dc[1]) workspace.FeedBlob('w1', w, dc[1]) workspace.FeedBlob('b1', b, dc[1]) workspace.RunOperatorOnce(conv_S1) workspace.RunOperatorOnce(conv_fusion) S1 = workspace.FetchBlob('S1') if not np.allclose(S0, S1, atol=0.01, rtol=0.01): print(S1.flatten()) print(S0.flatten()) print(np.max(np.abs(S1 - S0))) self.assertTrue(False) # Auto fusion for Conv + Sum + ReLU workspace.ResetWorkspace() old_net = caffe2_pb2.NetDef() conv_S0_old = caffe2_pb2.OperatorDef() conv_S0_old.CopyFrom(conv_S0) conv_S0_old.device_option.CopyFrom(dc[1]) conv_old = caffe2_pb2.OperatorDef() conv_old.CopyFrom(conv) conv_old.device_option.CopyFrom(dc[1]) sum_old = caffe2_pb2.OperatorDef() sum_old.CopyFrom(sum) sum_old.device_option.CopyFrom(dc[1]) relu_old = caffe2_pb2.OperatorDef() relu_old.CopyFrom(relu) relu_old.device_option.CopyFrom(dc[1]) old_net.op.extend([conv_S0_old, conv_old, sum_old, relu_old]) workspace.FeedBlob('SX0', SX, dc[1]) workspace.FeedBlob('Sw0', Sw, dc[1]) workspace.FeedBlob('Sb0', Sb, dc[1]) workspace.FeedBlob('X0', X, dc[1]) workspace.FeedBlob('w0', w, dc[1]) workspace.FeedBlob('b0', b, dc[1]) net = core.Net("net") net.Proto().CopyFrom(old_net) optimizeForIDEEP(net) self.assertTrue(len(net.Proto().op) == 2) self.assertTrue(net.Proto().op[1].type == "ConvFusion") workspace.RunNetOnce(net.Proto()) S2 = workspace.FetchBlob('S0') if not np.allclose(S0, S2, atol=0.01, rtol=0.01): print(S2.flatten()) print(S0.flatten()) print(np.max(np.abs(S2 - S0))) self.assertTrue(False) workspace.SwitchWorkspace(old_ws_name)
def test_convolution_affch_folding(self, stride, pad, kernel, size, input_channels, output_channels, batch_size, use_bias, group, inplace, gc, dc): conv = core.CreateOperator( "Conv", ["X0", "w0", "b0"] if use_bias else ["X0", "w0"], ["X1"], stride=stride, pad=pad, kernel=kernel, group=group, device_option=dc[1]) affch = core.CreateOperator("AffineChannel", ["X1", "scale", "bias"], ["X1" if inplace else "Y"], device_option=dc[1]) X = np.random.rand(batch_size, input_channels * group, size, size).astype(np.float32) - 0.5 w = np.random.rand( output_channels * group, input_channels, kernel, kernel) \ .astype(np.float32) - 0.5 b = np.random.rand(output_channels * group).astype(np.float32) - 0.5 scale = np.random.rand(output_channels).astype(np.float32) + 0.5 bias = np.random.rand(output_channels).astype(np.float32) - 0.5 old_ws_name = workspace.CurrentWorkspace() workspace.SwitchWorkspace("_device_check_", True) workspace.FeedBlob('X0', X, dc[1]) workspace.FeedBlob('w0', w, dc[1]) workspace.FeedBlob('b0', b, dc[1]) workspace.FeedBlob('scale', scale, dc[1]) workspace.FeedBlob('bias', bias, dc[1]) workspace.RunOperatorOnce(conv) workspace.RunOperatorOnce(affch) Y = workspace.FetchBlob('X1' if inplace else "Y") workspace.ResetWorkspace() old_net = caffe2_pb2.NetDef() conv_old = caffe2_pb2.OperatorDef() conv_old.CopyFrom(conv) conv_old.device_option.CopyFrom(dc[1]) affch_old = caffe2_pb2.OperatorDef() affch_old.CopyFrom(affch) affch_old.device_option.CopyFrom(dc[1]) old_net.op.extend([conv_old, affch_old]) workspace.FeedBlob('X0', X, dc[1]) workspace.FeedBlob('w0', w, dc[1]) workspace.FeedBlob('b0', b, dc[1]) workspace.FeedBlob('scale', scale, dc[1]) workspace.FeedBlob('bias', bias, dc[1]) net = core.Net("net") net.Proto().CopyFrom(old_net) optimizeForIDEEP(net) self.assertTrue(len(net.Proto().op) == 1) self.assertTrue(net.Proto().op[0].type == "Conv") workspace.RunOperatorOnce(net.Proto().op[0]) Y1 = workspace.FetchBlob('X1' if inplace else "Y") if not np.allclose(Y, Y1, atol=0.01, rtol=0.01): print(Y.flatten()) print(Y1.flatten()) print(np.max(np.abs(Y - Y1))) self.assertTrue(False) workspace.SwitchWorkspace(old_ws_name)
def test_int8_fc_4_dims(self, n, m, k, gc, dc): X = np.random.rand(m, k, m, m).astype(np.float32) - 0.5 w = np.random.rand(n, k, m, m).astype(np.float32) - 0.5 b = np.random.rand(n).astype(np.float32) - 0.5 fc_fp32 = core.CreateOperator('FC', ['X', 'w', 'b'], ["Y"]) old_ws_name = workspace.CurrentWorkspace() workspace.SwitchWorkspace("_device_check_", True) workspace.FeedBlob('X', X, dc[0]) workspace.FeedBlob('w', w, dc[0]) workspace.FeedBlob('b', b, dc[0]) workspace.RunOperatorOnce(fc_fp32) Y = workspace.FetchBlob('Y') workspace.ResetWorkspace() Y_absmax = np.array([np.absolute(Y).max()]).astype(np.float32) if Y.min() >= 0: Y_scale = Y_absmax / 0xFF Y_zero_point = 0 else: Y_scale = Y_absmax / 0x7F Y_zero_point = 128 X_absmax = np.array([np.absolute(X).max()]).astype(np.float32) if X.min() >= 0: X_scale = X_absmax / 0xFF X_zero_point = 0 else: X_scale = X_absmax / 0x7F X_zero_point = 128 w_absmax = np.array([ np.absolute(w[i, ...]).max() for i in range(w.shape[0]) ]).astype(np.float32) w_scale = w_absmax / 0x7F w_zero_point = 128 w = np.transpose(w, (0, 2, 3, 1)).astype(np.float32) w_bytes = np.rint([w[i, ...] / w_scale[i] for i in range(w.shape[0]) ]).astype(np.int8) + w_zero_point w_filler = core.CreateOperator( "Int8GivenTensorFill", [], ["wi"], shape=w.shape, values=w_bytes.astype(np.uint8).tobytes(), Y_zero_point=w_zero_point, Y_scales=w_scale, device_option=dc[1], ) b_scale = w_scale * X_scale b_zero_point = 0 b_bytes = np.rint([b[i] / b_scale[i] for i in range(b.shape[0])]).astype(np.int32) b_filler = core.CreateOperator( "Int8GivenIntTensorFill", [], ["bi"], shape=b.shape, values=b_bytes, Y_zero_point=b_zero_point, Y_scales=b_scale, device_option=dc[1], ) sw2nhwc = core.CreateOperator("NCHW2NHWC", ["Xi"], ["Xi_nhwc"], device_option=dc[1]) quantize_X = core.CreateOperator( "Int8Quantize", ["Xi_nhwc"], ["Xi_quantized"], engine="DNNLOWP", device_option=dc[1], Y_zero_point=X_zero_point, Y_scale=X_scale[0], ) fc = core.CreateOperator( 'Int8FC', ['Xi_quantized', 'wi', 'bi'], ["Y_out"], engine="DNNLOWP", device_option=dc[1], Y_zero_point=Y_zero_point, Y_scale=Y_scale[0], ) net = caffe2_pb2.NetDef() net.op.extend([w_filler, b_filler, sw2nhwc, quantize_X, fc]) workspace.FeedBlob("Xi", X, dc[1]) workspace.RunNetOnce(net) Y_out = workspace.FetchBlob("Y_out") MSE = np.square(np.subtract(Y, Y_out)).mean() if MSE > 0.005: print(Y.flatten()) print(Y_out.flatten()) print(np.max(np.abs(Y_out - Y))) print("MSE", MSE) self.assertTrue(False) workspace.SwitchWorkspace(old_ws_name)
def _test_binary_op_graph(self, name, seed): np.random.seed(seed) workspace.ResetWorkspace() # First dimension is the batch size dims = np.concatenate((np.array([1]), np.random.randint(1, 20, size=3))) A = np.random.uniform(low=-100.0, high=100.0, size=dims).astype(np.float32) B = np.random.uniform(low=-100.0, high=100.0, size=dims).astype(np.float32) # Avoid dividing by 0 B[np.abs(B) < 1e-3] = 1e-3 print(A.shape, B.shape) pred_net = caffe2_pb2.NetDef() pred_net.name = "pred" pred_net.external_input.extend(["A", "B"]) pred_net.external_output.append("C") pred_net.op.add().CopyFrom(core.CreateOperator(name, ["A", "B"], ["C"])) pred_net_ref = caffe2_pb2.NetDef() pred_net_ref.name = "ref" pred_net_ref.external_input.extend(["A", "B"]) pred_net_ref.external_output.append("C_ref") pred_net_ref.op.add().CopyFrom( core.CreateOperator( name + "FakeFp16", ["A", "B"], ["C_ref"], )) shape_hints = {"A": A.shape, "B": B.shape} pred_net_onnxified = onnxifi_caffe2_net(pred_net, shape_hints, debug=True, adjust_batch=True, use_onnx=False) print(pred_net_onnxified) num_onnxified_ops = sum(1 if o.type == "Onnxifi" else 0 for o in pred_net_onnxified.op) np.testing.assert_equal(num_onnxified_ops, 1) workspace.SwitchWorkspace("glow_test_ws", True) workspace.FeedBlob("A", A) workspace.FeedBlob("B", B) workspace.CreateNet(pred_net_ref) workspace.CreateNet(pred_net_onnxified) num_iterations = 10 for _ in range(num_iterations): A = np.random.uniform(low=-100.0, high=100.0, size=dims).astype(np.float32) B = np.random.uniform(low=-100.0, high=100.0, size=dims).astype(np.float32) workspace.FeedBlob("A", A) workspace.FeedBlob("B", B) # Run caffe2 net workspace.RunNet(pred_net_ref.name) Y_c2 = workspace.FetchBlob("C_ref") # Run Glow net workspace.RunNet(pred_net_onnxified.name) Y_glow = workspace.FetchBlob("C") # Results should be identical since we are comparing with the C2 emulation if not np.allclose(Y_c2, Y_glow): diff = np.abs((Y_glow - Y_c2) / (Y_c2 + kEpsilon)) print_test_debug_info( name, { "dims": dims, "A": A, "B": B, "Y_glow": Y_glow, "Y_c2": Y_c2, "diff": diff }) assert (0)
def run_model(self, devices, gpu): ''' Helper function for test_equiv ''' def input_builder_fun(model): return None def model_build_fun(model, loss_scale): workspace.FeedBlob( core.ScopedBlobReference("seq_lengths"), np.array([self.T] * self.batch_per_device, dtype=np.int32)) model.param_init_net.ConstantFill( [], "hidden_init", value=0.0, shape=[1, self.batch_per_device, self.hidden_dim]) model.param_init_net.ConstantFill( [], "cell_init", value=0.0, shape=[1, self.batch_per_device, self.hidden_dim]) output, _last_hidden, _, _last_state, = rnn_cell.LSTM( model=model, input_blob="data", seq_lengths="seq_lengths", initial_states=("hidden_init", "cell_init"), dim_in=self.input_dim, dim_out=self.hidden_dim, scope="partest", ) # A silly loss function loss = model.AveragedLoss( model.Sub([output, "target"], "dist"), "loss", ) loss = model.Scale(loss, "loss_scaled", scale=loss_scale) return [loss] def param_update_fun(model): ITER = model.Iter("ITER") LR = model.net.LearningRate( [ITER], "LR", base_lr=(-0.1), policy="fixed", ) ONE = model.param_init_net.ConstantFill( [], "ONE", shape=[1], value=1.0, ) for param in model.GetParams(): param_grad = model.param_to_grad[param] model.WeightedSum([param, ONE, param_grad, LR], param) assert len( model.GetParams()) == len(model.params) // len(model._devices) workspace.ResetWorkspace() model = cnn.CNNModelHelper(name="recurrent_test{}".format(devices), ) self.T = 8 self.batch_size = 64 self.input_dim = 8 self.hidden_dim = 31 self.batch_per_device = self.batch_size // len(devices) data_parallel_model.Parallelize( model, input_builder_fun=input_builder_fun, forward_pass_builder_fun=model_build_fun, param_update_builder_fun=param_update_fun, devices=devices, optimize_gradient_memory=True, cpu_device=not gpu, ) # Change all initialization to be ConstantFills so that # the everything is deterministic for op in model.param_init_net.Proto().op: if op.type.endswith('Fill'): op.type = 'ConstantFill' # Each run has same input, independent of number of gpus np.random.seed(20150210) for i in range(0, 10): full_data = np.random.rand(self.T, self.batch_size, self.input_dim) full_target = np.random.rand(self.T, self.batch_size, self.hidden_dim) for (j, g) in enumerate(devices): st = j * self.batch_per_device en = st + self.batch_per_device data = full_data[:, st:en, :].astype(np.float32) targets = full_target[:, st:en, :].astype(np.float32) with core.DeviceScope(core.DeviceOption(model._device_type, g)): workspace.FeedBlob( "{}_{}/data".format(model._device_prefix, g), data) workspace.FeedBlob( "{}_{}/target".format(model._device_prefix, g), targets) if i == 0: workspace.RunNetOnce(model.param_init_net) workspace.CreateNet(model.net) workspace.RunNet(model.net.Proto().name) return workspace.FetchBlob("{}_0/partest/i2h_w".format( model._device_prefix))
def run_model(self, devices, gpu): ''' Helper function for test_equiv ''' def input_builder_fun(model): return None def model_build_fun(model, loss_scale): fc = model.FC("data", "fc", 16, 1, ("ConstantFill", {}), ("ConstantFill", {})) fc_fl = model.FlattenToVec(fc, "fc_fl") sigm = model.Sigmoid(fc_fl, "sigm") sq = model.SquaredL2Distance([sigm, "label"], "sq") loss = model.AveragedLoss(sq, "loss") loss = model.Scale(loss, scale=loss_scale) return [loss] def add_optimizer(model): return optimizer.build_sgd(model, 0.1, policy="fixed", max_gradient_norm=5.0) workspace.ResetWorkspace() model = cnn.CNNModelHelper( order="NHWC", name="test{}".format(devices), ) data_parallel_model.Parallelize( model, input_builder_fun=input_builder_fun, forward_pass_builder_fun=model_build_fun, optimizer_builder_fun=add_optimizer, devices=devices, cpu_device=not gpu, ) np.random.seed(2603) # Each run has same input, independent of number of gpus batch_size = 64 for i in range(0, 10): full_data = np.random.rand(batch_size, 16) full_labels = np.round(full_data[:, 0]) batch_per_device = batch_size // len(devices) for (j, g) in enumerate(devices): st = j * batch_per_device en = st + batch_per_device data = full_data[st:en, :].astype(np.float32) labels = full_labels[st:en].astype(np.float32) with core.DeviceScope(core.DeviceOption(model._device_type, g)): workspace.FeedBlob( "{}_{}/data".format(model._device_prefix, g), data) workspace.FeedBlob( "{}_{}/label".format(model._device_prefix, g), labels) if i == 0: workspace.RunNetOnce(model.param_init_net) workspace.CreateNet(model.net) workspace.RunNet(model.net.Proto().name) return workspace.FetchBlob("{}_0/fc_w".format(model._device_prefix))
parser.add_argument('--proto_type', type=str, default='', help='empty or async_scheduling') parser.add_argument('--async_threads', type=int, default=0, help='async_thread_pool_size') parser.add_argument('--batch_size', type=int, default=1, help='Batch Size') parser.add_argument('--steps', type=int, default=10, help='Number of steps to measure.') args, _ = parser.parse_known_args() workspace.ResetWorkspace() workspace.GlobalInit([ 'caffe2', '--caffe2_log_level=2', '--caffe2_net_async_thread_pool_size=' + str(args.async_threads) ]) init_net = mynet.init_net predict_net = mynet.predict_net # you must name it something predict_net.name = "googlenet_predict" if args.proto_type != '': predict_net.type = 'async_scheduling' print('Using async scheduling.') #predict_net.type = 'prof_dag'
def test_layernorm(self, seed): np.random.seed(seed) # Reset the workspace size = 4 input_channels = 4 batch_size = 1 axis = 1 epsilon = 1e-4 workspace.ResetWorkspace() dims = np.array(([batch_size, input_channels, size, size])) X = np.random.uniform(size=dims).astype(np.float32) - 0.5 gamma = np.random.randn(*X.shape[axis:]).astype(np.float32) beta = np.random.randn(*X.shape[axis:]).astype(np.float32) pred_net = caffe2_pb2.NetDef() pred_net.name = "pred" pred_net.external_input.extend(["X", "gamma", "beta"]) pred_net.external_output.extend(["Y", "mean", "rstd"]) pred_net.op.add().CopyFrom( core.CreateOperator( "LayerNorm", ["X", "gamma", "beta"], ["Y", "mean", "rstd"], axis=1, epsilon=epsilon, elementwise_affine=True ) ) pred_net_ref = caffe2_pb2.NetDef() pred_net_ref.name = "pred_ref" pred_net_ref.external_input.extend(["X", "gamma", "beta"]) pred_net_ref.external_output.extend(["Y", "mean", "rstd"]) pred_net_ref.op.add().CopyFrom( core.CreateOperator( "LayerNormFakeFP16NNPI", ["X", "gamma", "beta"], ["Y", "mean", "rstd"], axis=1, epsilon=epsilon, elementwise_affine=True ) ) shape_hits = {"X": X.shape, "gamma": gamma.shape, "beta": beta.shape} pred_net_onnxified = onnxifi_caffe2_net( pred_net, shape_hits, debug=True, adjust_batch=True, use_onnx=False ) num_onnxified_ops = sum( 1 if o.type == "Onnxifi" else 0 for o in pred_net_onnxified.op) np.testing.assert_equal(num_onnxified_ops, 1) workspace.FeedBlob("X", X) workspace.FeedBlob("gamma", gamma) workspace.FeedBlob("beta", beta) workspace.CreateNet(pred_net_ref) workspace.CreateNet(pred_net_onnxified) workspace.RunNet(pred_net_ref.name) Y_c2 = workspace.FetchBlob("Y") workspace.RunNet(pred_net_onnxified.name) Y_glow = workspace.FetchBlob("Y") if not np.allclose(Y_glow.astype(np.float16), Y_c2.astype(np.float16)): diff_Y = np.abs(Y_glow - Y_c2).astype(np.float16) print_test_debug_info( "layernorm", { "seed": seed, "size": size, "input_channels": input_channels, "batch_size": batch_size, "epsilon": epsilon, "axis": axis, "X": X, "Y_glow": Y_glow, "Y_c2": Y_c2, "diff_Y": diff_Y, } ) assert(0)
def load_save(self, src_device_type, src_gpu_id, dst_device_type, dst_gpu_id): workspace.ResetWorkspace() dtypes = [np.float16, np.float32, np.float64, np.bool, np.int8, np.int16, np.int32, np.int64, np.uint8, np.uint16] arrays = [np.random.permutation(6).reshape(2, 3).astype(T) for T in dtypes] assume(core.IsGPUDeviceType(src_device_type) or src_gpu_id == 0) assume(core.IsGPUDeviceType(dst_device_type) or dst_gpu_id == 0) src_device_option = core.DeviceOption( src_device_type, src_gpu_id) dst_device_option = core.DeviceOption( dst_device_type, dst_gpu_id) for i, arr in enumerate(arrays): self.assertTrue(workspace.FeedBlob(str(i), arr, src_device_option)) self.assertTrue(workspace.HasBlob(str(i))) try: # Saves the blobs to a local db. tmp_folder = tempfile.mkdtemp() op = core.CreateOperator( "Save", [str(i) for i in range(len(arrays))], [], absolute_path=1, db=os.path.join(tmp_folder, "db"), db_type=self._db_type) self.assertTrue(workspace.RunOperatorOnce(op)) # Reset the workspace so that anything we load is surely loaded # from the serialized proto. workspace.ResetWorkspace() self.assertEqual(len(workspace.Blobs()), 0) def _LoadTest(keep_device, device_type, gpu_id, blobs, loadAll): """A helper subfunction to test keep and not keep.""" op = core.CreateOperator( "Load", [], blobs, absolute_path=1, db=os.path.join(tmp_folder, "db"), db_type=self._db_type, device_option=dst_device_option, keep_device=keep_device, load_all=loadAll) self.assertTrue(workspace.RunOperatorOnce(op)) for i, arr in enumerate(arrays): self.assertTrue(workspace.HasBlob(str(i))) fetched = workspace.FetchBlob(str(i)) self.assertEqual(fetched.dtype, arr.dtype) np.testing.assert_array_equal( workspace.FetchBlob(str(i)), arr) proto = caffe2_pb2.BlobProto() proto.ParseFromString(workspace.SerializeBlob(str(i))) self.assertTrue(proto.HasField('tensor')) self.assertEqual(proto.tensor.device_detail.device_type, device_type) if core.IsGPUDeviceType(device_type): self.assertEqual(proto.tensor.device_detail.device_id, gpu_id) blobs = [str(i) for i in range(len(arrays))] # Load using device option stored in the proto, i.e. # src_device_option _LoadTest(1, src_device_type, src_gpu_id, blobs, 0) # Load again, but this time load into dst_device_option. _LoadTest(0, dst_device_type, dst_gpu_id, blobs, 0) # Load back to the src_device_option to see if both paths are able # to reallocate memory. _LoadTest(1, src_device_type, src_gpu_id, blobs, 0) # Reset the workspace, and load directly into the dst_device_option. workspace.ResetWorkspace() _LoadTest(0, dst_device_type, dst_gpu_id, blobs, 0) # Test load all which loads all blobs in the db into the workspace. workspace.ResetWorkspace() _LoadTest(1, src_device_type, src_gpu_id, [], 1) # Load again making sure that overwrite functionality works. _LoadTest(1, src_device_type, src_gpu_id, [], 1) # Load again with different device. _LoadTest(0, dst_device_type, dst_gpu_id, [], 1) workspace.ResetWorkspace() _LoadTest(0, dst_device_type, dst_gpu_id, [], 1) workspace.ResetWorkspace() _LoadTest(1, src_device_type, src_gpu_id, blobs, 1) workspace.ResetWorkspace() _LoadTest(0, dst_device_type, dst_gpu_id, blobs, 1) finally: # clean up temp folder. try: shutil.rmtree(tmp_folder) except OSError as e: if e.errno != errno.ENOENT: raise
def testBlobNameOverrides(self): original_names = ['blob_a', 'blob_b', 'blob_c'] new_names = ['x', 'y', 'z'] blobs = [np.random.permutation(6) for i in range(3)] for i, blob in enumerate(blobs): self.assertTrue(workspace.FeedBlob(original_names[i], blob)) self.assertTrue(workspace.HasBlob(original_names[i])) self.assertEqual(len(workspace.Blobs()), 3) try: # Saves the blobs to a local db. tmp_folder = tempfile.mkdtemp() with self.assertRaises(RuntimeError): workspace.RunOperatorOnce( core.CreateOperator( "Save", original_names, [], absolute_path=1, strip_prefix='.temp', blob_name_overrides=new_names, db=os.path.join(tmp_folder, "db"), db_type=self._db_type ) ) self.assertTrue( workspace.RunOperatorOnce( core.CreateOperator( "Save", original_names, [], absolute_path=1, blob_name_overrides=new_names, db=os.path.join(tmp_folder, "db"), db_type=self._db_type ) ) ) self.assertTrue(workspace.ResetWorkspace()) self.assertEqual(len(workspace.Blobs()), 0) self.assertTrue( workspace.RunOperatorOnce( core.CreateOperator( "Load", [], [], absolute_path=1, db=os.path.join(tmp_folder, "db"), db_type=self._db_type, load_all=1 ) ) ) self.assertEqual(len(workspace.Blobs()), 3) for i, name in enumerate(new_names): self.assertTrue(workspace.HasBlob(name)) self.assertTrue((workspace.FetchBlob(name) == blobs[i]).all()) # moved here per @cxj's suggestion load_new_names = ['blob_x', 'blob_y', 'blob_z'] # load 'x' into 'blob_x' self.assertTrue( workspace.RunOperatorOnce( core.CreateOperator( "Load", [], load_new_names[0:1], absolute_path=1, db=os.path.join(tmp_folder, "db"), db_type=self._db_type, source_blob_names=new_names[0:1] ) ) ) # we should have 'blob_a/b/c/' and 'blob_x' now self.assertEqual(len(workspace.Blobs()), 4) for i, name in enumerate(load_new_names[0:1]): self.assertTrue(workspace.HasBlob(name)) self.assertTrue((workspace.FetchBlob(name) == blobs[i]).all()) self.assertTrue( workspace.RunOperatorOnce( core.CreateOperator( "Load", [], load_new_names[0:3], absolute_path=1, db=os.path.join(tmp_folder, "db"), db_type=self._db_type, source_blob_names=new_names[0:3] ) ) ) # we should have 'blob_a/b/c/' and 'blob_x/y/z' now self.assertEqual(len(workspace.Blobs()), 6) for i, name in enumerate(load_new_names[0:3]): self.assertTrue(workspace.HasBlob(name)) self.assertTrue((workspace.FetchBlob(name) == blobs[i]).all()) finally: # clean up temp folder. try: shutil.rmtree(tmp_folder) except OSError as e: if e.errno != errno.ENOENT: raise
def test_meta_net_def_net_runs(self): for param, value in viewitems(self.params): workspace.FeedBlob(param, value) extra_init_net = core.Net('extra_init') extra_init_net.ConstantFill('data', 'data', value=1.0) pem = pe.PredictorExportMeta( predict_net=self.predictor_export_meta.predict_net, parameters=self.predictor_export_meta.parameters, inputs=self.predictor_export_meta.inputs, outputs=self.predictor_export_meta.outputs, shapes=self.predictor_export_meta.shapes, extra_init_net=extra_init_net, net_type='dag', ) db_type = 'minidb' db_file = tempfile.NamedTemporaryFile(delete=False, suffix=".{}".format(db_type)) pe.save_to_db(db_type=db_type, db_destination=db_file.name, predictor_export_meta=pem) workspace.ResetWorkspace() meta_net_def = pe.load_from_db( db_type=db_type, filename=db_file.name, ) self.assertTrue("data" not in workspace.Blobs()) self.assertTrue("y" not in workspace.Blobs()) init_net = pred_utils.GetNet(meta_net_def, pc.PREDICT_INIT_NET_TYPE) # 0-fills externalblobs blobs and runs extra_init_net workspace.RunNetOnce(init_net) self.assertTrue("data" in workspace.Blobs()) self.assertTrue("y" in workspace.Blobs()) print(workspace.FetchBlob("data")) np.testing.assert_array_equal(workspace.FetchBlob("data"), np.ones(shape=(1, 5))) np.testing.assert_array_equal(workspace.FetchBlob("y"), np.zeros(shape=(1, 10))) # Load parameters from DB global_init_net = pred_utils.GetNet(meta_net_def, pc.GLOBAL_INIT_NET_TYPE) workspace.RunNetOnce(global_init_net) # Run the net with a reshaped input and verify we are # producing good numbers (with our custom implementation) workspace.FeedBlob("data", np.random.randn(2, 5).astype(np.float32)) predict_net = pred_utils.GetNet(meta_net_def, pc.PREDICT_NET_TYPE) self.assertEqual(predict_net.type, 'dag') workspace.RunNetOnce(predict_net) np.testing.assert_array_almost_equal( workspace.FetchBlob("y"), workspace.FetchBlob("data").dot(self.params["y_w"].T) + self.params["y_b"])
def temp_workspace(name=b"temp_ws"): old_ws_name = workspace.CurrentWorkspace() workspace.SwitchWorkspace(name, True) yield workspace.ResetWorkspace() workspace.SwitchWorkspace(old_ws_name)
def run_model(self, V, gpu_devices, cpu_indices): def input_builder_fun(model): return None def model_build_fun(model, loss_scale): if cpu_indices: with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU)): gathered_cpu = model.net.Gather([self.vecs, 'indices'], 'gathered_cpu') gathered = model.CopyCPUToGPU(gathered_cpu, "gathered") else: gpu_vecs = model.param_init_net.CopyCPUToGPU( self.vecs, "gpuvecs", ) model.params.append(gpu_vecs) gathered = model.net.Gather([gpu_vecs, 'indices'], 'gathered') flattened = model.Flatten(gathered, "flattened") fc = model.FC(flattened, "fc", 16 * 16, 1, ("ConstantFill", {}), ("ConstantFill", {})) fc_fl = model.FlattenToVec(fc, "fc_fl") sigm = model.Sigmoid(fc_fl, "sigm") sq = model.SquaredL2Distance([sigm, "label"], "sq") loss = model.AveragedLoss(sq, "loss") loss = model.Scale(loss, scale=loss_scale) return [loss] def param_update_fun(model): ONE = model.param_init_net.ConstantFill( [], "ONE", shape=[1], value=1.0, ) LR = model.CopyCPUToGPU(self.LR, "LR") for param in model.GetParams(): param_grad = model.param_to_grad[param] if not isinstance(param_grad, core.GradientSlice): model.WeightedSum([param, ONE, param_grad, LR], param) else: param_momentum = model.param_init_net.ConstantFill( [param], param + '_momentum', value=0.0, ) model.net.SparseMomentumSGDUpdate( [ param_grad.values, param_momentum, LR, param, param_grad.indices, ], [param_grad.values, param_momentum, param], momentum=0.1, nesterov=0, ) workspace.ResetWorkspace() model = cnn.CNNModelHelper( order="NHWC", name="sparse_test{}".format(gpu_devices), ) with core.NameScope("cpu"): with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU)): self.ITER = model.Iter("ITER") self.LR = model.net.LearningRate( [self.ITER], "LR", base_lr=(-0.1), policy="fixed", ) self.vecs = model.param_init_net.UniformFill([], "vecs", shape=[V, 16]) if cpu_indices: model.params.append(self.vecs) self.ONE_CPU = model.param_init_net.ConstantFill( [], "ONE_CPU", shape=[1], value=1.0, ) data_parallel_model.Parallelize_GPU( model, input_builder_fun=input_builder_fun, forward_pass_builder_fun=model_build_fun, param_update_builder_fun=param_update_fun, devices=gpu_devices, ) # Update the vecs if cpu_indices: with core.NameScope("cpu"): with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU)): for param in model.GetParams(): param_grad = model.param_to_grad[param] model.ScatterWeightedSum([ param, self.ONE_CPU, param_grad.indices, param_grad.values, self.LR ], self.vecs) else: with core.DeviceScope(core.DeviceOption(caffe2_pb2.CUDA, 0)): model.CopyGPUToCPU("gpu_0/gpuvecs", self.vecs) np.random.seed(2603) # Each run has same input, independent of number of gpus batch_size = 64 for i in range(0, 10): full_indices = np.random.permutation(V)[:batch_size * 16].reshape( batch_size, 16) full_labels = full_indices[:, 0] % 2 batch_per_device = batch_size // len(gpu_devices) for (j, g) in enumerate(gpu_devices): st = j * batch_per_device en = st + batch_per_device indices = full_indices[st:en, :].astype(np.int32) labels = full_labels[st:en].astype(np.float32) device_for_indices = core.DeviceOption(caffe2_pb2.CPU) if not cpu_indices: device_for_indices = core.DeviceOption(caffe2_pb2.CUDA, g) with core.DeviceScope(device_for_indices): workspace.FeedBlob("gpu_{}/indices".format(g), indices) with core.DeviceScope(core.DeviceOption(caffe2_pb2.CUDA, g)): workspace.FeedBlob("gpu_{}/label".format(g), labels) if i == 0: workspace.RunNetOnce(model.param_init_net) # Force vecs to be same on all runs orig_vecs = np.random.rand(V, 16).astype(np.float32) workspace.FeedBlob(self.vecs, orig_vecs) if not cpu_indices: for g in gpu_devices: workspace.FeedBlob( "gpu_{}/gpuvecs".format(g), orig_vecs, device_option=core.DeviceOption( caffe2_pb2.CUDA, g), ) workspace.CreateNet(model.net) workspace.RunNet(model.net.Proto().name) if len(gpu_devices) == 2: if not cpu_indices: idx = workspace.FetchBlob("gpu_0/indices") idx = list(idx.flatten()) n = len(idx) nu = len(set(idx)) assert n == nu, "We cannot have duplicate indices" # Sanity check to see the vecs were updated self.assertFalse(np.allclose(workspace.FetchBlob(self.vecs), orig_vecs)) return [ workspace.FetchBlob(self.vecs if cpu_indices else "gpu_0/gpuvecs"), workspace.FetchBlob("gpu_0/fc_w") ]
def CheckSimple(self, op, inputs, input_to_check, outputs_with_grads, grad_ops=None, input_device_options=None): """Checks the operator in a very simple fashion by stacking a sum of squares on the top. Inputs: op: the operator to be checked. inputs: the input data in numpy arrays. input_to_check: an index specifying which input blob we should check. outputs_with_grads: indices specifying which output blobs will we need to check gradients with. For these outputs, we will collect a squared sum and also feed in their gradients. grad_operator: the gradient operator. If not given, we will get the gradient operator from the gradient registry. input_device_options: an optional mapping from input names to DeviceOptions (to override the default DeviceOption) Outputs: boolean: True if it passes, False if it does not pass. """ if input_device_options is None: input_device_options = {} # Entering the checker workspace old_ws_name = workspace.CurrentWorkspace() if self._workspace_name != old_ws_name: workspace.SwitchWorkspace(self._workspace_name, True) op.device_option.CopyFrom(self._device_option) if grad_ops is None: # TODO(jiayq): use the gradient registration instead of the old # hack. grad_ops, g_input = core.GradientRegistry.GetGradientForOp( op, [s + '_grad' for s in op.output]) # sanity check: we only support dense gradient checking in this checker assert all(type(g) is not core.GradientSlice for g in g_input), \ "This checker does not support sparse gradient yet.""" dims_to_check = inputs[input_to_check].size # First, feed in the input. for i, arr in enumerate(inputs): workspace.FeedBlob( op.input[i], arr, input_device_options.get(op.input[i], self._device_option)) # Get the loss and gradient for the original. input_name = op.input[input_to_check] grad_name = g_input[input_to_check] loss, grad = self.GetLossAndGrad(op, grad_ops, inputs[input_to_check], input_name, grad_name, outputs_with_grads) grad_estimate = np.zeros_like(inputs[input_to_check]) if grad_estimate.shape != grad.shape: raise Exception( "Mismatched gradient shapes: estimated ({}), grad ({})".format( grad_estimate.shape, grad.shape)) for current_dim in range(dims_to_check): # Positive gradient inputs[input_to_check].flat[current_dim] += self._stepsize pos_loss, _ = self.GetLossAndGrad(op, grad_ops, inputs[input_to_check], input_name, grad_name, outputs_with_grads) # Negative gradient inputs[input_to_check].flat[current_dim] -= self._stepsize * 2 neg_loss, _ = self.GetLossAndGrad(op, grad_ops, inputs[input_to_check], input_name, grad_name, outputs_with_grads) # Recover the value inputs[input_to_check].flat[current_dim] += self._stepsize grad_estimate.flat[current_dim] = (pos_loss - neg_loss) / self._stepsize / 2 # Now, check correctness fail_mat = ~np.isclose( grad, grad_estimate, atol=self._threshold, rtol=self._threshold) if np.any(fail_mat): idx = np.flatnonzero(fail_mat) print('Failed. [idx, grad, grad_estimate] are:') print(np.vstack([idx, grad.flat[idx], grad_estimate.flat[idx]]).T) ret = False else: ret = True # After finishing, cleaning up things. if self._workspace_name != old_ws_name: # We reset the workspace to make sure everything intermediate is # cleaned up. Note that there is no need to delete a workspace - # when empty it takes a very limited amount of memory. workspace.ResetWorkspace() workspace.SwitchWorkspace(old_ws_name) return ret, grad, grad_estimate
def run_model(self, V, gpu_devices): def input_builder_fun(model): return None def model_build_fun(model, loss_scale): gpu_vecs_gathered = [] gpu_vecs = [] for num, vec in enumerate(self.vecs): gpu_vec = model.param_init_net.CopyCPUToGPU( vec, 'gpuvec_{}'.format(num), ) if num != 2: model.params.append(gpu_vec) gpu_vecs.append(gpu_vec) for num, gpu_vec in enumerate(gpu_vecs): gpu_vec_gathered = model.net.Gather( [gpu_vec, 'indices'], ['gpu_vec_gathered_{}'.format(num)]) gpu_vecs_gathered.append(gpu_vec_gathered) assert len(gpu_vecs_gathered) == 3 fc = model.net.FC( [ gpu_vecs_gathered[2], gpu_vecs_gathered[0], gpu_vecs_gathered[1], ], ['fc'], ) _, loss = model.net.SoftmaxWithLoss( [fc, 'label'], ['ce_loss', 'avg_loss'], only_loss=True, ) loss = model.Scale(loss, scale=loss_scale) model.net.Print(loss, [], limit=10) return [loss] def param_update_fun(model): ONE = model.param_init_net.ConstantFill( [], "ONE", shape=[1], value=1.0, ) LR = model.CopyCPUToGPU(self.LR, "LR") for param in model.GetParams(): param_grad = model.param_to_grad[param] if not isinstance(param_grad, core.GradientSlice): model.WeightedSum([param, ONE, param_grad, LR], param) else: model.net.ScatterWeightedSum( [ param, ONE, param_grad.indices, param_grad.values, ONE, ], param, ) workspace.ResetWorkspace() model = cnn.CNNModelHelper( order="NHWC", name="sparse_test{}".format(gpu_devices), ) batch_size = 32 batch_per_device = batch_size // len(gpu_devices) with core.NameScope("cpu"): with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU)): self.ITER = model.Iter("ITER") self.LR = model.net.LearningRate( [self.ITER], "LR", base_lr=(-0.1), policy="fixed", ) ''' self.vecs consists of 3 big blobs on which we call Gather: 1) FC weights, shape=(V, 16) 2) FC bias, shape=(V) 3) FC input, shape=(batch_per_device, 16) ''' self.vecs = [ model.param_init_net.UniformFill([], "vec_{}".format(num), shape=[V, 16]) for num in range(2) ] self.vecs.append( model.param_init_net.UniformFill( [], "vec_2", shape=[batch_per_device, 16])) self.ONE_CPU = model.param_init_net.ConstantFill( [], "ONE_CPU", shape=[1], value=1.0, ) data_parallel_model.Parallelize_GPU( model, input_builder_fun=input_builder_fun, forward_pass_builder_fun=model_build_fun, param_update_builder_fun=param_update_fun, devices=gpu_devices, ) # Update the vecs with core.DeviceScope(core.DeviceOption(caffe2_pb2.CUDA, 0)): for num, vec in enumerate(self.vecs[:-1]): model.CopyGPUToCPU("gpu_0/gpuvec_{}".format(num), vec) # Each run has same input, independent of number of gpus for i in range(0, 10): np.random.seed(2603) full_indices = np.random.permutation(V)[:batch_size].reshape( batch_size) full_labels = full_indices[:] % batch_per_device for (j, g) in enumerate(gpu_devices): st = j * batch_per_device en = st + batch_per_device indices = full_indices[st:en].astype(np.int32) labels = full_labels[st:en].astype(np.int32) with core.DeviceScope(core.DeviceOption(caffe2_pb2.CUDA, g)): workspace.FeedBlob("gpu_{}/indices".format(g), indices) workspace.FeedBlob("gpu_{}/label".format(g), labels) if i == 0: workspace.RunNetOnce(model.param_init_net) # Force vecs to be same on all runs orig_vecs = [ np.random.rand(V, 16).astype(np.float32), np.random.rand(V).astype(np.float32), np.random.rand(V, 16).astype(np.float32), ] for vec, orig_vec in zip(self.vecs, orig_vecs): workspace.FeedBlob(vec, orig_vec) for g in gpu_devices: for num, orig_vec in enumerate(orig_vecs): workspace.FeedBlob( "gpu_{}/gpuvec_{}".format(g, num), orig_vec, device_option=core.DeviceOption( caffe2_pb2.CUDA, g), ) workspace.CreateNet(model.net) workspace.RunNet(model.net.Proto().name) idx = workspace.FetchBlob('gpu_0/indices') grad_slices = [ workspace.FetchBlob('gpu_{}/gpu_vec_gathered_{}_grad'.format( g, num)) for g in gpu_devices for num in range(2) ] for grad_slice in grad_slices: # print (len(idx), len(grad_slice)) assert len(idx) == len(grad_slice), ( 'Number of indices {} is not same as number of gradient ' 'slices {}. This might lead to illegal memory access'. format(len(idx), len(grad_slice)))
def test_fc_with_axis(self, n, m, c, h, w, axis, gc, dc): X = np.random.rand(n, c, h, w).astype(np.float32) - 0.5 k = reduce((lambda x, y: x * y), [n, c, h, w][axis - 4:]) nn = reduce((lambda x, y: x * y), [n, c, h, w][:axis]) W = np.random.rand(m, k).astype(np.float32) - 0.5 b = np.random.rand(m).astype(np.float32) - 0.5 dY = np.random.rand(nn, m).astype(np.float32) - 0.5 op0 = core.CreateOperator('FC', ['X', 'W', 'b'], ["Y"], axis=axis, device_option=dc[0]) op0_bw = core.CreateOperator('FCGradient', ['X', 'W', 'dY'], ["dW", "db"], axis=axis, device_option=dc[0]) workspace.ResetWorkspace() workspace.FeedBlob('X', X, dc[0]) workspace.FeedBlob('W', W, dc[0]) workspace.FeedBlob('b', b, dc[0]) workspace.RunOperatorOnce(op0) Y0 = workspace.FetchBlob('Y') workspace.FeedBlob('dY', dY, dc[0]) workspace.RunOperatorOnce(op0_bw) dW0 = workspace.FetchBlob('dW') db0 = workspace.FetchBlob('db') op1 = core.CreateOperator('FC', ['X', 'W', 'b'], ["Y"], axis=axis, device_option=dc[1]) op1_bw = core.CreateOperator('FCGradient', ['X', 'W', 'dY'], ["dW", "db"], axis=axis, device_option=dc[1]) workspace.SwitchWorkspace("_device_check_", True) workspace.FeedBlob('X', X, dc[1]) workspace.FeedBlob('W', W, dc[1]) workspace.FeedBlob('b', b, dc[1]) workspace.RunOperatorOnce(op1) Y1 = workspace.FetchBlob('Y') workspace.FeedBlob('dY', dY, dc[1]) workspace.RunOperatorOnce(op1_bw) dW1 = workspace.FetchBlob('dW') db1 = workspace.FetchBlob('db') Y0 = Y0.flatten() Y1 = Y1.flatten() if not np.allclose(Y0, Y1, atol=0.01, rtol=0.01): print(Y1) print(Y0) print(np.max(np.abs(Y1 - Y0))) self.assertTrue(False) dW0 = dW0.flatten() dW1 = dW1.flatten() if not np.allclose(dW0, dW1, atol=0.01, rtol=0.01): print(dW1) print(dW0) print(np.max(np.abs(dW1 - dW0))) self.assertTrue(False) db0 = db0.flatten() db1 = db1.flatten() if not np.allclose(db0, db1, atol=0.01, rtol=0.01): print(db1) print(db0) print(np.max(np.abs(db1 - db0))) self.assertTrue(False)
def test_convolution_sync(self, net_type, num_workers, engine, gc, dc): m = ModelHelper(name="test_model") n = 1 d = 2 depth = 3 iters = 5 h = 5 w = 5 workspace.ResetWorkspace() use_cudnn = engine == "CUDNN" np.random.seed(1701) # Build a binary tree of conv layers, summing at each node. for i in reversed(range(depth)): for j in range(2**i): bottom_1 = "{}_{}".format(i + 1, 2 * j) bottom_2 = "{}_{}".format(i + 1, 2 * j + 1) mid_1 = "{}_{}_m".format(i + 1, 2 * j) mid_2 = "{}_{}_m".format(i + 1, 2 * j + 1) top = "{}_{}".format(i, j) w1, b1, w2, b2 = np.random.randn(4).tolist() brew.conv( m, bottom_1, mid_1, dim_in=d, dim_out=d, kernel=3, weight_init=("ConstantFill", { "value": w1 }), bias_init=("ConstantFill", { "value": b1 }), cudnn_state=np.random.randint(0, 3), stride=1, pad=1, deterministic=1, use_cudnn=use_cudnn, engine=engine, ) brew.conv( m, bottom_2, mid_2, dim_in=d, dim_out=d, kernel=3, stride=1, pad=1, weight_init=("ConstantFill", { "value": w2 }), bias_init=("ConstantFill", { "value": b2 }), deterministic=1, cudnn_state=np.random.randint(0, 3), use_cudnn=use_cudnn, engine=engine, ) m.net.Sum([mid_1, mid_2], top) m.net.Flatten(["0_0"], ["0_0_flat"]) m.net.SquaredL2Distance(["0_0_flat", "label"], "xent") m.net.AveragedLoss("xent", "loss") input_to_grad = m.AddGradientOperators(["loss"]) m.Proto().device_option.CopyFrom(gc) m.param_init_net.Proto().device_option.CopyFrom(gc) m.Proto().type = net_type m.Proto().num_workers = num_workers self.ws.run(m.param_init_net) def run(): import numpy as np np.random.seed(1701) input_blobs = ["{}_{}".format(depth, j) for j in range(2**depth)] for input_blob in input_blobs: self.ws.create_blob(input_blob).feed(np.random.randn( n, d, h, w).astype(np.float32), device_option=gc) self.ws.create_blob("label").feed(np.random.randn( n, d * h * w).astype(np.float32), device_option=gc) self.ws.run(m.net) gradients = [ self.ws.blobs[str(input_to_grad[input_blob])].fetch() for input_blob in input_blobs ] return gradients outputs = [run() for _ in range(iters)] for output in outputs[1:]: np.testing.assert_array_equal(outputs[0], output) np.testing.assert_allclose(np.sum(np.square(output)), 1763719461732352.0, rtol=1e-5)
def __exit__(self, *args): if self.is_cleanup: workspace.ResetWorkspace() if self.ws_name is not None: workspace.SwitchWorkspace(self.org_ws)
def test_slws_fused_8bit_rowwise_all_same(self, seed): # Comment out for predictable debugging np.random.seed(seed) workspace.ResetWorkspace() n = 1 m = 2 data = np.ones((n, m)).astype(np.float32) * 0.2 - 0.1 max_segments = 5 max_segment_length = 200 num_lengths = np.random.randint(1, max_segments + 1) # number of segments to run lengths = np.random.randint(0, max_segment_length + 1, size=num_lengths).astype(np.int32) num_indices = np.sum(lengths) indices = np.zeros(num_indices, dtype=np.int64) weights = np.random.uniform(low=-0.5, high=0.5, size=[len(indices)]).astype(np.float32) pred_net = caffe2_pb2.NetDef() pred_net.name = "pred" pred_net.external_input.extend( ["quantized_data", "weights", "indices", "lengths"]) pred_net.external_output.append("Y") pred_net.op.add().CopyFrom( core.CreateOperator( "SparseLengthsWeightedSumFused8BitRowwise", ["quantized_data", "weights", "indices", "lengths"], ["Y"], )) ref_net = caffe2_pb2.NetDef() ref_net.name = "ref" ref_net.external_input.extend( ["quantized_data", "weights", "indices", "lengths"]) ref_net.external_output.append("Y") ref_net.op.add().CopyFrom( core.CreateOperator( "SparseLengthsWeightedSumFused8BitRowwiseFakeFP16NNPI", ["quantized_data", "weights", "indices", "lengths"], ["Y"], )) workspace.FeedBlob("data", data) workspace.RunOperatorOnce( core.CreateOperator("FloatToFused8BitRowwiseQuantized", ["data"], ["quantized_data"])) pred_net_onnxified = onnxifi_caffe2_net( pred_net, {}, max_batch_size=max_segments, max_seq_size=max_segment_length, debug=True, adjust_batch=True, use_onnx=False, ) num_onnxified_ops = sum(1 if o.type == "Onnxifi" else 0 for o in pred_net_onnxified.op) np.testing.assert_equal(num_onnxified_ops, 1) workspace.FeedBlob("indices", indices) workspace.FeedBlob("lengths", lengths) workspace.FeedBlob("weights", weights) workspace.CreateNet(pred_net_onnxified) workspace.CreateNet(ref_net) workspace.RunNet(pred_net_onnxified.name) Y_glow = workspace.FetchBlob("Y") workspace.RunNet(ref_net.name) Y_c2 = workspace.FetchBlob("Y") if not np.allclose(Y_c2, Y_glow): print_test_debug_info( "slws_fused_8bit_rowwise", { "seed": seed, "indices": indices, "data": data, "lengths": lengths, "weights": weights, "Y_c2": Y_c2, "Y_glow": Y_glow, "diff": Y_glow - Y_c2, "rowwise_diff": (Y_glow - Y_c2)[:, 0], }, ) assert 0
def test_convolution_relu_fusion(self, stride, pad, kernel, size, input_channels, output_channels, batch_size, use_bias, group, gc, dc): conv = core.CreateOperator( "Conv", ["X0", "w0", "b0"] if use_bias else ["X0", "w0"], ["Y0"], stride=stride, pad=pad, kernel=kernel, group=group, device_option=dc[0]) relu = core.CreateOperator("Relu", ["Y0"], ["Y0"], device_option=dc[0]) # Manual fusion for Conv + ReLU conv_fusion = core.CreateOperator( "ConvFusion", ["X1", "w1", "b1"] if use_bias else ["X1", "w1"], ["Y1"], stride=stride, pad=pad, kernel=kernel, group=group, fusion_type=1, device_option=dc[1]) X = np.random.rand(batch_size, input_channels * group, size, size).astype(np.float32) - 0.5 w = np.random.rand( output_channels * group, input_channels, kernel, kernel) \ .astype(np.float32) - 0.5 b = np.random.rand(output_channels * group).astype(np.float32) - 0.5 old_ws_name = workspace.CurrentWorkspace() workspace.SwitchWorkspace("_device_check_", True) workspace.FeedBlob('X0', X, dc[0]) workspace.FeedBlob('w0', w, dc[0]) workspace.FeedBlob('b0', b, dc[0]) workspace.RunOperatorOnce(conv) workspace.RunOperatorOnce(relu) Y0 = workspace.FetchBlob('Y0') workspace.ResetWorkspace() workspace.FeedBlob('X1', X, dc[1]) workspace.FeedBlob('w1', w, dc[1]) workspace.FeedBlob('b1', b, dc[1]) workspace.RunOperatorOnce(conv_fusion) Y1 = workspace.FetchBlob('Y1') if not np.allclose(Y0, Y1, atol=0.01, rtol=0.01): print(Y1.flatten()) print(Y0.flatten()) print(np.max(np.abs(Y1 - Y0))) self.assertTrue(False) # Auto fusion for Conv + ReLU workspace.ResetWorkspace() old_net = caffe2_pb2.NetDef() conv_old = caffe2_pb2.OperatorDef() conv_old.CopyFrom(conv) conv_old.device_option.CopyFrom(dc[1]) relu_old = caffe2_pb2.OperatorDef() relu_old.CopyFrom(relu) relu_old.device_option.CopyFrom(dc[1]) old_net.op.extend([conv_old, relu_old]) workspace.FeedBlob('X0', X, dc[1]) workspace.FeedBlob('w0', w, dc[1]) workspace.FeedBlob('b0', b, dc[1]) net = core.Net("net") net.Proto().CopyFrom(old_net) optimizeForIDEEP(net) self.assertTrue(len(net.Proto().op) == 1) self.assertTrue(net.Proto().op[0].type == "ConvFusion") workspace.RunOperatorOnce(net.Proto().op[0]) Y2 = workspace.FetchBlob('Y0') if not np.allclose(Y0, Y2, atol=0.01, rtol=0.01): print(Y2.flatten()) print(Y0.flatten()) print(np.max(np.abs(Y2 - Y0))) self.assertTrue(False) workspace.SwitchWorkspace(old_ws_name)
def test_slws_fused_8bit_rowwise(self, seed, num_rows, embedding_dim, batch_size, max_weight): np.random.seed(seed) workspace.ResetWorkspace() data = np.random.rand(num_rows, embedding_dim).astype(np.float32) lengths = np.random.choice(np.arange(1, num_rows), batch_size).astype(np.int32) indices = [] for length in lengths: indices.extend(np.random.choice(np.arange(1, num_rows), length)) indices = np.asarray(indices).astype(np.int64) weights = np.random.uniform(low=0, high=max_weight, size=[len(indices)]).astype(np.float32) assert (len(weights) < 64000) pred_net = caffe2_pb2.NetDef() pred_net.name = "pred" pred_net.external_input.extend( ["quantized_data", "weights", "indices", "lengths"]) pred_net.external_output.append("Y") pred_net.op.add().CopyFrom( core.CreateOperator( "SparseLengthsWeightedSumFused8BitRowwise", ["quantized_data", "weights", "indices", "lengths"], ["Y"], )) ref_net = caffe2_pb2.NetDef() ref_net.name = "ref" ref_net.external_input.extend( ["quantized_data", "weights", "indices", "lengths"]) ref_net.external_output.append("Y") ref_net.op.add().CopyFrom( core.CreateOperator( "SparseLengthsWeightedSumFused8BitRowwiseFakeFP16NNPI", ["quantized_data", "weights", "indices", "lengths"], ["Y"], )) workspace.FeedBlob("data", data) workspace.RunOperatorOnce( core.CreateOperator("FloatToFused8BitRowwiseQuantized", ["data"], ["quantized_data"])) onnxified_net = onnxifi_caffe2_net( pred_net, {}, max_batch_size=batch_size, max_seq_size=np.max(lengths), debug=True, adjust_batch=True, use_onnx=False, ) num_onnxified_ops = sum(1 if o.type == "Onnxifi" else 0 for o in onnxified_net.op) np.testing.assert_equal(num_onnxified_ops, 1) workspace.FeedBlob("indices", indices) workspace.FeedBlob("lengths", lengths) workspace.FeedBlob("weights", weights) workspace.CreateNet(onnxified_net) workspace.CreateNet(ref_net) workspace.RunNet(onnxified_net.name) Y_glow = workspace.FetchBlob("Y") workspace.RunNet(ref_net.name) Y_ref = workspace.FetchBlob("Y") diff = np.abs((Y_ref - Y_glow) / (Y_ref + 1e-8)) max_err = np.max(diff, axis=1) num_offenders = (max_err > 0).sum() if num_offenders > 0: print_test_debug_info( "slws_fused_8bit_rowwise_inv_scale", { "seed": seed, "num_rows": num_rows, "embedding_dim": embedding_dim, "batch_size": batch_size, "max_weight": max_weight, "indices": indices, "data": data.shape, "lengths": lengths, "weights": weights, "Y_glow": Y_glow, "Y_ref": Y_ref, "diff": diff, "rowwise_diff": np.max(diff, axis=1), }, ) assert 0
def testResetWorkspace(self): self.assertEqual( workspace.RunNetOnce(self.net.Proto().SerializeToString()), True) self.assertEqual(workspace.HasBlob("testblob"), True) self.assertEqual(workspace.ResetWorkspace(), True) self.assertEqual(workspace.HasBlob("testblob"), False)
def test_small_sls(self, seed): np.random.seed(seed) workspace.ResetWorkspace() n = 2 DIM = 3 data = 4 * (np.random.random_sample((n, DIM)) + 1).astype(np.float32) lengths = np.array([n], dtype=np.int32) indices = np.array(range(n), dtype=np.int64) weights = np.random.uniform(low=0.01, high=0.5, size=[n]).astype(np.float32) pred_net = caffe2_pb2.NetDef() pred_net.name = "pred" pred_net.external_input.extend( ["quantized_data", "weights", "indices", "lengths"]) pred_net.external_output.append("Y") pred_net.op.add().CopyFrom( core.CreateOperator( "SparseLengthsWeightedSumFused8BitRowwise", ["quantized_data", "weights", "indices", "lengths"], ["Y"], )) ref_net = caffe2_pb2.NetDef() ref_net.name = "ref" ref_net.external_input.extend( ["quantized_data", "weights", "indices", "lengths"]) ref_net.external_output.append("Y") ref_net.op.add().CopyFrom( core.CreateOperator( "SparseLengthsWeightedSumFused8BitRowwiseFakeFP16NNPI", ["quantized_data", "weights", "indices", "lengths"], ["Y"], )) workspace.FeedBlob("data", data) workspace.RunOperatorOnce( core.CreateOperator("FloatToFused8BitRowwiseQuantized", ["data"], ["quantized_data"])) quantized_data = workspace.FetchBlob("quantized_data") onnxified_net = onnxifi_caffe2_net( pred_net, {}, max_batch_size=1, max_seq_size=n, debug=True, adjust_batch=True, use_onnx=False, ) num_onnxified_ops = sum(1 if o.type == "Onnxifi" else 0 for o in onnxified_net.op) np.testing.assert_equal(num_onnxified_ops, 1) workspace.FeedBlob("indices", indices) workspace.FeedBlob("lengths", lengths) workspace.FeedBlob("weights", weights) workspace.CreateNet(onnxified_net) workspace.CreateNet(ref_net) workspace.RunNet(onnxified_net.name) Y_glow = workspace.FetchBlob("Y") workspace.RunNet(ref_net.name) Y_ref = workspace.FetchBlob("Y") diff = np.abs((Y_ref - Y_glow) / (Y_ref + 1e-8)) max_err = np.max(diff, axis=1) num_offenders = (max_err > 0).sum() if num_offenders > 0: np.set_printoptions(precision=12) print( "ref", Y_ref.astype(np.float16).astype(np.float32), "glow", Y_glow.astype(np.float16).astype(np.float32), ) print_test_debug_info( "slws_fused_8bit_rowwise_inv_scale", { "seed": seed, "indices": indices, "data": data, "quantized_data": quantized_data, "lengths": lengths, "weights": weights, "Y_glow": Y_glow, "Y_ref": Y_ref, "diff": diff, "rowwise_diff": np.max(diff, axis=1), }, ) assert 0
def setUp(self): workspace.ResetWorkspace() self.net = core.Net("test-net") self.net.ConstantFill([], "testblob", shape=[1, 2, 3, 4], value=1.0) self.net.RunAllOnGPU()
def test_ckpt_name_and_load_model_from_ckpts(self): try: num_nodes = 3 tmpdir = tempfile.mkdtemp() # First, check if the checkpoint name generation mechanism is # correct. checkpoint = MultiNodeCheckpointManager(tmpdir, 'minidb') with Cluster(): with Job() as job: for node_id in range(num_nodes): build_pipeline(node_id) job.compile(LocalSession) checkpoint.init(job.nodes_to_checkpoint()) for node_id in range(num_nodes): epoch = 5 node_name = 'trainer_%d' % node_id expected_db_name = tmpdir + '/' + node_name + '.5' self.assertEquals( checkpoint.get_ckpt_db_name(node_name, epoch), expected_db_name) shutil.rmtree(tmpdir) # Next, check mechanism to load model from checkpoints. tmpdir = tempfile.mkdtemp() workspace.ResetWorkspace() for node_id in range(num_nodes): ws = workspace.C.Workspace() session = LocalSession(ws) checkpoint = MultiNodeCheckpointManager(tmpdir, 'minidb') with Cluster(): with Job() as job: build_pipeline(node_id) job.compile(LocalSession) job_runner = JobRunner(job, checkpoint) num_epochs = job_runner.train(session) self.assertEquals(num_epochs, len(EXPECTED_TOTALS)) # There are 12 global blobs after finishing up the job runner. # (only blobs on init_group are checkpointed) self.assertEquals(len(ws.blobs), 12) ws = workspace.C.Workspace() session = LocalSession(ws) self.assertEquals(len(ws.blobs), 0) model_blob_names = [ 'trainer_1/task_2/GivenTensorInt64Fill:0', 'trainer_2/task_2/GivenTensorInt64Fill:0' ] checkpoint = MultiNodeCheckpointManager(tmpdir, 'minidb') with Cluster(): with Job() as job: for node_id in range(num_nodes): build_pipeline(node_id) job.compile(LocalSession) job_runner = JobRunner(job, checkpoint) job_runner.load_blobs_from_checkpoints( blob_names=model_blob_names, epoch=1, session=session) # Check that we can successfully load from checkpoints of epochs # 1 to 4, but not epoch 5. for epoch in range(1, 5): self.assertTrue( job_runner.load_blobs_from_checkpoints( blob_names=model_blob_names, epoch=epoch, session=session)) # Check that all the model blobs are loaded. for blob_name in model_blob_names: self.assertTrue(ws.has_blob(blob_name)) self.assertEquals( ws.fetch_blob(blob_name), np.array([EXPECTED_TOTALS[epoch - 1]])) self.assertFalse( job_runner.load_blobs_from_checkpoints( blob_names=model_blob_names, epoch=5, session=session)) finally: shutil.rmtree(tmpdir)
def testRootFolder(self): self.assertEqual(workspace.ResetWorkspace(), True) self.assertEqual(workspace.RootFolder(), ".") self.assertEqual( workspace.ResetWorkspace("/tmp/caffe-workspace-test"), True) self.assertEqual(workspace.RootFolder(), "/tmp/caffe-workspace-test")
def test_slws_fused_4bit_rowwise(self, seed, num_rows, embedding_dim, batch_size, max_weight): workspace.ResetWorkspace() np.random.seed(seed) data = np.random.rand(num_rows, embedding_dim).astype(np.float32) data = data * 1e-3 lengths = np.random.choice(np.arange(1, num_rows), batch_size).astype(np.int32) indices = [] for length in lengths: indices.extend(np.random.choice(np.arange(1, num_rows), length)) indices = np.asarray(indices).astype(np.int64) weights = np.random.uniform( low=0, high=max_weight, size=[len(indices)]).astype( np.float32) - max_weight / 2.0 pred_net = caffe2_pb2.NetDef() pred_net.name = "pred" pred_net.external_input.extend( ["quantized_data", "weights", "indices", "lengths"]) pred_net.external_output.append("Y") pred_net.op.add().CopyFrom( core.CreateOperator( "SparseLengthsWeightedSumFused4BitRowwise", ["quantized_data", "weights", "indices", "lengths"], ["Y"], )) ref_net = caffe2_pb2.NetDef() ref_net.name = "ref" ref_net.external_input.extend( ["quantized_data", "weights", "indices", "lengths"]) ref_net.external_output.append("Y") ref_net.op.add().CopyFrom( core.CreateOperator( "SparseLengthsWeightedSumFused4BitRowwiseFakeFP16NNPI", ["quantized_data", "weights", "indices", "lengths"], ["Y"], )) workspace.FeedBlob("data", data) workspace.RunOperatorOnce( core.CreateOperator("FloatToFused4BitRowwiseQuantized", ["data"], ["quantized_data"])) pred_net_onnxified = onnxifi_caffe2_net(pred_net, {}, max_batch_size=batch_size, max_seq_size=np.max(lengths), debug=True, adjust_batch=True, use_onnx=False) num_onnxified_ops = sum(1 if o.type == "Onnxifi" else 0 for o in pred_net_onnxified.op) np.testing.assert_equal(num_onnxified_ops, 1) workspace.FeedBlob("indices", indices) workspace.FeedBlob("lengths", lengths) workspace.FeedBlob("weights", weights) workspace.CreateNet(pred_net_onnxified) workspace.CreateNet(ref_net) workspace.RunNet(pred_net_onnxified.name) Y_glow = workspace.FetchBlob('Y') workspace.RunNet(ref_net.name) Y_c2 = workspace.FetchBlob('Y') if not np.allclose(Y_c2, Y_glow): print_test_debug_info( "slws_fused_4bit_rowwise", { "seed": seed, "indices": indices, "data": data.shape, "lengths": lengths, "weights": weights, "Y_c2": Y_c2.shape, "Y_glow": Y_glow.shape, "diff": Y_glow - Y_c2, "rowwise_diff": (Y_glow - Y_c2)[:, 0] }) assert (0)