def main(bit_rate): # uncomment for debugging # np.random.seed(0) batchsize = 10 * 1000 blocksize = 64 print(batchsize, blocksize) input_data = np.random.rand(batchsize, blocksize).astype(np.float32) workspace.FeedBlob("input_data", input_data) net = core.Net("bench") op = core.CreateOperator( "FloatToFused" + str(bit_rate) + "BitRowwiseQuantized", "input_data", "quantized_data", engine="GREEDY", ) net.Proto().op.extend([op]) workspace.GlobalInit(["caffe2", "--caffe2_log_level=0"]) workspace.CreateNet(net) iterations = 10 workspace.BenchmarkNet(net.Proto().name, 1, iterations, True) net2 = core.Net("bench2") op = core.CreateOperator( "FloatToFused" + str(bit_rate) + "BitRowwiseQuantized", "input_data", "quantized_data", ) net2.Proto().op.extend([op]) workspace.CreateNet(net2) workspace.BenchmarkNet(net2.Proto().name, 1, iterations, True)
def testReLUSpeed(self): X = np.random.randn(128, 4096).astype(np.float32) mkl_do = core.DeviceOption(caffe2_pb2.MKLDNN) # Makes sure that feed works. workspace.FeedBlob("X", X) workspace.FeedBlob("X_mkl", X, device_option=mkl_do) net = core.Net("test") # Makes sure that we can run relu. net.Relu("X", "Y") net.Relu("X_mkl", "Y_mkl", device_option=mkl_do) workspace.CreateNet(net) workspace.RunNet(net) # makes sure that the results are good. np.testing.assert_allclose( workspace.FetchBlob("Y"), workspace.FetchBlob("Y_mkl"), atol=1e-10, rtol=1e-10) runtime = workspace.BenchmarkNet(net.Proto().name, 1, 100, True) # The returned runtime is the time of # [whole_net, cpu_op, mkl_op] # so we will assume that the MKL one runs faster than the CPU one. # Note(Yangqing): in fact, it seems that in optimized mode, this is # not always guaranteed - MKL runs slower than the Eigen vectorized # version, so I am turning this assertion off. #self.assertTrue(runtime[1] >= runtime[2]) print("Relu CPU runtime {}, MKL runtime {}.".format(runtime[1], runtime[2]))
def runOpBenchmark( device_option, op, inputs, input_device_options=None, iterations=10, ): if input_device_options is None: input_device_options = {} op = copy.deepcopy(op) op.device_option.CopyFrom(device_option) net = caffe2_pb2.NetDef() net.op.extend([op]) net.name = op.name if op.name else "test" with temp_workspace(): for (n, b) in zip(op.input, inputs): workspace.FeedBlob( n, b, device_option=input_device_options.get(n, device_option) ) workspace.CreateNet(net) ret = workspace.BenchmarkNet(net.name, 1, iterations, True) return ret
def testAveragePoolingSpeed(self): # We randomly select a shape to test the speed. Intentionally we # test a batch size of 1 since this may be the most frequent use # case for MKL during deployment time. X = np.random.rand(1, 64, 224, 224).astype(np.float32) mkl_do = core.DeviceOption(caffe2_pb2.MKLDNN) # Makes sure that feed works. workspace.FeedBlob("X", X) workspace.FeedBlob("X_mkl", X, device_option=mkl_do) net = core.Net("test") # Makes sure that we can run relu. net.AveragePool("X", "Y", stride=2, kernel=3) net.AveragePool("X_mkl", "Y_mkl", stride=2, kernel=3, device_option=mkl_do) workspace.CreateNet(net) workspace.RunNet(net) # makes sure that the results are good. np.testing.assert_allclose(workspace.FetchBlob("Y"), workspace.FetchBlob("Y_mkl"), atol=1e-2, rtol=1e-2) runtime = workspace.BenchmarkNet(net.Proto().name, 1, 100, True) print("Averagepooling CPU runtime {}, MKL runtime {}.".format( runtime[1], runtime[2]))
def testSpatialBNTrainingSpeed(self): input_channel = 10 X = np.random.rand(1, input_channel, 100, 100).astype(np.float32) - 0.5 scale = np.random.rand(input_channel).astype(np.float32) + 0.5 bias = np.random.rand(input_channel).astype(np.float32) - 0.5 mean = np.random.randn(input_channel).astype(np.float32) var = np.random.rand(input_channel).astype(np.float32) + 0.5 #mean = np.zeros(input_channel) #var = np.zeros(input_channel) mkl_do = core.DeviceOption(caffe2_pb2.MKLDNN) # Makes sure that feed works. workspace.FeedBlob("X", X) workspace.FeedBlob("scale", scale) workspace.FeedBlob("bias", bias) workspace.FeedBlob("mean", mean) workspace.FeedBlob("var", var) workspace.FeedBlob("X_mkl", X, device_option=mkl_do) workspace.FeedBlob("scale_mkl", scale, device_option=mkl_do) workspace.FeedBlob("bias_mkl", bias, device_option=mkl_do) workspace.FeedBlob("mean_mkl", mean, device_option=mkl_do) workspace.FeedBlob("var_mkl", var, device_option=mkl_do) net = core.Net("test") # Makes sure that we can run relu. net.SpatialBN(["X", "scale", "bias","mean", "var"], ["Y", "mean", "var", "saved_mean", "saved_var"], order="NCHW", is_test=False, epsilon=1e-5) net.SpatialBN(["X_mkl", "scale_mkl", "bias_mkl","mean_mkl","var_mkl"], ["Y_mkl", "mean_mkl", "var_mkl", "saved_mean_mkl", "saved_var_mkl"], order="NCHW", is_test=False, epsilon=1e-5, device_option=mkl_do) workspace.CreateNet(net) workspace.RunNet(net) # makes sure that the results are good. np.testing.assert_allclose( workspace.FetchBlob("Y"), workspace.FetchBlob("Y_mkl"), atol=1e-2, rtol=1e-2) np.testing.assert_allclose( workspace.FetchBlob("mean"), workspace.FetchBlob("mean_mkl"), atol=1e-2, rtol=1e-2) np.testing.assert_allclose( workspace.FetchBlob("var"), workspace.FetchBlob("var_mkl"), atol=1e-2, rtol=1e-2) runtime = workspace.BenchmarkNet(net.Proto().name, 1, 100, True) print("FC CPU runtime {}, MKL runtime {}.".format(runtime[1], runtime[2]))
def testFCSpeed(self): # We randomly select a shape to test the speed. Intentionally we # test a batch size of 1 since this may be the most frequent use # case for MKL during deployment time. X = np.random.rand(1, 256, 6, 6).astype(np.float32) - 0.5 #X = np.random.rand(32, 256*6*6).astype(np.float32) - 0.5 W = np.random.rand(4096, 9216).astype(np.float32) - 0.5 b = np.random.rand(4096).astype(np.float32) - 0.5 mkl_do = core.DeviceOption(caffe2_pb2.MKLDNN) # Makes sure that feed works. workspace.FeedBlob("X", X) workspace.FeedBlob("W", W) workspace.FeedBlob("b", b) workspace.FeedBlob("X_mkl", X, device_option=mkl_do) workspace.FeedBlob("W_mkl", W, device_option=mkl_do) workspace.FeedBlob("b_mkl", b, device_option=mkl_do) net = core.Net("test") # Makes sure that we can run relu. net.FC(["X", "W", "b"], "Y") net.FC(["X_mkl", "W_mkl", "b_mkl"], "Y_mkl", device_option=mkl_do) workspace.CreateNet(net) workspace.RunNet(net) # makes sure that the results are good. np.testing.assert_allclose(workspace.FetchBlob("Y"), workspace.FetchBlob("Y_mkl"), atol=1e-2, rtol=1e-2) runtime = workspace.BenchmarkNet(net.Proto().name, 1, 100, True) print("FC CPU runtime {}, MKL runtime {}.".format( runtime[1], runtime[2]))
def Benchmark(model_gen, arg): model, input_size = model_gen(arg.order, arg.cudnn_ws) model.Proto().type = arg.net_type model.Proto().num_workers = arg.num_workers # In order to be able to run everything without feeding more stuff, let's # add the data and label blobs to the parameter initialization net as well. if arg.order == "NCHW": input_shape = [arg.batch_size, 3, input_size, input_size] else: input_shape = [arg.batch_size, input_size, input_size, 3] if arg.model == "MLP": input_shape = [arg.batch_size, input_size] model.param_init_net.GaussianFill([], "data", shape=input_shape, mean=0.0, std=1.0) model.param_init_net.UniformIntFill([], "label", shape=[ arg.batch_size, ], min=0, max=999) if arg.forward_only: print('{}: running forward only.'.format(arg.model)) else: print('{}: running forward-backward.'.format(arg.model)) model.AddGradientOperators(["loss"]) AddParameterUpdate(model) if arg.order == 'NHWC': print( '==WARNING==\n' 'NHWC order with CuDNN may not be supported yet, so I might\n' 'exit suddenly.') if not arg.cpu: model.param_init_net.RunAllOnGPU() model.net.RunAllOnGPU() if arg.engine: for op in model.net.Proto().op: op.engine = arg.engine if arg.dump_model: # Writes out the pbtxt for benchmarks on e.g. Android with open("{0}_init_batch_{1}.pbtxt".format(arg.model, arg.batch_size), "w") as fid: fid.write(str(model.param_init_net.Proto())) with open("{0}.pbtxt".format(arg.model, arg.batch_size), "w") as fid: fid.write(str(model.net.Proto())) workspace.RunNetOnce(model.param_init_net) workspace.CreateNet(model.net) workspace.BenchmarkNet(model.net.Proto().name, arg.warmup_iterations, arg.iterations, arg.layer_wise_benchmark)
def Benchmark(args, model_map): arg_scope = { 'order': 'NCHW', 'use_cudnn': True, 'ws_nbytes_limit': args.cudnn_ws * 1024 * 1024, } model = model_helper.ModelHelper(name=args.model, arg_scope=arg_scope) # Either use specified device list or generate one if args.gpus is not None: gpus = [int(x) for x in args.gpus.split(',')] num_gpus = len(gpus) else: gpus = list(range(args.num_gpus)) num_gpus = args.num_gpus # Verify valid batch size total_batch_size = args.batch_size batch_per_device = total_batch_size // num_gpus assert \ total_batch_size % num_gpus == 0, \ "Number of GPUs must divide batch size" def add_image_input(model): AddNullInput( model, batch_size=batch_per_device, img_size=model_map[args.model][1], dtype=args.dtype, ) data_parallel_model.Parallelize( model, input_builder_fun=add_image_input, forward_pass_builder_fun=partial(model_map[args.model][0], dtype=args.dtype), optimizer_builder_fun=add_optimizer if not args.forward_only else None, post_sync_builder_fun=add_post_sync_ops, devices=gpus, optimize_gradient_memory=False, cpu_device=args.cpu, num_threads_per_device=args.num_workers_per_device, ) if not args.forward_only: data_parallel_model.OptimizeGradientMemory(model, {}, set(), False) model.Proto().type = args.net_type workspace.RunNetOnce(model.param_init_net) workspace.CreateNet(model.net) ms_per_iter = workspace.BenchmarkNet(model.net.Proto().name, args.warmup_iterations, args.iterations, args.layer_wise_benchmark) print("number of images/sec: {}".format( round(args.batch_size * 1000 / ms_per_iter[0], 2)))
def Benchmark(model_gen, arg): model, input_size = model_gen(arg.order) # In order to be able to run everything without feeding more stuff, let's # add the data and label blobs to the parameter initialization net as well. if arg.order == "NCHW": input_shape = [arg.batch_size, 3, input_size, input_size] else: input_shape = [arg.batch_size, input_size, input_size, 3] model.param_init_net.GaussianFill([], "data", shape=input_shape, mean=0.0, std=1.0) model.param_init_net.UniformIntFill([], "label", shape=[ arg.batch_size, ], min=0, max=999) if arg.forward_only: print('{}: running forward only.'.format(arg.model)) else: print('{}: running forward-backward.'.format(arg.model)) model.AddGradientOperators() if arg.order == 'NHWC': print( '==WARNING==\n' 'NHWC order with CuDNN may not be supported yet, so I might\n' 'exit suddenly.') if not arg.cpu: model.param_init_net.RunAllOnGPU() model.net.RunAllOnGPU() if arg.dump_model: # Writes out the pbtxt for benchmarks on e.g. Android with open("{0}_init_batch_{1}.pbtxt".format(arg.model, arg.batch_size), "w") as fid: fid.write(str(model.param_init_net.Proto())) with open("{0}.pbtxt".format(arg.model, arg.batch_size), "w") as fid: fid.write(str(model.net.Proto())) workspace.RunNetOnce(model.param_init_net) workspace.CreateNet(model.net) for i in range(arg.warmup_iterations): workspace.RunNet(model.net.Proto().name) start = time.time() for i in range(arg.iterations): workspace.RunNet(model.net.Proto().name) print('Spent: {}'.format((time.time() - start) / arg.iterations)) if arg.layer_wise_benchmark: print('Layer-wise benchmark.') workspace.BenchmarkNet(model.net.Proto().name, 1, arg.iterations, True)
def benchmark_mul_gradient(args): workspace.FeedBlob("dC", np.random.rand(args.m, args.n).astype(np.float32)) workspace.FeedBlob("A", np.random.rand(args.m, args.n).astype(np.float32)) workspace.FeedBlob("B", np.random.rand(args.m).astype(np.float32)) net = core.Net("mynet") net.MulGradient(["dC", "A", "B"], ["dA", "dB"], broadcast=True, axis=0) workspace.CreateNet(net) workspace.BenchmarkNet(net.Name(), 1, args.iteration, True)
def benchmark_concat(num_inputs, input_dim, axis, add_axis, iterations): input_names = [f"input{i}" for i in range(num_inputs)] for n in input_names: workspace.FeedBlob(n, np.random.randn(*input_dim).astype(np.float32)) net = core.Net("benchmark_net") net.Concat(input_names, ["output", "split_info"], axis=axis, add_axis=add_axis) workspace.CreateNet(net) runtimes = workspace.BenchmarkNet(net.Name(), 1, iterations, True) print(f"{num_inputs * np.prod(input_dim) * 4 / runtimes[1] / 1e6} GB/s")
def benchmark_sparse_lengths_sum(dtype_str, categorical_limit, embedding_size, average_len, batch_size, iterations): print('Preparing lookup table. ' + str(datetime.datetime.now())) # We will use a constant, but non-trivial value so we save initialization # time. data = np.ones([categorical_limit, embedding_size], dtype=np.float32) data *= 17.01 if dtype_str == 'uint8': scale_bias = np.random.rand(categorical_limit, 2).astype(np.float32) workspace.FeedBlob("scale_bias", scale_bias.astype(np.float32)) elif dtype_str == 'uint8_fused': scale_bias = np.random.randint(255, size=(categorical_limit, 8)) data = np.concatenate([data, scale_bias], axis=1) print('Data has shape {} {}'.format(data.shape, datetime.datetime.now())) workspace.FeedBlob("X", data.astype(DTYPES[dtype_str])) # In order to produce truly random lengths and indices, we will embed a # Python operator in the net to generate them. def f(_, outputs): lengths = np.random.randint(int(average_len * 0.75), int(average_len * 1.25), batch_size).astype(np.int32) indices = np.random.randint(0, categorical_limit, np.sum(lengths)).astype(np.int64) outputs[0].feed(indices) outputs[1].feed(lengths) net = core.Net("mynet") net.Python(f)([], [ "indices", "lengths", ]) if dtype_str == "uint8": net.SparseLengthsSum8BitsRowwise( ["X", "indices", "lengths", "scale_bias"], "Y") elif dtype_str == "uint8_fused": net.SparseLengthsSumFused8BitRowwise(["X", "indices", "lengths"], "Y") else: net.SparseLengthsSum(["X", "indices", "lengths"], "Y") workspace.CreateNet(net) # Set random seed, so that repeated runs will keep the same sequence of # random indices. np.random.seed(1701) print('Preparation finished. ' + str(datetime.datetime.now())) workspace.BenchmarkNet(net.Name(), 1, iterations, True)
def testConvReluLRNSpeed(self): # We randomly select a shape to test the speed. Intentionally we # test a batch size of 1 since this may be the most frequent use # case for MKL during deployment time. X = np.random.rand(1, 3, 224, 224).astype(np.float32) - 0.5 W = np.random.rand(64, 3, 11, 11).astype(np.float32) - 0.5 b = np.random.rand(64).astype(np.float32) - 0.5 mkl_do = core.DeviceOption(caffe2_pb2.MKLDNN) # Makes sure that feed works. workspace.FeedBlob("X", X) workspace.FeedBlob("W", W) workspace.FeedBlob("b", b) workspace.FeedBlob("X_mkl", X, device_option=mkl_do) workspace.FeedBlob("W_mkl", W, device_option=mkl_do) workspace.FeedBlob("b_mkl", b, device_option=mkl_do) net = core.Net("test") net.Conv(["X", "W", "b"], "C", pad=1, stride=1, kernel=11) net.Conv(["X_mkl", "W_mkl", "b_mkl"], "C_mkl", pad=1, stride=1, kernel=11, device_option=mkl_do) net.Relu("C", "R") net.Relu("C_mkl", "R_mkl", device_option=mkl_do) net.LRN("R", ["Y", "Y_Scale"], size=5, alpha=0.001, beta=0.75, bias=2.0, order="NCHW") net.LRN("R_mkl", ["Y_mkl", "Y_Scale_mkl"], size=5, alpha=0.001, beta=0.75, bias=2.0, order="NCHW", device_option=mkl_do) workspace.CreateNet(net) workspace.RunNet(net) # makes sure that the results are good. np.testing.assert_allclose(workspace.FetchBlob("Y"), workspace.FetchBlob("Y_mkl"), atol=1e-2, rtol=1e-2) runtime = workspace.BenchmarkNet(net.Proto().name, 1, 100, True)
def testConvReluMaxPoolFcSpeed(self): # We randomly select a shape to test the speed. Intentionally we # test a batch size of 1 since this may be the most frequent use # case for MKL during deployment time. X = np.random.rand(1, 256, 13, 13).astype(np.float32) - 0.5 W = np.random.rand(256, 256, 3, 3).astype(np.float32) - 0.5 b = np.random.rand(256).astype(np.float32) - 0.5 w_fc = np.random.rand(4096, 9216).astype(np.float32) - 0.5 b_fc = np.random.rand(4096).astype(np.float32) - 0.5 mkl_do = core.DeviceOption(caffe2_pb2.MKLDNN) # Makes sure that feed works. workspace.FeedBlob("X", X) workspace.FeedBlob("W", W) workspace.FeedBlob("b", b) workspace.FeedBlob("w_fc", w_fc) workspace.FeedBlob("b_fc", b_fc) workspace.FeedBlob("X_mkl", X, device_option=mkl_do) workspace.FeedBlob("W_mkl", W, device_option=mkl_do) workspace.FeedBlob("b_mkl", b, device_option=mkl_do) workspace.FeedBlob("w_fc_mkl", w_fc, device_option=mkl_do) workspace.FeedBlob("b_fc_mkl", b_fc, device_option=mkl_do) net = core.Net("test") net.Conv(["X", "W", "b"], "C", pad=1, stride=1, kernel=3) net.Relu("C", "R") net.MaxPool("R", "P", stride=2, kernel=3) net.FC(["P", "w_fc", "b_fc"], "Y") net.Conv(["X_mkl", "W_mkl", "b_mkl"], "C_mkl", pad=1, stride=1, kernel=3, device_option=mkl_do) net.Relu("C_mkl", "R_mkl", device_option=mkl_do) net.MaxPool("R_mkl", "P_mkl", stride=2, kernel=3, device_option=mkl_do) net.FC(["P_mkl", "w_fc_mkl", "b_fc_mkl"], "Y_mkl", device_option=mkl_do) workspace.CreateNet(net) workspace.RunNet(net) # makes sure that the results are good. np.testing.assert_allclose(workspace.FetchBlob("Y"), workspace.FetchBlob("Y_mkl"), atol=1e-2, rtol=1e-2) runtime = workspace.BenchmarkNet(net.Proto().name, 1, 100, True)
def benchmark(args): print('Batch size: {}'.format(args.batch_size)) mf = ModelDownloader() init_net, pred_net, value_info = mf.get_c2_model(args.model) input_shapes = { k: [args.batch_size] + v[-1][1:] for (k, v) in value_info.items() } print("input info: {}".format(input_shapes)) external_inputs = {} for k, v in input_shapes.items(): external_inputs[k] = np.random.randn(*v).astype(np.float32) if args.device == 'CPU': device_option = core.DeviceOption(caffe2_pb2.CPU) elif args.device == 'MKL': device_option = core.DeviceOption(caffe2_pb2.MKLDNN) elif args.device == 'IDEEP': device_option = core.DeviceOption(caffe2_pb2.IDEEP) else: raise Exception("Unknown device: {}".format(args.device)) print("Device option: {}, {}".format(args.device, device_option)) pred_net.device_option.CopyFrom(device_option) for op in pred_net.op: op.device_option.CopyFrom(device_option) # Hack to initialized weights into MKL/IDEEP context workspace.RunNetOnce(init_net) bb = workspace.Blobs() weights = {} for b in bb: weights[b] = workspace.FetchBlob(b) for k, v in external_inputs.items(): weights[k] = v workspace.ResetWorkspace() with core.DeviceScope(device_option): for name, blob in weights.items(): #print("{}".format(name)) workspace.FeedBlob(name, blob, device_option) workspace.CreateNet(pred_net) start = time.time() res = workspace.BenchmarkNet(pred_net.name, args.warmup_iterations, args.iterations, args.layer_wise_benchmark) print("FPS: {:.2f}".format(1 / res[0] * 1000 * args.batch_size))
def compare_fcs(B, M, N, num_runs, mapping_options=None): X = np.random.rand(B, M).astype(np.float32) - 0.5 W = np.random.rand(N, M).astype(np.float32) - 0.5 b = np.random.rand(N).astype(np.float32) - 0.5 with core.DeviceScope(core.DeviceOption(1)): workspace.FeedBlob("X", X) workspace.FeedBlob("W", W) workspace.FeedBlob("b", b) net = core.Net("test") with core.DeviceScope(core.DeviceOption(1)): net.FC(["X", "W", "b"], "Y_baseline") net.TcOp( ["X", "W", "b"], "Y_TC", tc_def=FC_LANG, tc_name="func_fc", mapping_options=(mapping_options.serialize() if mapping_options else None), check_sizes=True, ) workspace.CreateNet(net) workspace.RunNet(net) baseline_value = workspace.blobs["Y_baseline"] tc_value = workspace.blobs["Y_TC"] np.testing.assert_allclose( baseline_value, tc_value, rtol=1e-4, atol=1e-4, ) runtimes = workspace.BenchmarkNet( net.Name(), 0, # warmpup was already done num_runs, True, # run individual ops )[1:] print(runtimes)
def benchmark_sparse_lengths_sum( dtype_str, categorical_limit, embedding_size, average_len, batch_size, iterations): print('Preparing lookup table. ' + str(datetime.datetime.now())) # We will use a constant, but non-trivial value so we save initialization # time. arr = np.ones([categorical_limit, embedding_size], dtype=np.float32) arr *= 17.01 dtype_table = { 'float': np.float32, 'float16': np.float16 } workspace.FeedBlob("X", arr.astype(dtype_table[dtype_str])) # In order to produce truly random lengths and indices, we will embed a # Python operator in the net to generate them. def f(_, outputs): lengths = np.random.randint( int(average_len * 0.75), int(average_len * 1.25), batch_size).astype(np.int32) indices = np.random.randint( 0, categorical_limit, np.sum(lengths)).astype(np.int64) outputs[0].feed(indices) outputs[1].feed(lengths) net = core.Net("mynet") net.Python(f)([], ["indices", "lengths"]) net.SparseLengthsSum(["X", "indices", "lengths"], "Y") workspace.CreateNet(net) # Set random seed, so that repeated runs will keep the same sequence of # random indices. np.random.seed(1701) print('Preparation finished. ' + str(datetime.datetime.now())) workspace.BenchmarkNet(net.Name(), 1, iterations, True)
def testReLUConsistencyWithCPU(self): X = np.random.randn(128, 4096).astype(np.float32) mkl_do = core.DeviceOption(caffe2_pb2.MKLDNN) # Makes sure that feed works. workspace.FeedBlob("X", X) workspace.FeedBlob("X_mkl", X, device_option=mkl_do) model = cnn.CNNModelHelper() # Makes sure that we can run relu. model.Relu("X", "Y") model.Relu("X_mkl", "Y_mkl", device_option=mkl_do) workspace.CreateNet(model.net) workspace.RunNet(model.net) # makes sure that the results are good. np.testing.assert_allclose(workspace.FetchBlob("Y"), workspace.FetchBlob("Y_mkl"), atol=1e-10, rtol=1e-10) runtime = workspace.BenchmarkNet(model.net.Proto().name, 1, 10, True) # The returned runtime is the time of # [whole_net, cpu_op, mkl_op] # so we will assume that the MKL one runs faster than the CPU one. self.assertTrue(runtime[1] >= runtime[2]) print("CPU runtime {}, MKL runtime {}.".format(runtime[1], runtime[2]))
def main(): args = parser.parse_args() args.gpu_id = 0 model = model_helper.ModelHelper(name="le_net", init_params=False) # Bring in the init net from init_net.pb init_net_proto = caffe2_pb2.NetDef() with open(args.c2_init, "rb") as f: init_net_proto.ParseFromString(f.read()) model.param_init_net = core.Net( init_net_proto ) # model.param_init_net.AppendNet(core.Net(init_net_proto)) # # bring in the predict net from predict_net.pb predict_net_proto = caffe2_pb2.NetDef() with open(args.c2_predict, "rb") as f: predict_net_proto.ParseFromString(f.read()) model.net = core.Net( predict_net_proto) # model.net.AppendNet(core.Net(predict_net_proto)) # CUDA performance not impressive #device_opts = core.DeviceOption(caffe2_pb2.PROTO_CUDA, args.gpu_id) #model.net.RunAllOnGPU(gpu_id=args.gpu_id, use_cudnn=True) #model.param_init_net.RunAllOnGPU(gpu_id=args.gpu_id, use_cudnn=True) input_blob = model.net.external_inputs[0] model.param_init_net.GaussianFill([], input_blob.GetUnscopedName(), shape=(args.batch_size, 3, args.img_size, args.img_size), mean=0.0, std=1.0) workspace.RunNetOnce(model.param_init_net) workspace.CreateNet(model.net, overwrite=True) workspace.BenchmarkNet(model.net.Proto().name, 5, 20, True)
def testLRNSpeed(self): # We randomly select a shape to test the speed. Intentionally we # test a batch size of 1 since this may be the most frequent use # case for MKL during deployment time. X = np.random.rand(1, 2, 224, 224).astype(np.float32) mkl_do = core.DeviceOption(caffe2_pb2.MKLDNN) # Makes sure that feed works. workspace.FeedBlob("X", X) workspace.FeedBlob("X_mkl", X, device_option=mkl_do) net = core.Net("test") # Makes sure that we can run relu. net.LRN("X", ["Y", "Y_Scale"], size=5, alpha=0.001, beta=0.75, bias=2.0, order="NCHW") net.LRN("X_mkl", ["Y_mkl", "Y_Scale_mkl"], size=5, alpha=0.001, beta=0.75, bias=2.0, order="NCHW", device_option=mkl_do) workspace.CreateNet(net) workspace.RunNet(net) # makes sure that the results are good. np.testing.assert_allclose(workspace.FetchBlob("Y"), workspace.FetchBlob("Y_mkl"), atol=1e-2, rtol=1e-2) runtime = workspace.BenchmarkNet(net.Proto().name, 1, 100, True) print("LRN CPU runtime {}, MKL runtime {}.".format( runtime[1], runtime[2]))
def print_benchmark(shape): print("==> Counting FLOPS") model = model_helper.ModelHelper(name="model", init_params=False) init_net_proto = caffe2_pb2.NetDef() with open('weights/model.init.pb', "rb") as f: init_net_proto.ParseFromString(f.read()) model.param_init_net = core.Net(init_net_proto) predict_net_proto = caffe2_pb2.NetDef() with open('weights/model.predict.pb', "rb") as f: predict_net_proto.ParseFromString(f.read()) model.net = core.Net(predict_net_proto) model.param_init_net.GaussianFill( [], model.net.external_inputs[0].GetUnscopedName(), shape=shape, mean=0.0, std=1.0) workspace.RunNetOnce(model.param_init_net) workspace.CreateNet(model.net) workspace.BenchmarkNet(model.net.Proto().name, 5, 100, True) print("==> Done")
device_opt = core.DeviceOption(caffe2_pb2.CUDA, 0) with core.NameScope("imonaboat"): with core.DeviceScope(device_opt): data, label = AddInput(train_model, batch_size=1, db='/caffe/train', db_type='lmdb') softmax = AddLeNetModel(train_model, data) print('Created training model.') # The parameter initialization network only needs to be run once. workspace.RunNetOnce(train_model.param_init_net) # creating the network workspace.CreateNet(train_model.net) # set the number of iterations and track the accuracy & loss total_iters = 20 start = time.time() print 'Start' for i in range(total_iters): st = time.time() #workspace.RunNet(train_model.net.Proto().name) workspace.BenchmarkNet(train_model.net.Proto().name, 0, 1, True) e = time.time() print i + 1, ': iteration time {}'.format(e - st) end = time.time() print('Time: {}'.format(end - start))
def network_eval(args): """ Runs network benchmarking on either a single or multiple nodes """ # Define some parameters for the model instantiation if args.use_ideep: train_arg_scope = { 'use_cudnn': False, 'cudnn_exhaustive_search': False, 'training_mode': 1 } else: train_arg_scope = { 'order': 'NCHW', 'use_cudnn': True, 'cudnn_exhaustive_search': True, # 1048576 = 2 ^ 20 (1 MB) 'ws_nbytes_limit': (args.cudnn_ws_lim * 1048576), } # Create the model for evaluation evaluation_model = model_helper.ModelHelper(name='evaluation_model', arg_scope=train_arg_scope) evaluation_model.Proto().num_workers = 16 # Default the model for accuracy testing to None accuracy_time_model = None # Compute batch and epoch sizes # Per CPU / GPU batch size per_local_device_batch = ( args.batch_size // len(args.gpu_devices)) if args.gpu_devices else args.batch_size # Total batch size (over all the devices) global_batch_size = args.batch_size * args.num_shards # Number of epoch iterations epoch_iters = args.epoch_size // global_batch_size # Adjust the true number of examples per epoch args.epoch_size = global_batch_size * epoch_iters if args.training_data: log.info("Running experiments with user provided data: %s", args.training_data) # Create a reader, which can also help distribute data when running on multiple nodes reader = evaluation_model.CreateDB( "reader", db=args.training_data, db_type=args.db_type, num_shards=args.num_shards, shard_id=args.shard_id, ) def image_input(model): AddImageInput(model, reader, per_local_device_batch, min(args.height, args.width), args.data_type, args.use_cpu) else: input_shape = [args.batch_size, args.channels, args.height, args.width] log.info("Running experiments with synthetic data w/ shape: %s", input_shape) def image_input(model): AddSyntheticInput(model, args.data_type, input_shape, args.num_labels) # Create the network, and normalize the loss def create_model(model, loss_scale): initializer = (PseudoFP16Initializer if args.data_type == 'float16' else Initializer) with brew.arg_scope([brew.conv, brew.fc], WeightInitializer=initializer, BiasInitializer=initializer, enable_tensor_core=False, float16_compute=False): pred = resnet.create_resnet50( model, "data", num_input_channels=args.channels, num_labels=args.num_labels, # num_groups=args.resnext_num_groups, # num_width_per_group=args.resnext_width_per_group, no_bias=True, no_loss=True) # If we're using float on 2B, then inflate to the 4B representation if args.data_type == 'float16': pred = model.net.HalfToFloat(pred, pred + '_fp32') # Compute the softmax probabilities and the loss softmax, loss = model.SoftmaxWithLoss([pred, 'label'], ['softmax', 'loss']) # Noralize the loss, and compute the top_k accuracies for k \in {1, 5} loss = model.Scale(loss, scale=loss_scale) brew.accuracy(model, [softmax, "label"], "accuracy", top_k=1) brew.accuracy(model, [softmax, "label"], "accuracy_top5", top_k=5) return [loss] def add_optimizer(model): """ Optimizer function called once for the entire model, as opposed for each CPU / GPU individually. The optimizer will be a stepwise weight decay. :return: return the optimizer """ stepsz = int(30 * args.epoch_size / args.batch_size / args.num_shards) stepsz = stepsz if stepsz else 100 optimizer.add_weight_decay(model, 1e-4) # opt = optimizer.build_multi_precision_sgd( opt = optimizer.build_sgd(model, args.base_learning_rate, momentum=0.9, nesterov=1, policy="step", stepsize=stepsz, gamma=0.1) return opt def add_parameter_update(model): """ Add a simple gradient based parameter update with stepwise adaptive learning rate. """ # This counts the number if iterations we are making ITER = brew.iter(model, "iter") # Adds a LR to the model, updated using a simple step policy every 10k steps; gamma is an update parameter LR = model.LearningRate(ITER, "LR", base_lr=-args.base_learning_rate, policy="step", stepsize=1000, gamma=0.999) # This is a constant used in the following loop ONE = model.param_init_net.ConstantFill([], "ONE", shape=[1], value=1.0) # Here we are essentially applying the gradients to the weights (using the classical method) for param in model.params: param_grad = model.param_to_grad[param] model.WeightedSum([param, ONE, param_grad, LR], param) def add_post_sync_ops(model): """ Add ops applied after initial parameter sync. """ for param_info in model.GetOptimizationParamInfo(model.GetParams()): if param_info.blob_copy is not None: model.param_init_net.HalfToFloat( param_info.blob, param_info.blob_copy[core.DataType.FLOAT]) if args.num_shards > 1: log.info("Distributed benchmarking is enabled") log.info("Num shards: %d", args.num_shards) log.info("My shard ID: %d", args.shard_id) if args.redis_host: log.info("Using Redis server at %s:%d", args.redis_host, args.redis_port) else: log.info("Rendevous at: %s", args.rendezvous_path) # Prepare the required parameters for distribution store_handler = "store_handler" # We'll use the shared file system for rendezvous if args.redis_host: workspace.RunOperatorOnce( core.CreateOperator( "RedisStoreHandlerCreate", [], [store_handler], host=args.redis_host, port=args.redis_port, prefix=args.run_id, )) else: workspace.RunOperatorOnce( core.CreateOperator( "FileStoreHandlerCreate", [], [store_handler], path=args.rendezvous_path, prefix=args.run_id, )) rendezvous = dict(kv_handler=store_handler, shard_id=args.shard_id, num_shards=args.num_shards, engine="GLOO", transport=args.distributed_transport, interface=args.network_interface, exit_nets=None) # Parallelize the model (data parallel) data_parallel_model.Parallelize( evaluation_model, input_builder_fun=image_input, forward_pass_builder_fun=create_model, optimizer_builder_fun=None if not args.backward else (add_optimizer if not args.per_device_optimization else None), param_update_builder_fun=None if not args.backward else (add_parameter_update if args.per_device_optimization else None), post_sync_builder_fun=add_post_sync_ops if args.post_sync else None, devices=(args.gpu_devices if not args.use_cpu else [0]), rendezvous=rendezvous, # Although this is a parameter (broadcast params) of this function, it is # currently not implemented in Caffe2's source code broadcast_computed_params=args.broadcast_params, optimize_gradient_memory=args.optimize_gradient_memory, dynamic_memory_management=args.dynamic_memory_management, max_concurrent_distributed_ops=args.max_distributed_ops, num_threads_per_device=args.max_threads, use_nccl=args.use_nccl, cpu_device=args.use_cpu, ideep=args.use_ideep, shared_model=args.shared_model, combine_spatial_bn=args.use_cpu, ) if args.backward: data_parallel_model.OptimizeGradientMemory(evaluation_model, {}, set(), False) instantiate_and_create_net(evaluation_model) # If we're testing for the time it takes to reach a particular accuracy, then we'll need to create # a new model just for this if args.test_accuracy: # Test for the existance of testing data assert args.testing_data, "We must have testing data if we're measuring the time to accuracy" log.info("We're running time to test accuracy") log.info("The accuracy we're looking for: %f", args.target_accuracy) log.info("Testing data provided in: %s", args.testing_data) # Create the model if args.use_ideep: test_arg_scope = { 'use_cudnn': False, 'cudnn_exhaustive_search': False, } else: test_arg_scope = { 'order': 'NCHW', 'use_cudnn': True, 'cudnn_exhaustive_search': True, } accuracy_time_model = model_helper.ModelHelper( name='accuracy_time_model', arg_scope=test_arg_scope, init_params=False) # Create the input function # Create a reader, which can also help distribute data when running on multiple nodes test_reader = accuracy_time_model.CreateDB("test_reader", db=args.testing_data, db_type=args.db_type) def test_image_input(model): AddImageInput(model, test_reader, per_local_device_batch, min(args.height, args.width), args.data_type, args.use_cpu, is_test=True) # Create the test model per se data_parallel_model.Parallelize( accuracy_time_model, input_builder_fun=test_image_input, forward_pass_builder_fun=create_model, post_sync_builder_fun=add_post_sync_ops if args.post_sync else None, param_update_builder_fun=None, devices=(args.gpu_devices if not args.use_cpu else [0]), cpu_device=args.use_cpu) instantiate_and_create_net(accuracy_time_model) else: print("Single node benchmarking is enabled") # Build the training model if args.use_cpu: image_input(evaluation_model) create_model(evaluation_model, 1.0) if args.backward: evaluation_model.AddGradientOperators(["loss"]) add_parameter_update(evaluation_model) else: # We're running this on a single GPU on a single node, so create the net under the GPU's net with core.DeviceScope( core.DeviceOption(caffe2_pb2.CUDA, args.gpu_devices[0])): image_input(evaluation_model) create_model(evaluation_model, 1.0) if args.backward: evaluation_model.AddGradientOperators(["loss"]) add_parameter_update(evaluation_model) instantiate_and_create_net(evaluation_model) if args.test_accuracy: # Test for the existance of testing datan GPU: https://caffe2.ai/doxygen-python/html/classcaffe2_1_1python_1_1core_1_1_net.html#af67e059d8f4cc22e7e64ccdd07918681 assert args.testing_data, "We must have testing data if we're measuring the time to accuracy" log.info("We're running time to test accuracy") log.info("The accuracy we're looking for: %f", args.target_accuracy) log.info("Testing data provided in: %s", args.testing_data) # Create the model if args.use_ideep: test_arg_scope = { 'use_cudnn': False, 'cudnn_exhaustive_search': False, } else: test_arg_scope = { 'order': 'NCHW', 'use_cudnn': True, 'cudnn_exhaustive_search': True, } accuracy_time_model = model_helper.ModelHelper( name='accuracy_time_model', arg_scope=test_arg_scope, init_params=False) # Create the input function # Create a reader, which can also help distribute data when running on multiple nodes test_reader = accuracy_time_model.CreateDB("test_reader", db=args.testing_data, db_type=args.db_type) def test_image_input(model): AddImageInput(model, test_reader, per_local_device_batch, min(args.height, args.width), args.data_type, args.use_cpu, is_test=True) # Create the test model per se test_image_input(accuracy_time_model) create_model(accuracy_time_model, 1.0) instantiate_and_create_net(accuracy_time_model) if not args.test_accuracy: workspace.BenchmarkNet(evaluation_model.net.Proto().name, args.warmup_rounds, args.eval_rounds, args.per_layer_eval) else: # Create a log for time to accuracy testing expname = "time_to_acc_model_%s_gpu%d_b%d_L%d_lr%.2f_shard%d" % ( args.model_name, len(args.gpu_devices) if not args.use_cpu else 1, args.batch_size, args.num_labels, args.base_learning_rate, args.shard_id) explog = experiment_util.ModelTrainerLog(expname, args) # Run the epochs elapsed_training_time = 0.0 for i in range(args.epoch_count): elapsed_training_time, on_target = RunEpoch( args, i, evaluation_model, accuracy_time_model, explog, elapsed_training_time) if args.terminate_on_target and on_target: log.info("Have reached the target accuracy: {} in {} seconds.". format(args.target_accuracy, elapsed_training_time)) break
def Benchmark(model_gen, arg): model, input_size = model_gen(arg.order, arg.cudnn_ws, arg.device) model.Proto().type = arg.net_type model.Proto().num_workers = arg.num_workers # In order to be able to run everything without feeding more stuff, let's # add the data and label blobs to the parameter initialization net as well. if arg.order == "NCHW": input_shape = [arg.batch_size, 3, input_size, input_size] else: input_shape = [arg.batch_size, input_size, input_size, 3] if arg.model == "MLP": input_shape = [arg.batch_size, input_size] model.param_init_net.GaussianFill([], "data", shape=input_shape, mean=0.0, std=1.0) #IDEEP/MKL doesn't support int, so have to use numpy if arg.device == 'MKL' or arg.device == 'IDEEP': label = np.random.randint(low=0, high=1000, size=(arg.batch_size, )).astype(np.int32) workspace.FeedBlob("label", label) else: model.param_init_net.UniformIntFill([], "label", shape=[ arg.batch_size, ], min=0, max=999) if arg.forward_only: print('{}: running forward only.'.format(arg.model)) elif arg.device == 'MKL': raise Exception('forward-backward not supported yet in MKL') else: print('{}: running forward-backward.'.format(arg.model)) model.AddGradientOperators(["loss"]) AddParameterUpdate(model) if arg.order == 'NHWC': print( '==WARNING==\n' 'NHWC order with CuDNN may not be supported yet, so I might\n' 'exit suddenly.') if arg.device == 'IDEEP': model.param_init_net.RunAllOnIDEEP() model.net.RunAllOnIDEEP() elif arg.device == 'MKL': model.param_init_net.RunAllOnMKL() model.net.RunAllOnMKL() elif arg.device == 'CUDA': model.param_init_net.RunAllOnGPU() model.net.RunAllOnGPU() print('Running on device: {}'.format(arg.device)) if arg.engine: for op in model.net.Proto().op: op.engine = arg.engine if arg.dump_model: # Writes out the pbtxt for benchmarks on e.g. Android with open("{0}_init_batch_{1}.pbtxt".format(arg.model, arg.batch_size), "w") as fid: fid.write(str(model.param_init_net.Proto())) with open("{0}.pbtxt".format(arg.model, arg.batch_size), "w") as fid: fid.write(str(model.net.Proto())) workspace.RunNetOnce(model.param_init_net) workspace.CreateNet(model.net) results = workspace.BenchmarkNet(model.net.Proto().name, arg.warmup_iterations, arg.iterations, arg.layer_wise_benchmark) print('FPS: {}'.format(arg.batch_size * 1000 / results[0]))
#i = init_net.Cast([i], to=itype) i = workspace.FeedBlob("ind", np.load('ind.npy')) l = init_net.ConstantFill( [], shape=[isize // args.pooling], value=args.pooling, dtype=core.DataType.INT32, ) #net.SparseLengthsSum([d, i, l], name=name, engine=engine) net.SparseLengthsSum(["weights", "ind", l], name=name, engine=engine) if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--table-size', type=int, default=10**7, help='embedding table size') parser.add_argument('--batch-size', type=int, default=1024, help='batch size') parser.add_argument('--pooling', type=int, default=20, help='pooling') parser.add_argument('--column', type=int, default=64, help='number of columns in the embedding table') args, extra_args = parser.parse_known_args() benchSparseSegmentSum() workspace.GlobalInit(['caffe2', '--caffe2_log_level=0'] + extra_args) workspace.RunNetOnce(init_net) workspace.CreateNet(net) workspace.BenchmarkNet(net.Proto().name, 100, 10000, True)
def benchmark_sparse_lengths_sum( categorical_limit, embedding_size, average_len, batch_size, iterations, flush_cache, bit_rate=st.sampled_from([2, 4]), ): print("Preparing lookup table. " + str(datetime.datetime.now())) # We will use a constant, but non-trivial value so we save initialization # time. data = np.ones([categorical_limit, embedding_size], dtype=np.float32) data *= 17.01 init_net = core.Net("init_net") op = core.CreateOperator( "FloatToFused" + str(bit_rate) + "BitRowwiseQuantized", "X", "X_q") init_net.Proto().op.extend([op]) workspace.FeedBlob("X", data) print("Data has shape {} {}".format(data.shape, datetime.datetime.now())) # In order to produce truly random lengths and indices, we will embed a # Python operator in the net to generate them. def f(_, outputs): lengths = np.random.randint(int(average_len * 0.75), int(average_len * 1.25), batch_size).astype(np.int32) indices = np.random.randint(0, categorical_limit, np.sum(lengths)).astype(np.int64) outputs[0].feed(indices) outputs[1].feed(lengths) init_net.Python(f)([], ["indices", "lengths"]) workspace.RunNetOnce(init_net) net = core.Net("mynet") if flush_cache: l3_cache_size = 30 * 2**20 // 4 workspace.FeedBlob("huge_blob", np.random.randn(l3_cache_size).astype(np.float32)) net.Scale("huge_blob", "huge_blob_2x", value=2.0) op = core.CreateOperator( "SparseLengthsSumFused" + str(bit_rate) + "BitRowwise", ["X_q", "indices", "lengths"], "Y", ) net.Proto().op.extend([op]) workspace.CreateNet(net) # Set random seed, so that repeated runs will keep the same sequence of # random indices. np.random.seed(1701) print("Preparation finished. " + str(datetime.datetime.now())) runtimes = workspace.BenchmarkNet(net.Name(), 1, iterations, True) print("{} billion sums per sec".format( embedding_size * workspace.FetchBlob("indices").size / runtimes[2 if flush_cache else 1] / 1e6))
def benchmark_sparse_normalize( categorical_limit, embedding_size, average_len, batch_size, iterations, flush_cache, fp16, ): print("Preparing lookup table. " + str(datetime.datetime.now())) # We will use a constant, but non-trivial value so we save initialization # time. data = np.ones([categorical_limit, embedding_size], dtype=np.float32) data *= 17.01 init_net = core.Net("init_net") if fp16: op = core.CreateOperator("FloatToHalf", "X", "X_fp16") init_net.Proto().op.extend([op]) l3_cache_size = 30 * 2**20 // 4 # In order to produce truly random lengths and indices, we will embed a # Python operator in the net to generate them. def f(_, outputs): lengths = np.random.randint(int(average_len * 0.75), int(average_len * 1.25), batch_size).astype(np.int32) indices = np.random.randint(0, categorical_limit, np.sum(lengths)).astype(np.int64) outputs[0].feed(indices) workspace.FeedBlob("X", data) workspace.FeedBlob("huge_blob", np.random.randn(l3_cache_size).astype(np.float32)) print("Data has shape {} {}".format(data.shape, datetime.datetime.now())) init_net.Python(f)([], ["indices"]) workspace.RunNetOnce(init_net) net = core.Net("mynet") op = core.CreateOperator( "Float16SparseNormalize" if fp16 else "SparseNormalize", ["X_fp16", "indices"] if fp16 else ["X", "indices"], "X_fp16" if fp16 else "X", ) net.Proto().external_input.append("X") net.Proto().external_input.append("X_fp16") net.Proto().external_input.append("indices") net.Proto().op.extend([op]) if flush_cache: net.Scale("huge_blob", "huge_blob_2x", value=2.0) workspace.CreateNet(net) # Set random seed, so that repeated runs will keep the same sequence of # random indices. np.random.seed(1701) print("Preparation finished. " + str(datetime.datetime.now())) runtimes = workspace.BenchmarkNet(net.Name(), 1, iterations, True) print("{} ms".format(runtimes[2 if flush_cache else 1])) print("indice_size: " + str(workspace.FetchBlob("indices").size)) print("{} GB/sec".format((2 if fp16 else 4) * embedding_size * workspace.FetchBlob("indices").size / runtimes[2 if flush_cache else 1] / 1e6))
def compare_fcs(M, K, N, args): X = np.random.rand(M, K).astype(np.float32) - 0.5 W = np.random.rand(N, K).astype(np.float32) - 0.5 b = np.random.rand(N).astype(np.float32) - 0.5 def fc(X, W, b): return np.dot(X, np.transpose(W)) + b ground = np.array(fc(X, W, b)) Y_scale = (ground.max() - ground.min()) / 255 print("min ", ground.min(), " max ", ground.max(), " scale ", Y_scale) print("l3_cache_size ", args.l3_cache_size * 4 / 2 ** 20, "MB") workspace.FeedBlob("X", X) workspace.FeedBlob("W", W) workspace.FeedBlob("WT", W.T) workspace.FeedBlob("b", b) workspace.FeedBlob( "huge_blob", np.random.randn(args.l3_cache_size).astype(np.float32) ) net = core.Net("test") net.FC(["X", "W", "b"], "Y_default") net.FCTransposed(["X", "WT", "b"], "Y_pretranspose") if args.quantize_input: quantize_X = core.CreateOperator("Quantize", ["X"], ["X_q"], engine="DNNLOWP") net.Proto().op.extend([quantize_X]) quantize_W = core.CreateOperator("Quantize", ["W"], ["W_q"], engine="DNNLOWP") net.Proto().op.extend([quantize_W]) fc_i8_rowwise = core.CreateOperator( "Int8FCRowWise", ["X_q", "W", "b"] if args.quantize_input else ["X", "W", "b"], "Y_rowwise_dnnlowp", dequantize_output=0 if args.quantize_output else 1, Y_scale=Y_scale, Y_zero_point=0, engine="DNNLOWP", ) net.Proto().op.extend([fc_i8_rowwise]) fc_i8 = core.CreateOperator( "Int8FC", ["X_q", "W_q", "b"] if args.quantize_input else ["X", "W", "b"], "Y_dnnlowp", dequantize_output=0 if args.quantize_output else 1, Y_scale=Y_scale, Y_zero_point=0, engine="DNNLOWP", ) net.Proto().op.extend([fc_i8]) pack_w = core.CreateOperator("FbGemmPack", ["W"], "W_packed") net.Proto().op.extend([pack_w]) fc_fp16 = core.CreateOperator("FbFCPacked", ["X", "W_packed", "b"], ["Y_fp16"]) net.Proto().op.extend([fc_fp16]) ops = [op for op in net.Proto().op] del net.Proto().op[:] for op in ops: net.Proto().op.extend([op]) # wipe caches net.Scale("huge_blob", "huge_blob_2x", value=2.0) workspace.CreateNet(net) workspace.RunNet(net) # makes sure that results are good. outputs = [op.output[0] for op in net.Proto().op if "FC" in op.type] for Y in outputs: if "i8" in Y or "fp16" in Y or "dnnlowp" in Y: continue np.testing.assert_allclose( workspace.FetchBlob(outputs[0]), workspace.FetchBlob(Y), atol=1e-2, rtol=1e-2, ) runtimes = workspace.BenchmarkNet( net.Name(), 1, args.num_runs, True # warmup # run induvidual ops )[1:] results = { op.output[0]: runtime for op, runtime in zip(net.Proto().op, runtimes) if "FC" in op.type } def get_gflops(time, m, k, n): return round(m * n * k * 2 / time / 10 ** 6 * 10) / 10 results = [ (out, time, "{} GFLOPS".format(get_gflops(time, M, K, N))) for out, time in results.items() ] # results = sorted(results, key=operator.itemgetter(1)) print("input shape M, N, K: {} {} {}".format(M, N, K)) for output, time, flops in results: print("{}: {:.4f} ms {}".format(output, time, flops))
def benchmark_sparse_lengths_sum( dtype_str, categorical_limit, embedding_size, average_len, batch_size, iterations, flush_cache, ): print("Preparing lookup table. " + str(datetime.datetime.now())) # We will use a constant, but non-trivial value so we save initialization # time. data = np.ones([categorical_limit, embedding_size], dtype=np.float32) data *= 17.01 if dtype_str == "uint8": scale_bias = np.random.rand(categorical_limit, 2).astype(np.float32) workspace.FeedBlob("scale_bias", scale_bias.astype(np.float32)) elif dtype_str == "uint8_fused": scale_bias = np.random.randint(255, size=(categorical_limit, 8)) data = np.concatenate([data, scale_bias], axis=1) print("Data has shape {} {}".format(data.shape, datetime.datetime.now())) workspace.FeedBlob("X", data.astype(DTYPES[dtype_str])) # In order to produce truly random lengths and indices, we will embed a # Python operator in the net to generate them. def f(_, outputs): lengths = np.random.randint( int(np.round(average_len * 0.75)), int(np.round(average_len * 1.25)) + 1, batch_size, ).astype(np.int32) indices = np.random.randint(0, categorical_limit, np.sum(lengths)).astype(np.int64) outputs[0].feed(indices) outputs[1].feed(lengths) init_net = core.Net("init_net") init_net.Python(f)([], ["indices", "lengths"]) workspace.RunNetOnce(init_net) net = core.Net("mynet") if flush_cache: l3_cache_size = 30 * 2**20 // 4 workspace.FeedBlob("huge_blob", np.random.randn(l3_cache_size).astype(np.float32)) net.Scale("huge_blob", "huge_blob_2x", value=2.0) if dtype_str == "uint8": net.SparseLengthsSum8BitsRowwise( ["X", "indices", "lengths", "scale_bias"], "Y") elif dtype_str == "uint8_fused": net.SparseLengthsSumFused8BitRowwise(["X", "indices", "lengths"], "Y") else: net.SparseLengthsSum(["X", "indices", "lengths"], "Y") workspace.CreateNet(net) # Set random seed, so that repeated runs will keep the same sequence of # random indices. np.random.seed(1701) print("Preparation finished. " + str(datetime.datetime.now())) runtimes = workspace.BenchmarkNet(net.Name(), 1, iterations, True) print("{} billion sums per cycle".format( embedding_size * workspace.FetchBlob("indices").size / runtimes[2 if flush_cache else 1] / 1e6))
def Benchmark(model_gen, arg): model, input_size = model_gen(arg.order) # In order to be able to run everything without feeding more stuff, let's # add the data and label blobs to the parameter initialization net as well. if arg.order == "NCHW": input_shape = [arg.batch_size, 3, input_size, input_size] else: input_shape = [arg.batch_size, input_size, input_size, 3] model.param_init_net.GaussianFill([], "data", shape=input_shape, mean=0.0, std=1.0) model.param_init_net.UniformIntFill([], "label", shape=[ arg.batch_size, ], min=0, max=999) # Note: even when we are running things on CPU, adding a few engine related # argument will not hurt since the CPU operator registy will simply ignore # these options and go the default path. for op in model.net.Proto().op: if op.type == 'Conv' or op.type == 'ConvFp16': op.engine = 'CUDNN' #op.arg.add().CopyFrom(utils.MakeArgument('ws_nbytes_limit', arg.cudnn_limit)) op.arg.add().CopyFrom(utils.MakeArgument('exhaustive_search', 1)) op.arg.add().CopyFrom( utils.MakeArgument('shared_ws_name', 'cudnn_workspace')) elif op.type in [ 'MaxPool', 'MaxPoolFp16', 'AveragePool', 'AveragePoolFp16', 'Relu', 'ReluFp16', 'Softmax', 'SoftmaxFp16' ]: op.engine = 'CUDNN' if arg.forward_only: print arg.model, ': running forward only.' else: print arg.model, ': running forward-backward.' model.AddGradientOperators() if arg.order == 'NHWC': print( '==WARNING==\n' 'NHWC order with CuDNN may not be supported yet, so I might\n' 'exit suddenly.') if not arg.cpu: model.param_init_net.RunAllOnGPU() model.net.RunAllOnGPU() workspace.RunNetOnce(model.param_init_net) workspace.CreateNet(model.net) for i in range(arg.warmup_iterations): workspace.RunNet(model.net.Proto().name) start = time.time() for i in range(arg.iterations): workspace.RunNet(model.net.Proto().name) print 'Spent: ', (time.time() - start) / arg.iterations if arg.layer_wise_benchmark: print 'Layer-wise benchmark.' workspace.BenchmarkNet(model.net.Proto().name, 1, arg.iterations, True) # Writes out the pbtxt for benchmarks on e.g. Android with open("{0}_init_batch_{1}.pbtxt".format(arg.model, arg.batch_size), "w") as fid: fid.write(str(model.param_init_net.Proto())) with open("{0}.pbtxt".format(arg.model, arg.batch_size), "w") as fid: fid.write(str(model.net.Proto()))