def main(bit_rate):
    # uncomment for debugging
    # np.random.seed(0)
    batchsize = 10 * 1000
    blocksize = 64
    print(batchsize, blocksize)
    input_data = np.random.rand(batchsize, blocksize).astype(np.float32)

    workspace.FeedBlob("input_data", input_data)

    net = core.Net("bench")
    op = core.CreateOperator(
        "FloatToFused" + str(bit_rate) + "BitRowwiseQuantized",
        "input_data",
        "quantized_data",
        engine="GREEDY",
    )
    net.Proto().op.extend([op])
    workspace.GlobalInit(["caffe2", "--caffe2_log_level=0"])
    workspace.CreateNet(net)
    iterations = 10
    workspace.BenchmarkNet(net.Proto().name, 1, iterations, True)

    net2 = core.Net("bench2")
    op = core.CreateOperator(
        "FloatToFused" + str(bit_rate) + "BitRowwiseQuantized",
        "input_data",
        "quantized_data",
    )
    net2.Proto().op.extend([op])

    workspace.CreateNet(net2)
    workspace.BenchmarkNet(net2.Proto().name, 1, iterations, True)
    def testReLUSpeed(self):
        X = np.random.randn(128, 4096).astype(np.float32)
        mkl_do = core.DeviceOption(caffe2_pb2.MKLDNN)
        # Makes sure that feed works.
        workspace.FeedBlob("X", X)
        workspace.FeedBlob("X_mkl", X, device_option=mkl_do)
        net = core.Net("test")
        # Makes sure that we can run relu.
        net.Relu("X", "Y")
        net.Relu("X_mkl", "Y_mkl", device_option=mkl_do)
        workspace.CreateNet(net)
        workspace.RunNet(net)
        # makes sure that the results are good.
        np.testing.assert_allclose(
            workspace.FetchBlob("Y"),
            workspace.FetchBlob("Y_mkl"),
            atol=1e-10,
            rtol=1e-10)
        runtime = workspace.BenchmarkNet(net.Proto().name, 1, 100, True)

        # The returned runtime is the time of
        # [whole_net, cpu_op, mkl_op]
        # so we will assume that the MKL one runs faster than the CPU one.

        # Note(Yangqing): in fact, it seems that in optimized mode, this is
        # not always guaranteed - MKL runs slower than the Eigen vectorized
        # version, so I am turning this assertion off.
        #self.assertTrue(runtime[1] >= runtime[2])

        print("Relu CPU runtime {}, MKL runtime {}.".format(runtime[1], runtime[2]))
def runOpBenchmark(
    device_option,
    op,
    inputs,
    input_device_options=None,
    iterations=10,
):
    if input_device_options is None:
        input_device_options = {}
    op = copy.deepcopy(op)
    op.device_option.CopyFrom(device_option)
    net = caffe2_pb2.NetDef()
    net.op.extend([op])
    net.name = op.name if op.name else "test"

    with temp_workspace():
        for (n, b) in zip(op.input, inputs):
            workspace.FeedBlob(
                n,
                b,
                device_option=input_device_options.get(n, device_option)
            )
        workspace.CreateNet(net)
        ret = workspace.BenchmarkNet(net.name, 1, iterations, True)
    return ret
    def testAveragePoolingSpeed(self):
        # We randomly select a shape to test the speed. Intentionally we
        # test a batch size of 1 since this may be the most frequent use
        # case for MKL during deployment time.
        X = np.random.rand(1, 64, 224, 224).astype(np.float32)
        mkl_do = core.DeviceOption(caffe2_pb2.MKLDNN)
        # Makes sure that feed works.
        workspace.FeedBlob("X", X)
        workspace.FeedBlob("X_mkl", X, device_option=mkl_do)
        net = core.Net("test")
        # Makes sure that we can run relu.
        net.AveragePool("X", "Y", stride=2, kernel=3)
        net.AveragePool("X_mkl",
                        "Y_mkl",
                        stride=2,
                        kernel=3,
                        device_option=mkl_do)
        workspace.CreateNet(net)
        workspace.RunNet(net)
        # makes sure that the results are good.
        np.testing.assert_allclose(workspace.FetchBlob("Y"),
                                   workspace.FetchBlob("Y_mkl"),
                                   atol=1e-2,
                                   rtol=1e-2)
        runtime = workspace.BenchmarkNet(net.Proto().name, 1, 100, True)

        print("Averagepooling CPU runtime {}, MKL runtime {}.".format(
            runtime[1], runtime[2]))
    def testSpatialBNTrainingSpeed(self):
        input_channel = 10
        X = np.random.rand(1, input_channel, 100, 100).astype(np.float32) - 0.5
        scale = np.random.rand(input_channel).astype(np.float32) + 0.5
        bias = np.random.rand(input_channel).astype(np.float32) - 0.5
        mean = np.random.randn(input_channel).astype(np.float32)
        var = np.random.rand(input_channel).astype(np.float32) + 0.5

        #mean = np.zeros(input_channel)
        #var = np.zeros(input_channel)

        mkl_do = core.DeviceOption(caffe2_pb2.MKLDNN)
        # Makes sure that feed works.
        workspace.FeedBlob("X", X)
        workspace.FeedBlob("scale", scale)
        workspace.FeedBlob("bias", bias)
        workspace.FeedBlob("mean", mean)
        workspace.FeedBlob("var", var)
        workspace.FeedBlob("X_mkl", X, device_option=mkl_do)
        workspace.FeedBlob("scale_mkl", scale, device_option=mkl_do)
        workspace.FeedBlob("bias_mkl", bias, device_option=mkl_do)
        workspace.FeedBlob("mean_mkl", mean, device_option=mkl_do)
        workspace.FeedBlob("var_mkl", var, device_option=mkl_do)
        net = core.Net("test")
        # Makes sure that we can run relu.
        net.SpatialBN(["X", "scale", "bias","mean", "var"],
            ["Y", "mean", "var", "saved_mean", "saved_var"],
            order="NCHW",
            is_test=False,
            epsilon=1e-5)
        net.SpatialBN(["X_mkl", "scale_mkl", "bias_mkl","mean_mkl","var_mkl"],
            ["Y_mkl", "mean_mkl", "var_mkl", "saved_mean_mkl", "saved_var_mkl"],
            order="NCHW",
            is_test=False,
            epsilon=1e-5,
            device_option=mkl_do)

        workspace.CreateNet(net)
        workspace.RunNet(net)

        # makes sure that the results are good.
        np.testing.assert_allclose(
            workspace.FetchBlob("Y"),
            workspace.FetchBlob("Y_mkl"),
            atol=1e-2,
            rtol=1e-2)
        np.testing.assert_allclose(
            workspace.FetchBlob("mean"),
            workspace.FetchBlob("mean_mkl"),
            atol=1e-2,
            rtol=1e-2)
        np.testing.assert_allclose(
            workspace.FetchBlob("var"),
            workspace.FetchBlob("var_mkl"),
            atol=1e-2,
            rtol=1e-2)

        runtime = workspace.BenchmarkNet(net.Proto().name, 1, 100, True)

        print("FC CPU runtime {}, MKL runtime {}.".format(runtime[1], runtime[2]))
    def testFCSpeed(self):
        # We randomly select a shape to test the speed. Intentionally we
        # test a batch size of 1 since this may be the most frequent use
        # case for MKL during deployment time.
        X = np.random.rand(1, 256, 6, 6).astype(np.float32) - 0.5
        #X = np.random.rand(32, 256*6*6).astype(np.float32) - 0.5
        W = np.random.rand(4096, 9216).astype(np.float32) - 0.5
        b = np.random.rand(4096).astype(np.float32) - 0.5
        mkl_do = core.DeviceOption(caffe2_pb2.MKLDNN)
        # Makes sure that feed works.
        workspace.FeedBlob("X", X)
        workspace.FeedBlob("W", W)
        workspace.FeedBlob("b", b)
        workspace.FeedBlob("X_mkl", X, device_option=mkl_do)
        workspace.FeedBlob("W_mkl", W, device_option=mkl_do)
        workspace.FeedBlob("b_mkl", b, device_option=mkl_do)
        net = core.Net("test")
        # Makes sure that we can run relu.
        net.FC(["X", "W", "b"], "Y")
        net.FC(["X_mkl", "W_mkl", "b_mkl"], "Y_mkl", device_option=mkl_do)

        workspace.CreateNet(net)
        workspace.RunNet(net)
        # makes sure that the results are good.
        np.testing.assert_allclose(workspace.FetchBlob("Y"),
                                   workspace.FetchBlob("Y_mkl"),
                                   atol=1e-2,
                                   rtol=1e-2)
        runtime = workspace.BenchmarkNet(net.Proto().name, 1, 100, True)

        print("FC CPU runtime {}, MKL runtime {}.".format(
            runtime[1], runtime[2]))
Beispiel #7
0
def Benchmark(model_gen, arg):
    model, input_size = model_gen(arg.order, arg.cudnn_ws)
    model.Proto().type = arg.net_type
    model.Proto().num_workers = arg.num_workers

    # In order to be able to run everything without feeding more stuff, let's
    # add the data and label blobs to the parameter initialization net as well.
    if arg.order == "NCHW":
        input_shape = [arg.batch_size, 3, input_size, input_size]
    else:
        input_shape = [arg.batch_size, input_size, input_size, 3]
    if arg.model == "MLP":
        input_shape = [arg.batch_size, input_size]

    model.param_init_net.GaussianFill([],
                                      "data",
                                      shape=input_shape,
                                      mean=0.0,
                                      std=1.0)
    model.param_init_net.UniformIntFill([],
                                        "label",
                                        shape=[
                                            arg.batch_size,
                                        ],
                                        min=0,
                                        max=999)

    if arg.forward_only:
        print('{}: running forward only.'.format(arg.model))
    else:
        print('{}: running forward-backward.'.format(arg.model))
        model.AddGradientOperators(["loss"])
        AddParameterUpdate(model)
        if arg.order == 'NHWC':
            print(
                '==WARNING==\n'
                'NHWC order with CuDNN may not be supported yet, so I might\n'
                'exit suddenly.')

    if not arg.cpu:
        model.param_init_net.RunAllOnGPU()
        model.net.RunAllOnGPU()

    if arg.engine:
        for op in model.net.Proto().op:
            op.engine = arg.engine

    if arg.dump_model:
        # Writes out the pbtxt for benchmarks on e.g. Android
        with open("{0}_init_batch_{1}.pbtxt".format(arg.model, arg.batch_size),
                  "w") as fid:
            fid.write(str(model.param_init_net.Proto()))
        with open("{0}.pbtxt".format(arg.model, arg.batch_size), "w") as fid:
            fid.write(str(model.net.Proto()))

    workspace.RunNetOnce(model.param_init_net)
    workspace.CreateNet(model.net)
    workspace.BenchmarkNet(model.net.Proto().name, arg.warmup_iterations,
                           arg.iterations, arg.layer_wise_benchmark)
def Benchmark(args, model_map):

    arg_scope = {
        'order': 'NCHW',
        'use_cudnn': True,
        'ws_nbytes_limit': args.cudnn_ws * 1024 * 1024,
    }
    model = model_helper.ModelHelper(name=args.model, arg_scope=arg_scope)

    # Either use specified device list or generate one
    if args.gpus is not None:
        gpus = [int(x) for x in args.gpus.split(',')]
        num_gpus = len(gpus)
    else:
        gpus = list(range(args.num_gpus))
        num_gpus = args.num_gpus

    # Verify valid batch size
    total_batch_size = args.batch_size
    batch_per_device = total_batch_size // num_gpus
    assert \
        total_batch_size % num_gpus == 0, \
        "Number of GPUs must divide batch size"

    def add_image_input(model):
        AddNullInput(
            model,
            batch_size=batch_per_device,
            img_size=model_map[args.model][1],
            dtype=args.dtype,
        )

    data_parallel_model.Parallelize(
        model,
        input_builder_fun=add_image_input,
        forward_pass_builder_fun=partial(model_map[args.model][0],
                                         dtype=args.dtype),
        optimizer_builder_fun=add_optimizer if not args.forward_only else None,
        post_sync_builder_fun=add_post_sync_ops,
        devices=gpus,
        optimize_gradient_memory=False,
        cpu_device=args.cpu,
        num_threads_per_device=args.num_workers_per_device,
    )

    if not args.forward_only:
        data_parallel_model.OptimizeGradientMemory(model, {}, set(), False)

    model.Proto().type = args.net_type

    workspace.RunNetOnce(model.param_init_net)
    workspace.CreateNet(model.net)
    ms_per_iter = workspace.BenchmarkNet(model.net.Proto().name,
                                         args.warmup_iterations,
                                         args.iterations,
                                         args.layer_wise_benchmark)
    print("number of images/sec: {}".format(
        round(args.batch_size * 1000 / ms_per_iter[0], 2)))
Beispiel #9
0
def Benchmark(model_gen, arg):
    model, input_size = model_gen(arg.order)

    # In order to be able to run everything without feeding more stuff, let's
    # add the data and label blobs to the parameter initialization net as well.
    if arg.order == "NCHW":
        input_shape = [arg.batch_size, 3, input_size, input_size]
    else:
        input_shape = [arg.batch_size, input_size, input_size, 3]
    model.param_init_net.GaussianFill([],
                                      "data",
                                      shape=input_shape,
                                      mean=0.0,
                                      std=1.0)
    model.param_init_net.UniformIntFill([],
                                        "label",
                                        shape=[
                                            arg.batch_size,
                                        ],
                                        min=0,
                                        max=999)

    if arg.forward_only:
        print('{}: running forward only.'.format(arg.model))
    else:
        print('{}: running forward-backward.'.format(arg.model))
        model.AddGradientOperators()
        if arg.order == 'NHWC':
            print(
                '==WARNING==\n'
                'NHWC order with CuDNN may not be supported yet, so I might\n'
                'exit suddenly.')

    if not arg.cpu:
        model.param_init_net.RunAllOnGPU()
        model.net.RunAllOnGPU()

    if arg.dump_model:
        # Writes out the pbtxt for benchmarks on e.g. Android
        with open("{0}_init_batch_{1}.pbtxt".format(arg.model, arg.batch_size),
                  "w") as fid:
            fid.write(str(model.param_init_net.Proto()))
        with open("{0}.pbtxt".format(arg.model, arg.batch_size), "w") as fid:
            fid.write(str(model.net.Proto()))

    workspace.RunNetOnce(model.param_init_net)
    workspace.CreateNet(model.net)
    for i in range(arg.warmup_iterations):
        workspace.RunNet(model.net.Proto().name)

    start = time.time()
    for i in range(arg.iterations):
        workspace.RunNet(model.net.Proto().name)
    print('Spent: {}'.format((time.time() - start) / arg.iterations))
    if arg.layer_wise_benchmark:
        print('Layer-wise benchmark.')
        workspace.BenchmarkNet(model.net.Proto().name, 1, arg.iterations, True)
def benchmark_mul_gradient(args):
    workspace.FeedBlob("dC", np.random.rand(args.m, args.n).astype(np.float32))
    workspace.FeedBlob("A", np.random.rand(args.m, args.n).astype(np.float32))
    workspace.FeedBlob("B", np.random.rand(args.m).astype(np.float32))

    net = core.Net("mynet")
    net.MulGradient(["dC", "A", "B"], ["dA", "dB"], broadcast=True, axis=0)
    workspace.CreateNet(net)

    workspace.BenchmarkNet(net.Name(), 1, args.iteration, True)
def benchmark_concat(num_inputs, input_dim, axis, add_axis, iterations):
    input_names = [f"input{i}" for i in range(num_inputs)]
    for n in input_names:
        workspace.FeedBlob(n, np.random.randn(*input_dim).astype(np.float32))

    net = core.Net("benchmark_net")
    net.Concat(input_names, ["output", "split_info"],
               axis=axis,
               add_axis=add_axis)
    workspace.CreateNet(net)

    runtimes = workspace.BenchmarkNet(net.Name(), 1, iterations, True)
    print(f"{num_inputs * np.prod(input_dim) * 4 / runtimes[1] / 1e6} GB/s")
def benchmark_sparse_lengths_sum(dtype_str, categorical_limit, embedding_size,
                                 average_len, batch_size, iterations):
    print('Preparing lookup table. ' + str(datetime.datetime.now()))

    # We will use a constant, but non-trivial value so we save initialization
    # time.
    data = np.ones([categorical_limit, embedding_size], dtype=np.float32)
    data *= 17.01

    if dtype_str == 'uint8':
        scale_bias = np.random.rand(categorical_limit, 2).astype(np.float32)
        workspace.FeedBlob("scale_bias", scale_bias.astype(np.float32))
    elif dtype_str == 'uint8_fused':
        scale_bias = np.random.randint(255, size=(categorical_limit, 8))
        data = np.concatenate([data, scale_bias], axis=1)

    print('Data has shape {} {}'.format(data.shape, datetime.datetime.now()))
    workspace.FeedBlob("X", data.astype(DTYPES[dtype_str]))

    # In order to produce truly random lengths and indices, we will embed a
    # Python operator in the net to generate them.
    def f(_, outputs):
        lengths = np.random.randint(int(average_len * 0.75),
                                    int(average_len * 1.25),
                                    batch_size).astype(np.int32)
        indices = np.random.randint(0, categorical_limit,
                                    np.sum(lengths)).astype(np.int64)
        outputs[0].feed(indices)
        outputs[1].feed(lengths)

    net = core.Net("mynet")
    net.Python(f)([], [
        "indices",
        "lengths",
    ])
    if dtype_str == "uint8":
        net.SparseLengthsSum8BitsRowwise(
            ["X", "indices", "lengths", "scale_bias"], "Y")
    elif dtype_str == "uint8_fused":
        net.SparseLengthsSumFused8BitRowwise(["X", "indices", "lengths"], "Y")
    else:
        net.SparseLengthsSum(["X", "indices", "lengths"], "Y")
    workspace.CreateNet(net)

    # Set random seed, so that repeated runs will keep the same sequence of
    # random indices.
    np.random.seed(1701)

    print('Preparation finished. ' + str(datetime.datetime.now()))

    workspace.BenchmarkNet(net.Name(), 1, iterations, True)
Beispiel #13
0
    def testConvReluLRNSpeed(self):
        # We randomly select a shape to test the speed. Intentionally we
        # test a batch size of 1 since this may be the most frequent use
        # case for MKL during deployment time.
        X = np.random.rand(1, 3, 224, 224).astype(np.float32) - 0.5
        W = np.random.rand(64, 3, 11, 11).astype(np.float32) - 0.5
        b = np.random.rand(64).astype(np.float32) - 0.5

        mkl_do = core.DeviceOption(caffe2_pb2.MKLDNN)
        # Makes sure that feed works.
        workspace.FeedBlob("X", X)
        workspace.FeedBlob("W", W)
        workspace.FeedBlob("b", b)
        workspace.FeedBlob("X_mkl", X, device_option=mkl_do)
        workspace.FeedBlob("W_mkl", W, device_option=mkl_do)
        workspace.FeedBlob("b_mkl", b, device_option=mkl_do)

        net = core.Net("test")

        net.Conv(["X", "W", "b"], "C", pad=1, stride=1, kernel=11)
        net.Conv(["X_mkl", "W_mkl", "b_mkl"],
                 "C_mkl",
                 pad=1,
                 stride=1,
                 kernel=11,
                 device_option=mkl_do)
        net.Relu("C", "R")
        net.Relu("C_mkl", "R_mkl", device_option=mkl_do)
        net.LRN("R", ["Y", "Y_Scale"],
                size=5,
                alpha=0.001,
                beta=0.75,
                bias=2.0,
                order="NCHW")
        net.LRN("R_mkl", ["Y_mkl", "Y_Scale_mkl"],
                size=5,
                alpha=0.001,
                beta=0.75,
                bias=2.0,
                order="NCHW",
                device_option=mkl_do)

        workspace.CreateNet(net)
        workspace.RunNet(net)
        # makes sure that the results are good.
        np.testing.assert_allclose(workspace.FetchBlob("Y"),
                                   workspace.FetchBlob("Y_mkl"),
                                   atol=1e-2,
                                   rtol=1e-2)
        runtime = workspace.BenchmarkNet(net.Proto().name, 1, 100, True)
    def testConvReluMaxPoolFcSpeed(self):
        # We randomly select a shape to test the speed. Intentionally we
        # test a batch size of 1 since this may be the most frequent use
        # case for MKL during deployment time.
        X = np.random.rand(1, 256, 13, 13).astype(np.float32) - 0.5
        W = np.random.rand(256, 256, 3, 3).astype(np.float32) - 0.5
        b = np.random.rand(256).astype(np.float32) - 0.5

        w_fc = np.random.rand(4096, 9216).astype(np.float32) - 0.5
        b_fc = np.random.rand(4096).astype(np.float32) - 0.5
        mkl_do = core.DeviceOption(caffe2_pb2.MKLDNN)
        # Makes sure that feed works.
        workspace.FeedBlob("X", X)
        workspace.FeedBlob("W", W)
        workspace.FeedBlob("b", b)
        workspace.FeedBlob("w_fc", w_fc)
        workspace.FeedBlob("b_fc", b_fc)
        workspace.FeedBlob("X_mkl", X, device_option=mkl_do)
        workspace.FeedBlob("W_mkl", W, device_option=mkl_do)
        workspace.FeedBlob("b_mkl", b, device_option=mkl_do)
        workspace.FeedBlob("w_fc_mkl", w_fc, device_option=mkl_do)
        workspace.FeedBlob("b_fc_mkl", b_fc, device_option=mkl_do)

        net = core.Net("test")

        net.Conv(["X", "W", "b"], "C", pad=1, stride=1, kernel=3)
        net.Relu("C", "R")
        net.MaxPool("R", "P", stride=2, kernel=3)
        net.FC(["P", "w_fc", "b_fc"], "Y")

        net.Conv(["X_mkl", "W_mkl", "b_mkl"],
                 "C_mkl",
                 pad=1,
                 stride=1,
                 kernel=3,
                 device_option=mkl_do)
        net.Relu("C_mkl", "R_mkl", device_option=mkl_do)
        net.MaxPool("R_mkl", "P_mkl", stride=2, kernel=3, device_option=mkl_do)
        net.FC(["P_mkl", "w_fc_mkl", "b_fc_mkl"],
               "Y_mkl",
               device_option=mkl_do)

        workspace.CreateNet(net)
        workspace.RunNet(net)
        # makes sure that the results are good.
        np.testing.assert_allclose(workspace.FetchBlob("Y"),
                                   workspace.FetchBlob("Y_mkl"),
                                   atol=1e-2,
                                   rtol=1e-2)
        runtime = workspace.BenchmarkNet(net.Proto().name, 1, 100, True)
Beispiel #15
0
def benchmark(args):
    print('Batch size: {}'.format(args.batch_size))
    mf = ModelDownloader()
    init_net, pred_net, value_info = mf.get_c2_model(args.model)
    input_shapes = {
        k: [args.batch_size] + v[-1][1:]
        for (k, v) in value_info.items()
    }
    print("input info: {}".format(input_shapes))
    external_inputs = {}
    for k, v in input_shapes.items():
        external_inputs[k] = np.random.randn(*v).astype(np.float32)

    if args.device == 'CPU':
        device_option = core.DeviceOption(caffe2_pb2.CPU)
    elif args.device == 'MKL':
        device_option = core.DeviceOption(caffe2_pb2.MKLDNN)
    elif args.device == 'IDEEP':
        device_option = core.DeviceOption(caffe2_pb2.IDEEP)
    else:
        raise Exception("Unknown device: {}".format(args.device))
    print("Device option: {}, {}".format(args.device, device_option))
    pred_net.device_option.CopyFrom(device_option)
    for op in pred_net.op:
        op.device_option.CopyFrom(device_option)

    # Hack to initialized weights into MKL/IDEEP context
    workspace.RunNetOnce(init_net)
    bb = workspace.Blobs()
    weights = {}
    for b in bb:
        weights[b] = workspace.FetchBlob(b)
    for k, v in external_inputs.items():
        weights[k] = v
    workspace.ResetWorkspace()

    with core.DeviceScope(device_option):
        for name, blob in weights.items():
            #print("{}".format(name))
            workspace.FeedBlob(name, blob, device_option)
        workspace.CreateNet(pred_net)
        start = time.time()
        res = workspace.BenchmarkNet(pred_net.name, args.warmup_iterations,
                                     args.iterations,
                                     args.layer_wise_benchmark)
        print("FPS: {:.2f}".format(1 / res[0] * 1000 * args.batch_size))
def compare_fcs(B, M, N, num_runs, mapping_options=None):
    X = np.random.rand(B, M).astype(np.float32) - 0.5
    W = np.random.rand(N, M).astype(np.float32) - 0.5
    b = np.random.rand(N).astype(np.float32) - 0.5

    with core.DeviceScope(core.DeviceOption(1)):
        workspace.FeedBlob("X", X)
        workspace.FeedBlob("W", W)
        workspace.FeedBlob("b", b)

    net = core.Net("test")

    with core.DeviceScope(core.DeviceOption(1)):
        net.FC(["X", "W", "b"], "Y_baseline")
        net.TcOp(
            ["X", "W", "b"],
            "Y_TC",
            tc_def=FC_LANG,
            tc_name="func_fc",
            mapping_options=(mapping_options.serialize()
                             if mapping_options else None),
            check_sizes=True,
        )

    workspace.CreateNet(net)
    workspace.RunNet(net)

    baseline_value = workspace.blobs["Y_baseline"]
    tc_value = workspace.blobs["Y_TC"]
    np.testing.assert_allclose(
        baseline_value,
        tc_value,
        rtol=1e-4,
        atol=1e-4,
    )

    runtimes = workspace.BenchmarkNet(
        net.Name(),
        0,  # warmpup was already done
        num_runs,
        True,  # run individual ops
    )[1:]

    print(runtimes)
def benchmark_sparse_lengths_sum(
        dtype_str,
        categorical_limit,
        embedding_size,
        average_len,
        batch_size,
        iterations):
    print('Preparing lookup table. ' + str(datetime.datetime.now()))
    # We will use a constant, but non-trivial value so we save initialization
    # time.
    arr = np.ones([categorical_limit, embedding_size], dtype=np.float32)
    arr *= 17.01

    dtype_table = {
        'float': np.float32,
        'float16': np.float16
    }
    workspace.FeedBlob("X", arr.astype(dtype_table[dtype_str]))

    # In order to produce truly random lengths and indices, we will embed a
    # Python operator in the net to generate them.
    def f(_, outputs):
        lengths = np.random.randint(
            int(average_len * 0.75),
            int(average_len * 1.25),
            batch_size).astype(np.int32)
        indices = np.random.randint(
            0, categorical_limit, np.sum(lengths)).astype(np.int64)
        outputs[0].feed(indices)
        outputs[1].feed(lengths)

    net = core.Net("mynet")
    net.Python(f)([], ["indices", "lengths"])
    net.SparseLengthsSum(["X", "indices", "lengths"], "Y")
    workspace.CreateNet(net)

    # Set random seed, so that repeated runs will keep the same sequence of
    # random indices.
    np.random.seed(1701)

    print('Preparation finished. ' + str(datetime.datetime.now()))

    workspace.BenchmarkNet(net.Name(), 1, iterations, True)
Beispiel #18
0
 def testReLUConsistencyWithCPU(self):
     X = np.random.randn(128, 4096).astype(np.float32)
     mkl_do = core.DeviceOption(caffe2_pb2.MKLDNN)
     # Makes sure that feed works.
     workspace.FeedBlob("X", X)
     workspace.FeedBlob("X_mkl", X, device_option=mkl_do)
     model = cnn.CNNModelHelper()
     # Makes sure that we can run relu.
     model.Relu("X", "Y")
     model.Relu("X_mkl", "Y_mkl", device_option=mkl_do)
     workspace.CreateNet(model.net)
     workspace.RunNet(model.net)
     # makes sure that the results are good.
     np.testing.assert_allclose(workspace.FetchBlob("Y"),
                                workspace.FetchBlob("Y_mkl"),
                                atol=1e-10,
                                rtol=1e-10)
     runtime = workspace.BenchmarkNet(model.net.Proto().name, 1, 10, True)
     # The returned runtime is the time of
     # [whole_net, cpu_op, mkl_op]
     # so we will assume that the MKL one runs faster than the CPU one.
     self.assertTrue(runtime[1] >= runtime[2])
     print("CPU runtime {}, MKL runtime {}.".format(runtime[1], runtime[2]))
def main():
    args = parser.parse_args()
    args.gpu_id = 0

    model = model_helper.ModelHelper(name="le_net", init_params=False)

    # Bring in the init net from init_net.pb
    init_net_proto = caffe2_pb2.NetDef()
    with open(args.c2_init, "rb") as f:
        init_net_proto.ParseFromString(f.read())
    model.param_init_net = core.Net(
        init_net_proto
    )  # model.param_init_net.AppendNet(core.Net(init_net_proto)) #

    # bring in the predict net from predict_net.pb
    predict_net_proto = caffe2_pb2.NetDef()
    with open(args.c2_predict, "rb") as f:
        predict_net_proto.ParseFromString(f.read())
    model.net = core.Net(
        predict_net_proto)  # model.net.AppendNet(core.Net(predict_net_proto))

    # CUDA performance not impressive
    #device_opts = core.DeviceOption(caffe2_pb2.PROTO_CUDA, args.gpu_id)
    #model.net.RunAllOnGPU(gpu_id=args.gpu_id, use_cudnn=True)
    #model.param_init_net.RunAllOnGPU(gpu_id=args.gpu_id, use_cudnn=True)

    input_blob = model.net.external_inputs[0]
    model.param_init_net.GaussianFill([],
                                      input_blob.GetUnscopedName(),
                                      shape=(args.batch_size, 3, args.img_size,
                                             args.img_size),
                                      mean=0.0,
                                      std=1.0)
    workspace.RunNetOnce(model.param_init_net)
    workspace.CreateNet(model.net, overwrite=True)
    workspace.BenchmarkNet(model.net.Proto().name, 5, 20, True)
Beispiel #20
0
    def testLRNSpeed(self):
        # We randomly select a shape to test the speed. Intentionally we
        # test a batch size of 1 since this may be the most frequent use
        # case for MKL during deployment time.
        X = np.random.rand(1, 2, 224, 224).astype(np.float32)
        mkl_do = core.DeviceOption(caffe2_pb2.MKLDNN)
        # Makes sure that feed works.
        workspace.FeedBlob("X", X)
        workspace.FeedBlob("X_mkl", X, device_option=mkl_do)
        net = core.Net("test")
        # Makes sure that we can run relu.
        net.LRN("X", ["Y", "Y_Scale"],
                size=5,
                alpha=0.001,
                beta=0.75,
                bias=2.0,
                order="NCHW")
        net.LRN("X_mkl", ["Y_mkl", "Y_Scale_mkl"],
                size=5,
                alpha=0.001,
                beta=0.75,
                bias=2.0,
                order="NCHW",
                device_option=mkl_do)
        workspace.CreateNet(net)
        workspace.RunNet(net)

        # makes sure that the results are good.
        np.testing.assert_allclose(workspace.FetchBlob("Y"),
                                   workspace.FetchBlob("Y_mkl"),
                                   atol=1e-2,
                                   rtol=1e-2)
        runtime = workspace.BenchmarkNet(net.Proto().name, 1, 100, True)

        print("LRN CPU runtime {}, MKL runtime {}.".format(
            runtime[1], runtime[2]))
Beispiel #21
0
def print_benchmark(shape):
    print("==> Counting FLOPS")
    model = model_helper.ModelHelper(name="model", init_params=False)

    init_net_proto = caffe2_pb2.NetDef()
    with open('weights/model.init.pb', "rb") as f:
        init_net_proto.ParseFromString(f.read())
    model.param_init_net = core.Net(init_net_proto)

    predict_net_proto = caffe2_pb2.NetDef()
    with open('weights/model.predict.pb', "rb") as f:
        predict_net_proto.ParseFromString(f.read())
    model.net = core.Net(predict_net_proto)

    model.param_init_net.GaussianFill(
        [],
        model.net.external_inputs[0].GetUnscopedName(),
        shape=shape,
        mean=0.0,
        std=1.0)
    workspace.RunNetOnce(model.param_init_net)
    workspace.CreateNet(model.net)
    workspace.BenchmarkNet(model.net.Proto().name, 5, 100, True)
    print("==> Done")
Beispiel #22
0
device_opt = core.DeviceOption(caffe2_pb2.CUDA, 0)
with core.NameScope("imonaboat"):
    with core.DeviceScope(device_opt):
        data, label = AddInput(train_model,
                               batch_size=1,
                               db='/caffe/train',
                               db_type='lmdb')
        softmax = AddLeNetModel(train_model, data)

print('Created training model.')

# The parameter initialization network only needs to be run once.
workspace.RunNetOnce(train_model.param_init_net)
# creating the network
workspace.CreateNet(train_model.net)
# set the number of iterations and track the accuracy & loss
total_iters = 20

start = time.time()
print 'Start'
for i in range(total_iters):
    st = time.time()
    #workspace.RunNet(train_model.net.Proto().name)
    workspace.BenchmarkNet(train_model.net.Proto().name, 0, 1, True)
    e = time.time()
    print i + 1, ': iteration time {}'.format(e - st)

end = time.time()
print('Time: {}'.format(end - start))
Beispiel #23
0
def network_eval(args):
    """
    Runs network benchmarking on either a single or multiple nodes
    """
    # Define some parameters for the model instantiation
    if args.use_ideep:
        train_arg_scope = {
            'use_cudnn': False,
            'cudnn_exhaustive_search': False,
            'training_mode': 1
        }
    else:
        train_arg_scope = {
            'order': 'NCHW',
            'use_cudnn': True,
            'cudnn_exhaustive_search': True,
            # 1048576 = 2 ^ 20 (1 MB)
            'ws_nbytes_limit': (args.cudnn_ws_lim * 1048576),
        }
    # Create the model for evaluation
    evaluation_model = model_helper.ModelHelper(name='evaluation_model',
                                                arg_scope=train_arg_scope)

    evaluation_model.Proto().num_workers = 16

    # Default the model for accuracy testing to None
    accuracy_time_model = None

    # Compute batch and epoch sizes
    # Per CPU / GPU batch size
    per_local_device_batch = (
        args.batch_size //
        len(args.gpu_devices)) if args.gpu_devices else args.batch_size
    # Total batch size (over all the devices)
    global_batch_size = args.batch_size * args.num_shards
    # Number of epoch iterations
    epoch_iters = args.epoch_size // global_batch_size
    # Adjust the true number of examples per epoch
    args.epoch_size = global_batch_size * epoch_iters

    if args.training_data:
        log.info("Running experiments with user provided data: %s",
                 args.training_data)

        # Create a reader, which can also help distribute data when running on multiple nodes
        reader = evaluation_model.CreateDB(
            "reader",
            db=args.training_data,
            db_type=args.db_type,
            num_shards=args.num_shards,
            shard_id=args.shard_id,
        )

        def image_input(model):
            AddImageInput(model, reader, per_local_device_batch,
                          min(args.height, args.width), args.data_type,
                          args.use_cpu)
    else:
        input_shape = [args.batch_size, args.channels, args.height, args.width]
        log.info("Running experiments with synthetic data w/ shape: %s",
                 input_shape)

        def image_input(model):
            AddSyntheticInput(model, args.data_type, input_shape,
                              args.num_labels)

    # Create the network, and normalize the loss
    def create_model(model, loss_scale):
        initializer = (PseudoFP16Initializer
                       if args.data_type == 'float16' else Initializer)

        with brew.arg_scope([brew.conv, brew.fc],
                            WeightInitializer=initializer,
                            BiasInitializer=initializer,
                            enable_tensor_core=False,
                            float16_compute=False):
            pred = resnet.create_resnet50(
                model,
                "data",
                num_input_channels=args.channels,
                num_labels=args.num_labels,
                # num_groups=args.resnext_num_groups,
                # num_width_per_group=args.resnext_width_per_group,
                no_bias=True,
                no_loss=True)

        # If we're using float on 2B, then inflate to the 4B representation
        if args.data_type == 'float16':
            pred = model.net.HalfToFloat(pred, pred + '_fp32')

        # Compute the softmax probabilities and the loss
        softmax, loss = model.SoftmaxWithLoss([pred, 'label'],
                                              ['softmax', 'loss'])

        # Noralize the loss, and compute the top_k accuracies for k \in {1, 5}
        loss = model.Scale(loss, scale=loss_scale)
        brew.accuracy(model, [softmax, "label"], "accuracy", top_k=1)
        brew.accuracy(model, [softmax, "label"], "accuracy_top5", top_k=5)
        return [loss]

    def add_optimizer(model):
        """
        Optimizer function called once for the entire model, as opposed for each 
        CPU / GPU individually. The optimizer will be a stepwise weight decay.

        :return: return the optimizer
        """
        stepsz = int(30 * args.epoch_size / args.batch_size / args.num_shards)
        stepsz = stepsz if stepsz else 100

        optimizer.add_weight_decay(model, 1e-4)
        # opt = optimizer.build_multi_precision_sgd(

        opt = optimizer.build_sgd(model,
                                  args.base_learning_rate,
                                  momentum=0.9,
                                  nesterov=1,
                                  policy="step",
                                  stepsize=stepsz,
                                  gamma=0.1)
        return opt

    def add_parameter_update(model):
        """
        Add a simple gradient based parameter update with stepwise adaptive learning rate.
        """
        # This counts the number if iterations we are making
        ITER = brew.iter(model, "iter")
        # Adds a LR to the model, updated using a simple step policy every 10k steps; gamma is an update parameter
        LR = model.LearningRate(ITER,
                                "LR",
                                base_lr=-args.base_learning_rate,
                                policy="step",
                                stepsize=1000,
                                gamma=0.999)
        # This is a constant used in the following loop
        ONE = model.param_init_net.ConstantFill([],
                                                "ONE",
                                                shape=[1],
                                                value=1.0)
        # Here we are essentially applying the gradients to the weights (using the classical method)
        for param in model.params:
            param_grad = model.param_to_grad[param]
            model.WeightedSum([param, ONE, param_grad, LR], param)

    def add_post_sync_ops(model):
        """
        Add ops applied after initial parameter sync.
        """
        for param_info in model.GetOptimizationParamInfo(model.GetParams()):
            if param_info.blob_copy is not None:
                model.param_init_net.HalfToFloat(
                    param_info.blob, param_info.blob_copy[core.DataType.FLOAT])

    if args.num_shards > 1:
        log.info("Distributed benchmarking is enabled")
        log.info("Num shards: %d", args.num_shards)
        log.info("My shard ID: %d", args.shard_id)
        if args.redis_host:
            log.info("Using Redis server at %s:%d", args.redis_host,
                     args.redis_port)
        else:
            log.info("Rendevous at: %s", args.rendezvous_path)

        # Prepare the required parameters for distribution
        store_handler = "store_handler"

        # We'll use the shared file system for rendezvous
        if args.redis_host:
            workspace.RunOperatorOnce(
                core.CreateOperator(
                    "RedisStoreHandlerCreate",
                    [],
                    [store_handler],
                    host=args.redis_host,
                    port=args.redis_port,
                    prefix=args.run_id,
                ))
        else:
            workspace.RunOperatorOnce(
                core.CreateOperator(
                    "FileStoreHandlerCreate",
                    [],
                    [store_handler],
                    path=args.rendezvous_path,
                    prefix=args.run_id,
                ))

        rendezvous = dict(kv_handler=store_handler,
                          shard_id=args.shard_id,
                          num_shards=args.num_shards,
                          engine="GLOO",
                          transport=args.distributed_transport,
                          interface=args.network_interface,
                          exit_nets=None)

        # Parallelize the model (data parallel)
        data_parallel_model.Parallelize(
            evaluation_model,
            input_builder_fun=image_input,
            forward_pass_builder_fun=create_model,
            optimizer_builder_fun=None if not args.backward else
            (add_optimizer if not args.per_device_optimization else None),
            param_update_builder_fun=None if not args.backward else
            (add_parameter_update if args.per_device_optimization else None),
            post_sync_builder_fun=add_post_sync_ops
            if args.post_sync else None,
            devices=(args.gpu_devices if not args.use_cpu else [0]),
            rendezvous=rendezvous,
            # Although this is a parameter (broadcast params) of this function, it is
            # currently not implemented in Caffe2's source code
            broadcast_computed_params=args.broadcast_params,
            optimize_gradient_memory=args.optimize_gradient_memory,
            dynamic_memory_management=args.dynamic_memory_management,
            max_concurrent_distributed_ops=args.max_distributed_ops,
            num_threads_per_device=args.max_threads,
            use_nccl=args.use_nccl,
            cpu_device=args.use_cpu,
            ideep=args.use_ideep,
            shared_model=args.shared_model,
            combine_spatial_bn=args.use_cpu,
        )

        if args.backward:
            data_parallel_model.OptimizeGradientMemory(evaluation_model, {},
                                                       set(), False)

        instantiate_and_create_net(evaluation_model)

        # If we're testing for the time it takes to reach a particular accuracy, then we'll need to create
        # a new model just for this
        if args.test_accuracy:
            # Test for the existance of testing data
            assert args.testing_data, "We must have testing data if we're measuring the time to accuracy"

            log.info("We're running time to test accuracy")
            log.info("The accuracy we're looking for: %f",
                     args.target_accuracy)
            log.info("Testing data provided in: %s", args.testing_data)

            # Create the model
            if args.use_ideep:
                test_arg_scope = {
                    'use_cudnn': False,
                    'cudnn_exhaustive_search': False,
                }
            else:
                test_arg_scope = {
                    'order': 'NCHW',
                    'use_cudnn': True,
                    'cudnn_exhaustive_search': True,
                }

            accuracy_time_model = model_helper.ModelHelper(
                name='accuracy_time_model',
                arg_scope=test_arg_scope,
                init_params=False)

            # Create the input function
            # Create a reader, which can also help distribute data when running on multiple nodes
            test_reader = accuracy_time_model.CreateDB("test_reader",
                                                       db=args.testing_data,
                                                       db_type=args.db_type)

            def test_image_input(model):
                AddImageInput(model,
                              test_reader,
                              per_local_device_batch,
                              min(args.height, args.width),
                              args.data_type,
                              args.use_cpu,
                              is_test=True)

            # Create the test model per se
            data_parallel_model.Parallelize(
                accuracy_time_model,
                input_builder_fun=test_image_input,
                forward_pass_builder_fun=create_model,
                post_sync_builder_fun=add_post_sync_ops
                if args.post_sync else None,
                param_update_builder_fun=None,
                devices=(args.gpu_devices if not args.use_cpu else [0]),
                cpu_device=args.use_cpu)

            instantiate_and_create_net(accuracy_time_model)
    else:
        print("Single node benchmarking is enabled")

        # Build the training model
        if args.use_cpu:
            image_input(evaluation_model)
            create_model(evaluation_model, 1.0)
            if args.backward:
                evaluation_model.AddGradientOperators(["loss"])
                add_parameter_update(evaluation_model)
        else:
            # We're running this on a single GPU on a single node, so create the net under the GPU's net
            with core.DeviceScope(
                    core.DeviceOption(caffe2_pb2.CUDA, args.gpu_devices[0])):
                image_input(evaluation_model)
                create_model(evaluation_model, 1.0)
                if args.backward:
                    evaluation_model.AddGradientOperators(["loss"])
                    add_parameter_update(evaluation_model)

        instantiate_and_create_net(evaluation_model)

        if args.test_accuracy:
            # Test for the existance of testing datan GPU: https://caffe2.ai/doxygen-python/html/classcaffe2_1_1python_1_1core_1_1_net.html#af67e059d8f4cc22e7e64ccdd07918681
            assert args.testing_data, "We must have testing data if we're measuring the time to accuracy"

            log.info("We're running time to test accuracy")
            log.info("The accuracy we're looking for: %f",
                     args.target_accuracy)
            log.info("Testing data provided in: %s", args.testing_data)

            # Create the model
            if args.use_ideep:
                test_arg_scope = {
                    'use_cudnn': False,
                    'cudnn_exhaustive_search': False,
                }
            else:
                test_arg_scope = {
                    'order': 'NCHW',
                    'use_cudnn': True,
                    'cudnn_exhaustive_search': True,
                }

            accuracy_time_model = model_helper.ModelHelper(
                name='accuracy_time_model',
                arg_scope=test_arg_scope,
                init_params=False)

            # Create the input function
            # Create a reader, which can also help distribute data when running on multiple nodes
            test_reader = accuracy_time_model.CreateDB("test_reader",
                                                       db=args.testing_data,
                                                       db_type=args.db_type)

            def test_image_input(model):
                AddImageInput(model,
                              test_reader,
                              per_local_device_batch,
                              min(args.height, args.width),
                              args.data_type,
                              args.use_cpu,
                              is_test=True)

            # Create the test model per se
            test_image_input(accuracy_time_model)
            create_model(accuracy_time_model, 1.0)

            instantiate_and_create_net(accuracy_time_model)

    if not args.test_accuracy:
        workspace.BenchmarkNet(evaluation_model.net.Proto().name,
                               args.warmup_rounds, args.eval_rounds,
                               args.per_layer_eval)
    else:
        # Create a log for time to accuracy testing
        expname = "time_to_acc_model_%s_gpu%d_b%d_L%d_lr%.2f_shard%d" % (
            args.model_name, len(args.gpu_devices) if not args.use_cpu else 1,
            args.batch_size, args.num_labels, args.base_learning_rate,
            args.shard_id)

        explog = experiment_util.ModelTrainerLog(expname, args)

        # Run the epochs
        elapsed_training_time = 0.0
        for i in range(args.epoch_count):
            elapsed_training_time, on_target = RunEpoch(
                args, i, evaluation_model, accuracy_time_model, explog,
                elapsed_training_time)

            if args.terminate_on_target and on_target:
                log.info("Have reached the target accuracy: {} in {} seconds.".
                         format(args.target_accuracy, elapsed_training_time))
                break
Beispiel #24
0
def Benchmark(model_gen, arg):
    model, input_size = model_gen(arg.order, arg.cudnn_ws, arg.device)
    model.Proto().type = arg.net_type
    model.Proto().num_workers = arg.num_workers

    # In order to be able to run everything without feeding more stuff, let's
    # add the data and label blobs to the parameter initialization net as well.
    if arg.order == "NCHW":
        input_shape = [arg.batch_size, 3, input_size, input_size]
    else:
        input_shape = [arg.batch_size, input_size, input_size, 3]
    if arg.model == "MLP":
        input_shape = [arg.batch_size, input_size]

    model.param_init_net.GaussianFill([],
                                      "data",
                                      shape=input_shape,
                                      mean=0.0,
                                      std=1.0)
    #IDEEP/MKL doesn't support int, so have to use numpy
    if arg.device == 'MKL' or arg.device == 'IDEEP':
        label = np.random.randint(low=0, high=1000,
                                  size=(arg.batch_size, )).astype(np.int32)
        workspace.FeedBlob("label", label)
    else:
        model.param_init_net.UniformIntFill([],
                                            "label",
                                            shape=[
                                                arg.batch_size,
                                            ],
                                            min=0,
                                            max=999)

    if arg.forward_only:
        print('{}: running forward only.'.format(arg.model))
    elif arg.device == 'MKL':
        raise Exception('forward-backward not supported yet in MKL')
    else:
        print('{}: running forward-backward.'.format(arg.model))
        model.AddGradientOperators(["loss"])
        AddParameterUpdate(model)
        if arg.order == 'NHWC':
            print(
                '==WARNING==\n'
                'NHWC order with CuDNN may not be supported yet, so I might\n'
                'exit suddenly.')

    if arg.device == 'IDEEP':
        model.param_init_net.RunAllOnIDEEP()
        model.net.RunAllOnIDEEP()
    elif arg.device == 'MKL':
        model.param_init_net.RunAllOnMKL()
        model.net.RunAllOnMKL()
    elif arg.device == 'CUDA':
        model.param_init_net.RunAllOnGPU()
        model.net.RunAllOnGPU()
    print('Running on device: {}'.format(arg.device))

    if arg.engine:
        for op in model.net.Proto().op:
            op.engine = arg.engine

    if arg.dump_model:
        # Writes out the pbtxt for benchmarks on e.g. Android
        with open("{0}_init_batch_{1}.pbtxt".format(arg.model, arg.batch_size),
                  "w") as fid:
            fid.write(str(model.param_init_net.Proto()))
        with open("{0}.pbtxt".format(arg.model, arg.batch_size), "w") as fid:
            fid.write(str(model.net.Proto()))

    workspace.RunNetOnce(model.param_init_net)
    workspace.CreateNet(model.net)
    results = workspace.BenchmarkNet(model.net.Proto().name,
                                     arg.warmup_iterations, arg.iterations,
                                     arg.layer_wise_benchmark)
    print('FPS: {}'.format(arg.batch_size * 1000 / results[0]))
Beispiel #25
0
                #i = init_net.Cast([i], to=itype)
                i = workspace.FeedBlob("ind", np.load('ind.npy'))
                l = init_net.ConstantFill(
                    [],
                    shape=[isize // args.pooling],
                    value=args.pooling,
                    dtype=core.DataType.INT32,
                )
                #net.SparseLengthsSum([d, i, l], name=name, engine=engine)
                net.SparseLengthsSum(["weights", "ind", l], name=name, engine=engine)


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--table-size', type=int, default=10**7,
                        help='embedding table size')
    parser.add_argument('--batch-size', type=int, default=1024,
                        help='batch size')
    parser.add_argument('--pooling', type=int, default=20,
                        help='pooling')
    parser.add_argument('--column', type=int, default=64,
                        help='number of columns in the embedding table')
    args, extra_args = parser.parse_known_args()

    benchSparseSegmentSum()

    workspace.GlobalInit(['caffe2', '--caffe2_log_level=0'] + extra_args)
    workspace.RunNetOnce(init_net)
    workspace.CreateNet(net)
    workspace.BenchmarkNet(net.Proto().name, 100, 10000, True)
def benchmark_sparse_lengths_sum(
        categorical_limit,
        embedding_size,
        average_len,
        batch_size,
        iterations,
        flush_cache,
        bit_rate=st.sampled_from([2, 4]),
):
    print("Preparing lookup table. " + str(datetime.datetime.now()))

    # We will use a constant, but non-trivial value so we save initialization
    # time.
    data = np.ones([categorical_limit, embedding_size], dtype=np.float32)
    data *= 17.01

    init_net = core.Net("init_net")
    op = core.CreateOperator(
        "FloatToFused" + str(bit_rate) + "BitRowwiseQuantized", "X", "X_q")
    init_net.Proto().op.extend([op])
    workspace.FeedBlob("X", data)

    print("Data has shape {} {}".format(data.shape, datetime.datetime.now()))

    # In order to produce truly random lengths and indices, we will embed a
    # Python operator in the net to generate them.
    def f(_, outputs):
        lengths = np.random.randint(int(average_len * 0.75),
                                    int(average_len * 1.25),
                                    batch_size).astype(np.int32)
        indices = np.random.randint(0, categorical_limit,
                                    np.sum(lengths)).astype(np.int64)
        outputs[0].feed(indices)
        outputs[1].feed(lengths)

    init_net.Python(f)([], ["indices", "lengths"])
    workspace.RunNetOnce(init_net)

    net = core.Net("mynet")
    if flush_cache:
        l3_cache_size = 30 * 2**20 // 4
        workspace.FeedBlob("huge_blob",
                           np.random.randn(l3_cache_size).astype(np.float32))
        net.Scale("huge_blob", "huge_blob_2x", value=2.0)
    op = core.CreateOperator(
        "SparseLengthsSumFused" + str(bit_rate) + "BitRowwise",
        ["X_q", "indices", "lengths"],
        "Y",
    )
    net.Proto().op.extend([op])
    workspace.CreateNet(net)

    # Set random seed, so that repeated runs will keep the same sequence of
    # random indices.
    np.random.seed(1701)

    print("Preparation finished. " + str(datetime.datetime.now()))

    runtimes = workspace.BenchmarkNet(net.Name(), 1, iterations, True)
    print("{} billion sums per sec".format(
        embedding_size * workspace.FetchBlob("indices").size /
        runtimes[2 if flush_cache else 1] / 1e6))
Beispiel #27
0
def benchmark_sparse_normalize(
    categorical_limit,
    embedding_size,
    average_len,
    batch_size,
    iterations,
    flush_cache,
    fp16,
):
    print("Preparing lookup table. " + str(datetime.datetime.now()))

    # We will use a constant, but non-trivial value so we save initialization
    # time.
    data = np.ones([categorical_limit, embedding_size], dtype=np.float32)
    data *= 17.01

    init_net = core.Net("init_net")
    if fp16:
        op = core.CreateOperator("FloatToHalf", "X", "X_fp16")
        init_net.Proto().op.extend([op])
    l3_cache_size = 30 * 2**20 // 4

    # In order to produce truly random lengths and indices, we will embed a
    # Python operator in the net to generate them.
    def f(_, outputs):
        lengths = np.random.randint(int(average_len * 0.75),
                                    int(average_len * 1.25),
                                    batch_size).astype(np.int32)
        indices = np.random.randint(0, categorical_limit,
                                    np.sum(lengths)).astype(np.int64)
        outputs[0].feed(indices)

    workspace.FeedBlob("X", data)
    workspace.FeedBlob("huge_blob",
                       np.random.randn(l3_cache_size).astype(np.float32))

    print("Data has shape {} {}".format(data.shape, datetime.datetime.now()))

    init_net.Python(f)([], ["indices"])
    workspace.RunNetOnce(init_net)

    net = core.Net("mynet")
    op = core.CreateOperator(
        "Float16SparseNormalize" if fp16 else "SparseNormalize",
        ["X_fp16", "indices"] if fp16 else ["X", "indices"],
        "X_fp16" if fp16 else "X",
    )
    net.Proto().external_input.append("X")
    net.Proto().external_input.append("X_fp16")
    net.Proto().external_input.append("indices")
    net.Proto().op.extend([op])
    if flush_cache:
        net.Scale("huge_blob", "huge_blob_2x", value=2.0)

    workspace.CreateNet(net)

    # Set random seed, so that repeated runs will keep the same sequence of
    # random indices.
    np.random.seed(1701)

    print("Preparation finished. " + str(datetime.datetime.now()))

    runtimes = workspace.BenchmarkNet(net.Name(), 1, iterations, True)

    print("{} ms".format(runtimes[2 if flush_cache else 1]))
    print("indice_size: " + str(workspace.FetchBlob("indices").size))
    print("{} GB/sec".format((2 if fp16 else 4) * embedding_size *
                             workspace.FetchBlob("indices").size /
                             runtimes[2 if flush_cache else 1] / 1e6))
def compare_fcs(M, K, N, args):
    X = np.random.rand(M, K).astype(np.float32) - 0.5
    W = np.random.rand(N, K).astype(np.float32) - 0.5
    b = np.random.rand(N).astype(np.float32) - 0.5

    def fc(X, W, b):
        return np.dot(X, np.transpose(W)) + b

    ground = np.array(fc(X, W, b))
    Y_scale = (ground.max() - ground.min()) / 255
    print("min ", ground.min(), " max ", ground.max(), " scale ", Y_scale)
    print("l3_cache_size ", args.l3_cache_size * 4 / 2 ** 20, "MB")
    workspace.FeedBlob("X", X)
    workspace.FeedBlob("W", W)
    workspace.FeedBlob("WT", W.T)
    workspace.FeedBlob("b", b)
    workspace.FeedBlob(
        "huge_blob", np.random.randn(args.l3_cache_size).astype(np.float32)
    )

    net = core.Net("test")

    net.FC(["X", "W", "b"], "Y_default")
    net.FCTransposed(["X", "WT", "b"], "Y_pretranspose")

    if args.quantize_input:
        quantize_X = core.CreateOperator("Quantize", ["X"], ["X_q"], engine="DNNLOWP")
        net.Proto().op.extend([quantize_X])
        quantize_W = core.CreateOperator("Quantize", ["W"], ["W_q"], engine="DNNLOWP")
        net.Proto().op.extend([quantize_W])

    fc_i8_rowwise = core.CreateOperator(
        "Int8FCRowWise",
        ["X_q", "W", "b"] if args.quantize_input else ["X", "W", "b"],
        "Y_rowwise_dnnlowp",
        dequantize_output=0 if args.quantize_output else 1,
        Y_scale=Y_scale,
        Y_zero_point=0,
        engine="DNNLOWP",
    )
    net.Proto().op.extend([fc_i8_rowwise])

    fc_i8 = core.CreateOperator(
        "Int8FC",
        ["X_q", "W_q", "b"] if args.quantize_input else ["X", "W", "b"],
        "Y_dnnlowp",
        dequantize_output=0 if args.quantize_output else 1,
        Y_scale=Y_scale,
        Y_zero_point=0,
        engine="DNNLOWP",
    )
    net.Proto().op.extend([fc_i8])

    pack_w = core.CreateOperator("FbGemmPack", ["W"], "W_packed")
    net.Proto().op.extend([pack_w])
    fc_fp16 = core.CreateOperator("FbFCPacked", ["X", "W_packed", "b"], ["Y_fp16"])
    net.Proto().op.extend([fc_fp16])

    ops = [op for op in net.Proto().op]
    del net.Proto().op[:]
    for op in ops:
        net.Proto().op.extend([op])
        # wipe caches
        net.Scale("huge_blob", "huge_blob_2x", value=2.0)

    workspace.CreateNet(net)
    workspace.RunNet(net)

    # makes sure that results are good.
    outputs = [op.output[0] for op in net.Proto().op if "FC" in op.type]
    for Y in outputs:
        if "i8" in Y or "fp16" in Y or "dnnlowp" in Y:
            continue
        np.testing.assert_allclose(
            workspace.FetchBlob(outputs[0]),
            workspace.FetchBlob(Y),
            atol=1e-2,
            rtol=1e-2,
        )

    runtimes = workspace.BenchmarkNet(
        net.Name(), 1, args.num_runs, True  # warmup  # run induvidual ops
    )[1:]

    results = {
        op.output[0]: runtime
        for op, runtime in zip(net.Proto().op, runtimes)
        if "FC" in op.type
    }

    def get_gflops(time, m, k, n):
        return round(m * n * k * 2 / time / 10 ** 6 * 10) / 10

    results = [
        (out, time, "{} GFLOPS".format(get_gflops(time, M, K, N)))
        for out, time in results.items()
    ]
    # results = sorted(results, key=operator.itemgetter(1))

    print("input shape M, N, K: {} {} {}".format(M, N, K))
    for output, time, flops in results:
        print("{}: {:.4f} ms {}".format(output, time, flops))
Beispiel #29
0
def benchmark_sparse_lengths_sum(
    dtype_str,
    categorical_limit,
    embedding_size,
    average_len,
    batch_size,
    iterations,
    flush_cache,
):
    print("Preparing lookup table. " + str(datetime.datetime.now()))

    # We will use a constant, but non-trivial value so we save initialization
    # time.
    data = np.ones([categorical_limit, embedding_size], dtype=np.float32)
    data *= 17.01

    if dtype_str == "uint8":
        scale_bias = np.random.rand(categorical_limit, 2).astype(np.float32)
        workspace.FeedBlob("scale_bias", scale_bias.astype(np.float32))
    elif dtype_str == "uint8_fused":
        scale_bias = np.random.randint(255, size=(categorical_limit, 8))
        data = np.concatenate([data, scale_bias], axis=1)

    print("Data has shape {} {}".format(data.shape, datetime.datetime.now()))
    workspace.FeedBlob("X", data.astype(DTYPES[dtype_str]))

    # In order to produce truly random lengths and indices, we will embed a
    # Python operator in the net to generate them.
    def f(_, outputs):
        lengths = np.random.randint(
            int(np.round(average_len * 0.75)),
            int(np.round(average_len * 1.25)) + 1,
            batch_size,
        ).astype(np.int32)
        indices = np.random.randint(0, categorical_limit,
                                    np.sum(lengths)).astype(np.int64)
        outputs[0].feed(indices)
        outputs[1].feed(lengths)

    init_net = core.Net("init_net")
    init_net.Python(f)([], ["indices", "lengths"])
    workspace.RunNetOnce(init_net)

    net = core.Net("mynet")
    if flush_cache:
        l3_cache_size = 30 * 2**20 // 4
        workspace.FeedBlob("huge_blob",
                           np.random.randn(l3_cache_size).astype(np.float32))
        net.Scale("huge_blob", "huge_blob_2x", value=2.0)
    if dtype_str == "uint8":
        net.SparseLengthsSum8BitsRowwise(
            ["X", "indices", "lengths", "scale_bias"], "Y")
    elif dtype_str == "uint8_fused":
        net.SparseLengthsSumFused8BitRowwise(["X", "indices", "lengths"], "Y")
    else:
        net.SparseLengthsSum(["X", "indices", "lengths"], "Y")
    workspace.CreateNet(net)

    # Set random seed, so that repeated runs will keep the same sequence of
    # random indices.
    np.random.seed(1701)

    print("Preparation finished. " + str(datetime.datetime.now()))

    runtimes = workspace.BenchmarkNet(net.Name(), 1, iterations, True)
    print("{} billion sums per cycle".format(
        embedding_size * workspace.FetchBlob("indices").size /
        runtimes[2 if flush_cache else 1] / 1e6))
def Benchmark(model_gen, arg):
    model, input_size = model_gen(arg.order)

    # In order to be able to run everything without feeding more stuff, let's
    # add the data and label blobs to the parameter initialization net as well.
    if arg.order == "NCHW":
        input_shape = [arg.batch_size, 3, input_size, input_size]
    else:
        input_shape = [arg.batch_size, input_size, input_size, 3]
    model.param_init_net.GaussianFill([],
                                      "data",
                                      shape=input_shape,
                                      mean=0.0,
                                      std=1.0)
    model.param_init_net.UniformIntFill([],
                                        "label",
                                        shape=[
                                            arg.batch_size,
                                        ],
                                        min=0,
                                        max=999)

    # Note: even when we are running things on CPU, adding a few engine related
    # argument will not hurt since the CPU operator registy will simply ignore
    # these options and go the default path.
    for op in model.net.Proto().op:
        if op.type == 'Conv' or op.type == 'ConvFp16':
            op.engine = 'CUDNN'
            #op.arg.add().CopyFrom(utils.MakeArgument('ws_nbytes_limit', arg.cudnn_limit))
            op.arg.add().CopyFrom(utils.MakeArgument('exhaustive_search', 1))
            op.arg.add().CopyFrom(
                utils.MakeArgument('shared_ws_name', 'cudnn_workspace'))
        elif op.type in [
                'MaxPool', 'MaxPoolFp16', 'AveragePool', 'AveragePoolFp16',
                'Relu', 'ReluFp16', 'Softmax', 'SoftmaxFp16'
        ]:
            op.engine = 'CUDNN'
    if arg.forward_only:
        print arg.model, ': running forward only.'
    else:
        print arg.model, ': running forward-backward.'
        model.AddGradientOperators()
        if arg.order == 'NHWC':
            print(
                '==WARNING==\n'
                'NHWC order with CuDNN may not be supported yet, so I might\n'
                'exit suddenly.')

    if not arg.cpu:
        model.param_init_net.RunAllOnGPU()
        model.net.RunAllOnGPU()

    workspace.RunNetOnce(model.param_init_net)
    workspace.CreateNet(model.net)
    for i in range(arg.warmup_iterations):
        workspace.RunNet(model.net.Proto().name)

    start = time.time()
    for i in range(arg.iterations):
        workspace.RunNet(model.net.Proto().name)
    print 'Spent: ', (time.time() - start) / arg.iterations
    if arg.layer_wise_benchmark:
        print 'Layer-wise benchmark.'
        workspace.BenchmarkNet(model.net.Proto().name, 1, arg.iterations, True)
    # Writes out the pbtxt for benchmarks on e.g. Android
    with open("{0}_init_batch_{1}.pbtxt".format(arg.model, arg.batch_size),
              "w") as fid:
        fid.write(str(model.param_init_net.Proto()))
    with open("{0}.pbtxt".format(arg.model, arg.batch_size), "w") as fid:
        fid.write(str(model.net.Proto()))