Beispiel #1
0
def run_pooling_operators_benchmarks(
        ctx=mx.cpu(), dtype='float32', warmup=10, runs=50):
    pool_types = ['avg', 'max', 'sum']
    global_pool_types = [0, 1]

    # Run 1D and 2D Pooling performance runs
    pool1d_benchmark_res = []
    pool2d_benchmark_res = []
    for pool_type in pool_types:
        for global_pool in global_pool_types:
            for pool1d_data in [(32, 3, 256), (32, 3, 64)]:
                pool1d_benchmark_res += run_performance_test([nd.Pooling],
                                                             run_backward=True,
                                                             dtype=dtype,
                                                             ctx=ctx,
                                                             inputs=[{
                                                                 "data":
                                                                 pool1d_data,
                                                                 "kernel":
                                                                 3,
                                                                 "pool_type":
                                                                 pool_type,
                                                                 "global_pool":
                                                                 global_pool,
                                                                 "stride":
                                                                 1,
                                                                 "pad":
                                                                 1,
                                                                 "layout":
                                                                 'NCW'
                                                             }],
                                                             warmup=warmup,
                                                             runs=runs)
            for pool2d_data in [(32, 3, 256, 256), (32, 3, 64, 64)]:
                pool2d_benchmark_res += run_performance_test([nd.Pooling],
                                                             run_backward=True,
                                                             dtype=dtype,
                                                             ctx=ctx,
                                                             inputs=[{
                                                                 "data":
                                                                 pool2d_data,
                                                                 "kernel":
                                                                 (3, 3),
                                                                 "pool_type":
                                                                 pool_type,
                                                                 "global_pool":
                                                                 global_pool,
                                                                 "stride":
                                                                 (1, 1),
                                                                 "pad": (0, 0),
                                                                 "layout":
                                                                 'NCHW'
                                                             }],
                                                             warmup=warmup,
                                                             runs=runs)
    # Prepare combined results
    mx_pooling_op_results = merge_map_list(pool1d_benchmark_res +
                                           pool2d_benchmark_res)
    return mx_pooling_op_results
Beispiel #2
0
def run_all_mxnet_operator_benchmarks(ctx=mx.cpu(), dtype='float32'):
    """Run all the MXNet operators (NDArray) benchmarks.

    Returns
    -------
    Dictionary of benchmark results.
    """
    mxnet_operator_benchmark_results = []

    # *************************MXNET TENSOR OPERATOR BENCHMARKS*****************************

    # Run all Unary operations benchmarks with default input values
    mxnet_operator_benchmark_results.append(run_mx_unary_operators_benchmarks(ctx=ctx, dtype=dtype))

    # Run all Binary Broadcast, element_wise operations benchmarks with default input values
    mxnet_operator_benchmark_results.append(run_mx_binary_broadcast_operators_benchmarks(ctx=ctx,
                                                                                         dtype=dtype))
    mxnet_operator_benchmark_results.append(run_mx_binary_element_wise_operators_benchmarks(ctx=ctx,
                                                                                            dtype=dtype))

    # Run all GEMM operations benchmarks with default input values
    mxnet_operator_benchmark_results.append(run_gemm_operators_benchmarks(ctx=ctx,
                                                                          dtype=dtype))

    # Run all Random sampling operations benchmarks with default input values
    mxnet_operator_benchmark_results.append(run_mx_random_sampling_operators_benchmarks(ctx=ctx, dtype=dtype))

    # Run all Reduction operations benchmarks with default input values
    mxnet_operator_benchmark_results.append(run_mx_reduction_operators_benchmarks(ctx=ctx, dtype=dtype))

    # Run all Sorting and Searching operations benchmarks with default input values
    mxnet_operator_benchmark_results.append(run_sorting_searching_operators_benchmarks(ctx=ctx, dtype=dtype))

    # Run all Array Rearrange operations benchmarks with default input values
    mxnet_operator_benchmark_results.append(run_rearrange_operators_benchmarks(ctx=ctx, dtype=dtype))

    # ************************ MXNET NN OPERATOR BENCHMARKS ****************************

    # Run all basic NN operations benchmarks with default input values
    mxnet_operator_benchmark_results.append(run_nn_basic_operators_benchmarks(ctx=ctx, dtype=dtype))

    # Run all Activation operations benchmarks with default input values
    mxnet_operator_benchmark_results.append(run_activation_operators_benchmarks(ctx=ctx, dtype=dtype))

    # Run all Pooling operations benchmarks with default input values
    mxnet_operator_benchmark_results.append(run_pooling_operators_benchmarks(ctx=ctx, dtype=dtype))

    # Run all Convolution operations benchmarks with default input values
    mxnet_operator_benchmark_results.append(run_convolution_operators_benchmarks(ctx=ctx, dtype=dtype))

    # Run all Optimizer operations benchmarks with default input values
    mxnet_operator_benchmark_results.append(run_optimizer_operators_benchmarks(ctx=ctx, dtype=dtype))
    # Run all Transpose Convolution operations benchmarks with default input values
    mxnet_operator_benchmark_results.append(run_transpose_convolution_operators_benchmarks(ctx=ctx, dtype=dtype))

    # ****************************** PREPARE FINAL RESULTS ********************************
    final_benchmark_result_map = merge_map_list(mxnet_operator_benchmark_results)
    return final_benchmark_result_map
Beispiel #3
0
def run_convolution_operators_benchmarks(
        ctx=mx.cpu(), dtype='float32', warmup=10, runs=50):
    # Conv1D Benchmarks
    conv1d_benchmark_res = []
    for conv_data in [(32, 3, 256), (32, 3, 64)]:
        conv1d_benchmark_res += run_performance_test([nd.Convolution],
                                                     run_backward=True,
                                                     dtype=dtype,
                                                     ctx=ctx,
                                                     inputs=[{
                                                         "data":
                                                         conv_data,
                                                         "weight": (
                                                             64,
                                                             3,
                                                             3,
                                                         ),
                                                         "bias": (64, ),
                                                         "kernel": (3, ),
                                                         "stride": (1, ),
                                                         "dilate": (1, ),
                                                         "pad": (0, ),
                                                         "num_filter":
                                                         64,
                                                         "layout":
                                                         'NCW'
                                                     }],
                                                     warmup=warmup,
                                                     runs=runs)
    # Conv2D Benchmarks
    conv2d_benchmark_res = []
    for conv_data in [(32, 3, 256, 256), (32, 3, 64, 64)]:
        conv2d_benchmark_res += run_performance_test([nd.Convolution],
                                                     run_backward=True,
                                                     dtype=dtype,
                                                     ctx=ctx,
                                                     inputs=[{
                                                         "data":
                                                         conv_data,
                                                         "weight":
                                                         (64, 3, 3, 3),
                                                         "bias": (64, ),
                                                         "kernel": (3, 3),
                                                         "stride": (1, 1),
                                                         "dilate": (1, 1),
                                                         "pad": (0, 0),
                                                         "num_filter":
                                                         64,
                                                         "layout":
                                                         'NCHW'
                                                     }],
                                                     warmup=warmup,
                                                     runs=runs)
    # Prepare combined results
    mx_conv_op_results = merge_map_list(conv1d_benchmark_res +
                                        conv2d_benchmark_res)
    return mx_conv_op_results
Beispiel #4
0
def run_pooling_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='native', warmup=25, runs=100):
    pool_types = ['avg', 'max', 'sum']
    global_pool_types = [0, 1]

    # Run 1D and 2D Pooling performance runs
    pool1d_benchmark_res = []
    pool2d_benchmark_res = []
    for pool_type in pool_types:
        for global_pool in global_pool_types:
            for pool1d_data in [(32, 3, 256), (32, 3, 64)]:
                pool1d_benchmark_res += run_performance_test([getattr(MX_OP_MODULE, "Pooling")],
                                                             run_backward=True,
                                                             dtype=dtype,
                                                             ctx=ctx,
                                                             profiler=profiler,
                                                             inputs=[{"data": pool1d_data,
                                                                      "kernel": 3,
                                                                      "pool_type": pool_type,
                                                                      "global_pool": global_pool,
                                                                      "stride": 1,
                                                                      "pad": 1}
                                                                     ],
                                                             warmup=warmup,
                                                             runs=runs)
            for pool2d_data in [(32, 3, 256, 256), (32, 3, 64, 64)]:
                pool2d_benchmark_res += run_performance_test([getattr(MX_OP_MODULE, "Pooling")],
                                                             run_backward=True,
                                                             dtype=dtype,
                                                             ctx=ctx,
                                                             profiler=profiler,
                                                             inputs=[{"data": pool2d_data,
                                                                      "kernel": (3, 3),
                                                                      "pool_type": pool_type,
                                                                      "global_pool": global_pool,
                                                                      "stride": (1, 1),
                                                                      "pad": (0, 0)}
                                                                     ],
                                                             warmup=warmup,
                                                             runs=runs)
    # Run ROI Pooling performance runs
    roipool_benchmark_res = []
    for roipool_data in [(32, 3, 256, 256), (32, 3, 64, 64)]:
        roipool_benchmark_res += run_performance_test([getattr(MX_OP_MODULE, "ROIPooling")],
                                                      run_backward=True,
                                                      dtype=dtype,
                                                      ctx=ctx,
                                                      profiler=profiler,
                                                      inputs=[{"data": roipool_data,
                                                               "rois": (32, 5),
                                                               "pooled_size": (2, 2),
                                                               "spatial_scale": .5}
                                                             ],
                                                      warmup=warmup,
                                                      runs=runs)
    # Prepare combined results
    mx_pooling_op_results = merge_map_list(pool1d_benchmark_res + pool2d_benchmark_res + roipool_benchmark_res)
    return mx_pooling_op_results
def run_transpose_convolution_operators_benchmarks(ctx=mx.cpu(),
                                                   profiler='native',
                                                   dtype='float32',
                                                   warmup=10,
                                                   runs=50):
    # Conv1DTranspose Benchmarks
    conv1d_transpose_benchmark_res = []
    for conv_data in [(32, 3, 256), (32, 3, 64)]:
        conv1d_transpose_benchmark_res += run_performance_test(
            [getattr(MX_OP_MODULE, "Deconvolution")],
            run_backward=True,
            dtype=dtype,
            ctx=ctx,
            profiler=profiler,
            inputs=[{
                "data": conv_data,
                "weight": (3, 64, 3),
                "bias": (64, ),
                "kernel": (3, ),
                "stride": (1, ),
                "dilate": (1, ),
                "pad": (0, ),
                "adj": (0, ),
                "num_filter": 64,
                "no_bias": False,
                "layout": 'NCW'
            }],
            warmup=warmup,
            runs=runs)
    # Conv2DTranspose Benchmarks
    conv2d_transpose_benchmark_res = []
    for conv_data in [(32, 3, 256, 256), (32, 3, 64, 64)]:
        conv2d_transpose_benchmark_res += run_performance_test(
            [getattr(MX_OP_MODULE, "Deconvolution")],
            run_backward=True,
            dtype=dtype,
            ctx=ctx,
            profiler=profiler,
            inputs=[{
                "data": conv_data,
                "weight": (3, 64, 3, 3),
                "bias": (64, ),
                "kernel": (3, 3),
                "stride": (1, 1),
                "dilate": (1, 1),
                "pad": (0, 0),
                "num_filter": 64,
                "no_bias": False,
                "layout": 'NCHW'
            }],
            warmup=warmup,
            runs=runs)
    # Prepare combined results
    mx_transpose_conv_op_results = merge_map_list(
        conv1d_transpose_benchmark_res + conv2d_transpose_benchmark_res)
    return mx_transpose_conv_op_results
def run_mx_unary_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='native', int64_tensor='off', warmup=25, runs=100):
    """Runs benchmarks with the given context, precision (dtype), and input data size (int64_tensor) for all the unary
    operators in MXNet.

    Parameters
    ----------
    ctx: mx.ctx
        Context to run benchmarks
    dtype: str, default 'float32'
        Precision to use for benchmarks
    profiler: str, default 'native'
        Type of Profiler to use (native/python)
    int64_tensor: str, default 'off'
        Input tensor size to use for tests (if on, dimensions >= 2**32)
    warmup: int, default 25
        Number of times to run for warmup
    runs: int, default 100
        Number of runs to capture benchmark results

    Returns
    -------
    Dictionary of results. Key -> Name of the operator, Value -> Benchmark results.

    """

    standard_inputs = [{"args": [(1024, 1024)],
                        "num_outputs":1},
                       {"args": [(10000, 1)],
                        "num_outputs":1}]
    int64_tensor_inputs = [{"args": [(2**32, 1)],
                            "num_outputs":1}]

    if int64_tensor == 'on':
        inputs = int64_tensor_inputs
    else:
        inputs = standard_inputs

    # Run amp_multicast as it needs data as positional argument
    amp_multicast_benchmark = run_performance_test([getattr(MX_OP_MODULE, "amp_multicast")],
                                                   run_backward=True,
                                                   dtype=dtype,
                                                   ctx=ctx,
                                                   profiler=profiler,
                                                   inputs=inputs,
                                                   warmup=warmup,
                                                   runs=runs)

    # Fetch all Unary Operators
    mx_unary_broadcast_ops = get_all_unary_operators()

    # Run benchmarks
    mx_unary_op_results = run_op_benchmarks(mx_unary_broadcast_ops, dtype, ctx, profiler, int64_tensor, warmup, runs)
    return merge_map_list(amp_multicast_benchmark + [mx_unary_op_results])
Beispiel #7
0
def run_gemm_operators_benchmarks(ctx=mx.cpu(), dtype='float32', warmup=25, runs=100):
    """Runs benchmarks with the given context and precision (dtype)for all the GEMM
    operators (dot, batch_dot) in MXNet.

    Parameters
    ----------
    ctx: mx.ctx
        Context to run benchmarks
    dtype: str, default 'float32'
        Precision to use for benchmarks
    warmup: int, default 25
        Number of times to run for warmup
    runs: int, default 100
        Number of runs to capture benchmark results

    Returns
    -------
    Dictionary of results. Key -> Name of the operator, Value -> Benchmark results.

    """
    # Benchmark tests for dot and batch_dot operators
    dot_benchmark_res = run_performance_test(
        [getattr(MX_OP_MODULE, "dot")], run_backward=True,
        dtype=dtype, ctx=ctx,
        inputs=[{"lhs": (1024, 1024),
                 "rhs": (1024, 1024)},
                {"lhs": (1000, 10),
                 "rhs": (1000, 10),
                 "transpose_b": True},
                {"lhs": (1000, 1),
                 "rhs": (100, 1000),
                 "transpose_a": True,
                 "transpose_b": True}],
        warmup=warmup, runs=runs)

    batch_dot_benchmark_res = run_performance_test(
        [getattr(MX_OP_MODULE, "batch_dot")], run_backward=True,
        dtype=dtype, ctx=ctx,
        inputs=[{"lhs": (32, 1024, 1024),
                 "rhs": (32, 1024, 1024)},
                {"lhs": (32, 1000, 10),
                 "rhs": (32, 1000, 10),
                 "transpose_b": True},
                {"lhs": (32, 1000, 1),
                 "rhs": (32, 100, 1000),
                 "transpose_a": True,
                 "transpose_b": True}],
        warmup=warmup, runs=runs)

    # Prepare combined results for GEMM operators
    mx_gemm_op_results = merge_map_list(dot_benchmark_res + batch_dot_benchmark_res)
    return mx_gemm_op_results
def run_linalg_operators_benchmarks(ctx=mx.cpu(),
                                    dtype='float32',
                                    profiler='native',
                                    warmup=25,
                                    runs=100):
    """Runs benchmarks with the given context and precision (dtype) for all the linear algebra
    operators in MXNet.

    Parameters
    ----------
    ctx: mx.ctx
        Context to run benchmarks
    dtype: str, default 'float32'
        Precision to use for benchmarks
    profiler: str, default 'native'
        Type of Profiler to use (native/python)
    warmup: int, default 25
        Number of times to run for warmup
    runs: int, default 100
        Number of runs to capture benchmark results

    Returns
    -------
    Dictionary of results. Key -> Name of the operator, Value -> Benchmark results.

    """
    # Individual tests for ops with specific requirements on input data
    # linalg_potrf requires a positive definite matrix as input
    linalg_potrf_benchmark = run_performance_test(getattr(
        MX_OP_MODULE, "linalg_potrf"),
                                                  run_backward=False,
                                                  dtype=dtype,
                                                  ctx=ctx,
                                                  profiler=profiler,
                                                  inputs=[{
                                                      "A": [[1, 0], [0, 1]]
                                                  }, {
                                                      "A": [[2, -1, 0],
                                                            [-1, 2, -1],
                                                            [0, -1, 2]]
                                                  }],
                                                  warmup=warmup,
                                                  runs=runs)

    # Fetch all Linear Algebra Operators
    mx_linalg_ops = get_all_linalg_operators()
    # Run benchmarks
    mx_linalg_op_results = run_op_benchmarks(mx_linalg_ops, dtype, ctx,
                                             profiler, warmup, runs)
    return merge_map_list(linalg_potrf_benchmark + [mx_linalg_op_results])
def run_activation_operators_benchmarks(
        ctx=mx.cpu(), dtype='float32', warmup=10, runs=50):
    """Runs benchmarks with the given context and precision (dtype)for all the activation
    operators (relu, sigmoid, softmax) in MXNet.

    Parameters
    ----------
    ctx: mx.ctx
        Context to run benchmarks
    dtype: str, default 'float32'
        Precision to use for benchmarks
    warmup: int, default 10
        Number of times to run for warmup
    runs: int, default 50
        Number of runs to capture benchmark results

    Returns
    -------
    Dictionary of results. Key -> Name of the operator, Value -> Benchmark results.

    """
    # Relu and its variation
    relu_benchmark_res = run_performance_test([nd.LeakyReLU],
                                              run_backward=True,
                                              dtype=dtype,
                                              ctx=ctx,
                                              inputs=[{
                                                  "data": (1024, 1024),
                                                  "act_type": "leaky",
                                                  "slope": 0.1
                                              }, {
                                                  "data": (10000, 1),
                                                  "act_type": "leaky",
                                                  "slope": 0.1
                                              }, {
                                                  "data": (10000, 100),
                                                  "act_type": "leaky",
                                                  "slope": 0.1
                                              }, {
                                                  "data": (1024, 1024),
                                                  "act_type": "elu",
                                                  "slope": 0.1
                                              }, {
                                                  "data": (10000, 1),
                                                  "act_type": "elu",
                                                  "slope": 0.1
                                              }, {
                                                  "data": (10000, 100),
                                                  "act_type": "elu",
                                                  "slope": 0.1
                                              }, {
                                                  "data": (1024, 1024),
                                                  "act_type": "selu"
                                              }, {
                                                  "data": (10000, 1),
                                                  "act_type": "selu"
                                              }, {
                                                  "data": (10000, 100),
                                                  "act_type": "selu"
                                              }, {
                                                  "data": (1024, 1024),
                                                  "act_type": "prelu",
                                                  "gamma": (1, 1024)
                                              }, {
                                                  "data": (10000, 1),
                                                  "act_type": "prelu",
                                                  "gamma": (1, 1)
                                              }, {
                                                  "data": (10000, 100),
                                                  "act_type": "prelu",
                                                  "gamma": (1, 100)
                                              }],
                                              warmup=warmup,
                                              runs=runs)

    # Sigmoid => Covered as part of Unary ops
    # Hard_Sigmoid
    hard_sigmoid_benchmark_res = run_performance_test([nd.hard_sigmoid],
                                                      run_backward=True,
                                                      dtype=dtype,
                                                      ctx=ctx,
                                                      inputs=[{
                                                          "data": (1024, 1024),
                                                          "alpha":
                                                          0.25,
                                                          "beta":
                                                          0.5
                                                      }, {
                                                          "data": (10000, 1),
                                                          "alpha":
                                                          0.25,
                                                          "beta":
                                                          0.5
                                                      }, {
                                                          "data": (10000, 100),
                                                          "alpha":
                                                          0.25,
                                                          "beta":
                                                          0.5
                                                      }],
                                                      warmup=warmup,
                                                      runs=runs)

    # Softmax, LogSoftmax
    softmax_benchmark_res = run_performance_test([nd.softmax, nd.log_softmax],
                                                 run_backward=True,
                                                 dtype=dtype,
                                                 ctx=ctx,
                                                 inputs=[{
                                                     "data": (1024, 1024),
                                                     "axis": -1,
                                                     "temperature": 0.5
                                                 }, {
                                                     "data": (10000, 1),
                                                     "axis": -1,
                                                     "temperature": 0.5
                                                 }, {
                                                     "data": (10000, 100),
                                                     "axis": -1,
                                                     "temperature": 0.5
                                                 }],
                                                 warmup=warmup,
                                                 runs=runs)

    # Prepare combined results
    mx_activation_op_results = merge_map_list(relu_benchmark_res +
                                              hard_sigmoid_benchmark_res +
                                              softmax_benchmark_res)
    return mx_activation_op_results
Beispiel #10
0
def run_join_split_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='native', int64_tensor='off', warmup=25, runs=100):
    """Runs benchmarks with the given context and precision (dtype) for all the
    join & split operators  in MXNet.

    Parameters
    ----------
    ctx: mx.ctx
        Context to run benchmarks
    dtype: str, default 'float32'
        Precision to use for benchmarks
    profiler: str, default 'native'
        Type of Profiler to use (native/python)
    int64_tensor: str, default 'off'
        Input tensor size to use for tests (if on, dimensions >= 2**32)
    warmup: int, default 25
        Number of times to run for warmup
    runs: int, default 100
        Number of runs to capture benchmark results

    Returns
    -------
    Dictionary of results. Key -> Name of the operator, Value -> Benchmark results.

    """
    # backward not supported for all 3 ops - concat, stack, split
    # concat
    concat_benchmark_res = run_performance_test([getattr(MX_OP_MODULE, "concat")],
                                                      run_backward=False,
                                                      dtype=dtype,
                                                      ctx=ctx,
                                                      profiler=profiler,
                                                      inputs=[{"args0":nd.random_normal(shape=(100,100)),
                                                               "args1":nd.random_normal(shape=(100,100)),
                                                               "args2":nd.random_normal(shape=(100,100))}
                                                              ],
                                                      warmup=warmup,
                                                      runs=runs)

    # split
    split_benchmark_res = run_performance_test([getattr(MX_OP_MODULE, "split")],
                                                      run_backward=False,
                                                      dtype=dtype,
                                                      ctx=ctx,
                                                      profiler=profiler,
                                                      inputs=[{"data": (1024, 1024), "num_outputs": 2},
                                                              {"data": (10000, 1), "num_outputs": 1},
                                                              {"data": (10000, 100), "num_outputs": 10}
                                                              ],
                                                      warmup=warmup,
                                                      runs=runs)

    # stack
    stack_benchmark_res = run_performance_test([getattr(MX_OP_MODULE, "stack")],
                                                      run_backward=False,
                                                      dtype=dtype,
                                                      ctx=ctx,
                                                      profiler=profiler,
                                                      inputs=[{"args0":nd.random_normal(shape=(100,100)),
                                                               "args1":nd.random_normal(shape=(100,100)),
                                                               "args2":nd.random_normal(shape=(100,100))}
                                                              ],
                                                      warmup=warmup,
                                                      runs=runs)
    mx_join_split_op_results = merge_map_list(concat_benchmark_res + split_benchmark_res + stack_benchmark_res)
    return mx_join_split_op_results
def run_pooling_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='native', int64_tensor='off', warmup=25, runs=100):
    """Runs benchmarks with the given context, precision (dtype), and input data size (int64_tensor) for all the pooling
    operators in MXNet.

    Parameters
    ----------
    ctx: mx.ctx
        Context to run benchmarks
    dtype: str, default 'float32'
        Precision to use for benchmarks
    profiler: str, default 'native'
        Type of Profiler to use (native/python)
    int64_tensor: str, default 'off'
        Input tensor size to use for tests (if on, dimensions >= 2**32)
    warmup: int, default 25
        Number of times to run for warmup
    runs: int, default 100
        Number of runs to capture benchmark results

    Returns
    -------
    Dictionary of results. Key -> Name of the operator, Value -> Benchmark results.

    """
    pool_types = ['avg', 'max', 'sum']
    global_pool_types = [0, 1]

    standard_data_list_pool1d = [(32, 3, 256), (32, 3, 64)]
    int64_tensor_data_list_pool1d = [(1, 1, 2**32)]
    standard_data_list_pool2d = [(32, 3, 256, 256), (32, 3, 64, 64)]
    int64_tensor_data_list_pool2d = [(2**28, 1, 4, 4)]
    standard_data_list_roipool = [(32, 3, 256, 256), (32, 3, 64, 64)]
    int64_tensor_data_list_roipool = [(32, 3, 2**13, 2**13)]

    if int64_tensor == 'on':
        data_list_pool1d = int64_tensor_data_list_pool1d
        data_list_pool2d = int64_tensor_data_list_pool2d
        data_list_roipool = int64_tensor_data_list_roipool
    else:
        data_list_pool1d = standard_data_list_pool1d
        data_list_pool2d = standard_data_list_pool2d
        data_list_roipool = standard_data_list_roipool

    # Run 1D and 2D Pooling performance runs
    pool1d_benchmark_res = []
    pool2d_benchmark_res = []
    for pool_type in pool_types:
        for global_pool in global_pool_types:
            for pool1d_data in data_list_pool1d:
                pool1d_benchmark_res += run_performance_test([getattr(MX_OP_MODULE, "Pooling")],
                                                             run_backward=True,
                                                             dtype=dtype,
                                                             ctx=ctx,
                                                             profiler=profiler,
                                                             inputs=[{"data": pool1d_data,
                                                                      "kernel": 3,
                                                                      "pool_type": pool_type,
                                                                      "global_pool": global_pool,
                                                                      "stride": 1,
                                                                      "pad": 1}
                                                                    ],
                                                             warmup=warmup,
                                                             runs=runs)
            for pool2d_data in data_list_pool2d:
                pool2d_benchmark_res += run_performance_test([getattr(MX_OP_MODULE, "Pooling")],
                                                             run_backward=True,
                                                             dtype=dtype,
                                                             ctx=ctx,
                                                             profiler=profiler,
                                                             inputs=[{"data": pool2d_data,
                                                                      "kernel": (3, 3),
                                                                      "pool_type": pool_type,
                                                                      "global_pool": global_pool,
                                                                      "stride": (1, 1),
                                                                      "pad": (0, 0)}
                                                                    ],
                                                             warmup=warmup,
                                                             runs=runs)
            # Run ROI Pooling performance runs
            roipool_benchmark_res = []
            for roipool_data in data_list_roipool:
                roipool_benchmark_res += run_performance_test([getattr(MX_OP_MODULE, "ROIPooling")],
                                                              run_backward=True,
                                                              dtype=dtype,
                                                              ctx=ctx,
                                                              profiler=profiler,
                                                              inputs=[{"data": roipool_data,
                                                                       "rois": (32, 5),
                                                                       "pooled_size": (2, 2),
                                                                       "spatial_scale": .5}
                                                                     ],
                                                              warmup=warmup,
                                                              runs=runs)
    # Prepare combined results
    mx_pooling_op_results = merge_map_list(pool1d_benchmark_res + pool2d_benchmark_res + roipool_benchmark_res)
    return mx_pooling_op_results
def run_transpose_convolution_operators_benchmarks(ctx=mx.cpu(), profiler='native', int64_tensor='off', dtype='float32', warmup=25, runs=100):
    """Runs benchmarks with the given context, precision (dtype), and input data size (int64_tensor) for all the transpose convolution
    operators in MXNet.

    Parameters
    ----------
    ctx: mx.ctx
        Context to run benchmarks
    dtype: str, default 'float32'
        Precision to use for benchmarks
    profiler: str, default 'native'
        Type of Profiler to use (native/python)
    int64_tensor: str, default 'off'
        Input tensor size to use for tests (if on, dimensions >= 2**32)
    warmup: int, default 25
        Number of times to run for warmup
    runs: int, default 100
        Number of runs to capture benchmark results

    Returns
    -------
    Dictionary of results. Key -> Name of the operator, Value -> Benchmark results.

    """

    standard_data_list_conv1d_transpose = [(32, 3, 256), (32, 3, 64)]
    int64_tensor_data_list_conv1d_transpose = [(2**30, 1, 4)]
    standard_weight_conv1d_transpose = (3, 1, 3)
    int64_tensor_weight_conv1d_transpose = (1, 1, 1)
    standard_kernel_conv1d_transpose = (3,)
    int64_tensor_kernel_conv1d_transpose = (1,)
    standard_data_list_conv2d_transpose = [(32, 3, 256, 256), (32, 3, 64, 64)]
    int64_tensor_data_list_conv2d_transpose = [(2**28, 1, 4, 4)]
    standard_weight_conv2d_transpose = (3, 1, 3, 3)
    int64_tensor_weight_conv2d_transpose = (1, 1, 1, 1)
    standard_kernel_conv2d_transpose = (3, 3)
    int64_tensor_kernel_conv2d_transpose = (1, 1)

    if int64_tensor == 'on':
        data_list_conv1d_transpose = int64_tensor_data_list_conv1d_transpose
        weight_conv1d_transpose = int64_tensor_weight_conv1d_transpose
        kernel_conv1d_transpose = int64_tensor_kernel_conv1d_transpose
        data_list_conv2d_transpose = int64_tensor_data_list_conv2d_transpose
        weight_conv2d_transpose = int64_tensor_weight_conv2d_transpose
        kernel_conv2d_transpose = int64_tensor_kernel_conv2d_transpose
    else:
        data_list_conv1d_transpose = standard_data_list_conv1d_transpose
        weight_conv1d_transpose = standard_weight_conv1d_transpose
        kernel_conv1d_transpose = standard_kernel_conv1d_transpose
        data_list_conv2d_transpose = standard_data_list_conv2d_transpose
        weight_conv2d_transpose = standard_weight_conv2d_transpose
        kernel_conv2d_transpose = standard_kernel_conv2d_transpose

    # Conv1DTranspose Benchmarks
    conv1d_transpose_benchmark_res = []
    for conv_data in data_list_conv1d_transpose:
        conv1d_transpose_benchmark_res += run_performance_test([getattr(MX_OP_MODULE, "Deconvolution")],
                                                                   run_backward=True,
                                                                   dtype=dtype,
                                                                   ctx=ctx,
                                                                   profiler=profiler,
                                                                   inputs=[{"data": conv_data,
                                                                            "weight": weight_conv1d_transpose,
                                                                            "bias": (1,),
                                                                            "kernel": kernel_conv1d_transpose,
                                                                            "stride": (1,),
                                                                            "dilate": (1,),
                                                                            "pad": (0,),
                                                                            "num_filter": 1,
                                                                            "no_bias": False,
                                                                            "layout": 'NCW'}],
                                                                   warmup=warmup,
                                                                   runs=runs)
    # Conv2DTranspose Benchmarks
    conv2d_transpose_benchmark_res = []
    for conv_data in data_list_conv2d_transpose:
        conv2d_transpose_benchmark_res += run_performance_test([getattr(MX_OP_MODULE, "Deconvolution")],
                                                                   run_backward=True,
                                                                   dtype=dtype,
                                                                   ctx=ctx,
                                                                   profiler=profiler,
                                                                   inputs=[{"data": conv_data,
                                                                            "weight": weight_conv2d_transpose,
                                                                            "bias": (1,),
                                                                            "kernel": kernel_conv2d_transpose,
                                                                            "stride": (1, 1),
                                                                            "pad": (0, 0),
                                                                            "num_filter": 1,
                                                                            "no_bias": False,
                                                                            "layout": 'NCHW'}],
                                                                   warmup=warmup,
                                                                   runs=runs)
    # Prepare combined results
    mx_transpose_conv_op_results = merge_map_list(conv1d_transpose_benchmark_res + conv2d_transpose_benchmark_res)
    return mx_transpose_conv_op_results
def run_nn_basic_operators_benchmarks(ctx=mx.cpu(),
                                      dtype='float32',
                                      profiler='native',
                                      warmup=25,
                                      runs=100):
    # FullyConnnected operator benchmarks
    fc_benchmark_res = run_performance_test(
        [getattr(MX_OP_MODULE, "FullyConnected")],
        run_backward=True,
        dtype=dtype,
        ctx=ctx,
        profiler=profiler,
        inputs=[{
            "data": (32, 3, 256, 256),
            "num_hidden": 64,
            "weight": (64, 3 * 256 * 256),
            "bias": (64, ),
            "flatten": True
        }, {
            "data": (32, 3, 256, 256),
            "num_hidden": 64,
            "weight": (64, 256),
            "bias": (64, ),
            "flatten": False
        }],
        warmup=warmup,
        runs=runs)

    # Dropout benchmarks
    dropout_benchmark_res = run_performance_test(
        [getattr(MX_OP_MODULE, "Dropout")],
        run_backward=True,
        dtype=dtype,
        ctx=ctx,
        profiler=profiler,
        inputs=[{
            "data": (32, 3, 256, 256),
            "p": 0.5,
            "mode": "always"
        }, {
            "data": (10000, 10),
            "p": 0.5,
            "mode": "always"
        }],
        warmup=warmup,
        runs=runs)
    # BatchNorm benchmarks
    batchnorm_benchmark_res = run_performance_test(
        [getattr(MX_OP_MODULE, "BatchNorm")],
        run_backward=True,
        dtype=dtype,
        ctx=ctx,
        profiler=profiler,
        inputs=[{
            "data": (32, 3, 256, 256),
            "gamma": (3, ),
            "beta": (3, ),
            "moving_mean": (3, ),
            "moving_var": (3, )
        }, {
            "data": (32, 3, 10000, 10),
            "gamma": (3, ),
            "beta": (3, ),
            "moving_mean": (3, ),
            "moving_var": (3, )
        }],
        warmup=warmup,
        runs=runs)
    # Prepare combined results
    mx_basic_nn_results = merge_map_list(fc_benchmark_res +
                                         dropout_benchmark_res +
                                         batchnorm_benchmark_res)
    return mx_basic_nn_results
def run_gemm_operators_benchmarks(ctx=mx.cpu(),
                                  dtype='float32',
                                  profiler='native',
                                  warmup=25,
                                  runs=100):
    """Runs benchmarks with the given context and precision (dtype)for all the GEMM
    operators (dot, batch_dot, khatri_rao) in MXNet.

    Parameters
    ----------
    ctx: mx.ctx
        Context to run benchmarks
    dtype: str, default 'float32'
        Precision to use for benchmarks
    profiler: str, default 'native'
        Type of Profiler to use (native/python)
    warmup: int, default 25
        Number of times to run for warmup
    runs: int, default 100
        Number of runs to capture benchmark results

    Returns
    -------
    Dictionary of results. Key -> Name of the operator, Value -> Benchmark results.

    """
    # Benchmark tests for dot operator
    dot_benchmark_res = run_performance_test([getattr(MX_OP_MODULE, "dot")],
                                             run_backward=True,
                                             dtype=dtype,
                                             ctx=ctx,
                                             inputs=[{
                                                 "lhs": (1024, 1024),
                                                 "rhs": (1024, 1024)
                                             }, {
                                                 "lhs": (1000, 10),
                                                 "rhs": (1000, 10),
                                                 "transpose_b": True
                                             }, {
                                                 "lhs": (1000, 1),
                                                 "rhs": (100, 1000),
                                                 "transpose_a": True,
                                                 "transpose_b": True
                                             }],
                                             warmup=warmup,
                                             runs=runs,
                                             profiler=profiler)
    # Benchmark tests for batch_dot operator
    batch_dot_benchmark_res = run_performance_test(
        [getattr(MX_OP_MODULE, "batch_dot")],
        run_backward=True,
        dtype=dtype,
        ctx=ctx,
        inputs=[{
            "lhs": (32, 1024, 1024),
            "rhs": (32, 1024, 1024)
        }, {
            "lhs": (32, 1000, 10),
            "rhs": (32, 1000, 10),
            "transpose_b": True
        }, {
            "lhs": (32, 1000, 1),
            "rhs": (32, 100, 1000),
            "transpose_a": True,
            "transpose_b": True
        }],
        warmup=warmup,
        runs=runs,
        profiler=profiler)
    # Operator khatri_rao is not yet implemented for GPU
    khatri_rao_benchmark_res = []
    if ctx != mx.gpu():
        # Benchmark tests for khatri_rao operator
        khatri_rao_benchmark_res = run_performance_test(
            [getattr(MX_OP_MODULE, "khatri_rao")],
            run_backward=False,
            dtype=dtype,
            ctx=ctx,
            inputs=[{
                "args": [(32, 32), (32, 32)]
            }, {
                "args": [(64, 64), (64, 64)]
            }],
            warmup=warmup,
            runs=runs,
            profiler=profiler)

    # Prepare combined results for GEMM operators
    mx_gemm_op_results = merge_map_list(dot_benchmark_res +
                                        batch_dot_benchmark_res +
                                        khatri_rao_benchmark_res)
    return mx_gemm_op_results
Beispiel #15
0
def run_gemm_operators_benchmarks(ctx=mx.cpu(),
                                  dtype='float32',
                                  profiler='native',
                                  int64_tensor='off',
                                  warmup=25,
                                  runs=100):
    """Runs benchmarks with the given context, precision (dtype), and input data size (int64_tensor) for all the GEMM
    operators (dot, batch_dot, khatri_rao) in MXNet.

    Parameters
    ----------
    ctx: mx.ctx
        Context to run benchmarks
    dtype: str, default 'float32'
        Precision to use for benchmarks
    profiler: str, default 'native'
        Type of Profiler to use (native/python)
    int64_tensor: str, default 'off'
        Input tensor size to use for tests (if on, dimensions >= 2**32)
    warmup: int, default 25
        Number of times to run for warmup
    runs: int, default 100
        Number of runs to capture benchmark results

    Returns
    -------
    Dictionary of results. Key -> Name of the operator, Value -> Benchmark results.

    """
    standard_inputs_dot = [{
        "lhs": (1024, 1024),
        "rhs": (1024, 1024)
    }, {
        "lhs": (1000, 10),
        "rhs": (1000, 10),
        "transpose_b": True
    }, {
        "lhs": (1000, 1),
        "rhs": (100, 1000),
        "transpose_a": True,
        "transpose_b": True
    }]
    int64_tensor_inputs_dot = [{
        "lhs": (2**16, 2**16),
        "rhs": (2**16, 2**16)
    }, {
        "lhs": (4, 2**30),
        "rhs": (4, 2**30),
        "transpose_b": True
    }, {
        "lhs": (2**28, 16),
        "rhs": (16, 2**28),
        "transpose_a": True,
        "transpose_b": True
    }]
    standard_inputs_batch_dot = [{
        "lhs": (32, 1024, 1024),
        "rhs": (32, 1024, 1024)
    }, {
        "lhs": (32, 1000, 10),
        "rhs": (32, 1000, 10),
        "transpose_b": True
    }, {
        "lhs": (32, 1000, 1),
        "rhs": (32, 100, 1000),
        "transpose_a": True,
        "transpose_b": True
    }]
    int64_tensor_inputs_batch_dot = [{
        "lhs": (1, 2**16, 2**16),
        "rhs": (1, 2**16, 2**16)
    }, {
        "lhs": (1, 4, 2**30),
        "rhs": (1, 4, 2**30),
        "transpose_b": True
    }, {
        "lhs": (1, 2**28, 16),
        "rhs": (1, 16, 2**28),
        "transpose_a": True,
        "transpose_b": True
    }]
    standard_inputs_khatri_rao = [{
        "args": [(32, 32), (32, 32)]
    }, {
        "args": [(64, 64), (64, 64)]
    }]
    int64_tensor_inputs_khatri_rao = [{"args": [(2**32, 1), (2**32, 1)]}]

    if int64_tensor == 'on':
        inputs_dot = int64_tensor_inputs_dot
        inputs_batch_dot = int64_tensor_inputs_batch_dot
        inputs_khatri_rao = int64_tensor_inputs_khatri_rao
    else:
        inputs_dot = standard_inputs_dot
        inputs_batch_dot = standard_inputs_batch_dot
        inputs_khatri_rao = standard_inputs_khatri_rao

    # Benchmark tests for dot and batch_dot operators
    dot_benchmark_res = run_performance_test([getattr(MX_OP_MODULE, "dot")],
                                             run_backward=True,
                                             dtype=dtype,
                                             ctx=ctx,
                                             inputs=inputs_dot,
                                             warmup=warmup,
                                             runs=runs,
                                             profiler=profiler)

    batch_dot_benchmark_res = run_performance_test(
        [getattr(MX_OP_MODULE, "batch_dot")],
        run_backward=True,
        dtype=dtype,
        ctx=ctx,
        inputs=inputs_batch_dot,
        warmup=warmup,
        runs=runs,
        profiler=profiler)
    # Operator khatri_rao is not yet implemented for GPU
    khatri_rao_benchmark_res = []
    if ctx != mx.gpu():
        # Benchmark tests for khatri_rao operator
        khatri_rao_benchmark_res = run_performance_test(
            [getattr(MX_OP_MODULE, "khatri_rao")],
            run_backward=False,
            dtype=dtype,
            ctx=ctx,
            inputs=inputs_khatri_rao,
            warmup=warmup,
            runs=runs,
            profiler=profiler)

    # Prepare combined results for GEMM operators
    mx_gemm_op_results = merge_map_list(dot_benchmark_res +
                                        batch_dot_benchmark_res +
                                        khatri_rao_benchmark_res)
    return mx_gemm_op_results
Beispiel #16
0
def run_all_mxnet_operator_benchmarks(ctx=mx.cpu(),
                                      dtype='float32',
                                      profiler='native',
                                      int64_tensor='off',
                                      warmup=25,
                                      runs=100):
    """Run all the MXNet operators (NDArray) benchmarks.

    Returns
    -------
    Dictionary of benchmark results.
    """
    mxnet_operator_benchmark_results = []

    # *************************MXNET TENSOR OPERATOR BENCHMARKS*****************************

    # Run all Unary operations benchmarks with default input values
    mxnet_operator_benchmark_results.append(
        run_mx_unary_operators_benchmarks(ctx=ctx,
                                          dtype=dtype,
                                          profiler=profiler,
                                          int64_tensor=int64_tensor,
                                          warmup=warmup,
                                          runs=runs))

    # Run all Binary Broadcast, element_wise, and miscellaneous operations benchmarks with default input values
    mxnet_operator_benchmark_results.append(
        run_mx_binary_broadcast_operators_benchmarks(ctx=ctx,
                                                     dtype=dtype,
                                                     profiler=profiler,
                                                     int64_tensor=int64_tensor,
                                                     warmup=warmup,
                                                     runs=runs))
    mxnet_operator_benchmark_results.append(
        run_mx_binary_element_wise_operators_benchmarks(
            ctx=ctx,
            dtype=dtype,
            profiler=profiler,
            int64_tensor=int64_tensor,
            warmup=warmup,
            runs=runs))

    mxnet_operator_benchmark_results.append(
        run_mx_binary_misc_operators_benchmarks(ctx=ctx,
                                                dtype=dtype,
                                                profiler=profiler,
                                                int64_tensor=int64_tensor,
                                                warmup=warmup,
                                                runs=runs))

    # Run all GEMM operations benchmarks with default input values
    mxnet_operator_benchmark_results.append(
        run_gemm_operators_benchmarks(ctx=ctx,
                                      dtype=dtype,
                                      profiler=profiler,
                                      int64_tensor=int64_tensor,
                                      warmup=warmup,
                                      runs=runs))

    # Run all Random sampling operations benchmarks with default input values
    mxnet_operator_benchmark_results.append(
        run_mx_random_sampling_operators_benchmarks(ctx=ctx,
                                                    dtype=dtype,
                                                    profiler=profiler,
                                                    int64_tensor=int64_tensor,
                                                    warmup=warmup,
                                                    runs=runs))

    # Run all Reduction operations benchmarks with default input values
    mxnet_operator_benchmark_results.append(
        run_mx_reduction_operators_benchmarks(ctx=ctx,
                                              dtype=dtype,
                                              profiler=profiler,
                                              int64_tensor=int64_tensor,
                                              warmup=warmup,
                                              runs=runs))

    # Run all Sorting and Searching operations benchmarks with default input values
    mxnet_operator_benchmark_results.append(
        run_sorting_searching_operators_benchmarks(ctx=ctx,
                                                   dtype=dtype,
                                                   profiler=profiler,
                                                   int64_tensor=int64_tensor,
                                                   warmup=warmup,
                                                   runs=runs))

    # Run all Indexing routines benchmarks with default input values
    mxnet_operator_benchmark_results.append(
        run_indexing_routines_benchmarks(ctx=ctx,
                                         dtype=dtype,
                                         profiler=profiler,
                                         int64_tensor=int64_tensor,
                                         warmup=warmup,
                                         runs=runs))

    # Run all Array Rearrange operations benchmarks with default input values
    mxnet_operator_benchmark_results.append(
        run_rearrange_operators_benchmarks(ctx=ctx,
                                           dtype=dtype,
                                           profiler=profiler,
                                           int64_tensor=int64_tensor,
                                           warmup=warmup,
                                           runs=runs))

    # Run all Array Shape Manipulation operations benchmarks with default input values
    mxnet_operator_benchmark_results.append(
        run_shape_operators_benchmarks(ctx=ctx,
                                       dtype=dtype,
                                       profiler=profiler,
                                       int64_tensor=int64_tensor,
                                       warmup=warmup,
                                       runs=runs))

    # Run all Array Expansion operations benchmarks with default input values
    mxnet_operator_benchmark_results.append(
        run_expanding_operators_benchmarks(ctx=ctx,
                                           dtype=dtype,
                                           profiler=profiler,
                                           int64_tensor=int64_tensor,
                                           warmup=warmup,
                                           runs=runs))

    # Run all Array Rounding operations benchmarks with default input values
    mxnet_operator_benchmark_results.append(
        run_rounding_operators_benchmarks(ctx=ctx,
                                          dtype=dtype,
                                          profiler=profiler,
                                          int64_tensor=int64_tensor,
                                          warmup=warmup,
                                          runs=runs))

    # Run all Array Join & Split operations benchmarks with default input values
    mxnet_operator_benchmark_results.append(
        run_join_split_operators_benchmarks(ctx=ctx,
                                            dtype=dtype,
                                            profiler=profiler,
                                            int64_tensor=int64_tensor,
                                            warmup=warmup,
                                            runs=runs))

    # ************************ MXNET NN OPERATOR BENCHMARKS ****************************

    # Run all basic NN operations benchmarks with default input values
    mxnet_operator_benchmark_results.append(
        run_nn_basic_operators_benchmarks(ctx=ctx,
                                          dtype=dtype,
                                          profiler=profiler,
                                          int64_tensor=int64_tensor,
                                          warmup=warmup,
                                          runs=runs))

    # Run all Activation operations benchmarks with default input values
    mxnet_operator_benchmark_results.append(
        run_activation_operators_benchmarks(ctx=ctx,
                                            dtype=dtype,
                                            profiler=profiler,
                                            int64_tensor=int64_tensor,
                                            warmup=warmup,
                                            runs=runs))

    # Run all Pooling operations benchmarks with default input values
    mxnet_operator_benchmark_results.append(
        run_pooling_operators_benchmarks(ctx=ctx,
                                         dtype=dtype,
                                         profiler=profiler,
                                         int64_tensor=int64_tensor,
                                         warmup=warmup,
                                         runs=runs))

    # Run all Convolution operations benchmarks with default input values
    mxnet_operator_benchmark_results.append(
        run_convolution_operators_benchmarks(ctx=ctx,
                                             dtype=dtype,
                                             profiler=profiler,
                                             int64_tensor=int64_tensor,
                                             warmup=warmup,
                                             runs=runs))

    # Run all Optimizer operations benchmarks with default input values
    mxnet_operator_benchmark_results.append(
        run_optimizer_operators_benchmarks(ctx=ctx,
                                           dtype=dtype,
                                           profiler=profiler,
                                           int64_tensor=int64_tensor,
                                           warmup=warmup,
                                           runs=runs))

    # Run all Transpose Convolution operations benchmarks with default input values
    mxnet_operator_benchmark_results.append(
        run_transpose_convolution_operators_benchmarks(
            ctx=ctx,
            dtype=dtype,
            profiler=profiler,
            int64_tensor=int64_tensor,
            warmup=warmup,
            runs=runs))

    # Run all NN loss operations benchmarks with default input values
    mxnet_operator_benchmark_results.append(
        run_loss_operators_benchmarks(ctx=ctx,
                                      dtype=dtype,
                                      profiler=profiler,
                                      int64_tensor=int64_tensor,
                                      warmup=warmup,
                                      runs=runs))

    # Run all Miscellaneous operations benchmarks with default input values
    mxnet_operator_benchmark_results.append(
        run_mx_misc_operators_benchmarks(ctx=ctx,
                                         dtype=dtype,
                                         profiler=profiler,
                                         int64_tensor=int64_tensor,
                                         warmup=warmup,
                                         runs=runs))

    # Linear Algebra operators do not work with int64 tensor data. Issue tracked here: https://github.com/apache/incubator-mxnet/issues/17716
    if int64_tensor == 'off':
        # Run all Linear Algebra operations benchmarks with default input values
        mxnet_operator_benchmark_results.append(
            run_linalg_operators_benchmarks(ctx=ctx,
                                            dtype=dtype,
                                            profiler=profiler,
                                            int64_tensor=int64_tensor,
                                            warmup=warmup,
                                            runs=runs))

    # ****************************** PREPARE FINAL RESULTS ********************************
    final_benchmark_result_map = merge_map_list(
        mxnet_operator_benchmark_results)
    return final_benchmark_result_map
def run_mx_misc_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='native', warmup=25, runs=100):
    """Runs benchmarks with the given context and precision (dtype) for all the miscellaneous
    operators in MXNet.

    Parameters
    ----------
    ctx: mx.ctx
        Context to run benchmarks
    dtype: str, default 'float32'
        Precision to use for benchmarks
    profiler: str, default 'native'
        Type of Profiler to use (native/python)
    warmup: int, default 25
        Number of times to run for warmup
    runs: int, default 100
        Number of runs to capture benchmark results

    Returns
    -------
    Dictionary of results. Key -> Name of the operator, Value -> Benchmark results.

    """
    # Individual tests for ops with positional args
    array_ops_benchmark = run_performance_test([getattr(MX_OP_MODULE, "reset_arrays"),
                                                getattr(MX_OP_MODULE, "multi_all_finite"),
                                                getattr(MX_OP_MODULE, "multi_sum_sq")],
                                               run_backward=False,
                                               dtype=dtype,
                                               ctx=ctx,
                                               profiler=profiler,
                                               inputs=[{"args": [(1024, 1024)],
                                                        "num_arrays": 1},
                                                       {"args": [(10000, 1)],
                                                        "num_arrays": 1},
                                                       {"args": [(10000, 10)],
                                                        "num_arrays": 1}],
                                               warmup=warmup,
                                               runs=runs)
    add_n_benchmark = run_performance_test([getattr(MX_OP_MODULE, "add_n")],
                                           run_backward=True,
                                           dtype=dtype,
                                           ctx=ctx,
                                           profiler=profiler,
                                           inputs=[{"args": [(1024, 1024)]},
                                                   {"args": [(10000, 1)]},
                                                   {"args": [(10000, 10)]}],
                                           warmup=warmup,
                                           runs=runs)
    # There are currently issus with UpSampling with bilinear interpolation.
    # track issue here: https://github.com/apache/incubator-mxnet/issues/9138
    upsampling_benchmark = run_performance_test([getattr(MX_OP_MODULE, "UpSampling")],
                                                run_backward=True,
                                                dtype=dtype,
                                                ctx=ctx,
                                                profiler=profiler,
                                                inputs=[{"args": (32, 3, 256, 256),
                                                         "scale": 2,
                                                         "sample_type": "nearest"},
                                                        {"args": (32, 3, 10000, 1),
                                                         "scale": 4,
                                                         "sample_type": "nearest"}],
                                                warmup=warmup,
                                                runs=runs)
    # Create and register CustomAddOne operator for use in Custom op testing
    c = CustomAddOneProp()
    c.create_operator(ctx, [(1024,1024)], [dtype])
    custom_benchmark = run_performance_test([getattr(MX_OP_MODULE, "Custom")],
                                            run_backward=True,
                                            dtype=dtype,
                                            ctx=ctx,
                                            profiler=profiler,
                                            inputs=[{"args": [(1024, 1024)],
                                                     "op_type": "CustomAddOne"},
                                                    {"args": [(10000, 1)],
                                                     "op_type": "CustomAddOne"},
                                                    {"args": [(10000, 10)],
                                                     "op_type": "CustomAddOne"}],
                                            warmup=warmup,
                                            runs=runs)

    # Fetch remaining Miscellaneous Operators
    mx_misc_ops = get_remaining_miscellaneous_operators()
    # Run benchmarks
    mx_misc_op_results = run_op_benchmarks(mx_misc_ops, dtype, ctx, profiler, warmup, runs)
    return merge_map_list(array_ops_benchmark + add_n_benchmark + upsampling_benchmark + custom_benchmark + [mx_misc_op_results])
Beispiel #18
0
def run_nn_basic_operators_benchmarks(ctx=mx.cpu(),
                                      dtype='float32',
                                      profiler='native',
                                      int64_tensor='off',
                                      warmup=25,
                                      runs=100):
    """Runs benchmarks with the given context, precision (dtype), and data size (int64_tensor) for all the basic neural network
    operators in MXNet.

    Parameters
    ----------
    ctx: mx.ctx
        Context to run benchmarks
    dtype: str, default 'float32'
        Precision to use for benchmarks
    profiler: str, default 'native'
        Type of Profiler to use (native/python)
    int64_tensor: str, default 'off'
        Input tensor size to use for tests (if on, dimensions >= 2**32)
    warmup: int, default 25
        Number of times to run for warmup
    runs: int, default 100
        Number of runs to capture benchmark results

    Returns
    -------
    Dictionary of results. Key -> Name of the operator, Value -> Benchmark results.

    """

    standard_data_list = [(1024, 4, 4)]
    int64_tensor_data_list = [(2**28, 4, 4)]

    if int64_tensor == 'on':
        data_list = int64_tensor_data_list
    else:
        data_list = standard_data_list

    for data in data_list:
        rnn_relu_benchmark = run_performance_test(
            [getattr(MX_OP_MODULE, "RNN")],
            run_backward=True,
            dtype=dtype,
            ctx=ctx,
            profiler=profiler,
            inputs=[{
                "data": data,
                "parameters": (7, ),
                "state": (1, 4, 1),
                "mode": "rnn_relu",
                "state_size": 1,
                "num_layers": 1
            }],
            warmup=warmup,
            runs=runs)
        rnn_tanh_benchmark = run_performance_test(
            [getattr(MX_OP_MODULE, "RNN")],
            run_backward=True,
            dtype=dtype,
            ctx=ctx,
            profiler=profiler,
            inputs=[{
                "data": data,
                "parameters": (7, ),
                "state": (1, 4, 1),
                "mode": "rnn_tanh",
                "state_size": 1,
                "num_layers": 1
            }],
            warmup=warmup,
            runs=runs)
        rnn_lstm_benchmark = run_performance_test(
            [getattr(MX_OP_MODULE, "RNN")],
            run_backward=True,
            dtype=dtype,
            ctx=ctx,
            profiler=profiler,
            inputs=[{
                "data": data,
                "parameters": (28, ),
                "state": (1, 4, 1),
                "state_cell": (1, 4, 1),
                "mode": "lstm",
                "state_size": 1,
                "num_layers": 1
            }],
            warmup=warmup,
            runs=runs)
        rnn_gru_benchmark = run_performance_test(
            [getattr(MX_OP_MODULE, "RNN")],
            run_backward=True,
            dtype=dtype,
            ctx=ctx,
            profiler=profiler,
            inputs=[{
                "data": data,
                "parameters": (21, ),
                "state": (1, 4, 1),
                "mode": "gru",
                "state_size": 1,
                "num_layers": 1
            }],
            warmup=warmup,
            runs=runs)
    # Fetch all NN Basic Operators
    mx_nn_basic_ops = get_all_nn_basic_operators()

    # Run benchmarks
    mx_nn_basic_op_results = run_op_benchmarks(mx_nn_basic_ops, dtype, ctx,
                                               profiler, int64_tensor, warmup,
                                               runs)
    return merge_map_list(rnn_relu_benchmark + rnn_tanh_benchmark +
                          rnn_lstm_benchmark + rnn_gru_benchmark +
                          [mx_nn_basic_op_results])
def run_optimizer_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='native', warmup=25, runs=100):
    """Runs benchmarks with the given context and precision (dtype) for all the neural network
    optimizer update operators in MXNet.

    Parameters
    ----------
    ctx: mx.ctx
        Context to run benchmarks
    dtype: str, default 'float32'
        Precision to use for benchmarks
    profiler: str, default 'native'
        Type of Profiler to use (native/python)
    warmup: int, default 25
        Number of times to run for warmup
    runs: int, default 100
        Number of runs to capture benchmark results

    Returns
    -------
    Dictionary of results. Key -> Name of the operator, Value -> Benchmark results.

    """
    # Run independent tests for ops that need specific input data
    multi_mp_sgd_mom_res = run_performance_test([getattr(MX_OP_MODULE, "multi_mp_sgd_mom_update")],
                                                inputs=[{"args0": nd.random_normal(shape=(5,5)),
                                                "args1": nd.random_normal(shape=(5,5)), "args2": nd.random_normal(shape=(5,5)),
                                                "args3": nd.random_normal(shape=(5,5)), "lrs": 0.1, "wds": 0.2,
                                                "out": nd.random_normal(shape=(5,5))}],run_backward=False)

    multi_sgd_mom_res = run_performance_test([getattr(MX_OP_MODULE, "multi_sgd_mom_update")],
                                             inputs=[{"args0": nd.random_normal(shape=(5,5)),
                                             "args1": nd.random_normal(shape=(5,5)),"args2": nd.random_normal(shape=(5,5)),
                                             "lrs": 0.1, "wds": 0.2, "out": nd.random_normal(shape=(5,5))}], run_backward=False)

    multi_sgd_res = run_performance_test([getattr(MX_OP_MODULE, "multi_sgd_update")],
                                         inputs=[{"args0": nd.random_normal(shape=(5,5)),
                                         "args1": nd.random_normal(shape=(5,5)), "lrs": 0.1, "wds": 0.2,
                                         "out": nd.random_normal(shape=(5,5))}], run_backward=False)

    multi_mp_sgd_res = run_performance_test([getattr(MX_OP_MODULE, "multi_mp_sgd_update")],
                                            inputs=[{"args0": nd.random_normal(shape=(5,5)),
                                            "args1": nd.random_normal(shape=(5,5)),"args2": nd.random_normal(shape=(5,5)),
                                            "lrs": 0.1, "wds": 0.2, "out": nd.random_normal(shape=(5,5))}], run_backward=False)

    preloaded_multi_mp_sgd_res = run_performance_test(
                                 [getattr(MX_OP_MODULE, "preloaded_multi_mp_sgd_update")],
                                 inputs=[{"args0": nd.random_normal(shape=(5,5)),
                                          "args1": nd.random_normal(shape=(5,5)), "args2": nd.random_normal(shape=(5,5)),
                                          "args3": nd.random_normal(shape=(1)), "args4": nd.random_normal(shape=(1)),
                                          "out": nd.random_normal(shape=(5,5))}], run_backward=False)

    preloaded_multi_sgd_mom_res = run_performance_test(
                                  [getattr(MX_OP_MODULE, "preloaded_multi_sgd_mom_update")],
                                  inputs=[{"args0": nd.random_normal(shape=(5,5)),
                                           "args1": nd.random_normal(shape=(5,5)), "args2": nd.random_normal(shape=(5,5)),
                                           "args3": nd.random_normal(shape=(1)), "args4": nd.random_normal(shape=(1)),
                                           "out": nd.random_normal(shape=(5,5))}], run_backward=False)

    preloaded_multi_sgd_res = run_performance_test(
                              [getattr(MX_OP_MODULE, "preloaded_multi_sgd_update")],
                              inputs=[{"args0": nd.random_normal(shape=(5,5)), "args1": nd.random_normal(shape=(5,5)),
                                       "args4": nd.random_normal(shape=(1)), "args5": nd.random_normal(shape=(1)),
                                       "out": nd.random_normal(shape=(5,5))}], run_backward=False)

    preloaded_multi_mp_sgd_mom_res = run_performance_test(
                                     [getattr(MX_OP_MODULE, "preloaded_multi_mp_sgd_mom_update")],
                                     inputs=[{"args0": nd.random_normal(shape=(5,5)), "args1": nd.random_normal(shape=(5,5)),
                                              "args2": nd.random_normal(shape=(5,5)), "args3": nd.random_normal(shape=(5,5)),
                                              "args4": nd.random_normal(shape=(1)), "args5": nd.random_normal(shape=(1)),
                                              "out": nd.random_normal(shape=(5,5))}], run_backward=False)

    # Fetch remaining optimizer operators
    mx_optimizer_ops = get_all_optimizer_operators()

    # Run benchmarks
    mx_optimizer_op_results = run_op_benchmarks(mx_optimizer_ops, dtype, ctx, profiler, warmup, runs)
    return merge_map_list(multi_sgd_mom_res + multi_sgd_mom_res + multi_sgd_res + multi_mp_sgd_res + preloaded_multi_mp_sgd_res +\
                          preloaded_multi_sgd_mom_res + preloaded_multi_mp_sgd_res + preloaded_multi_mp_sgd_mom_res +\
                          [mx_optimizer_op_results])
Beispiel #20
0
def run_all_mxnet_operator_benchmarks(ctx=mx.cpu(),
                                      dtype='float32',
                                      profiler='native',
                                      warmup=25,
                                      runs=100):
    """Run all the MXNet operators (NDArray) benchmarks.

    Returns
    -------
    Dictionary of benchmark results.
    """
    mxnet_operator_benchmark_results = []

    # *************************MXNET TENSOR OPERATOR BENCHMARKS*****************************

    # Run all Unary operations benchmarks with default input values
    mxnet_operator_benchmark_results.append(
        run_mx_unary_operators_benchmarks(ctx=ctx,
                                          dtype=dtype,
                                          profiler=profiler,
                                          warmup=warmup,
                                          runs=runs))

    # Run all Binary Broadcast, element_wise, and miscellaneous operations benchmarks with default input values
    mxnet_operator_benchmark_results.append(
        run_mx_binary_broadcast_operators_benchmarks(ctx=ctx,
                                                     dtype=dtype,
                                                     profiler=profiler,
                                                     warmup=warmup,
                                                     runs=runs))
    mxnet_operator_benchmark_results.append(
        run_mx_binary_element_wise_operators_benchmarks(ctx=ctx,
                                                        dtype=dtype,
                                                        profiler=profiler,
                                                        warmup=warmup,
                                                        runs=runs))

    mxnet_operator_benchmark_results.append(
        run_mx_binary_misc_operators_benchmarks(ctx=ctx,
                                                dtype=dtype,
                                                profiler=profiler,
                                                warmup=warmup,
                                                runs=runs))

    # Run all GEMM operations benchmarks with default input values
    mxnet_operator_benchmark_results.append(
        run_gemm_operators_benchmarks(ctx=ctx,
                                      dtype=dtype,
                                      profiler=profiler,
                                      warmup=warmup,
                                      runs=runs))

    # Run all Random sampling operations benchmarks with default input values
    mxnet_operator_benchmark_results.append(
        run_mx_random_sampling_operators_benchmarks(ctx=ctx,
                                                    dtype=dtype,
                                                    profiler=profiler,
                                                    warmup=warmup,
                                                    runs=runs))

    # Run all Reduction operations benchmarks with default input values
    mxnet_operator_benchmark_results.append(
        run_mx_reduction_operators_benchmarks(ctx=ctx,
                                              dtype=dtype,
                                              profiler=profiler,
                                              warmup=warmup,
                                              runs=runs))

    # Run all Sorting and Searching operations benchmarks with default input values
    mxnet_operator_benchmark_results.append(
        run_sorting_searching_operators_benchmarks(ctx=ctx,
                                                   dtype=dtype,
                                                   profiler=profiler,
                                                   warmup=warmup,
                                                   runs=runs))

    # Run all Array Rearrange operations benchmarks with default input values
    mxnet_operator_benchmark_results.append(
        run_rearrange_operators_benchmarks(ctx=ctx,
                                           dtype=dtype,
                                           profiler=profiler,
                                           warmup=warmup,
                                           runs=runs))

    # Run all Indexing routines benchmarks with default input values
    mxnet_operator_benchmark_results.append(
        run_indexing_routines_benchmarks(ctx=ctx,
                                         dtype=dtype,
                                         profiler=profiler,
                                         warmup=warmup,
                                         runs=runs))

    # ************************ MXNET NN OPERATOR BENCHMARKS ****************************

    # Run all basic NN operations benchmarks with default input values
    mxnet_operator_benchmark_results.append(
        run_nn_basic_operators_benchmarks(ctx=ctx,
                                          dtype=dtype,
                                          profiler=profiler,
                                          warmup=warmup,
                                          runs=runs))

    # Run all Activation operations benchmarks with default input values
    mxnet_operator_benchmark_results.append(
        run_activation_operators_benchmarks(ctx=ctx,
                                            dtype=dtype,
                                            profiler=profiler,
                                            warmup=warmup,
                                            runs=runs))

    # Run all Pooling operations benchmarks with default input values
    mxnet_operator_benchmark_results.append(
        run_pooling_operators_benchmarks(ctx=ctx,
                                         dtype=dtype,
                                         profiler=profiler,
                                         warmup=warmup,
                                         runs=runs))

    # Run all Convolution operations benchmarks with default input values
    mxnet_operator_benchmark_results.append(
        run_convolution_operators_benchmarks(ctx=ctx,
                                             dtype=dtype,
                                             profiler=profiler,
                                             warmup=warmup,
                                             runs=runs))

    # Run all Optimizer operations benchmarks with default input values
    mxnet_operator_benchmark_results.append(
        run_optimizer_operators_benchmarks(ctx=ctx,
                                           dtype=dtype,
                                           profiler=profiler,
                                           warmup=warmup,
                                           runs=runs))

    # Run all Transpose Convolution operations benchmarks with default input values
    mxnet_operator_benchmark_results.append(
        run_transpose_convolution_operators_benchmarks(ctx=ctx,
                                                       dtype=dtype,
                                                       profiler=profiler,
                                                       warmup=warmup,
                                                       runs=runs))

    # Run all NN loss operations benchmarks with default input values
    mxnet_operator_benchmark_results.append(
        run_loss_operators_benchmarks(ctx=ctx,
                                      dtype=dtype,
                                      profiler=profiler,
                                      warmup=warmup,
                                      runs=runs))

    # Run all Miscellaneous operations benchmarks with default input values
    mxnet_operator_benchmark_results.append(
        run_mx_misc_operators_benchmarks(ctx=ctx,
                                         dtype=dtype,
                                         profiler=profiler,
                                         warmup=warmup,
                                         runs=runs))

    # Run all Linear Algebra operations benchmarks with default input values
    mxnet_operator_benchmark_results.append(
        run_linalg_operators_benchmarks(ctx=ctx,
                                        dtype=dtype,
                                        profiler=profiler,
                                        warmup=warmup,
                                        runs=runs))

    # ****************************** PREPARE FINAL RESULTS ********************************
    final_benchmark_result_map = merge_map_list(
        mxnet_operator_benchmark_results)
    return final_benchmark_result_map