def run_pooling_operators_benchmarks( ctx=mx.cpu(), dtype='float32', warmup=10, runs=50): pool_types = ['avg', 'max', 'sum'] global_pool_types = [0, 1] # Run 1D and 2D Pooling performance runs pool1d_benchmark_res = [] pool2d_benchmark_res = [] for pool_type in pool_types: for global_pool in global_pool_types: for pool1d_data in [(32, 3, 256), (32, 3, 64)]: pool1d_benchmark_res += run_performance_test([nd.Pooling], run_backward=True, dtype=dtype, ctx=ctx, inputs=[{ "data": pool1d_data, "kernel": 3, "pool_type": pool_type, "global_pool": global_pool, "stride": 1, "pad": 1, "layout": 'NCW' }], warmup=warmup, runs=runs) for pool2d_data in [(32, 3, 256, 256), (32, 3, 64, 64)]: pool2d_benchmark_res += run_performance_test([nd.Pooling], run_backward=True, dtype=dtype, ctx=ctx, inputs=[{ "data": pool2d_data, "kernel": (3, 3), "pool_type": pool_type, "global_pool": global_pool, "stride": (1, 1), "pad": (0, 0), "layout": 'NCHW' }], warmup=warmup, runs=runs) # Prepare combined results mx_pooling_op_results = merge_map_list(pool1d_benchmark_res + pool2d_benchmark_res) return mx_pooling_op_results
def run_all_mxnet_operator_benchmarks(ctx=mx.cpu(), dtype='float32'): """Run all the MXNet operators (NDArray) benchmarks. Returns ------- Dictionary of benchmark results. """ mxnet_operator_benchmark_results = [] # *************************MXNET TENSOR OPERATOR BENCHMARKS***************************** # Run all Unary operations benchmarks with default input values mxnet_operator_benchmark_results.append(run_mx_unary_operators_benchmarks(ctx=ctx, dtype=dtype)) # Run all Binary Broadcast, element_wise operations benchmarks with default input values mxnet_operator_benchmark_results.append(run_mx_binary_broadcast_operators_benchmarks(ctx=ctx, dtype=dtype)) mxnet_operator_benchmark_results.append(run_mx_binary_element_wise_operators_benchmarks(ctx=ctx, dtype=dtype)) # Run all GEMM operations benchmarks with default input values mxnet_operator_benchmark_results.append(run_gemm_operators_benchmarks(ctx=ctx, dtype=dtype)) # Run all Random sampling operations benchmarks with default input values mxnet_operator_benchmark_results.append(run_mx_random_sampling_operators_benchmarks(ctx=ctx, dtype=dtype)) # Run all Reduction operations benchmarks with default input values mxnet_operator_benchmark_results.append(run_mx_reduction_operators_benchmarks(ctx=ctx, dtype=dtype)) # Run all Sorting and Searching operations benchmarks with default input values mxnet_operator_benchmark_results.append(run_sorting_searching_operators_benchmarks(ctx=ctx, dtype=dtype)) # Run all Array Rearrange operations benchmarks with default input values mxnet_operator_benchmark_results.append(run_rearrange_operators_benchmarks(ctx=ctx, dtype=dtype)) # ************************ MXNET NN OPERATOR BENCHMARKS **************************** # Run all basic NN operations benchmarks with default input values mxnet_operator_benchmark_results.append(run_nn_basic_operators_benchmarks(ctx=ctx, dtype=dtype)) # Run all Activation operations benchmarks with default input values mxnet_operator_benchmark_results.append(run_activation_operators_benchmarks(ctx=ctx, dtype=dtype)) # Run all Pooling operations benchmarks with default input values mxnet_operator_benchmark_results.append(run_pooling_operators_benchmarks(ctx=ctx, dtype=dtype)) # Run all Convolution operations benchmarks with default input values mxnet_operator_benchmark_results.append(run_convolution_operators_benchmarks(ctx=ctx, dtype=dtype)) # Run all Optimizer operations benchmarks with default input values mxnet_operator_benchmark_results.append(run_optimizer_operators_benchmarks(ctx=ctx, dtype=dtype)) # Run all Transpose Convolution operations benchmarks with default input values mxnet_operator_benchmark_results.append(run_transpose_convolution_operators_benchmarks(ctx=ctx, dtype=dtype)) # ****************************** PREPARE FINAL RESULTS ******************************** final_benchmark_result_map = merge_map_list(mxnet_operator_benchmark_results) return final_benchmark_result_map
def run_convolution_operators_benchmarks( ctx=mx.cpu(), dtype='float32', warmup=10, runs=50): # Conv1D Benchmarks conv1d_benchmark_res = [] for conv_data in [(32, 3, 256), (32, 3, 64)]: conv1d_benchmark_res += run_performance_test([nd.Convolution], run_backward=True, dtype=dtype, ctx=ctx, inputs=[{ "data": conv_data, "weight": ( 64, 3, 3, ), "bias": (64, ), "kernel": (3, ), "stride": (1, ), "dilate": (1, ), "pad": (0, ), "num_filter": 64, "layout": 'NCW' }], warmup=warmup, runs=runs) # Conv2D Benchmarks conv2d_benchmark_res = [] for conv_data in [(32, 3, 256, 256), (32, 3, 64, 64)]: conv2d_benchmark_res += run_performance_test([nd.Convolution], run_backward=True, dtype=dtype, ctx=ctx, inputs=[{ "data": conv_data, "weight": (64, 3, 3, 3), "bias": (64, ), "kernel": (3, 3), "stride": (1, 1), "dilate": (1, 1), "pad": (0, 0), "num_filter": 64, "layout": 'NCHW' }], warmup=warmup, runs=runs) # Prepare combined results mx_conv_op_results = merge_map_list(conv1d_benchmark_res + conv2d_benchmark_res) return mx_conv_op_results
def run_pooling_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='native', warmup=25, runs=100): pool_types = ['avg', 'max', 'sum'] global_pool_types = [0, 1] # Run 1D and 2D Pooling performance runs pool1d_benchmark_res = [] pool2d_benchmark_res = [] for pool_type in pool_types: for global_pool in global_pool_types: for pool1d_data in [(32, 3, 256), (32, 3, 64)]: pool1d_benchmark_res += run_performance_test([getattr(MX_OP_MODULE, "Pooling")], run_backward=True, dtype=dtype, ctx=ctx, profiler=profiler, inputs=[{"data": pool1d_data, "kernel": 3, "pool_type": pool_type, "global_pool": global_pool, "stride": 1, "pad": 1} ], warmup=warmup, runs=runs) for pool2d_data in [(32, 3, 256, 256), (32, 3, 64, 64)]: pool2d_benchmark_res += run_performance_test([getattr(MX_OP_MODULE, "Pooling")], run_backward=True, dtype=dtype, ctx=ctx, profiler=profiler, inputs=[{"data": pool2d_data, "kernel": (3, 3), "pool_type": pool_type, "global_pool": global_pool, "stride": (1, 1), "pad": (0, 0)} ], warmup=warmup, runs=runs) # Run ROI Pooling performance runs roipool_benchmark_res = [] for roipool_data in [(32, 3, 256, 256), (32, 3, 64, 64)]: roipool_benchmark_res += run_performance_test([getattr(MX_OP_MODULE, "ROIPooling")], run_backward=True, dtype=dtype, ctx=ctx, profiler=profiler, inputs=[{"data": roipool_data, "rois": (32, 5), "pooled_size": (2, 2), "spatial_scale": .5} ], warmup=warmup, runs=runs) # Prepare combined results mx_pooling_op_results = merge_map_list(pool1d_benchmark_res + pool2d_benchmark_res + roipool_benchmark_res) return mx_pooling_op_results
def run_transpose_convolution_operators_benchmarks(ctx=mx.cpu(), profiler='native', dtype='float32', warmup=10, runs=50): # Conv1DTranspose Benchmarks conv1d_transpose_benchmark_res = [] for conv_data in [(32, 3, 256), (32, 3, 64)]: conv1d_transpose_benchmark_res += run_performance_test( [getattr(MX_OP_MODULE, "Deconvolution")], run_backward=True, dtype=dtype, ctx=ctx, profiler=profiler, inputs=[{ "data": conv_data, "weight": (3, 64, 3), "bias": (64, ), "kernel": (3, ), "stride": (1, ), "dilate": (1, ), "pad": (0, ), "adj": (0, ), "num_filter": 64, "no_bias": False, "layout": 'NCW' }], warmup=warmup, runs=runs) # Conv2DTranspose Benchmarks conv2d_transpose_benchmark_res = [] for conv_data in [(32, 3, 256, 256), (32, 3, 64, 64)]: conv2d_transpose_benchmark_res += run_performance_test( [getattr(MX_OP_MODULE, "Deconvolution")], run_backward=True, dtype=dtype, ctx=ctx, profiler=profiler, inputs=[{ "data": conv_data, "weight": (3, 64, 3, 3), "bias": (64, ), "kernel": (3, 3), "stride": (1, 1), "dilate": (1, 1), "pad": (0, 0), "num_filter": 64, "no_bias": False, "layout": 'NCHW' }], warmup=warmup, runs=runs) # Prepare combined results mx_transpose_conv_op_results = merge_map_list( conv1d_transpose_benchmark_res + conv2d_transpose_benchmark_res) return mx_transpose_conv_op_results
def run_mx_unary_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='native', int64_tensor='off', warmup=25, runs=100): """Runs benchmarks with the given context, precision (dtype), and input data size (int64_tensor) for all the unary operators in MXNet. Parameters ---------- ctx: mx.ctx Context to run benchmarks dtype: str, default 'float32' Precision to use for benchmarks profiler: str, default 'native' Type of Profiler to use (native/python) int64_tensor: str, default 'off' Input tensor size to use for tests (if on, dimensions >= 2**32) warmup: int, default 25 Number of times to run for warmup runs: int, default 100 Number of runs to capture benchmark results Returns ------- Dictionary of results. Key -> Name of the operator, Value -> Benchmark results. """ standard_inputs = [{"args": [(1024, 1024)], "num_outputs":1}, {"args": [(10000, 1)], "num_outputs":1}] int64_tensor_inputs = [{"args": [(2**32, 1)], "num_outputs":1}] if int64_tensor == 'on': inputs = int64_tensor_inputs else: inputs = standard_inputs # Run amp_multicast as it needs data as positional argument amp_multicast_benchmark = run_performance_test([getattr(MX_OP_MODULE, "amp_multicast")], run_backward=True, dtype=dtype, ctx=ctx, profiler=profiler, inputs=inputs, warmup=warmup, runs=runs) # Fetch all Unary Operators mx_unary_broadcast_ops = get_all_unary_operators() # Run benchmarks mx_unary_op_results = run_op_benchmarks(mx_unary_broadcast_ops, dtype, ctx, profiler, int64_tensor, warmup, runs) return merge_map_list(amp_multicast_benchmark + [mx_unary_op_results])
def run_gemm_operators_benchmarks(ctx=mx.cpu(), dtype='float32', warmup=25, runs=100): """Runs benchmarks with the given context and precision (dtype)for all the GEMM operators (dot, batch_dot) in MXNet. Parameters ---------- ctx: mx.ctx Context to run benchmarks dtype: str, default 'float32' Precision to use for benchmarks warmup: int, default 25 Number of times to run for warmup runs: int, default 100 Number of runs to capture benchmark results Returns ------- Dictionary of results. Key -> Name of the operator, Value -> Benchmark results. """ # Benchmark tests for dot and batch_dot operators dot_benchmark_res = run_performance_test( [getattr(MX_OP_MODULE, "dot")], run_backward=True, dtype=dtype, ctx=ctx, inputs=[{"lhs": (1024, 1024), "rhs": (1024, 1024)}, {"lhs": (1000, 10), "rhs": (1000, 10), "transpose_b": True}, {"lhs": (1000, 1), "rhs": (100, 1000), "transpose_a": True, "transpose_b": True}], warmup=warmup, runs=runs) batch_dot_benchmark_res = run_performance_test( [getattr(MX_OP_MODULE, "batch_dot")], run_backward=True, dtype=dtype, ctx=ctx, inputs=[{"lhs": (32, 1024, 1024), "rhs": (32, 1024, 1024)}, {"lhs": (32, 1000, 10), "rhs": (32, 1000, 10), "transpose_b": True}, {"lhs": (32, 1000, 1), "rhs": (32, 100, 1000), "transpose_a": True, "transpose_b": True}], warmup=warmup, runs=runs) # Prepare combined results for GEMM operators mx_gemm_op_results = merge_map_list(dot_benchmark_res + batch_dot_benchmark_res) return mx_gemm_op_results
def run_linalg_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='native', warmup=25, runs=100): """Runs benchmarks with the given context and precision (dtype) for all the linear algebra operators in MXNet. Parameters ---------- ctx: mx.ctx Context to run benchmarks dtype: str, default 'float32' Precision to use for benchmarks profiler: str, default 'native' Type of Profiler to use (native/python) warmup: int, default 25 Number of times to run for warmup runs: int, default 100 Number of runs to capture benchmark results Returns ------- Dictionary of results. Key -> Name of the operator, Value -> Benchmark results. """ # Individual tests for ops with specific requirements on input data # linalg_potrf requires a positive definite matrix as input linalg_potrf_benchmark = run_performance_test(getattr( MX_OP_MODULE, "linalg_potrf"), run_backward=False, dtype=dtype, ctx=ctx, profiler=profiler, inputs=[{ "A": [[1, 0], [0, 1]] }, { "A": [[2, -1, 0], [-1, 2, -1], [0, -1, 2]] }], warmup=warmup, runs=runs) # Fetch all Linear Algebra Operators mx_linalg_ops = get_all_linalg_operators() # Run benchmarks mx_linalg_op_results = run_op_benchmarks(mx_linalg_ops, dtype, ctx, profiler, warmup, runs) return merge_map_list(linalg_potrf_benchmark + [mx_linalg_op_results])
def run_activation_operators_benchmarks( ctx=mx.cpu(), dtype='float32', warmup=10, runs=50): """Runs benchmarks with the given context and precision (dtype)for all the activation operators (relu, sigmoid, softmax) in MXNet. Parameters ---------- ctx: mx.ctx Context to run benchmarks dtype: str, default 'float32' Precision to use for benchmarks warmup: int, default 10 Number of times to run for warmup runs: int, default 50 Number of runs to capture benchmark results Returns ------- Dictionary of results. Key -> Name of the operator, Value -> Benchmark results. """ # Relu and its variation relu_benchmark_res = run_performance_test([nd.LeakyReLU], run_backward=True, dtype=dtype, ctx=ctx, inputs=[{ "data": (1024, 1024), "act_type": "leaky", "slope": 0.1 }, { "data": (10000, 1), "act_type": "leaky", "slope": 0.1 }, { "data": (10000, 100), "act_type": "leaky", "slope": 0.1 }, { "data": (1024, 1024), "act_type": "elu", "slope": 0.1 }, { "data": (10000, 1), "act_type": "elu", "slope": 0.1 }, { "data": (10000, 100), "act_type": "elu", "slope": 0.1 }, { "data": (1024, 1024), "act_type": "selu" }, { "data": (10000, 1), "act_type": "selu" }, { "data": (10000, 100), "act_type": "selu" }, { "data": (1024, 1024), "act_type": "prelu", "gamma": (1, 1024) }, { "data": (10000, 1), "act_type": "prelu", "gamma": (1, 1) }, { "data": (10000, 100), "act_type": "prelu", "gamma": (1, 100) }], warmup=warmup, runs=runs) # Sigmoid => Covered as part of Unary ops # Hard_Sigmoid hard_sigmoid_benchmark_res = run_performance_test([nd.hard_sigmoid], run_backward=True, dtype=dtype, ctx=ctx, inputs=[{ "data": (1024, 1024), "alpha": 0.25, "beta": 0.5 }, { "data": (10000, 1), "alpha": 0.25, "beta": 0.5 }, { "data": (10000, 100), "alpha": 0.25, "beta": 0.5 }], warmup=warmup, runs=runs) # Softmax, LogSoftmax softmax_benchmark_res = run_performance_test([nd.softmax, nd.log_softmax], run_backward=True, dtype=dtype, ctx=ctx, inputs=[{ "data": (1024, 1024), "axis": -1, "temperature": 0.5 }, { "data": (10000, 1), "axis": -1, "temperature": 0.5 }, { "data": (10000, 100), "axis": -1, "temperature": 0.5 }], warmup=warmup, runs=runs) # Prepare combined results mx_activation_op_results = merge_map_list(relu_benchmark_res + hard_sigmoid_benchmark_res + softmax_benchmark_res) return mx_activation_op_results
def run_join_split_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='native', int64_tensor='off', warmup=25, runs=100): """Runs benchmarks with the given context and precision (dtype) for all the join & split operators in MXNet. Parameters ---------- ctx: mx.ctx Context to run benchmarks dtype: str, default 'float32' Precision to use for benchmarks profiler: str, default 'native' Type of Profiler to use (native/python) int64_tensor: str, default 'off' Input tensor size to use for tests (if on, dimensions >= 2**32) warmup: int, default 25 Number of times to run for warmup runs: int, default 100 Number of runs to capture benchmark results Returns ------- Dictionary of results. Key -> Name of the operator, Value -> Benchmark results. """ # backward not supported for all 3 ops - concat, stack, split # concat concat_benchmark_res = run_performance_test([getattr(MX_OP_MODULE, "concat")], run_backward=False, dtype=dtype, ctx=ctx, profiler=profiler, inputs=[{"args0":nd.random_normal(shape=(100,100)), "args1":nd.random_normal(shape=(100,100)), "args2":nd.random_normal(shape=(100,100))} ], warmup=warmup, runs=runs) # split split_benchmark_res = run_performance_test([getattr(MX_OP_MODULE, "split")], run_backward=False, dtype=dtype, ctx=ctx, profiler=profiler, inputs=[{"data": (1024, 1024), "num_outputs": 2}, {"data": (10000, 1), "num_outputs": 1}, {"data": (10000, 100), "num_outputs": 10} ], warmup=warmup, runs=runs) # stack stack_benchmark_res = run_performance_test([getattr(MX_OP_MODULE, "stack")], run_backward=False, dtype=dtype, ctx=ctx, profiler=profiler, inputs=[{"args0":nd.random_normal(shape=(100,100)), "args1":nd.random_normal(shape=(100,100)), "args2":nd.random_normal(shape=(100,100))} ], warmup=warmup, runs=runs) mx_join_split_op_results = merge_map_list(concat_benchmark_res + split_benchmark_res + stack_benchmark_res) return mx_join_split_op_results
def run_pooling_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='native', int64_tensor='off', warmup=25, runs=100): """Runs benchmarks with the given context, precision (dtype), and input data size (int64_tensor) for all the pooling operators in MXNet. Parameters ---------- ctx: mx.ctx Context to run benchmarks dtype: str, default 'float32' Precision to use for benchmarks profiler: str, default 'native' Type of Profiler to use (native/python) int64_tensor: str, default 'off' Input tensor size to use for tests (if on, dimensions >= 2**32) warmup: int, default 25 Number of times to run for warmup runs: int, default 100 Number of runs to capture benchmark results Returns ------- Dictionary of results. Key -> Name of the operator, Value -> Benchmark results. """ pool_types = ['avg', 'max', 'sum'] global_pool_types = [0, 1] standard_data_list_pool1d = [(32, 3, 256), (32, 3, 64)] int64_tensor_data_list_pool1d = [(1, 1, 2**32)] standard_data_list_pool2d = [(32, 3, 256, 256), (32, 3, 64, 64)] int64_tensor_data_list_pool2d = [(2**28, 1, 4, 4)] standard_data_list_roipool = [(32, 3, 256, 256), (32, 3, 64, 64)] int64_tensor_data_list_roipool = [(32, 3, 2**13, 2**13)] if int64_tensor == 'on': data_list_pool1d = int64_tensor_data_list_pool1d data_list_pool2d = int64_tensor_data_list_pool2d data_list_roipool = int64_tensor_data_list_roipool else: data_list_pool1d = standard_data_list_pool1d data_list_pool2d = standard_data_list_pool2d data_list_roipool = standard_data_list_roipool # Run 1D and 2D Pooling performance runs pool1d_benchmark_res = [] pool2d_benchmark_res = [] for pool_type in pool_types: for global_pool in global_pool_types: for pool1d_data in data_list_pool1d: pool1d_benchmark_res += run_performance_test([getattr(MX_OP_MODULE, "Pooling")], run_backward=True, dtype=dtype, ctx=ctx, profiler=profiler, inputs=[{"data": pool1d_data, "kernel": 3, "pool_type": pool_type, "global_pool": global_pool, "stride": 1, "pad": 1} ], warmup=warmup, runs=runs) for pool2d_data in data_list_pool2d: pool2d_benchmark_res += run_performance_test([getattr(MX_OP_MODULE, "Pooling")], run_backward=True, dtype=dtype, ctx=ctx, profiler=profiler, inputs=[{"data": pool2d_data, "kernel": (3, 3), "pool_type": pool_type, "global_pool": global_pool, "stride": (1, 1), "pad": (0, 0)} ], warmup=warmup, runs=runs) # Run ROI Pooling performance runs roipool_benchmark_res = [] for roipool_data in data_list_roipool: roipool_benchmark_res += run_performance_test([getattr(MX_OP_MODULE, "ROIPooling")], run_backward=True, dtype=dtype, ctx=ctx, profiler=profiler, inputs=[{"data": roipool_data, "rois": (32, 5), "pooled_size": (2, 2), "spatial_scale": .5} ], warmup=warmup, runs=runs) # Prepare combined results mx_pooling_op_results = merge_map_list(pool1d_benchmark_res + pool2d_benchmark_res + roipool_benchmark_res) return mx_pooling_op_results
def run_transpose_convolution_operators_benchmarks(ctx=mx.cpu(), profiler='native', int64_tensor='off', dtype='float32', warmup=25, runs=100): """Runs benchmarks with the given context, precision (dtype), and input data size (int64_tensor) for all the transpose convolution operators in MXNet. Parameters ---------- ctx: mx.ctx Context to run benchmarks dtype: str, default 'float32' Precision to use for benchmarks profiler: str, default 'native' Type of Profiler to use (native/python) int64_tensor: str, default 'off' Input tensor size to use for tests (if on, dimensions >= 2**32) warmup: int, default 25 Number of times to run for warmup runs: int, default 100 Number of runs to capture benchmark results Returns ------- Dictionary of results. Key -> Name of the operator, Value -> Benchmark results. """ standard_data_list_conv1d_transpose = [(32, 3, 256), (32, 3, 64)] int64_tensor_data_list_conv1d_transpose = [(2**30, 1, 4)] standard_weight_conv1d_transpose = (3, 1, 3) int64_tensor_weight_conv1d_transpose = (1, 1, 1) standard_kernel_conv1d_transpose = (3,) int64_tensor_kernel_conv1d_transpose = (1,) standard_data_list_conv2d_transpose = [(32, 3, 256, 256), (32, 3, 64, 64)] int64_tensor_data_list_conv2d_transpose = [(2**28, 1, 4, 4)] standard_weight_conv2d_transpose = (3, 1, 3, 3) int64_tensor_weight_conv2d_transpose = (1, 1, 1, 1) standard_kernel_conv2d_transpose = (3, 3) int64_tensor_kernel_conv2d_transpose = (1, 1) if int64_tensor == 'on': data_list_conv1d_transpose = int64_tensor_data_list_conv1d_transpose weight_conv1d_transpose = int64_tensor_weight_conv1d_transpose kernel_conv1d_transpose = int64_tensor_kernel_conv1d_transpose data_list_conv2d_transpose = int64_tensor_data_list_conv2d_transpose weight_conv2d_transpose = int64_tensor_weight_conv2d_transpose kernel_conv2d_transpose = int64_tensor_kernel_conv2d_transpose else: data_list_conv1d_transpose = standard_data_list_conv1d_transpose weight_conv1d_transpose = standard_weight_conv1d_transpose kernel_conv1d_transpose = standard_kernel_conv1d_transpose data_list_conv2d_transpose = standard_data_list_conv2d_transpose weight_conv2d_transpose = standard_weight_conv2d_transpose kernel_conv2d_transpose = standard_kernel_conv2d_transpose # Conv1DTranspose Benchmarks conv1d_transpose_benchmark_res = [] for conv_data in data_list_conv1d_transpose: conv1d_transpose_benchmark_res += run_performance_test([getattr(MX_OP_MODULE, "Deconvolution")], run_backward=True, dtype=dtype, ctx=ctx, profiler=profiler, inputs=[{"data": conv_data, "weight": weight_conv1d_transpose, "bias": (1,), "kernel": kernel_conv1d_transpose, "stride": (1,), "dilate": (1,), "pad": (0,), "num_filter": 1, "no_bias": False, "layout": 'NCW'}], warmup=warmup, runs=runs) # Conv2DTranspose Benchmarks conv2d_transpose_benchmark_res = [] for conv_data in data_list_conv2d_transpose: conv2d_transpose_benchmark_res += run_performance_test([getattr(MX_OP_MODULE, "Deconvolution")], run_backward=True, dtype=dtype, ctx=ctx, profiler=profiler, inputs=[{"data": conv_data, "weight": weight_conv2d_transpose, "bias": (1,), "kernel": kernel_conv2d_transpose, "stride": (1, 1), "pad": (0, 0), "num_filter": 1, "no_bias": False, "layout": 'NCHW'}], warmup=warmup, runs=runs) # Prepare combined results mx_transpose_conv_op_results = merge_map_list(conv1d_transpose_benchmark_res + conv2d_transpose_benchmark_res) return mx_transpose_conv_op_results
def run_nn_basic_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='native', warmup=25, runs=100): # FullyConnnected operator benchmarks fc_benchmark_res = run_performance_test( [getattr(MX_OP_MODULE, "FullyConnected")], run_backward=True, dtype=dtype, ctx=ctx, profiler=profiler, inputs=[{ "data": (32, 3, 256, 256), "num_hidden": 64, "weight": (64, 3 * 256 * 256), "bias": (64, ), "flatten": True }, { "data": (32, 3, 256, 256), "num_hidden": 64, "weight": (64, 256), "bias": (64, ), "flatten": False }], warmup=warmup, runs=runs) # Dropout benchmarks dropout_benchmark_res = run_performance_test( [getattr(MX_OP_MODULE, "Dropout")], run_backward=True, dtype=dtype, ctx=ctx, profiler=profiler, inputs=[{ "data": (32, 3, 256, 256), "p": 0.5, "mode": "always" }, { "data": (10000, 10), "p": 0.5, "mode": "always" }], warmup=warmup, runs=runs) # BatchNorm benchmarks batchnorm_benchmark_res = run_performance_test( [getattr(MX_OP_MODULE, "BatchNorm")], run_backward=True, dtype=dtype, ctx=ctx, profiler=profiler, inputs=[{ "data": (32, 3, 256, 256), "gamma": (3, ), "beta": (3, ), "moving_mean": (3, ), "moving_var": (3, ) }, { "data": (32, 3, 10000, 10), "gamma": (3, ), "beta": (3, ), "moving_mean": (3, ), "moving_var": (3, ) }], warmup=warmup, runs=runs) # Prepare combined results mx_basic_nn_results = merge_map_list(fc_benchmark_res + dropout_benchmark_res + batchnorm_benchmark_res) return mx_basic_nn_results
def run_gemm_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='native', warmup=25, runs=100): """Runs benchmarks with the given context and precision (dtype)for all the GEMM operators (dot, batch_dot, khatri_rao) in MXNet. Parameters ---------- ctx: mx.ctx Context to run benchmarks dtype: str, default 'float32' Precision to use for benchmarks profiler: str, default 'native' Type of Profiler to use (native/python) warmup: int, default 25 Number of times to run for warmup runs: int, default 100 Number of runs to capture benchmark results Returns ------- Dictionary of results. Key -> Name of the operator, Value -> Benchmark results. """ # Benchmark tests for dot operator dot_benchmark_res = run_performance_test([getattr(MX_OP_MODULE, "dot")], run_backward=True, dtype=dtype, ctx=ctx, inputs=[{ "lhs": (1024, 1024), "rhs": (1024, 1024) }, { "lhs": (1000, 10), "rhs": (1000, 10), "transpose_b": True }, { "lhs": (1000, 1), "rhs": (100, 1000), "transpose_a": True, "transpose_b": True }], warmup=warmup, runs=runs, profiler=profiler) # Benchmark tests for batch_dot operator batch_dot_benchmark_res = run_performance_test( [getattr(MX_OP_MODULE, "batch_dot")], run_backward=True, dtype=dtype, ctx=ctx, inputs=[{ "lhs": (32, 1024, 1024), "rhs": (32, 1024, 1024) }, { "lhs": (32, 1000, 10), "rhs": (32, 1000, 10), "transpose_b": True }, { "lhs": (32, 1000, 1), "rhs": (32, 100, 1000), "transpose_a": True, "transpose_b": True }], warmup=warmup, runs=runs, profiler=profiler) # Operator khatri_rao is not yet implemented for GPU khatri_rao_benchmark_res = [] if ctx != mx.gpu(): # Benchmark tests for khatri_rao operator khatri_rao_benchmark_res = run_performance_test( [getattr(MX_OP_MODULE, "khatri_rao")], run_backward=False, dtype=dtype, ctx=ctx, inputs=[{ "args": [(32, 32), (32, 32)] }, { "args": [(64, 64), (64, 64)] }], warmup=warmup, runs=runs, profiler=profiler) # Prepare combined results for GEMM operators mx_gemm_op_results = merge_map_list(dot_benchmark_res + batch_dot_benchmark_res + khatri_rao_benchmark_res) return mx_gemm_op_results
def run_gemm_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='native', int64_tensor='off', warmup=25, runs=100): """Runs benchmarks with the given context, precision (dtype), and input data size (int64_tensor) for all the GEMM operators (dot, batch_dot, khatri_rao) in MXNet. Parameters ---------- ctx: mx.ctx Context to run benchmarks dtype: str, default 'float32' Precision to use for benchmarks profiler: str, default 'native' Type of Profiler to use (native/python) int64_tensor: str, default 'off' Input tensor size to use for tests (if on, dimensions >= 2**32) warmup: int, default 25 Number of times to run for warmup runs: int, default 100 Number of runs to capture benchmark results Returns ------- Dictionary of results. Key -> Name of the operator, Value -> Benchmark results. """ standard_inputs_dot = [{ "lhs": (1024, 1024), "rhs": (1024, 1024) }, { "lhs": (1000, 10), "rhs": (1000, 10), "transpose_b": True }, { "lhs": (1000, 1), "rhs": (100, 1000), "transpose_a": True, "transpose_b": True }] int64_tensor_inputs_dot = [{ "lhs": (2**16, 2**16), "rhs": (2**16, 2**16) }, { "lhs": (4, 2**30), "rhs": (4, 2**30), "transpose_b": True }, { "lhs": (2**28, 16), "rhs": (16, 2**28), "transpose_a": True, "transpose_b": True }] standard_inputs_batch_dot = [{ "lhs": (32, 1024, 1024), "rhs": (32, 1024, 1024) }, { "lhs": (32, 1000, 10), "rhs": (32, 1000, 10), "transpose_b": True }, { "lhs": (32, 1000, 1), "rhs": (32, 100, 1000), "transpose_a": True, "transpose_b": True }] int64_tensor_inputs_batch_dot = [{ "lhs": (1, 2**16, 2**16), "rhs": (1, 2**16, 2**16) }, { "lhs": (1, 4, 2**30), "rhs": (1, 4, 2**30), "transpose_b": True }, { "lhs": (1, 2**28, 16), "rhs": (1, 16, 2**28), "transpose_a": True, "transpose_b": True }] standard_inputs_khatri_rao = [{ "args": [(32, 32), (32, 32)] }, { "args": [(64, 64), (64, 64)] }] int64_tensor_inputs_khatri_rao = [{"args": [(2**32, 1), (2**32, 1)]}] if int64_tensor == 'on': inputs_dot = int64_tensor_inputs_dot inputs_batch_dot = int64_tensor_inputs_batch_dot inputs_khatri_rao = int64_tensor_inputs_khatri_rao else: inputs_dot = standard_inputs_dot inputs_batch_dot = standard_inputs_batch_dot inputs_khatri_rao = standard_inputs_khatri_rao # Benchmark tests for dot and batch_dot operators dot_benchmark_res = run_performance_test([getattr(MX_OP_MODULE, "dot")], run_backward=True, dtype=dtype, ctx=ctx, inputs=inputs_dot, warmup=warmup, runs=runs, profiler=profiler) batch_dot_benchmark_res = run_performance_test( [getattr(MX_OP_MODULE, "batch_dot")], run_backward=True, dtype=dtype, ctx=ctx, inputs=inputs_batch_dot, warmup=warmup, runs=runs, profiler=profiler) # Operator khatri_rao is not yet implemented for GPU khatri_rao_benchmark_res = [] if ctx != mx.gpu(): # Benchmark tests for khatri_rao operator khatri_rao_benchmark_res = run_performance_test( [getattr(MX_OP_MODULE, "khatri_rao")], run_backward=False, dtype=dtype, ctx=ctx, inputs=inputs_khatri_rao, warmup=warmup, runs=runs, profiler=profiler) # Prepare combined results for GEMM operators mx_gemm_op_results = merge_map_list(dot_benchmark_res + batch_dot_benchmark_res + khatri_rao_benchmark_res) return mx_gemm_op_results
def run_all_mxnet_operator_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='native', int64_tensor='off', warmup=25, runs=100): """Run all the MXNet operators (NDArray) benchmarks. Returns ------- Dictionary of benchmark results. """ mxnet_operator_benchmark_results = [] # *************************MXNET TENSOR OPERATOR BENCHMARKS***************************** # Run all Unary operations benchmarks with default input values mxnet_operator_benchmark_results.append( run_mx_unary_operators_benchmarks(ctx=ctx, dtype=dtype, profiler=profiler, int64_tensor=int64_tensor, warmup=warmup, runs=runs)) # Run all Binary Broadcast, element_wise, and miscellaneous operations benchmarks with default input values mxnet_operator_benchmark_results.append( run_mx_binary_broadcast_operators_benchmarks(ctx=ctx, dtype=dtype, profiler=profiler, int64_tensor=int64_tensor, warmup=warmup, runs=runs)) mxnet_operator_benchmark_results.append( run_mx_binary_element_wise_operators_benchmarks( ctx=ctx, dtype=dtype, profiler=profiler, int64_tensor=int64_tensor, warmup=warmup, runs=runs)) mxnet_operator_benchmark_results.append( run_mx_binary_misc_operators_benchmarks(ctx=ctx, dtype=dtype, profiler=profiler, int64_tensor=int64_tensor, warmup=warmup, runs=runs)) # Run all GEMM operations benchmarks with default input values mxnet_operator_benchmark_results.append( run_gemm_operators_benchmarks(ctx=ctx, dtype=dtype, profiler=profiler, int64_tensor=int64_tensor, warmup=warmup, runs=runs)) # Run all Random sampling operations benchmarks with default input values mxnet_operator_benchmark_results.append( run_mx_random_sampling_operators_benchmarks(ctx=ctx, dtype=dtype, profiler=profiler, int64_tensor=int64_tensor, warmup=warmup, runs=runs)) # Run all Reduction operations benchmarks with default input values mxnet_operator_benchmark_results.append( run_mx_reduction_operators_benchmarks(ctx=ctx, dtype=dtype, profiler=profiler, int64_tensor=int64_tensor, warmup=warmup, runs=runs)) # Run all Sorting and Searching operations benchmarks with default input values mxnet_operator_benchmark_results.append( run_sorting_searching_operators_benchmarks(ctx=ctx, dtype=dtype, profiler=profiler, int64_tensor=int64_tensor, warmup=warmup, runs=runs)) # Run all Indexing routines benchmarks with default input values mxnet_operator_benchmark_results.append( run_indexing_routines_benchmarks(ctx=ctx, dtype=dtype, profiler=profiler, int64_tensor=int64_tensor, warmup=warmup, runs=runs)) # Run all Array Rearrange operations benchmarks with default input values mxnet_operator_benchmark_results.append( run_rearrange_operators_benchmarks(ctx=ctx, dtype=dtype, profiler=profiler, int64_tensor=int64_tensor, warmup=warmup, runs=runs)) # Run all Array Shape Manipulation operations benchmarks with default input values mxnet_operator_benchmark_results.append( run_shape_operators_benchmarks(ctx=ctx, dtype=dtype, profiler=profiler, int64_tensor=int64_tensor, warmup=warmup, runs=runs)) # Run all Array Expansion operations benchmarks with default input values mxnet_operator_benchmark_results.append( run_expanding_operators_benchmarks(ctx=ctx, dtype=dtype, profiler=profiler, int64_tensor=int64_tensor, warmup=warmup, runs=runs)) # Run all Array Rounding operations benchmarks with default input values mxnet_operator_benchmark_results.append( run_rounding_operators_benchmarks(ctx=ctx, dtype=dtype, profiler=profiler, int64_tensor=int64_tensor, warmup=warmup, runs=runs)) # Run all Array Join & Split operations benchmarks with default input values mxnet_operator_benchmark_results.append( run_join_split_operators_benchmarks(ctx=ctx, dtype=dtype, profiler=profiler, int64_tensor=int64_tensor, warmup=warmup, runs=runs)) # ************************ MXNET NN OPERATOR BENCHMARKS **************************** # Run all basic NN operations benchmarks with default input values mxnet_operator_benchmark_results.append( run_nn_basic_operators_benchmarks(ctx=ctx, dtype=dtype, profiler=profiler, int64_tensor=int64_tensor, warmup=warmup, runs=runs)) # Run all Activation operations benchmarks with default input values mxnet_operator_benchmark_results.append( run_activation_operators_benchmarks(ctx=ctx, dtype=dtype, profiler=profiler, int64_tensor=int64_tensor, warmup=warmup, runs=runs)) # Run all Pooling operations benchmarks with default input values mxnet_operator_benchmark_results.append( run_pooling_operators_benchmarks(ctx=ctx, dtype=dtype, profiler=profiler, int64_tensor=int64_tensor, warmup=warmup, runs=runs)) # Run all Convolution operations benchmarks with default input values mxnet_operator_benchmark_results.append( run_convolution_operators_benchmarks(ctx=ctx, dtype=dtype, profiler=profiler, int64_tensor=int64_tensor, warmup=warmup, runs=runs)) # Run all Optimizer operations benchmarks with default input values mxnet_operator_benchmark_results.append( run_optimizer_operators_benchmarks(ctx=ctx, dtype=dtype, profiler=profiler, int64_tensor=int64_tensor, warmup=warmup, runs=runs)) # Run all Transpose Convolution operations benchmarks with default input values mxnet_operator_benchmark_results.append( run_transpose_convolution_operators_benchmarks( ctx=ctx, dtype=dtype, profiler=profiler, int64_tensor=int64_tensor, warmup=warmup, runs=runs)) # Run all NN loss operations benchmarks with default input values mxnet_operator_benchmark_results.append( run_loss_operators_benchmarks(ctx=ctx, dtype=dtype, profiler=profiler, int64_tensor=int64_tensor, warmup=warmup, runs=runs)) # Run all Miscellaneous operations benchmarks with default input values mxnet_operator_benchmark_results.append( run_mx_misc_operators_benchmarks(ctx=ctx, dtype=dtype, profiler=profiler, int64_tensor=int64_tensor, warmup=warmup, runs=runs)) # Linear Algebra operators do not work with int64 tensor data. Issue tracked here: https://github.com/apache/incubator-mxnet/issues/17716 if int64_tensor == 'off': # Run all Linear Algebra operations benchmarks with default input values mxnet_operator_benchmark_results.append( run_linalg_operators_benchmarks(ctx=ctx, dtype=dtype, profiler=profiler, int64_tensor=int64_tensor, warmup=warmup, runs=runs)) # ****************************** PREPARE FINAL RESULTS ******************************** final_benchmark_result_map = merge_map_list( mxnet_operator_benchmark_results) return final_benchmark_result_map
def run_mx_misc_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='native', warmup=25, runs=100): """Runs benchmarks with the given context and precision (dtype) for all the miscellaneous operators in MXNet. Parameters ---------- ctx: mx.ctx Context to run benchmarks dtype: str, default 'float32' Precision to use for benchmarks profiler: str, default 'native' Type of Profiler to use (native/python) warmup: int, default 25 Number of times to run for warmup runs: int, default 100 Number of runs to capture benchmark results Returns ------- Dictionary of results. Key -> Name of the operator, Value -> Benchmark results. """ # Individual tests for ops with positional args array_ops_benchmark = run_performance_test([getattr(MX_OP_MODULE, "reset_arrays"), getattr(MX_OP_MODULE, "multi_all_finite"), getattr(MX_OP_MODULE, "multi_sum_sq")], run_backward=False, dtype=dtype, ctx=ctx, profiler=profiler, inputs=[{"args": [(1024, 1024)], "num_arrays": 1}, {"args": [(10000, 1)], "num_arrays": 1}, {"args": [(10000, 10)], "num_arrays": 1}], warmup=warmup, runs=runs) add_n_benchmark = run_performance_test([getattr(MX_OP_MODULE, "add_n")], run_backward=True, dtype=dtype, ctx=ctx, profiler=profiler, inputs=[{"args": [(1024, 1024)]}, {"args": [(10000, 1)]}, {"args": [(10000, 10)]}], warmup=warmup, runs=runs) # There are currently issus with UpSampling with bilinear interpolation. # track issue here: https://github.com/apache/incubator-mxnet/issues/9138 upsampling_benchmark = run_performance_test([getattr(MX_OP_MODULE, "UpSampling")], run_backward=True, dtype=dtype, ctx=ctx, profiler=profiler, inputs=[{"args": (32, 3, 256, 256), "scale": 2, "sample_type": "nearest"}, {"args": (32, 3, 10000, 1), "scale": 4, "sample_type": "nearest"}], warmup=warmup, runs=runs) # Create and register CustomAddOne operator for use in Custom op testing c = CustomAddOneProp() c.create_operator(ctx, [(1024,1024)], [dtype]) custom_benchmark = run_performance_test([getattr(MX_OP_MODULE, "Custom")], run_backward=True, dtype=dtype, ctx=ctx, profiler=profiler, inputs=[{"args": [(1024, 1024)], "op_type": "CustomAddOne"}, {"args": [(10000, 1)], "op_type": "CustomAddOne"}, {"args": [(10000, 10)], "op_type": "CustomAddOne"}], warmup=warmup, runs=runs) # Fetch remaining Miscellaneous Operators mx_misc_ops = get_remaining_miscellaneous_operators() # Run benchmarks mx_misc_op_results = run_op_benchmarks(mx_misc_ops, dtype, ctx, profiler, warmup, runs) return merge_map_list(array_ops_benchmark + add_n_benchmark + upsampling_benchmark + custom_benchmark + [mx_misc_op_results])
def run_nn_basic_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='native', int64_tensor='off', warmup=25, runs=100): """Runs benchmarks with the given context, precision (dtype), and data size (int64_tensor) for all the basic neural network operators in MXNet. Parameters ---------- ctx: mx.ctx Context to run benchmarks dtype: str, default 'float32' Precision to use for benchmarks profiler: str, default 'native' Type of Profiler to use (native/python) int64_tensor: str, default 'off' Input tensor size to use for tests (if on, dimensions >= 2**32) warmup: int, default 25 Number of times to run for warmup runs: int, default 100 Number of runs to capture benchmark results Returns ------- Dictionary of results. Key -> Name of the operator, Value -> Benchmark results. """ standard_data_list = [(1024, 4, 4)] int64_tensor_data_list = [(2**28, 4, 4)] if int64_tensor == 'on': data_list = int64_tensor_data_list else: data_list = standard_data_list for data in data_list: rnn_relu_benchmark = run_performance_test( [getattr(MX_OP_MODULE, "RNN")], run_backward=True, dtype=dtype, ctx=ctx, profiler=profiler, inputs=[{ "data": data, "parameters": (7, ), "state": (1, 4, 1), "mode": "rnn_relu", "state_size": 1, "num_layers": 1 }], warmup=warmup, runs=runs) rnn_tanh_benchmark = run_performance_test( [getattr(MX_OP_MODULE, "RNN")], run_backward=True, dtype=dtype, ctx=ctx, profiler=profiler, inputs=[{ "data": data, "parameters": (7, ), "state": (1, 4, 1), "mode": "rnn_tanh", "state_size": 1, "num_layers": 1 }], warmup=warmup, runs=runs) rnn_lstm_benchmark = run_performance_test( [getattr(MX_OP_MODULE, "RNN")], run_backward=True, dtype=dtype, ctx=ctx, profiler=profiler, inputs=[{ "data": data, "parameters": (28, ), "state": (1, 4, 1), "state_cell": (1, 4, 1), "mode": "lstm", "state_size": 1, "num_layers": 1 }], warmup=warmup, runs=runs) rnn_gru_benchmark = run_performance_test( [getattr(MX_OP_MODULE, "RNN")], run_backward=True, dtype=dtype, ctx=ctx, profiler=profiler, inputs=[{ "data": data, "parameters": (21, ), "state": (1, 4, 1), "mode": "gru", "state_size": 1, "num_layers": 1 }], warmup=warmup, runs=runs) # Fetch all NN Basic Operators mx_nn_basic_ops = get_all_nn_basic_operators() # Run benchmarks mx_nn_basic_op_results = run_op_benchmarks(mx_nn_basic_ops, dtype, ctx, profiler, int64_tensor, warmup, runs) return merge_map_list(rnn_relu_benchmark + rnn_tanh_benchmark + rnn_lstm_benchmark + rnn_gru_benchmark + [mx_nn_basic_op_results])
def run_optimizer_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='native', warmup=25, runs=100): """Runs benchmarks with the given context and precision (dtype) for all the neural network optimizer update operators in MXNet. Parameters ---------- ctx: mx.ctx Context to run benchmarks dtype: str, default 'float32' Precision to use for benchmarks profiler: str, default 'native' Type of Profiler to use (native/python) warmup: int, default 25 Number of times to run for warmup runs: int, default 100 Number of runs to capture benchmark results Returns ------- Dictionary of results. Key -> Name of the operator, Value -> Benchmark results. """ # Run independent tests for ops that need specific input data multi_mp_sgd_mom_res = run_performance_test([getattr(MX_OP_MODULE, "multi_mp_sgd_mom_update")], inputs=[{"args0": nd.random_normal(shape=(5,5)), "args1": nd.random_normal(shape=(5,5)), "args2": nd.random_normal(shape=(5,5)), "args3": nd.random_normal(shape=(5,5)), "lrs": 0.1, "wds": 0.2, "out": nd.random_normal(shape=(5,5))}],run_backward=False) multi_sgd_mom_res = run_performance_test([getattr(MX_OP_MODULE, "multi_sgd_mom_update")], inputs=[{"args0": nd.random_normal(shape=(5,5)), "args1": nd.random_normal(shape=(5,5)),"args2": nd.random_normal(shape=(5,5)), "lrs": 0.1, "wds": 0.2, "out": nd.random_normal(shape=(5,5))}], run_backward=False) multi_sgd_res = run_performance_test([getattr(MX_OP_MODULE, "multi_sgd_update")], inputs=[{"args0": nd.random_normal(shape=(5,5)), "args1": nd.random_normal(shape=(5,5)), "lrs": 0.1, "wds": 0.2, "out": nd.random_normal(shape=(5,5))}], run_backward=False) multi_mp_sgd_res = run_performance_test([getattr(MX_OP_MODULE, "multi_mp_sgd_update")], inputs=[{"args0": nd.random_normal(shape=(5,5)), "args1": nd.random_normal(shape=(5,5)),"args2": nd.random_normal(shape=(5,5)), "lrs": 0.1, "wds": 0.2, "out": nd.random_normal(shape=(5,5))}], run_backward=False) preloaded_multi_mp_sgd_res = run_performance_test( [getattr(MX_OP_MODULE, "preloaded_multi_mp_sgd_update")], inputs=[{"args0": nd.random_normal(shape=(5,5)), "args1": nd.random_normal(shape=(5,5)), "args2": nd.random_normal(shape=(5,5)), "args3": nd.random_normal(shape=(1)), "args4": nd.random_normal(shape=(1)), "out": nd.random_normal(shape=(5,5))}], run_backward=False) preloaded_multi_sgd_mom_res = run_performance_test( [getattr(MX_OP_MODULE, "preloaded_multi_sgd_mom_update")], inputs=[{"args0": nd.random_normal(shape=(5,5)), "args1": nd.random_normal(shape=(5,5)), "args2": nd.random_normal(shape=(5,5)), "args3": nd.random_normal(shape=(1)), "args4": nd.random_normal(shape=(1)), "out": nd.random_normal(shape=(5,5))}], run_backward=False) preloaded_multi_sgd_res = run_performance_test( [getattr(MX_OP_MODULE, "preloaded_multi_sgd_update")], inputs=[{"args0": nd.random_normal(shape=(5,5)), "args1": nd.random_normal(shape=(5,5)), "args4": nd.random_normal(shape=(1)), "args5": nd.random_normal(shape=(1)), "out": nd.random_normal(shape=(5,5))}], run_backward=False) preloaded_multi_mp_sgd_mom_res = run_performance_test( [getattr(MX_OP_MODULE, "preloaded_multi_mp_sgd_mom_update")], inputs=[{"args0": nd.random_normal(shape=(5,5)), "args1": nd.random_normal(shape=(5,5)), "args2": nd.random_normal(shape=(5,5)), "args3": nd.random_normal(shape=(5,5)), "args4": nd.random_normal(shape=(1)), "args5": nd.random_normal(shape=(1)), "out": nd.random_normal(shape=(5,5))}], run_backward=False) # Fetch remaining optimizer operators mx_optimizer_ops = get_all_optimizer_operators() # Run benchmarks mx_optimizer_op_results = run_op_benchmarks(mx_optimizer_ops, dtype, ctx, profiler, warmup, runs) return merge_map_list(multi_sgd_mom_res + multi_sgd_mom_res + multi_sgd_res + multi_mp_sgd_res + preloaded_multi_mp_sgd_res +\ preloaded_multi_sgd_mom_res + preloaded_multi_mp_sgd_res + preloaded_multi_mp_sgd_mom_res +\ [mx_optimizer_op_results])
def run_all_mxnet_operator_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='native', warmup=25, runs=100): """Run all the MXNet operators (NDArray) benchmarks. Returns ------- Dictionary of benchmark results. """ mxnet_operator_benchmark_results = [] # *************************MXNET TENSOR OPERATOR BENCHMARKS***************************** # Run all Unary operations benchmarks with default input values mxnet_operator_benchmark_results.append( run_mx_unary_operators_benchmarks(ctx=ctx, dtype=dtype, profiler=profiler, warmup=warmup, runs=runs)) # Run all Binary Broadcast, element_wise, and miscellaneous operations benchmarks with default input values mxnet_operator_benchmark_results.append( run_mx_binary_broadcast_operators_benchmarks(ctx=ctx, dtype=dtype, profiler=profiler, warmup=warmup, runs=runs)) mxnet_operator_benchmark_results.append( run_mx_binary_element_wise_operators_benchmarks(ctx=ctx, dtype=dtype, profiler=profiler, warmup=warmup, runs=runs)) mxnet_operator_benchmark_results.append( run_mx_binary_misc_operators_benchmarks(ctx=ctx, dtype=dtype, profiler=profiler, warmup=warmup, runs=runs)) # Run all GEMM operations benchmarks with default input values mxnet_operator_benchmark_results.append( run_gemm_operators_benchmarks(ctx=ctx, dtype=dtype, profiler=profiler, warmup=warmup, runs=runs)) # Run all Random sampling operations benchmarks with default input values mxnet_operator_benchmark_results.append( run_mx_random_sampling_operators_benchmarks(ctx=ctx, dtype=dtype, profiler=profiler, warmup=warmup, runs=runs)) # Run all Reduction operations benchmarks with default input values mxnet_operator_benchmark_results.append( run_mx_reduction_operators_benchmarks(ctx=ctx, dtype=dtype, profiler=profiler, warmup=warmup, runs=runs)) # Run all Sorting and Searching operations benchmarks with default input values mxnet_operator_benchmark_results.append( run_sorting_searching_operators_benchmarks(ctx=ctx, dtype=dtype, profiler=profiler, warmup=warmup, runs=runs)) # Run all Array Rearrange operations benchmarks with default input values mxnet_operator_benchmark_results.append( run_rearrange_operators_benchmarks(ctx=ctx, dtype=dtype, profiler=profiler, warmup=warmup, runs=runs)) # Run all Indexing routines benchmarks with default input values mxnet_operator_benchmark_results.append( run_indexing_routines_benchmarks(ctx=ctx, dtype=dtype, profiler=profiler, warmup=warmup, runs=runs)) # ************************ MXNET NN OPERATOR BENCHMARKS **************************** # Run all basic NN operations benchmarks with default input values mxnet_operator_benchmark_results.append( run_nn_basic_operators_benchmarks(ctx=ctx, dtype=dtype, profiler=profiler, warmup=warmup, runs=runs)) # Run all Activation operations benchmarks with default input values mxnet_operator_benchmark_results.append( run_activation_operators_benchmarks(ctx=ctx, dtype=dtype, profiler=profiler, warmup=warmup, runs=runs)) # Run all Pooling operations benchmarks with default input values mxnet_operator_benchmark_results.append( run_pooling_operators_benchmarks(ctx=ctx, dtype=dtype, profiler=profiler, warmup=warmup, runs=runs)) # Run all Convolution operations benchmarks with default input values mxnet_operator_benchmark_results.append( run_convolution_operators_benchmarks(ctx=ctx, dtype=dtype, profiler=profiler, warmup=warmup, runs=runs)) # Run all Optimizer operations benchmarks with default input values mxnet_operator_benchmark_results.append( run_optimizer_operators_benchmarks(ctx=ctx, dtype=dtype, profiler=profiler, warmup=warmup, runs=runs)) # Run all Transpose Convolution operations benchmarks with default input values mxnet_operator_benchmark_results.append( run_transpose_convolution_operators_benchmarks(ctx=ctx, dtype=dtype, profiler=profiler, warmup=warmup, runs=runs)) # Run all NN loss operations benchmarks with default input values mxnet_operator_benchmark_results.append( run_loss_operators_benchmarks(ctx=ctx, dtype=dtype, profiler=profiler, warmup=warmup, runs=runs)) # Run all Miscellaneous operations benchmarks with default input values mxnet_operator_benchmark_results.append( run_mx_misc_operators_benchmarks(ctx=ctx, dtype=dtype, profiler=profiler, warmup=warmup, runs=runs)) # Run all Linear Algebra operations benchmarks with default input values mxnet_operator_benchmark_results.append( run_linalg_operators_benchmarks(ctx=ctx, dtype=dtype, profiler=profiler, warmup=warmup, runs=runs)) # ****************************** PREPARE FINAL RESULTS ******************************** final_benchmark_result_map = merge_map_list( mxnet_operator_benchmark_results) return final_benchmark_result_map