def run_cast_storage_synthetic(): def dense_to_sparse(m, n, density, ctx, repeat, stype): set_default_context(ctx) data_shape = (m, n) dns_data = rand_ndarray(data_shape, stype, density).tostype('default') dns_data.wait_to_read() # do one warm up run, verify correctness assert same( mx.nd.cast_storage(dns_data, stype).asnumpy(), dns_data.asnumpy()) # start benchmarking cost = measure_cost(repeat, mx.nd.cast_storage, dns_data, stype) results = '{:10.1f} {:>10} {:8d} {:8d} {:10.2f}'.format( density * 100, str(ctx), m, n, cost * 1000) print(results) check_call(_LIB.MXSetNumOMPThreads(ctypes.c_int(args.num_omp_threads))) # params # m number of rows # n number of columns # density density of the matrix # num_repeat number of benchmark runs to average over # contexts mx.cpu(), mx.gpu() # note: benchmark different contexts separately; to benchmark cpu, compile without CUDA # benchmarks dns_to_csr, dns_to_rsp m = [512, 512] n = [50000, 100000] density = [1.00, 0.80, 0.60, 0.40, 0.20, 0.10, 0.05, 0.02, 0.01] num_repeat = 10 contexts = [mx.gpu()] benchmarks = ["dns_to_csr", "dns_to_rsp"] # run benchmark for b in benchmarks: stype = '' print("==================================================") if b is "dns_to_csr": stype = 'csr' print(" cast_storage benchmark: dense to csr, size m x n ") elif b is "dns_to_rsp": stype = 'row_sparse' print(" cast_storage benchmark: dense to rsp, size m x n ") else: print("invalid benchmark: %s" % b) continue print("==================================================") headline = '{:>10} {:>10} {:>8} {:>8} {:>10}'.format( 'density(%)', 'context', 'm', 'n', 'time(ms)') print(headline) for i in range(len(n)): for ctx in contexts: for den in density: dense_to_sparse(m[i], n[i], den, ctx, num_repeat, stype) print("") print("")
def main(): args = parse_args() lhs_row_dim = int(args.lhs_row_dim) lhs_col_dim = int(args.lhs_col_dim) rhs_col_dim = int(args.rhs_col_dim) density = float(args.density) lhs_stype = args.lhs_stype rhs_stype = args.rhs_stype if args.rhs_density: rhs_density = float(args.rhs_density) else: rhs_density = density dot_func = mx.nd.sparse.dot if lhs_stype == "csr" else mx.nd.dot check_call(_LIB.MXSetNumOMPThreads(ctypes.c_int(args.num_omp_threads))) bench_dot(lhs_row_dim, lhs_col_dim, rhs_col_dim, density, rhs_density, dot_func, False, lhs_stype, rhs_stype, args.only_storage)
def test_dot_synthetic(data_dict): """benchmark sparse mxnet dot and scipy dot operator with matrices of given density. `t_sparse` is the runtime of the invoked sparse dot operator in ms, while `t_dense` is the runtime of dot(dns, dns), with the same matrices except that they are in default storage type. """ # Benchmark MXNet and Scipys dot operator def bench_dot(lhs_shape, rhs_shape, lhs_stype, rhs_stype, lhs_den, rhs_den, trans_lhs, ctx, num_repeat=10, fw="mxnet", distribution="uniform"): set_default_context(ctx) assert fw == "mxnet" or fw == "scipy" # Set funcs dot_func_sparse = mx.nd.sparse.dot if fw == "mxnet" else sp.spmatrix.dot dot_func_dense = mx.nd.dot if fw == "mxnet" else np.dot # Create matrix instances lhs_nd = rand_ndarray(lhs_shape, lhs_stype, density=lhs_den, distribution=distribution) # only uniform distribution supported for rhs if rhs_stype == 'csr': rhs_nd = rand_ndarray(rhs_shape, rhs_stype, density=rhs_den, distribution=distribution) else: rhs_nd = rand_ndarray(rhs_shape, rhs_stype, density=rhs_den, distribution="uniform") lhs_dns = None rhs_dns = None dense_cost = None sparse_cost = None if fw == "mxnet": lhs_dns = lhs_nd if lhs_stype == 'default' else lhs_nd.tostype( 'default') rhs_dns = rhs_nd if rhs_stype == 'default' else rhs_nd.tostype( 'default') # One warm up run, verify correctness out = dot_func_sparse(lhs_nd, rhs_dns, trans_lhs) out_expected = dot_func_dense(lhs_dns, rhs_dns, trans_lhs) assert_almost_equal(out.asnumpy(), out_expected.asnumpy(), rtol=1e-1, atol=1e-1) sparse_cost = measure_cost(num_repeat, False, False, dot_func_sparse, lhs_nd, rhs_nd, trans_lhs) dense_cost = measure_cost(num_repeat, False, False, dot_func_dense, lhs_dns, rhs_dns, trans_lhs) else: lhs_dns = lhs_nd.asnumpy() rhs_dns = rhs_nd.asnumpy() lhs_nd = sp.csr_matrix(lhs_nd.asnumpy()) rhs_nd = rhs_nd.asnumpy() # One warm up run, verify correctness lhs_nd_copy = sp.spmatrix.transpose( lhs_nd) if trans_lhs else lhs_nd out = dot_func_sparse(lhs_nd_copy, rhs_dns) sparse_cost = measure_cost(num_repeat, trans_lhs, False, dot_func_sparse, lhs_nd, rhs_nd) dense_cost = measure_cost(num_repeat, trans_lhs, True, dot_func_dense, lhs_dns, rhs_dns) speedup = dense_cost / sparse_cost # Print results m = lhs_shape[0] k = lhs_shape[1] n = rhs_shape[1] result_pattern = '{:15.1f} {:15.1f} {:>10} {:8d} {:8d} {:8d} {:13.2f} {:13.2f} {:8.2f}' results = result_pattern.format(lhs_den * 100, rhs_den * 100, str(ctx), m, k, n, sparse_cost * 1000, dense_cost * 1000, speedup) print(results) def print_benchmark_info(lhs, rhs, lhs_trans, fw): trans_str = "^T" if lhs_trans else "" print("========================================================") print(" %s sparse dot benchmark: dot(%s, %s) = %s ") % (fw, lhs, rhs, rhs) print(" (matrix multiplication: (m x k)%s * (k x n) = m x n) ") % ( trans_str) print("========================================================") headline_pattern = '{:>15} {:>15} {:>10} {:>8} {:>8} {:>8} {:>13} {:>13} {:>8}' headline = headline_pattern.format('lhs_density(%)', 'rhs_density(%)', 'context', 'm', 'k', 'n', 't_sparse(ms)', 't_dense(ms)', 'speedup') print(headline) def run_benchmark(ctx=None, lhs="csr", lhs_trans=False, rhs="dns", fw="mxnet", rhs_density=1, distribution="uniform"): if rhs_density > 1 or rhs_density < 0: raise ValueError("rhs_density has to be between 0 and 1") print_benchmark_info(lhs, rhs, lhs_trans, fw) if rhs == "csr": lhs_stype = "default" rhs_stype = "csr" assert (lhs_stype == 'default'), "Only dot(default, csr) supported" # Arrange dimensions according to use case. For below csr will have num_rows << num_cols feature_dim_list = data_dict['batch_size'] batch_size_list = data_dict['m'] output_dim_list = data_dict['feature_dim'] density_list = data_dict['density'] default_output_index = data_dict['default_index']['feature_dim'] default_density_index = data_dict['default_index']['density'] default_feature_index = data_dict['default_index']['batch_size'] default_batch_size_index = data_dict['default_index']['output_dim'] num_repeat = data_dict['num_repeat'] else: lhs_stype = "csr" rhs_stype = "row_sparse" if rhs == "rsp" else "default" feature_dim_list = data_dict['feature_dim'] output_dim_list = data_dict['m'] batch_size_list = data_dict['batch_size'] density_list = data_dict['density'] default_output_index = data_dict['default_index']['output_dim'] default_batch_size_index = data_dict['default_index']['batch_size'] default_feature_index = data_dict['default_index']['feature_dim'] default_density_index = data_dict['default_index']['density'] num_repeat = data_dict['num_repeat'] for output_dim in output_dim_list: if lhs_trans: output_row_dim = batch_size_list[default_batch_size_index] else: output_row_dim = feature_dim_list[default_feature_index] bench_dot((batch_size_list[default_batch_size_index], feature_dim_list[default_feature_index]), (output_row_dim, output_dim), lhs_stype, rhs_stype, density_list[default_density_index], rhs_density, lhs_trans, ctx, num_repeat=num_repeat, fw=fw, distribution=distribution) for feature_dim in feature_dim_list: if lhs_trans: output_row_dim = batch_size_list[default_batch_size_index] else: output_row_dim = feature_dim bench_dot((batch_size_list[default_batch_size_index], feature_dim), (output_row_dim, output_dim_list[default_output_index]), lhs_stype, rhs_stype, density_list[default_density_index], rhs_density, lhs_trans, ctx, num_repeat=num_repeat, fw=fw, distribution=distribution) for batch_size in batch_size_list: if lhs_trans: output_row_dim = batch_size else: output_row_dim = feature_dim_list[default_feature_index] bench_dot((batch_size, feature_dim_list[default_feature_index]), (output_row_dim, output_dim_list[default_output_index]), lhs_stype, rhs_stype, density_list[default_density_index], rhs_density, lhs_trans, ctx, num_repeat=num_repeat, fw=fw, distribution=distribution) for density in density_list: if lhs_trans: output_row_dim = batch_size_list[default_batch_size_index] else: output_row_dim = feature_dim_list[default_feature_index] bench_dot((batch_size_list[default_batch_size_index], feature_dim_list[default_feature_index]), (output_row_dim, output_dim_list[default_output_index]), lhs_stype, rhs_stype, density, density, lhs_trans, ctx, num_repeat=num_repeat, fw=fw, distribution=distribution) check_call(_LIB.MXSetNumOMPThreads(ctypes.c_int(ARGS.num_omp_threads))) context = mx.gpu() if ARGS.gpu else mx.cpu() # TODO(anirudh): make the data dicts to config which can be passed at runtime distributions = ["uniform", "powerlaw"] for distribution in distributions: run_benchmark(context, lhs="csr", rhs="default", lhs_trans=False, fw="mxnet", rhs_density=1, distribution=distribution) run_benchmark(context, lhs="csr", rhs="default", lhs_trans=True, fw="mxnet", rhs_density=1, distribution=distribution) run_benchmark(context, lhs="csr", rhs="rsp", lhs_trans=False, fw="mxnet", rhs_density=0.05, distribution=distribution) run_benchmark(context, lhs="default", rhs="csr", lhs_trans=False, fw="mxnet", rhs_density=0.001, distribution=distribution) if not ARGS.gpu: run_benchmark(context, lhs="csr", rhs="default", lhs_trans=False, fw="scipy", rhs_density=1, distribution=distribution) run_benchmark(context, lhs="csr", rhs="default", lhs_trans=True, fw="scipy", rhs_density=1, distribution=distribution)
def test_dot_synthetic(): """benchmark mx.nd.dot(sparse_ndarray, dense_ndarray) with given density. `t_sparse` is the time cost of dot(csr, dns), while `t_dense` is the time cost of dot(dns, dns), with the same matrix except that it is in default storage type. """ def measure_cost_forward_baseline(repeat, dot, lhs, rhs): start = time.time() for i in range(repeat): dot(lhs, rhs) end = time.time() diff = end - start return diff / repeat def measure_cost_backward_baseline(repeat, dot, transpose, lhs, rhs): start = time.time() for i in range(repeat): dot(transpose(lhs), rhs) end = time.time() diff = end - start return diff / repeat def bench_dot_forward(m, k, n, density, ctx, repeat): set_default_device(ctx) dns = mx.nd.random.uniform(shape=(k, n)).copyto(ctx) data_shape = (m, k) csr_data = rand_ndarray(data_shape, 'csr', density) dns_data = csr_data.tostype('default') rhs_dns_np = dns.asnumpy() lhs_csr_sp = sp.csr_matrix(dns_data.asnumpy()) # csr in scipy lhs_dns_np = lhs_csr_sp.tostype('default') data = [dns_data, csr_data] costs = [] for d in data: dns.wait_to_read() d.wait_to_read() cost = measure_cost(repeat, mx.nd.dot, d, dns) costs.append(cost) ratio = costs[0] / costs[1] costs_baseline = [] cost = measure_cost_forward_baseline(repeat, np.dot, lhs_dns_np, rhs_dns_np) costs_baseline.append(cost) cost = measure_cost_forward_baseline(repeat, sp.spmatrix.dot, lhs_csr_sp, rhs_dns_np) costs_baseline.append(cost) ratio_baseline = costs_baseline[0] / costs_baseline[1] fmt = "%0.1f\t\t%s\t%d\t%d\t%d\t%0.2f\t\t\t%0.2f\t%0.5f\t\t%0.2f\t\t\t\t%0.6f\t%0.5f" print(fmt % (density * 100, str(ctx), n, m, k, ratio, costs[0], costs[1], ratio_baseline, costs_baseline[0], costs_baseline[1])) def bench_dot_backward(m, k, n, density, ctx, repeat): set_default_device(ctx) dns = mx.nd.random.uniform(shape=(m, n)).copyto(ctx) data_shape = (m, k) csr_data = rand_ndarray(data_shape, 'csr', density) dns_data = csr_data.tostype('default') rhs_dns_np = dns.asnumpy() lhs_csr_sp = sp.csr_matrix(dns_data.asnumpy()) lhs_dns_np = lhs_csr_sp.tostype('default') data = [dns_data, csr_data] costs = [] for d in data: dns.wait_to_read() d.wait_to_read() cost = measure_cost(repeat, mx.nd.dot, d, dns, transpose_a=True) costs.append(cost) ratio = costs[0] / costs[1] costs_baseline = [] cost = measure_cost_backward_baseline(repeat, np.dot, np.transpose, lhs_dns_np, rhs_dns_np) costs_baseline.append(cost) cost = measure_cost_backward_baseline(repeat, sp.spmatrix.dot, sp.spmatrix.transpose, lhs_csr_sp, rhs_dns_np) costs_baseline.append(cost) ratio_baseline = costs_baseline[0] / costs_baseline[1] fmt = "%0.1f\t\t%s\t%d\t%d\t%d\t%0.2f\t\t\t%0.2f\t%0.5f\t\t%0.2f\t\t\t\t%0.6f\t%0.5f" print(fmt % (density * 100, str(ctx), n, m, k, ratio, costs[0], costs[1], ratio_baseline, costs_baseline[0], costs_baseline[1])) print("A = sparse NDArray of shape(m, k)") print("B = dense NDArray of shape(k, n)") print("dot_forward\tdot(csr, dns)") print('density(%)\tcontext\tn\tm\tk\tt_dense/t_sparse\tt_dense\tt_sparse' '\tt_scipy_dense/t_scipy_sparse\tt_scipy_dense\tt_scipy_sparse') check_call(_LIB.MXSetNumOMPThreads(ctypes.c_int(args.num_omp_threads))) # TODO(haibin) make these runtime options m = 512 k = [50000, 100000] n = [64, 128] density = [ 1.00, 0.90, 0.70, 0.50, 0.30, 0.20, 0.10, 0.07, 0.05, 0.02, 0.01, 0.005, 0.001 ] num_repeat = 10 # contexts = [mx.cpu(), mx.gpu(0)] contexts = [mx.cpu()] for i in range(2): for ctx in contexts: for den in density: bench_dot_forward(m, k[i], n[i], den, ctx, num_repeat) print("dot_backward\tdot(csr.T, dns)") print('density(%)\tcontext\tn\tm\tk\tt_dense/t_sparse\tt_dense\tt_sparse' '\tt_scipy_dense/t_scipy_sparse\tt_scipy_dense\tt_scipy_sparse') for i in range(2): for ctx in contexts: for den in density: bench_dot_backward(m, k[i], n[i], den, ctx, num_repeat)
logger.info('Running model %s for inference', symbol_file) acc_m = mx.metric.create('acc') mod = mx.mod.Module(symbol=sym, context=ctx, data_names=['csr_data', 'dns_data'], label_names=[ label_name, ]) mod.bind(for_training=False, data_shapes=data.provide_data, label_shapes=data.provide_label) mod.set_params(arg_params, aux_params) check_call(_LIB.MXSetNumOMPThreads(ctypes.c_int(args.num_omp_threads))) batch_data = [] nbatch = 0 while nbatch < args.num_batches: for batch in data: batch_data.append(batch) nbatch += 1 if nbatch < args.num_batches: continue else: break data.hard_reset() #for data warmup wi = args.num_warmup i = 0 for batch in batch_data: