def benchmarkLuOp(self): for shape in self.shapes: with ops.Graph().as_default(), \ session.Session(config=benchmark.benchmark_config()) as sess, \ ops.device("/cpu:0"): matrix = variables.Variable(self._GenerateMatrix(shape)) lu, p = linalg_ops.lu(matrix) variables.global_variables_initializer().run() self.run_op_benchmark( sess, control_flow_ops.group(lu, p), min_iters=25, name="lu_cpu_{shape}".format(shape=shape)) if test.is_gpu_available(True): with ops.Graph().as_default(), \ session.Session(config=benchmark.benchmark_config()) as sess, \ ops.device("/device:GPU:0"): matrix = variables.Variable(self._GenerateMatrix(shape)) lu, p = linalg_ops.lu(matrix) variables.global_variables_initializer().run() self.run_op_benchmark( sess, control_flow_ops.group(lu, p), min_iters=25, name="lu_gpu_{shape}".format(shape=shape))
def benchmarkMatrixInverseOp(self): for adjoint in False, True: for shape in self.shapes: with ops.Graph().as_default(), \ session.Session(config=benchmark.benchmark_config()) as sess, \ ops.device("/cpu:0"): matrix = self._GenerateMatrix(shape) inv = linalg_ops.matrix_inverse(matrix, adjoint=adjoint) self.evaluate(variables.global_variables_initializer()) self.run_op_benchmark( sess, control_flow_ops.group(inv), min_iters=25, name="matrix_inverse_cpu_{shape}_adjoint_{adjoint}". format(shape=shape, adjoint=adjoint)) if test.is_gpu_available(True): with ops.Graph().as_default(), \ session.Session(config=benchmark.benchmark_config()) as sess, \ ops.device("/gpu:0"): matrix = self._GenerateMatrix(shape) inv = linalg_ops.matrix_inverse(matrix, adjoint=adjoint) self.evaluate(variables.global_variables_initializer()) self.run_op_benchmark( sess, control_flow_ops.group(inv), min_iters=25, name="matrix_inverse_gpu_{shape}_adjoint_{adjoint}" .format(shape=shape, adjoint=adjoint))
def benchmarkMatrixExponentialOp(self): for shape in self.shapes: with ops.Graph().as_default(), \ session.Session(config=benchmark.benchmark_config()) as sess, \ ops.device("/cpu:0"): matrix = self._GenerateMatrix(shape) expm = linalg_impl.matrix_exponential(matrix) variables.global_variables_initializer().run() self.run_op_benchmark( sess, control_flow_ops.group(expm), min_iters=25, name="matrix_exponential_cpu_{shape}".format(shape=shape)) if test.is_gpu_available(True): with ops.Graph().as_default(), \ session.Session(config=benchmark.benchmark_config()) as sess, \ ops.device("/gpu:0"): matrix = self._GenerateMatrix(shape) expm = linalg_impl.matrix_exponential(matrix) variables.global_variables_initializer().run() self.run_op_benchmark( sess, control_flow_ops.group(expm), min_iters=25, name="matrix_exponential_gpu_{shape}".format( shape=shape))
def benchmarkQROp(self): for shape_ in self.shapes: with ops.Graph().as_default(), \ session.Session(config=benchmark.benchmark_config()) as sess, \ ops.device("/cpu:0"): matrix_value = np.random.uniform( low=-1.0, high=1.0, size=shape_).astype(np.float32) matrix = variables.Variable(matrix_value) q, r = linalg_ops.qr(matrix) variables.global_variables_initializer().run() self.run_op_benchmark( sess, control_flow_ops.group(q, r), min_iters=25, name="QR_cpu_{shape}".format(shape=shape_)) if test.is_gpu_available(True): with ops.Graph().as_default(), \ session.Session(config=benchmark.benchmark_config()) as sess, \ ops.device("/device:GPU:0"): matrix_value = np.random.uniform( low=-1.0, high=1.0, size=shape_).astype(np.float32) matrix = variables.Variable(matrix_value) q, r = linalg_ops.qr(matrix) variables.global_variables_initializer().run() self.run_op_benchmark( sess, control_flow_ops.group(q, r), min_iters=25, name="QR_gpu_{shape}".format(shape=shape_))
def benchmarkMatrixBandPartOp(self): for shape_ in self.shapes: for limits in (-1, -1), (-1, 0), (0, -1), (2, 2): with ops.Graph().as_default(), \ session.Session(config=benchmark.benchmark_config()) as sess, \ ops.device("/cpu:0"): matrix = variables.Variable(array_ops.ones(shape_)) band = array_ops.matrix_band_part(matrix, limits[0], limits[1]) variables.global_variables_initializer().run() self.run_op_benchmark( sess, control_flow_ops.group(band), min_iters=10, name="matrix_band_part_cpu_{shape}_{limits}".format( shape=shape_, limits=limits)) if test_lib.is_gpu_available(True): with ops.Graph().as_default(), \ session.Session(config=benchmark.benchmark_config()) as sess, \ ops.device("/gpu:0"): matrix = variables.Variable(array_ops.ones(shape_)) band = array_ops.matrix_band_part(matrix, limits[0], limits[1]) variables.global_variables_initializer().run() self.run_op_benchmark( sess, control_flow_ops.group(band), min_iters=10, name="matrix_band_part_gpu_{shape}_{limits}".format( shape=shape_, limits=limits))
def benchmarkMatrixDeterminantOp(self): for shape in self.shapes: with ops.Graph().as_default(), session.Session( config=benchmark.benchmark_config()) as sess, ops.device("/cpu:0"): matrix = self._GenerateMatrix(shape) d = linalg_ops.matrix_determinant(matrix) variables.global_variables_initializer().run() self.run_op_benchmark( sess, control_flow_ops.group( d,), min_iters=25, name="matrix_determinant_cpu_{shape}".format(shape=shape)) if test.is_gpu_available(True): with ops.Graph().as_default(), session.Session( config=benchmark.benchmark_config()) as sess, ops.device("/gpu:0"): matrix = self._GenerateMatrix(shape) d = linalg_ops.matrix_determinant(matrix) variables.global_variables_initializer().run() self.run_op_benchmark( sess, control_flow_ops.group( d,), min_iters=25, name="matrix_determinant_gpu_{shape}".format(shape=shape))
def benchmarkMatrixSolveLsOp(self): run_gpu_test = test_lib.is_gpu_available(True) regularizer = 1.0 for matrix_shape in self.matrix_shapes: for num_rhs in 1, 2, matrix_shape[-1]: with ops.Graph().as_default(), \ session.Session(config=benchmark.benchmark_config()) as sess, \ ops.device("/cpu:0"): matrix, rhs = _GenerateTestData(matrix_shape, num_rhs) x = linalg_ops.matrix_solve_ls(matrix, rhs, regularizer) self.evaluate(variables.global_variables_initializer()) self.run_op_benchmark( sess, control_flow_ops.group(x), min_iters=25, store_memory_usage=False, name=("matrix_solve_ls_cpu_shape_{matrix_shape}_num_rhs_{num_rhs}" ).format(matrix_shape=matrix_shape, num_rhs=num_rhs)) if run_gpu_test and (len(matrix_shape) < 3 or matrix_shape[0] < 513): with ops.Graph().as_default(), \ session.Session(config=benchmark.benchmark_config()) as sess, \ ops.device("/gpu:0"): matrix, rhs = _GenerateTestData(matrix_shape, num_rhs) x = linalg_ops.matrix_solve_ls(matrix, rhs, regularizer) self.evaluate(variables.global_variables_initializer()) self.run_op_benchmark( sess, control_flow_ops.group(x), min_iters=25, store_memory_usage=False, name=("matrix_solve_ls_gpu_shape_{matrix_shape}_num_rhs_" "{num_rhs}").format( matrix_shape=matrix_shape, num_rhs=num_rhs))
def benchmarkMatrixDeterminantOp(self): for shape in self.shapes: with ops.Graph().as_default(), session.Session( config=benchmark.benchmark_config()) as sess, ops.device( "/cpu:0"): matrix = self._GenerateMatrix(shape) d = linalg_ops.matrix_determinant(matrix) self.evaluate(variables.global_variables_initializer()) self.run_op_benchmark( sess, control_flow_ops.group(d, ), min_iters=25, name="matrix_determinant_cpu_{shape}".format(shape=shape)) if test.is_gpu_available(True): with ops.Graph().as_default(), session.Session( config=benchmark.benchmark_config( )) as sess, ops.device("/gpu:0"): matrix = self._GenerateMatrix(shape) d = linalg_ops.matrix_determinant(matrix) self.evaluate(variables.global_variables_initializer()) self.run_op_benchmark( sess, control_flow_ops.group(d, ), min_iters=25, name="matrix_determinant_gpu_{shape}".format( shape=shape))
def benchmarkMatrixSolveLsOp(self): run_gpu_test = test_lib.is_gpu_available(True) regularizer = 1.0 for matrix_shape in self.matrix_shapes: for num_rhs in 1, 2, matrix_shape[-1]: with ops.Graph().as_default(), \ session.Session(config=benchmark.benchmark_config()) as sess, \ ops.device("/cpu:0"): matrix, rhs = _GenerateTestData(matrix_shape, num_rhs) x = linalg_ops.matrix_solve_ls(matrix, rhs, regularizer) variables.global_variables_initializer().run() self.run_op_benchmark( sess, control_flow_ops.group(x), min_iters=25, store_memory_usage=False, name=("matrix_solve_ls_cpu_shape_{matrix_shape}_num_rhs_{num_rhs}" ).format(matrix_shape=matrix_shape, num_rhs=num_rhs)) if run_gpu_test and (len(matrix_shape) < 3 or matrix_shape[0] < 513): with ops.Graph().as_default(), \ session.Session(config=benchmark.benchmark_config()) as sess, \ ops.device("/gpu:0"): matrix, rhs = _GenerateTestData(matrix_shape, num_rhs) x = linalg_ops.matrix_solve_ls(matrix, rhs, regularizer) variables.global_variables_initializer().run() self.run_op_benchmark( sess, control_flow_ops.group(x), min_iters=25, store_memory_usage=False, name=("matrix_solve_ls_gpu_shape_{matrix_shape}_num_rhs_" "{num_rhs}").format( matrix_shape=matrix_shape, num_rhs=num_rhs))
def benchmarkCholeskyOp(self): for shape in self.shapes: with ops.Graph().as_default(), \ session.Session(config=benchmark.benchmark_config()) as sess, \ ops.device("/cpu:0"): matrix = variables.Variable(self._GenerateMatrix(shape)) l = linalg_ops.cholesky(matrix) variables.global_variables_initializer().run() self.run_op_benchmark( sess, control_flow_ops.group( l,), min_iters=25, name="cholesky_cpu_{shape}".format(shape=shape)) if test.is_gpu_available(True): with ops.Graph().as_default(), \ session.Session(config=benchmark.benchmark_config()) as sess, \ ops.device("/device:GPU:0"): matrix = variables.Variable(self._GenerateMatrix(shape)) l = linalg_ops.cholesky(matrix) variables.global_variables_initializer().run() self.run_op_benchmark( sess, control_flow_ops.group( l,), min_iters=25, name="cholesky_gpu_{shape}".format(shape=shape))
def benchmarkMatrixInverseOp(self): for adjoint in False, True: for shape in self.shapes: with ops.Graph().as_default(), \ session.Session(config=benchmark.benchmark_config()) as sess, \ ops.device("/cpu:0"): matrix = self._GenerateMatrix(shape) inv = linalg_ops.matrix_inverse(matrix, adjoint=adjoint) variables.global_variables_initializer().run() self.run_op_benchmark( sess, control_flow_ops.group(inv), min_iters=25, name="matrix_inverse_cpu_{shape}_adjoint_{adjoint}".format( shape=shape, adjoint=adjoint)) if test.is_gpu_available(True): with ops.Graph().as_default(), \ session.Session(config=benchmark.benchmark_config()) as sess, \ ops.device("/gpu:0"): matrix = self._GenerateMatrix(shape) inv = linalg_ops.matrix_inverse(matrix, adjoint=adjoint) variables.global_variables_initializer().run() self.run_op_benchmark( sess, control_flow_ops.group(inv), min_iters=25, name="matrix_inverse_gpu_{shape}_adjoint_{adjoint}".format( shape=shape, adjoint=adjoint))
def benchmarkMatrixBandPartOp(self): for shape_ in self.shapes: for limits in (-1, -1), (-1, 0), (0, -1), (2, 2): with ops.Graph().as_default(), \ session.Session(config=benchmark.benchmark_config()) as sess, \ ops.device("/cpu:0"): matrix = variables.Variable(array_ops.ones(shape_)) band = array_ops.matrix_band_part(matrix, limits[0], limits[1]) variables.global_variables_initializer().run() self.run_op_benchmark( sess, control_flow_ops.group(band), min_iters=10, name="matrix_band_part_cpu_{shape}_{limits}".format( shape=shape_, limits=limits)) if test_lib.is_gpu_available(True): with ops.Graph().as_default(), \ session.Session(config=benchmark.benchmark_config()) as sess, \ ops.device("/gpu:0"): matrix = variables.Variable(array_ops.ones(shape_)) band = array_ops.matrix_band_part( matrix, limits[0], limits[1]) variables.global_variables_initializer().run() self.run_op_benchmark( sess, control_flow_ops.group(band), min_iters=10, name="matrix_band_part_gpu_{shape}_{limits}". format(shape=shape_, limits=limits))
def benchmarkMatrixSolveOp(self): run_gpu_test = test.is_gpu_available(True) for adjoint in False, True: for matrix_shape in self.matrix_shapes: for num_rhs in 1, 2, matrix_shape[-1]: with ops.Graph().as_default(), \ session.Session(config=benchmark.benchmark_config()) as sess, \ ops.device("/cpu:0"): matrix, rhs = self._GenerateTestData( matrix_shape, num_rhs) x = linalg_ops.matrix_solve(matrix, rhs, adjoint=adjoint) self.evaluate(variables.global_variables_initializer()) self.run_op_benchmark( sess, control_flow_ops.group(x), min_iters=25, store_memory_usage=False, name= ("matrix_solve_cpu_shape_{matrix_shape}_num_rhs_{num_rhs}_" "adjoint_{adjoint}").format( matrix_shape=matrix_shape, num_rhs=num_rhs, adjoint=adjoint)) if run_gpu_test: with ops.Graph().as_default(), \ session.Session(config=benchmark.benchmark_config()) as sess, \ ops.device("/gpu:0"): matrix, rhs = self._GenerateTestData( matrix_shape, num_rhs) x = linalg_ops.matrix_solve(matrix, rhs, adjoint=adjoint) self.evaluate( variables.global_variables_initializer()) self.run_op_benchmark( sess, control_flow_ops.group(x), min_iters=25, store_memory_usage=False, name= ("matrix_solve_gpu_shape_{matrix_shape}_num_rhs_" "{num_rhs}_adjoint_{adjoint}").format( matrix_shape=matrix_shape, num_rhs=num_rhs, adjoint=adjoint))
def benchmarkTridiagonalMulOp(self): devices = [('/cpu:0', 'cpu')] if test.is_gpu_available(cuda_only=True): devices += [('/gpu:0', 'gpu')] for device_option, size_option in itertools.product(devices, self.sizes): device_id, device_name = device_option m, batch_size, n = size_option with ops.Graph().as_default(), \ session.Session(config=benchmark.benchmark_config()) as sess, \ ops.device(device_id): upper, diag, lower, vec = self._generateData(batch_size, m, n) x1 = self.baseline(upper, diag, lower, vec) x2 = linalg_impl.tridiagonal_matmul((upper, diag, lower), vec, diagonals_format='sequence') variables.global_variables_initializer().run() self.run_op_benchmark( sess, control_flow_ops.group(x1), min_iters=10, store_memory_usage=False, name=('tridiagonal_matmul_baseline_%s' '_batch_size_%d_m_%d_n_%d' % (device_name, batch_size, m, n))) self.run_op_benchmark( sess, control_flow_ops.group(x2), min_iters=10, store_memory_usage=False, name=('tridiagonal_matmul_%s_batch_size_%d_m_%d_n_%d' % (device_name, batch_size, m, n)))
def benchmarkTridiagonalSolveOp(self): devices = [("/cpu:0", "cpu")] if test.is_gpu_available(cuda_only=True): devices += [("/gpu:0", "gpu")] for device_option, pivoting_option, size_option in \ itertools.product(devices, self.pivoting_options, self.sizes): device_id, device_name = device_option pivoting, pivoting_name = pivoting_option matrix_size, batch_size, num_rhs = size_option with ops.Graph().as_default(), \ session.Session(config=benchmark.benchmark_config()) as sess, \ ops.device(device_id): diags, rhs = self._generateData(matrix_size, batch_size, num_rhs) x = linalg_impl.tridiagonal_solve( diags, rhs, partial_pivoting=pivoting) variables.global_variables_initializer().run() self.run_op_benchmark( sess, control_flow_ops.group(x), min_iters=10, store_memory_usage=False, name=("tridiagonal_solve_{}_matrix_size_{}_batch_size_{}_" "num_rhs_{}_{}").format(device_name, matrix_size, batch_size, num_rhs, pivoting_name))
def benchmarkBatchMatMulBroadcast(self): for (a_shape, b_shape) in self.shape_pairs: with compat.forward_compatibility_horizon(2019, 4, 26): with ops.Graph().as_default(), \ session.Session(config=benchmark.benchmark_config()) as sess, \ ops.device("/cpu:0"): matrix_a = variables.Variable( GetRandomNormalInput(a_shape, np.float32)) matrix_b = variables.Variable( GetRandomNormalInput(b_shape, np.float32)) variables.global_variables_initializer().run() # Use batch matmul op's internal broadcasting. self.run_op_benchmark( sess, math_ops.matmul(matrix_a, matrix_b), min_iters=50, name="batch_matmul_cpu_{}_{}".format(a_shape, b_shape)) # Manually broadcast the input matrices using the broadcast_to op. broadcasted_batch_shape = array_ops.broadcast_static_shape( matrix_a.shape[:-2], matrix_b.shape[:-2]) broadcasted_a_shape = broadcasted_batch_shape.concatenate( matrix_a.shape[-2:]) broadcasted_b_shape = broadcasted_batch_shape.concatenate( matrix_b.shape[-2:]) self.run_op_benchmark( sess, math_ops.matmul( array_ops.broadcast_to(matrix_a, broadcasted_a_shape), array_ops.broadcast_to(matrix_b, broadcasted_b_shape)), min_iters=50, name="batch_matmul_manual_broadcast_cpu_{}_{}".format( a_shape, b_shape))
def benchmark_einsum(self): for equation, dim in self.cases: with ops.Graph().as_default(), \ session.Session(config=benchmark.benchmark_config()) as sess, \ ops.device('/cpu:0'): r = np.random.RandomState(0) input_subscripts = equation.split('->')[0].split(',') input_vars = [] for subscript in input_subscripts: input_shape = (dim, ) * len(subscript) input_vars.append( variables.Variable( np.array(r.randn(*input_shape), np.float32))) variables.global_variables_initializer().run() if len(input_vars) <= 2: self.run_op_benchmark( sess, special_math_ops.einsum(equation, *input_vars), min_iters=50, name='einsum_cpu_({})_{}'.format(equation, dim)) else: for optimize in ['greedy', 'auto']: self.run_op_benchmark( sess, special_math_ops.einsum(equation, *input_vars, optimize=optimize), min_iters=50, name='einsum_cpu_({})_{}_{}'.format( equation, optimize, dim))
def benchmarkBatchMatMulBroadcast(self): for (a_shape, b_shape) in self.shape_pairs: with ops.Graph().as_default(), \ session.Session(config=benchmark.benchmark_config()) as sess, \ ops.device("/cpu:0"): matrix_a = variables.Variable( GetRandomNormalInput(a_shape, np.float32)) matrix_b = variables.Variable( GetRandomNormalInput(b_shape, np.float32)) variables.global_variables_initializer().run() # Use batch matmul op's internal broadcasting. self.run_op_benchmark(sess, math_ops.matmul(matrix_a, matrix_b), min_iters=50, name="batch_matmul_cpu_{}_{}".format( a_shape, b_shape)) # Manually broadcast the input matrices using the broadcast_to op. broadcasted_batch_shape = array_ops.broadcast_static_shape( matrix_a.shape[:-2], matrix_b.shape[:-2]) broadcasted_a_shape = broadcasted_batch_shape.concatenate( matrix_a.shape[-2:]) broadcasted_b_shape = broadcasted_batch_shape.concatenate( matrix_b.shape[-2:]) self.run_op_benchmark( sess, math_ops.matmul( array_ops.broadcast_to(matrix_a, broadcasted_a_shape), array_ops.broadcast_to(matrix_b, broadcasted_b_shape)), min_iters=50, name="batch_matmul_manual_broadcast_cpu_{}_{}".format( a_shape, b_shape))
def benchmarkBatchSelect(self): for (m, n, use_gpu) in itertools.product([1000, 10000, 100000], [10, 100, 1000], [False, True]): name = "m_%d_n_%d_use_gpu_%s" % (m, n, use_gpu) device = "/%s:0" % ("gpu" if use_gpu else "cpu") with ops.Graph().as_default(): with ops.device(device): x_gen = random_ops.random_uniform([m, n], dtype=dtypes.float32) y_gen = random_ops.random_uniform([m, n], dtype=dtypes.float32) c_gen = random_ops.random_uniform( [m], dtype=dtypes.float32) <= 0.5 x = resource_variable_ops.ResourceVariable(x_gen) y = resource_variable_ops.ResourceVariable(y_gen) c = resource_variable_ops.ResourceVariable(c_gen) op = array_ops.where(c, x, y) with session.Session( config=benchmark.benchmark_config()) as sess: self.evaluate(x.initializer) self.evaluate(y.initializer) self.evaluate(c.initializer) r = self.run_op_benchmark(sess, op, min_iters=100, name=name) # approximate size of output: m*n*2 floats for each axis. gb_processed = m * n * 8 / 1.0e9 throughput = gb_processed / r["wall_time"] print("Benchmark: %s \t wall_time: %0.03g s \t " "Throughput: %0.03g GB/s" % (name, r["wall_time"], throughput)) sys.stdout.flush()
def _benchmark(self, generate_data_fn, test_name_format_string): devices = [("/cpu:0", "cpu")] if test.is_gpu_available(cuda_only=True): devices += [("/gpu:0", "gpu")] for device_option, pivoting_option, size_option in \ itertools.product(devices, self.pivoting_options, self.sizes): device_id, device_name = device_option pivoting, pivoting_name = pivoting_option matrix_size, batch_size, num_rhs = size_option with ops.Graph().as_default(), \ session.Session(config=benchmark.benchmark_config()) as sess, \ ops.device(device_id): diags, rhs = generate_data_fn(matrix_size, batch_size, num_rhs) # Pivoting is not supported by XLA backends. if test.is_xla_enabled() and pivoting: return x = linalg_impl.tridiagonal_solve( diags, rhs, partial_pivoting=pivoting) self.evaluate(variables.global_variables_initializer()) self.run_op_benchmark(sess, control_flow_ops.group(x), min_iters=10, store_memory_usage=False, name=test_name_format_string.format( device_name, matrix_size, batch_size, num_rhs, pivoting_name))
def benchmarkWhere(self): for (m, n, p, use_gpu) in itertools.product( [10], [10, 100, 1000, 10000, 100000, 1000000], [0.01, 0.5, 0.99], [False, True]): name = "m_%d_n_%d_p_%g_use_gpu_%s" % (m, n, p, use_gpu) device = "/%s:0" % ("gpu" if use_gpu else "cpu") with ops.Graph().as_default(): with ops.device(device): x = random_ops.random_uniform( (m, n), dtype=dtypes.float32) <= p v = resource_variable_ops.ResourceVariable(x) op = array_ops.where(v) with session.Session( config=benchmark.benchmark_config()) as sess: self.evaluate(v.initializer) r = self.run_op_benchmark(sess, op, min_iters=100, name=name) gb_processed_input = m * n / 1.0e9 # approximate size of output: m*n*p int64s for each axis. gb_processed_output = 2 * 8 * m * n * p / 1.0e9 gb_processed = gb_processed_input + gb_processed_output throughput = gb_processed / r["wall_time"] print("Benchmark: %s \t wall_time: %0.03g s \t " "Throughput: %0.03g GB/s" % (name, r["wall_time"], throughput)) sys.stdout.flush()
def benchmarkEinsum(self): for equation, dim in self.cases: with ops.Graph().as_default(), \ session.Session(config=benchmark.benchmark_config()) as sess, \ ops.device('/cpu:0'): r = np.random.RandomState(0) input_subscripts = equation.split('->')[0].split(',') input_vars = [] for subscript in input_subscripts: input_shape = (dim, ) * len(subscript) input_vars.append( variables.Variable( np.array(r.randn(*input_shape), np.float32))) self.evaluate(variables.global_variables_initializer()) # Call einsum_v1. self.run_op_benchmark( sess, special_math_ops.einsum(equation, *input_vars), min_iters=50, name='einsum_v1_cpu_({})_{}'.format(equation, dim)) # Call gen_linalg_ops.einsum. self.run_op_benchmark( sess, gen_linalg_ops.einsum(input_vars, equation), min_iters=50, name='einsum_v2_cpu_({})_{}'.format(equation, dim))
def benchmarkTridiagonalMulOp(self): devices = [('/cpu:0', 'cpu')] for device_id, device_name in devices: for batch_size, matrix_size in self.sizes: with ops.Graph().as_default(), \ session.Session(config=benchmark.benchmark_config()) as sess, \ ops.device(device_id): upper, diag, lower, vec = self._generateData( batch_size, matrix_size) x1 = self.baseline(upper, diag, lower, vec) x2 = linalg_impl.tridiagonal_matmul( (upper, diag, lower), vec) variables.global_variables_initializer().run() self.run_op_benchmark( sess, control_flow_ops.group(x1), min_iters=10, store_memory_usage=False, name=('tridiagonal_matmul_baseline_%s' '_batch_size_%d_matrix_size_%d' % (device_name, batch_size, matrix_size))) self.run_op_benchmark( sess, control_flow_ops.group(x2), min_iters=10, store_memory_usage=False, name= ('tridiagonal_matmul_%s_batch_size_%d_matrix_size_%d' % (device_name, batch_size, matrix_size)))
def benchmarkBatchSelect(self): for (m, n, use_gpu) in itertools.product([1000, 10000, 100000], [10, 100, 1000], [False, True]): name = "m_%d_n_%d_use_gpu_%s" % (m, n, use_gpu) device = "/%s:0" % ("gpu" if use_gpu else "cpu") with ops.Graph().as_default(): with ops.device(device): x_gen = random_ops.random_uniform([m, n], dtype=dtypes.float32) y_gen = random_ops.random_uniform([m, n], dtype=dtypes.float32) c_gen = random_ops.random_uniform([m], dtype=dtypes.float32) <= 0.5 x = resource_variable_ops.ResourceVariable(x_gen) y = resource_variable_ops.ResourceVariable(y_gen) c = resource_variable_ops.ResourceVariable(c_gen) op = array_ops.where(c, x, y) with session.Session(config=benchmark.benchmark_config()) as sess: x.initializer.run() y.initializer.run() c.initializer.run() r = self.run_op_benchmark(sess, op, min_iters=100, name=name) # approximate size of output: m*n*2 floats for each axis. gb_processed = m * n * 8 / 1.0e9 throughput = gb_processed / r["wall_time"] print("Benchmark: %s \t wall_time: %0.03g s \t " "Throughput: %0.03g GB/s" % (name, r["wall_time"], throughput)) sys.stdout.flush()
def benchmark_times_an_op(self): with session.Session(config=benchmark.benchmark_config()) as sess: a = constant_op.constant(0.0) a_plus_a = a + a return self.run_op_benchmark(sess, a_plus_a, min_iters=1000, store_trace=True, name="op_benchmark")
def benchmark_times_an_op(self): input_size = 5 with session.Session(config=benchmark.benchmark_config()) as sess: a = array_ops.placeholder(dtype=dtypes.float32, shape=(input_size)) a_plus_a = a + a return self.run_op_benchmark(sess, a_plus_a, feed_dict={a: np.arange(input_size)}, min_iters=1000, store_trace=True, name="op_benchmark")
def benchmark_times_an_op(self): input_size = 5 with session.Session(config=benchmark.benchmark_config()) as sess: a = array_ops.placeholder(dtype=dtypes.float32, shape=(input_size)) a_plus_a = a + a return self.run_op_benchmark( sess, a_plus_a, feed_dict={a: np.arange(input_size)}, min_iters=1000, store_trace=True, name="op_benchmark")
def _run_and_report_graphmode(self, fn, iters, burn_iters, benchmark_name, xprof_enabled, **kwargs): """Runs and reports benchmarks in graph mode.""" if self.input_data is None: raise ValueError('Input data is missing for {} benchmark'.format( benchmark_name)) # Uses the benchmark config to disable the static graph optimizations with session.Session(config=benchmark.benchmark_config()) as sess: if hasattr(self, 'iterator'): sess.run(self.iterator.initializer) sess.run(lookup_ops.tables_initializer()) sess.run(variables_lib.global_variables_initializer()) inputs = sess.run(self.input_data) @def_function.function def benchmark_op(data): return fn(data, **kwargs) def run_benchmark(): for _ in range(burn_iters): sess.run(benchmark_op(inputs)) total_time = 0 for _ in range(iters): start_time = time.time() sess.run(benchmark_op(inputs)) total_time += time.time() - start_time return total_time total_time = run_benchmark() mean_time = total_time / iters extras = {'sec_per_batch': mean_time} metrics = [] if hasattr(self, 'batch_number'): extras.update( {'batches_per_sec': self.batch_number / mean_time}) metrics.append({ 'name': 'batches_per_sec', 'value': self.batch_number / mean_time }) if xprof_enabled: extras.update(self._run_with_xprof(run_benchmark)) self.report_benchmark(wall_time=mean_time, name=benchmark_name + '_graph', extras=extras, metrics=metrics)
def benchmarkMatrixLogarithmOp(self): for shape in self.shapes: with ops.Graph().as_default(), \ session.Session(config=benchmark.benchmark_config()) as sess, \ ops.device("/cpu:0"): matrix = self._GenerateMatrix(shape) logm = gen_linalg_ops.matrix_logarithm(matrix) variables.global_variables_initializer().run() self.run_op_benchmark( sess, control_flow_ops.group(logm), min_iters=25, name="matrix_logarithm_cpu_{shape}".format(shape=shape))
def benchmarkMatrixSolveOp(self): run_gpu_test = test.is_gpu_available(True) for adjoint in False, True: for matrix_shape in self.matrix_shapes: for num_rhs in 1, 2, matrix_shape[-1]: with ops.Graph().as_default(), \ session.Session(config=benchmark.benchmark_config()) as sess, \ ops.device("/cpu:0"): matrix, rhs = self._GenerateTestData(matrix_shape, num_rhs) x = linalg_ops.matrix_solve(matrix, rhs, adjoint=adjoint) variables.global_variables_initializer().run() self.run_op_benchmark( sess, control_flow_ops.group(x), min_iters=25, store_memory_usage=False, name=("matrix_solve_cpu_shape_{matrix_shape}_num_rhs_{num_rhs}_" "adjoint_{adjoint}").format( matrix_shape=matrix_shape, num_rhs=num_rhs, adjoint=adjoint)) if run_gpu_test: with ops.Graph().as_default(), \ session.Session(config=benchmark.benchmark_config()) as sess, \ ops.device("/gpu:0"): matrix, rhs = self._GenerateTestData(matrix_shape, num_rhs) x = linalg_ops.matrix_solve(matrix, rhs, adjoint=adjoint) variables.global_variables_initializer().run() self.run_op_benchmark( sess, control_flow_ops.group(x), min_iters=25, store_memory_usage=False, name=("matrix_solve_gpu_shape_{matrix_shape}_num_rhs_" "{num_rhs}_adjoint_{adjoint}").format( matrix_shape=matrix_shape, num_rhs=num_rhs, adjoint=adjoint))
def benchmarkMatrixLogarithmOp(self): for shape in self.shapes: with ops.Graph().as_default(), \ session.Session(config=benchmark.benchmark_config()) as sess, \ ops.device("/cpu:0"): matrix = self._GenerateMatrix(shape) logm = gen_linalg_ops.matrix_logarithm(matrix) variables.global_variables_initializer().run() self.run_op_benchmark( sess, control_flow_ops.group(logm), min_iters=25, name="matrix_logarithm_cpu_{shape}".format( shape=shape))
def benchmarkVeryLarge2DFloatSparseTensor(self): np.random.seed(127) num_elements = 10000 batch_size = 64 indices_batch = np.random.randint(batch_size, size=num_elements, dtype=np.int64) indices_value = np.arange(num_elements, dtype=np.int64) indices = np.asarray(sorted(zip(indices_batch, indices_value)), dtype=np.int64) values = ["feature_value_for_embedding_lookup"] * num_elements shape = np.asarray([batch_size, num_elements], dtype=np.int64) with session.Session(config=benchmark.benchmark_config()) as sess: with ops.device("/cpu:0"): indices = variables.Variable(indices) values = variables.Variable(values) shape = variables.Variable(shape) st = sparse_tensor_lib.SparseTensor(indices, values, shape) st_handles = add_many_sparse_to_tensors_map(st) st_roundtrip = take_many_sparse_from_tensors_map( sparse_map_op=st_handles.op, sparse_handles=st_handles) st_roundtrip_op = st_roundtrip.values.op st_serialized = sparse_ops.serialize_many_sparse(st) st_deserialized = sparse_ops.deserialize_many_sparse( st_serialized, dtype=values.dtype) st_deserialized_op = st_deserialized.values.op variables.global_variables_initializer().run() st_roundtrip_values = self.evaluate(st_roundtrip) st_deserialized_values = self.evaluate(st_deserialized) np.testing.assert_equal(st_roundtrip_values.values, st_deserialized_values.values) np.testing.assert_equal(st_roundtrip_values.indices, st_deserialized_values.indices) np.testing.assert_equal(st_roundtrip_values.dense_shape, st_deserialized_values.dense_shape) self.run_op_benchmark( sess, st_roundtrip_op, min_iters=2000, name="benchmark_very_large_2d_float_st_tensor_maps") self.run_op_benchmark( sess, st_deserialized_op, min_iters=2000, name="benchmark_very_large_2d_float_st_serialization")
def benchmarkVeryLarge2DFloatSparseTensor(self): np.random.seed(127) num_elements = 10000 batch_size = 64 indices_batch = np.random.randint( batch_size, size=num_elements, dtype=np.int64) indices_value = np.arange(num_elements, dtype=np.int64) indices = np.asarray( sorted(zip(indices_batch, indices_value)), dtype=np.int64) values = ["feature_value_for_embedding_lookup"] * num_elements shape = np.asarray([batch_size, num_elements], dtype=np.int64) with session.Session(config=benchmark.benchmark_config()) as sess: with ops.device("/cpu:0"): indices = variables.Variable(indices) values = variables.Variable(values) shape = variables.Variable(shape) st = sparse_tensor_lib.SparseTensor(indices, values, shape) st_handles = add_many_sparse_to_tensors_map(st) st_roundtrip = take_many_sparse_from_tensors_map( sparse_map_op=st_handles.op, sparse_handles=st_handles) st_roundtrip_op = st_roundtrip.values.op st_serialized = sparse_ops.serialize_many_sparse(st) st_deserialized = sparse_ops.deserialize_many_sparse( st_serialized, dtype=values.dtype) st_deserialized_op = st_deserialized.values.op variables.global_variables_initializer().run() st_roundtrip_values = sess.run(st_roundtrip) st_deserialized_values = sess.run(st_deserialized) np.testing.assert_equal(st_roundtrip_values.values, st_deserialized_values.values) np.testing.assert_equal(st_roundtrip_values.indices, st_deserialized_values.indices) np.testing.assert_equal(st_roundtrip_values.dense_shape, st_deserialized_values.dense_shape) self.run_op_benchmark( sess, st_roundtrip_op, min_iters=2000, name="benchmark_very_large_2d_float_st_tensor_maps") self.run_op_benchmark( sess, st_deserialized_op, min_iters=2000, name="benchmark_very_large_2d_float_st_serialization")
def _BenchmarkGrad(grad_fn, name, device): for shape in self.shapes: matrix = self._GenerateMatrix(shape) with ops.Graph().as_default(), \ session.Session(config=benchmark.benchmark_config()) as sess, \ ops.device(device): l = variables.Variable(np.linalg.cholesky(matrix)) grad_matrix = variables.Variable( np.random.randn(*matrix.shape).astype(np.float32)) grad = grad_fn(l, grad_matrix) variables.global_variables_initializer().run() self.run_op_benchmark( sess, control_flow_ops.group( grad,), min_iters=25, name="{name}_{dev}_{shape}".format( name=name, dev=grad.device, shape=shape))
def benchmarkTridiagonalSolveOp(self): for matrix_size, batch_size, num_rhs in self.sizes: with ops.Graph().as_default(), \ session.Session(config=benchmark.benchmark_config()) as sess, \ ops.device("/cpu:0"): diags, rhs = self._generateData(matrix_size, batch_size, num_rhs) x = linalg_impl.tridiagonal_solve(diags, rhs, transpose_rhs=True) variables.global_variables_initializer().run() self.run_op_benchmark( sess, control_flow_ops.group(x), min_iters=10, store_memory_usage=False, name=("tridiagonal_solve_matrix_size_{}_batch_size_{}_" "num_rhs_{}").format(matrix_size, batch_size, num_rhs))
def benchmarkTridiagonalMulOp(self): devices = [('/cpu:0', 'cpu')] if test.is_gpu_available(cuda_only=True): devices += [('/gpu:0', 'gpu')] for device_option, size_option in itertools.product( devices, self.sizes): device_id, device_name = device_option m, batch_size, n = size_option with ops.Graph().as_default(), \ session.Session(config=benchmark.benchmark_config()) as sess, \ ops.device(device_id): upper, diag, lower, vec = self._generateData( batch_size, m, n) x1 = self.baseline(upper, diag, lower, vec) x2 = linalg_impl.tridiagonal_matmul( (upper, diag, lower), vec, diagonals_format='sequence') self.evaluate(variables.global_variables_initializer()) self.run_op_benchmark( sess, control_flow_ops.group(x1), min_iters=10, store_memory_usage=False, name=('tridiagonal_matmul_baseline_%s' '_batch_size_%d_m_%d_n_%d' % (device_name, batch_size, m, n))) self.run_op_benchmark( sess, control_flow_ops.group(x2), min_iters=10, store_memory_usage=False, name=('tridiagonal_matmul_%s_batch_size_%d_m_%d_n_%d' % (device_name, batch_size, m, n)))
def benchmarkWhere(self): for (m, n, p, use_gpu) in itertools.product( [10], [10, 100, 1000, 10000, 100000, 1000000], [0.01, 0.5, 0.99], [False, True]): name = "m_%d_n_%d_p_%g_use_gpu_%s" % (m, n, p, use_gpu) device = "/%s:0" % ("gpu" if use_gpu else "cpu") with ops.Graph().as_default(): with ops.device(device): x = random_ops.random_uniform((m, n), dtype=dtypes.float32) <= p v = resource_variable_ops.ResourceVariable(x) op = array_ops.where(v) with session.Session(config=benchmark.benchmark_config()) as sess: v.initializer.run() r = self.run_op_benchmark(sess, op, min_iters=100, name=name) gb_processed_input = m * n / 1.0e9 # approximate size of output: m*n*p int64s for each axis. gb_processed_output = 2 * 8 * m * n * p / 1.0e9 gb_processed = gb_processed_input + gb_processed_output throughput = gb_processed / r["wall_time"] print("Benchmark: %s \t wall_time: %0.03g s \t " "Throughput: %0.03g GB/s" % (name, r["wall_time"], throughput)) sys.stdout.flush()
def benchmark_unicode_script(self): with session.Session(config=benchmark.benchmark_config()) as sess: chars = self._generateBenchmarkInput(1000000) script = string_ops.unicode_script(chars) self.run_op_benchmark(sess, script.op, min_iters=100)
def run_benchmark(self, shape=(100, 100), ragged_rank=None, dtype=dtypes.float32, fill=None, default_shape=(), output_shape=None, min_iters=1000): """Run a benchmark with the specified configuraiton parameters. Args: shape: Bounding box for the input ragged tensor. ragged_rank: Ragged rank for the input ragged tensor. Defauts to `len(shape)-1`. dtype: Data type for the input ragged tensor. fill: How full each dimension should be (0-1). Corresponds 1:1 with `shape`. Defaults to 0.8 for each dimension. default_shape: Shape for the default (padding) value. output_shape: Output shape -- ragged tensor will be padded or cropped to this shape. min_iters: Minimum iterations for benchmark. """ if ragged_rank is None: ragged_rank = len(shape) - 1 if fill is None: fill = [0.8 for _ in shape] # Build the inputs for the op. rt_input = self._generateRaggedTensor(shape, ragged_rank, dtype, fill) default_value = constant_op.constant(self._generateRaggedTensor( default_shape, 0, dtype), dtype=dtype) mbs = np.prod(shape) / (2**20) with session.Session(config=benchmark.benchmark_config()) as sess: extras = { 'shape': shape, 'ragged_rank': ragged_rank, 'dtype': dtype, 'fill': fill, 'default_shape': default_shape } rt = ragged_factory_ops.constant(rt_input, dtype, ragged_rank=ragged_rank) # Inputs for with_splits: splits_rt_placeholder = ragged_factory_ops.placeholder( dtype, ragged_rank, shape[ragged_rank + 1:]) splits_feed_dict = {splits_rt_placeholder: sess.run(rt)} # Inputs for with_rowids: rowids_feed_dict = {} rowids_rt_placeholder = rebuild_ragged_tensor_with_value_rowids( rt, rowids_feed_dict, sess) # Common arguments for benchmarks: run_op_benchmark_kwargs = dict(sess=sess, store_memory_usage=True, min_iters=min_iters, burn_iters=max(5, min_iters // 10), mbs=mbs, extras=extras) ragged_to_dense_with_splits = ragged_conversion_ops.ragged_to_dense( splits_rt_placeholder, default_value=default_value) self.run_op_benchmark(op_or_tensor=ragged_to_dense_with_splits.op, name='ragged_to_dense_with_splits', feed_dict=splits_feed_dict, **run_op_benchmark_kwargs) ragged_to_tensor_with_splits = splits_rt_placeholder.to_tensor( default_value=default_value) self.run_op_benchmark(op_or_tensor=ragged_to_tensor_with_splits.op, name='ragged_to_tensor_with_splits', feed_dict=splits_feed_dict, **run_op_benchmark_kwargs) ragged_to_dense_with_rowids = ragged_conversion_ops.ragged_to_dense( rowids_rt_placeholder, default_value=default_value) self.run_op_benchmark(op_or_tensor=ragged_to_dense_with_rowids.op, name='ragged_to_dense_with_rowids', feed_dict=rowids_feed_dict, **run_op_benchmark_kwargs) ragged_to_tensor_with_rowids = rowids_rt_placeholder.to_tensor( default_value=default_value) self.run_op_benchmark(op_or_tensor=ragged_to_tensor_with_rowids.op, name='ragged_to_tensor_with_rowids', feed_dict=rowids_feed_dict, **run_op_benchmark_kwargs)
def benchmark_times_an_op(self): with session.Session(config=benchmark.benchmark_config()) as sess: a = constant_op.constant(0.0) a_plus_a = a + a return self.run_op_benchmark( sess, a_plus_a, min_iters=1000, store_trace=True, name="op_benchmark")