def testSelectEverthingDetail(self): ops.reset_default_graph() dev = '/gpu:0' if test.is_gpu_available() else '/cpu:0' outfile = os.path.join(test.get_temp_dir(), 'dump') opts = (builder(builder.trainable_variables_parameter()) .with_file_output(outfile) .with_accounted_types(['.*']) .select(['micros', 'bytes', 'params', 'float_ops', 'occurrence', 'device', 'op_types', 'input_shapes']).build()) config = config_pb2.ConfigProto() with session.Session(config=config) as sess, ops.device(dev): x = lib.BuildSmallModel() sess.run(variables.global_variables_initializer()) run_meta = config_pb2.RunMetadata() _ = sess.run(x, options=config_pb2.RunOptions( trace_level=config_pb2.RunOptions.FULL_TRACE), run_metadata=run_meta) model_analyzer.profile( sess.graph, run_meta, options=opts) with gfile.Open(outfile, 'r') as f: # pylint: disable=line-too-long outputs = f.read().split('\n') self.assertEqual(outputs[0], 'node name | # parameters | # float_ops | requested bytes | total execution time | accelerator execution time | cpu execution time | assigned devices | op types | op count (run|defined) | input shapes') for o in outputs[1:]: if o.find('Conv2D ') > 0: metrics = o[o.find('(') +1: o.find(')')].split(',') # Make sure time is profiled. gap = 1 if test.is_gpu_available() else 2 for i in range(3, 6, gap): mat = re.search('(.*)[um]s/(.*)[um]s', metrics[i]) self.assertGreater(float(mat.group(1)), 0.0) self.assertGreater(float(mat.group(2)), 0.0) # Make sure device is profiled. if test.is_gpu_available(): self.assertTrue(metrics[6].find('gpu') > 0) self.assertFalse(metrics[6].find('cpu') > 0) else: self.assertFalse(metrics[6].find('gpu') > 0) self.assertTrue(metrics[6].find('cpu') > 0) # Make sure float_ops is profiled. mat = re.search('(.*)k/(.*)k flops', metrics[1].strip()) self.assertGreater(float(mat.group(1)), 0.0) self.assertGreater(float(mat.group(2)), 0.0) # Make sure op_count is profiled. self.assertEqual(metrics[8].strip(), '1/1|1/1') # Make sure input_shapes is profiled. self.assertEqual(metrics[9].strip(), '0:2x6x6x3|1:3x3x3x6') if o.find('DW (3x3x3x6') > 0: metrics = o[o.find('(') +1: o.find(')')].split(',') mat = re.search('(.*)/(.*) params', metrics[1].strip()) self.assertGreater(float(mat.group(1)), 0.0) self.assertGreater(float(mat.group(2)), 0.0)
def testTraining(self): x_shape = [1, 1, 6, 1] for dtype in [np.float16, np.float32]: if test.is_gpu_available(cuda_only=True): self._test_training( x_shape, dtype, [1], np.float32, use_gpu=True, data_format='NHWC') self._test_training( x_shape, dtype, [1], np.float32, use_gpu=True, data_format='NCHW') self._test_training( x_shape, dtype, [1], np.float32, use_gpu=False, data_format='NHWC') x_shape = [1, 1, 6, 2] for dtype in [np.float16, np.float32]: if test.is_gpu_available(cuda_only=True): self._test_training( x_shape, dtype, [2], np.float32, use_gpu=True, data_format='NHWC') self._test_training( x_shape, dtype, [2], np.float32, use_gpu=False, data_format='NHWC') x_shape = [1, 2, 1, 6] if test.is_gpu_available(cuda_only=True): for dtype in [np.float16, np.float32]: self._test_training( x_shape, dtype, [2], np.float32, use_gpu=True, data_format='NCHW') x_shape = [27, 131, 127, 6] for dtype in [np.float16, np.float32]: if test.is_gpu_available(cuda_only=True): self._test_training( x_shape, dtype, [131], np.float32, use_gpu=True, data_format='NCHW') self._test_training( x_shape, dtype, [6], np.float32, use_gpu=True, data_format='NHWC') self._test_training( x_shape, dtype, [6], np.float32, use_gpu=False, data_format='NHWC')
def testBatchNormGrad(self): for is_training in [True, False]: x_shape = [1, 1, 6, 1] if test.is_gpu_available(cuda_only=True): self._test_gradient( x_shape, [1], use_gpu=True, data_format='NHWC', is_training=is_training) self._test_gradient( x_shape, [1], use_gpu=True, data_format='NCHW', is_training=is_training) self._test_gradient( x_shape, [1], use_gpu=False, data_format='NHWC', is_training=is_training) x_shape = [1, 1, 6, 2] if test.is_gpu_available(cuda_only=True): self._test_gradient( x_shape, [2], use_gpu=True, data_format='NHWC', is_training=is_training) self._test_gradient( x_shape, [2], use_gpu=False, data_format='NHWC', is_training=is_training) x_shape = [1, 2, 1, 6] if test.is_gpu_available(cuda_only=True): self._test_gradient( x_shape, [2], use_gpu=True, data_format='NCHW', is_training=is_training) x_shape = [7, 9, 13, 6] if test.is_gpu_available(cuda_only=True): self._test_gradient( x_shape, [9], use_gpu=True, data_format='NCHW', is_training=is_training) self._test_gradient( x_shape, [6], use_gpu=True, data_format='NHWC', is_training=is_training) self._test_gradient( x_shape, [6], use_gpu=False, data_format='NHWC', is_training=is_training)
def testMaxPoolV2(self): if test.is_gpu_available(cuda_only=True): random_seed.set_random_seed(0) x = random_ops.truncated_normal([1, 784], seed=0) conv = _two_layer_model(x) ksize = constant_op.constant([1, 2, 3, 1], shape=[4]) strides = array_ops.placeholder(dtype='int32', shape=[4]) max_pool = gen_nn_ops._max_pool_v2(conv, ksize, strides, 'VALID') output = array_ops.identity(max_pool) strides_val = [1, 3, 2, 1] with session.Session() as sess: output_val_ref = sess.run(output, feed_dict={strides: strides_val}) with session.Session(config=_get_config()) as sess: metadata = config_pb2.RunMetadata() output_val = sess.run( output, run_metadata=metadata, feed_dict={ strides: strides_val }) nodes = [] num_transposes = 0 for node in metadata.cost_graph.node: if _is_transpose(node.name): num_transposes += 1 nodes.append(node.name) expected_num_transposes = 2 self.assertEqual(expected_num_transposes, num_transposes) self._assert_trans_nhwc_to_nchw('Conv2D-0', nodes) self._assert_trans_nchw_to_nhwc('MaxPoolV2-0-0', nodes) self._assert_vec_nhwc_to_nchw('MaxPoolV2-2', nodes) self.assertIn('MaxPoolV2-1-LayoutOptimizer', nodes) self.assertAllClose(output_val_ref, output_val, atol=1e-3)
def benchmarkMatrixExponentialOp(self): for shape in self.shapes: with ops.Graph().as_default(), \ session.Session() as sess, \ ops.device("/cpu:0"): matrix = self._GenerateMatrix(shape) expm = linalg_impl.matrix_exponential(matrix) variables.global_variables_initializer().run() self.run_op_benchmark( sess, control_flow_ops.group(expm), min_iters=25, name="matrix_exponential_cpu_{shape}".format( shape=shape)) if test.is_gpu_available(True): with ops.Graph().as_default(), \ session.Session() as sess, \ ops.device("/gpu:0"): matrix = self._GenerateMatrix(shape) expm = linalg_impl.matrix_exponential(matrix) variables.global_variables_initializer().run() self.run_op_benchmark( sess, control_flow_ops.group(expm), min_iters=25, name="matrix_exponential_gpu_{shape}".format( shape=shape))
def testStridedSliceWithMask1011(self): if test.is_gpu_available(cuda_only=True): random_seed.set_random_seed(0) x = random_ops.truncated_normal([1, 784], seed=0) conv = _two_layer_model(x) # This will generate a StridedSlice op with begin mask and # end mask 11(1011). s = conv[:, :, 1:-1, :] output = array_ops.identity(s) with session.Session() as sess: output_val_ref = sess.run(output) with session.Session(config=_get_config()) as sess: metadata = config_pb2.RunMetadata() output_val = sess.run(output, run_metadata=metadata) nodes = [] num_transposes = 0 for node in metadata.cost_graph.node: if _is_transpose(node.name): num_transposes += 1 nodes.append(node.name) # Four transposes were initially added in the Expand phase of # LayoutOptimizer; two of them are cancelled out in the Collapse phase. expected_num_transposes = 2 self.assertEqual(expected_num_transposes, num_transposes) self._assert_trans_nhwc_to_nchw('Conv2D-0', nodes) self._assert_trans_nchw_to_nhwc('strided_slice-0-0', nodes) self.assertIn('strided_slice-1-LayoutOptimizer', nodes) self.assertIn('strided_slice-2-LayoutOptimizer', nodes) self.assertIn('strided_slice-3-LayoutOptimizer', nodes) self.assertAllClose(output_val_ref, output_val, atol=1e-3)
def testLoopGPU(self): if not test.is_gpu_available(): return ops.reset_default_graph() with ops.device('/gpu:0'): tfprof_node, run_meta = _run_loop_model() # The while-loop caused a node to appear 4 times in scheduling. ret = _extract_node(run_meta, 'rnn/while/rnn/basic_rnn_cell/basic_rnn_cell/MatMul') self.assertEqual(len(ret['/job:localhost/replica:0/task:0/gpu:0']), 4) total_cpu_execs = 0 for node in ret['/job:localhost/replica:0/task:0/gpu:0']: total_cpu_execs += node.op_end_rel_micros ret = _extract_node( run_meta, 'rnn/while/rnn/basic_rnn_cell/basic_rnn_cell/MatMul:MatMul') self.assertGreaterEqual(len(ret['/gpu:0/stream:all']), 4) total_accelerator_execs = 0 for node in ret['/gpu:0/stream:all']: total_accelerator_execs += node.op_end_rel_micros mm_node = lib.SearchTFProfNode( tfprof_node, 'rnn/while/rnn/basic_rnn_cell/basic_rnn_cell/MatMul') self.assertEqual(mm_node.run_count, 4) self.assertEqual(mm_node.accelerator_exec_micros, total_accelerator_execs) self.assertEqual(mm_node.cpu_exec_micros, total_cpu_execs) self.assertEqual(mm_node.exec_micros, total_cpu_execs + total_accelerator_execs)
def testConcatLargeNumberOfTensors(self): with self.session(use_gpu=True): for concat_dim in range(2): params = {} p = [] shape = np.array([7, 13]) if test.is_gpu_available(): num_tensors = 5000 else: num_tensors = 500 for i in np.arange(num_tensors): input_shape = shape placeholder = array_ops.placeholder(dtypes.float32, shape=input_shape) p.append(placeholder) params[placeholder] = np.random.rand(*input_shape).astype(np.float32) concat_inputs = p c = array_ops.concat(concat_inputs, concat_dim) result = c.eval(feed_dict=params) self.assertEqual(result.shape, c.get_shape()) cur_offset = 0 for i in np.arange(num_tensors): # The index into the result is the ':' along all dimensions # except the concat_dim. slice(0, size) is used for ':', and # a list of slices is used to index into result. index = [slice(0, params[p[i]].shape[j]) for j in np.arange(2)] index[concat_dim] = slice(cur_offset, cur_offset + params[p[i]].shape[concat_dim]) cur_offset += params[p[i]].shape[concat_dim] self.assertAllEqual(result[index], params[p[i]])
def testGradientDilatedConv(self): if test.is_gpu_available(cuda_only=True): with self.test_session(use_gpu=True): for padding in ["SAME", "VALID"]: for stride in [1, 2]: np.random.seed(1) in_shape = [5, 8, 6, 4] in_val = constant_op.constant( 2 * np.random.random_sample(in_shape) - 1, dtype=dtypes.float32) filter_shape = [3, 3, 4, 6] # Make a convolution op with the current settings, # just to easily get the shape of the output. conv_out = nn_ops.conv2d( in_val, array_ops.zeros(filter_shape), dilations=[1, 2, 2, 1], strides=[1, stride, stride, 1], padding=padding) out_backprop_shape = conv_out.get_shape().as_list() out_backprop_val = constant_op.constant( 2 * np.random.random_sample(out_backprop_shape) - 1, dtype=dtypes.float32) output = nn_ops.conv2d_backprop_filter( in_val, filter_shape, out_backprop_val, dilations=[1, 2, 2, 1], strides=[1, stride, stride, 1], padding=padding) err = gradient_checker.compute_gradient_error( [in_val, out_backprop_val], [in_shape, out_backprop_shape], output, filter_shape) print("conv2d_backprop_filter gradient err = %g " % err) err_tolerance = 2e-3 self.assertLess(err, err_tolerance)
def testDeviceWrapperDynamicExecutionNodesAreAllProperlyLocated(self): if not test.is_gpu_available(): # Can't perform this test w/o a GPU return with self.test_session(use_gpu=True) as sess: with variable_scope.variable_scope( "root", initializer=init_ops.constant_initializer(0.5)): x = array_ops.zeros([1, 1, 3]) cell = rnn_cell_impl.DeviceWrapper(rnn_cell_impl.GRUCell(3), "/gpu:0") with ops.device("/cpu:0"): outputs, _ = rnn.dynamic_rnn( cell=cell, inputs=x, dtype=dtypes.float32) run_metadata = config_pb2.RunMetadata() opts = config_pb2.RunOptions( trace_level=config_pb2.RunOptions.FULL_TRACE) sess.run([variables_lib.global_variables_initializer()]) _ = sess.run(outputs, options=opts, run_metadata=run_metadata) step_stats = run_metadata.step_stats ix = 0 if "gpu" in step_stats.dev_stats[0].device else 1 gpu_stats = step_stats.dev_stats[ix].node_stats cpu_stats = step_stats.dev_stats[1 - ix].node_stats self.assertFalse([s for s in cpu_stats if "gru_cell" in s.node_name]) self.assertTrue([s for s in gpu_stats if "gru_cell" in s.node_name])
def benchmarkCholeskyOp(self): for shape in self.shapes: with ops.Graph().as_default(), \ session.Session() as sess, \ ops.device("/cpu:0"): matrix = variables.Variable(self._GenerateMatrix(shape)) l = linalg_ops.cholesky(matrix) variables.global_variables_initializer().run() self.run_op_benchmark( sess, control_flow_ops.group( l,), min_iters=25, name="cholesky_cpu_{shape}".format(shape=shape)) if test.is_gpu_available(True): with ops.Graph().as_default(), \ session.Session() as sess, \ ops.device("/device:GPU:0"): matrix = variables.Variable(self._GenerateMatrix(shape)) l = linalg_ops.cholesky(matrix) variables.global_variables_initializer().run() self.run_op_benchmark( sess, control_flow_ops.group( l,), min_iters=25, name="cholesky_gpu_{shape}".format(shape=shape))
def setUp(self): self._dump_root = tempfile.mkdtemp() if test.is_gpu_available(): self._expected_partition_graph_count = 2 else: self._expected_partition_graph_count = 1
def _testBatchNormGradGrad(self, config): shape = config['shape'] err_tolerance = config['err_tolerance'] dtype = config['dtype'] for is_training in [True, False]: if test.is_gpu_available(cuda_only=True): self._test_grad_grad( shape, dtype, [shape[3]], np.float32, use_gpu=True, data_format='NHWC', is_training=is_training, err_tolerance=err_tolerance) self._test_grad_grad( shape, dtype, [shape[1]], np.float32, use_gpu=True, data_format='NCHW', is_training=is_training, err_tolerance=err_tolerance) self._test_grad_grad( shape, dtype, [shape[3]], np.float32, use_gpu=False, data_format='NHWC', is_training=is_training, err_tolerance=err_tolerance)
def test_convolution_2d(self): num_samples = 2 filters = 2 stack_size = 3 kernel_size = (3, 2) num_row = 7 num_col = 6 for padding in ['valid', 'same']: for strides in [(1, 1), (2, 2)]: if padding == 'same' and strides != (1, 1): continue with self.test_session(use_gpu=True): # Only runs on GPU with CUDA, channels_first is not supported on CPU. # TODO(b/62340061): Support channels_first on CPU. if test.is_gpu_available(cuda_only=True): testing_utils.layer_test( keras.layers.Conv2D, kwargs={ 'filters': filters, 'kernel_size': kernel_size, 'padding': padding, 'strides': strides, 'data_format': 'channels_first' }, input_shape=(num_samples, stack_size, num_row, num_col))
def testAllocationHistory(self): if not test.is_gpu_available(cuda_only=True): return gpu_dev = test.gpu_device_name() ops.reset_default_graph() with ops.device(gpu_dev): _, run_meta = _run_model() mm = _extract_node(run_meta, 'MatMul')['gpu:0'][0] mm_allocs = mm.memory[0].allocation_records # has allocation and deallocation. self.assertEqual(len(mm_allocs), 2) # first allocated. self.assertGreater(mm_allocs[1].alloc_micros, mm_allocs[0].alloc_micros) self.assertGreater(mm_allocs[0].alloc_bytes, 0) # Then deallocated. self.assertLess(mm_allocs[1].alloc_bytes, 0) # All memory deallocated. self.assertEqual(mm_allocs[0].alloc_bytes + mm_allocs[1].alloc_bytes, 0) rand = _extract_node( run_meta, 'random_normal/RandomStandardNormal')['gpu:0'][0] random_allocs = rand.memory[0].allocation_records # random normal must allocated first since matmul depends on it. self.assertLess(random_allocs[0].alloc_micros, mm.all_start_micros) # deallocates the memory after matmul started. self.assertGreater(random_allocs[1].alloc_micros, mm.all_start_micros)
def testSliceWithNonConstAxis(self): if test.is_gpu_available(cuda_only=True): random_seed.set_random_seed(0) x = random_ops.truncated_normal([1, 784], seed=0) conv = _two_layer_model(x) size = array_ops.placeholder(dtype='int32') s = array_ops.slice(conv, [0, 0, 0, 0], size) output = array_ops.identity(s) size_val = [1, 2, 3, 4] with session.Session() as sess: output_val_ref = sess.run(output, feed_dict={size: size_val}) with session.Session(config=_get_config()) as sess: metadata = config_pb2.RunMetadata() output_val = sess.run( output, run_metadata=metadata, feed_dict={ size: size_val }) nodes = [] num_transposes = 0 for node in metadata.cost_graph.node: if _is_transpose(node.name): num_transposes += 1 nodes.append(node.name) # Four transposes were initially added in the Expand phase of # LayoutOptimizer; two of them are cancelled out in the Collapse phase. expected_num_transposes = 2 self.assertEqual(expected_num_transposes, num_transposes) self._assert_trans_nhwc_to_nchw('Conv2D-0', nodes) self._assert_trans_nchw_to_nhwc('Slice-0-0', nodes) self._assert_vec_nhwc_to_nchw('Slice-2', nodes) self.assertAllClose(output_val_ref, output_val, atol=1e-3)
def testSkipEagerIteratorGetNextAsOptional(self, np_value, tf_value_fn, works_on_gpu): if not works_on_gpu and test.is_gpu_available(): self.skipTest("Test case not yet supported on GPU.") ds = dataset_ops.Dataset.from_tensors(np_value).repeat(3) iterator = ds.make_initializable_iterator() next_elem = iterator_ops.get_next_as_optional(iterator) self.assertIsInstance(next_elem, optional_ops.Optional) self.assertTrue( next_elem.value_structure.is_compatible_with( structure.Structure.from_value(tf_value_fn()))) elem_has_value_t = next_elem.has_value() elem_value_t = next_elem.get_value() with self.cached_session() as sess: # Before initializing the iterator, evaluating the optional fails with # a FailedPreconditionError. with self.assertRaises(errors.FailedPreconditionError): sess.run(elem_has_value_t) with self.assertRaises(errors.FailedPreconditionError): sess.run(elem_value_t) # For each element of the dataset, assert that the optional evaluates to # the expected value. sess.run(iterator.initializer) for _ in range(3): elem_has_value, elem_value = sess.run([elem_has_value_t, elem_value_t]) self.assertTrue(elem_has_value) self._assertElementValueEqual(np_value, elem_value) # After exhausting the iterator, `next_elem.has_value()` will evaluate to # false, and attempting to get the value will fail. for _ in range(2): self.assertFalse(sess.run(elem_has_value_t)) with self.assertRaises(errors.InvalidArgumentError): sess.run(elem_value_t)
def benchmarkSamplingMVNDiag(self): logging.vlog( 2, "mvn_diag\tuse_gpu\tcomponents\tbatch\tfeatures\tsample\twall_time") def create_distribution(batch_size, num_components, num_features): cat = ds.Categorical( logits=np.random.randn(batch_size, num_components)) mus = [ variables.Variable(np.random.randn(batch_size, num_features)) for _ in range(num_components) ] sigmas = [ variables.Variable(np.random.rand(batch_size, num_features)) for _ in range(num_components) ] components = list( ds.MultivariateNormalDiag( loc=mu, scale_diag=sigma) for (mu, sigma) in zip(mus, sigmas)) return ds.Mixture(cat, components, use_static_graph=self.use_static_graph) for use_gpu in False, True: if use_gpu and not test.is_gpu_available(): continue for num_components in 1, 8, 16: for batch_size in 1, 32: for num_features in 1, 64, 512: for sample_size in 1, 32, 128: self._runSamplingBenchmark( "mvn_diag", create_distribution=create_distribution, use_gpu=use_gpu, num_components=num_components, batch_size=batch_size, num_features=num_features, sample_size=sample_size)
def testGradient(self): if not test.is_gpu_available(cuda_only=True): self.skipTest('GPU required') random_seed.set_random_seed(0) x = random_ops.truncated_normal([1, 200, 200, 3], seed=0) y = conv_layers.conv2d(x, 32, [3, 3]) z = conv_layers.conv2d(y, 32, [3, 3]) optimizer = gradient_descent.GradientDescentOptimizer(1e-4) loss = math_ops.reduce_mean(z) train_op = optimizer.minimize(loss) graph = ops.get_default_graph() graph.add_to_collection('train_op', train_op) meta_graph = saver_lib.export_meta_graph(graph_def=graph.as_graph_def()) rewrite_options = rewriter_config_pb2.RewriterConfig( optimize_tensor_layout=True) optimized_graph = tf_optimizer.OptimizeGraph(rewrite_options, meta_graph) found = 0 for node in optimized_graph.node: if node.op in ['Conv2D', 'Conv2DBackpropFilter', 'Conv2DBackpropInput']: found += 1 self.assertEqual(node.attr['data_format'].s, 'NCHW') self.assertEqual(found, 5)
def testSendingLargeGraphDefsWorks(self): with self.test_session( use_gpu=True, config=session_debug_testlib.no_rewrite_session_config()) as sess: u = variables.Variable(42.0, name="original_u") for _ in xrange(50 * 1000): u = array_ops.identity(u) sess.run(variables.global_variables_initializer()) def watch_fn(fetches, feeds): del fetches, feeds return framework.WatchOptions( debug_ops=["DebugIdentity"], node_name_regex_whitelist=r"original_u") sess = grpc_wrapper.GrpcDebugWrapperSession( sess, "localhost:%d" % self.debug_server_port, watch_fn=watch_fn) self.assertAllClose(42.0, sess.run(u)) self.assertAllClose( [42.0], self.debug_server.debug_tensor_values["original_u:0:DebugIdentity"]) self.assertEqual(2 if test.is_gpu_available() else 1, len(self.debug_server.partition_graph_defs)) max_graph_def_size = max([ len(graph_def.SerializeToString()) for graph_def in self.debug_server.partition_graph_defs]) self.assertGreater(max_graph_def_size, 4 * 1024 * 1024)
def test_specify_initial_state_keras_tensor(self, layer_class): if test.is_gpu_available(cuda_only=True): with self.test_session(use_gpu=True): input_size = 10 timesteps = 6 units = 2 num_samples = 32 num_states = 2 if layer_class is keras.layers.CuDNNLSTM else 1 inputs = keras.Input((timesteps, input_size)) initial_state = [keras.Input((units,)) for _ in range(num_states)] layer = layer_class(units) if len(initial_state) == 1: output = layer(inputs, initial_state=initial_state[0]) else: output = layer(inputs, initial_state=initial_state) self.assertIn(initial_state[0], layer._inbound_nodes[0].input_tensors) model = keras.models.Model([inputs] + initial_state, output) model.compile(loss='categorical_crossentropy', optimizer='adam') inputs = np.random.random((num_samples, timesteps, input_size)) initial_state = [ np.random.random((num_samples, units)) for _ in range(num_states) ] targets = np.random.random((num_samples, units)) model.fit([inputs] + initial_state, targets)
def benchmarkMatrixBandPartOp(self): for shape_ in self.shapes: for limits in (-1, -1), (-1, 0), (0, -1), (2, 2): with ops.Graph().as_default(), \ session.Session() as sess, \ ops.device("/cpu:0"): matrix = variables.Variable(array_ops.ones(shape_)) band = array_ops.matrix_band_part(matrix, limits[0], limits[1]) variables.global_variables_initializer().run() self.run_op_benchmark( sess, control_flow_ops.group(band), min_iters=10, name="matrix_band_part_cpu_{shape}_{limits}".format( shape=shape_, limits=limits)) if test_lib.is_gpu_available(True): with ops.Graph().as_default(), \ session.Session() as sess, \ ops.device("/gpu:0"): matrix = variables.Variable(array_ops.ones(shape_)) band = array_ops.matrix_band_part(matrix, limits[0], limits[1]) variables.global_variables_initializer().run() self.run_op_benchmark( sess, control_flow_ops.group(band), min_iters=10, name="matrix_band_part_gpu_{shape}_{limits}".format( shape=shape_, limits=limits))
def test_cudnn_rnn_basics(self): if test.is_gpu_available(cuda_only=True): with self.test_session(use_gpu=True): input_size = 10 timesteps = 6 units = 2 num_samples = 32 for layer_class in [keras.layers.CuDNNGRU, keras.layers.CuDNNLSTM]: for return_sequences in [True, False]: with keras.utils.CustomObjectScope( {'keras.layers.CuDNNGRU': keras.layers.CuDNNGRU, 'keras.layers.CuDNNLSTM': keras.layers.CuDNNLSTM}): testing_utils.layer_test( layer_class, kwargs={'units': units, 'return_sequences': return_sequences}, input_shape=(num_samples, timesteps, input_size)) for go_backwards in [True, False]: with keras.utils.CustomObjectScope( {'keras.layers.CuDNNGRU': keras.layers.CuDNNGRU, 'keras.layers.CuDNNLSTM': keras.layers.CuDNNLSTM}): testing_utils.layer_test( layer_class, kwargs={'units': units, 'go_backwards': go_backwards}, input_shape=(num_samples, timesteps, input_size))
def test_regularizer(self, layer_class): if test.is_gpu_available(cuda_only=True): with self.test_session(use_gpu=True): input_size = 10 timesteps = 6 units = 2 num_samples = 32 layer = layer_class( units, return_sequences=False, input_shape=(timesteps, input_size), kernel_regularizer=keras.regularizers.l1(0.01), recurrent_regularizer=keras.regularizers.l1(0.01), bias_regularizer='l2') layer.build((None, None, input_size)) self.assertEqual(len(layer.losses), 3) layer = layer_class( units, return_sequences=False, input_shape=(timesteps, input_size), activity_regularizer='l2') self.assertTrue(layer.activity_regularizer) x = keras.backend.variable( np.ones((num_samples, timesteps, input_size))) layer(x) self.assertEqual(len(layer.get_losses_for(x)), 1)
def testTimelineGpu(self): if not test.is_gpu_available(cuda_only=True): return run_options = config_pb2.RunOptions( trace_level=config_pb2.RunOptions.FULL_TRACE) run_metadata = config_pb2.RunMetadata() with self.session(force_gpu=True) as sess: const1 = constant_op.constant(1.0, name='const1') const2 = constant_op.constant(2.0, name='const2') result = math_ops.add(const1, const2) + const1 * const2 sess.run(result, options=run_options, run_metadata=run_metadata) self.assertTrue(run_metadata.HasField('step_stats')) step_stats = run_metadata.step_stats devices = [d.device for d in step_stats.dev_stats] self.assertTrue('/job:localhost/replica:0/task:0/device:GPU:0' in devices) self.assertTrue('/device:GPU:0/stream:all' in devices) tl = timeline.Timeline(step_stats) ctf = tl.generate_chrome_trace_format() self._validateTrace(ctf) tl = timeline.Timeline(step_stats) ctf = tl.generate_chrome_trace_format(show_dataflow=False) self._validateTrace(ctf) tl = timeline.Timeline(step_stats) ctf = tl.generate_chrome_trace_format(show_memory=False) self._validateTrace(ctf) tl = timeline.Timeline(step_stats) ctf = tl.generate_chrome_trace_format( show_memory=False, show_dataflow=False) self._validateTrace(ctf)
def benchmarkMatrixSolveLsOp(self): run_gpu_test = test_lib.is_gpu_available(True) regularizer = 1.0 for matrix_shape in self.matrix_shapes: for num_rhs in 1, 2, matrix_shape[-1]: with ops.Graph().as_default(), \ session.Session(config=benchmark.benchmark_config()) as sess, \ ops.device("/cpu:0"): matrix, rhs = _GenerateTestData(matrix_shape, num_rhs) x = linalg_ops.matrix_solve_ls(matrix, rhs, regularizer) variables.global_variables_initializer().run() self.run_op_benchmark( sess, control_flow_ops.group(x), min_iters=25, store_memory_usage=False, name=("matrix_solve_ls_cpu_shape_{matrix_shape}_num_rhs_{num_rhs}" ).format(matrix_shape=matrix_shape, num_rhs=num_rhs)) if run_gpu_test and (len(matrix_shape) < 3 or matrix_shape[0] < 513): with ops.Graph().as_default(), \ session.Session(config=benchmark.benchmark_config()) as sess, \ ops.device("/gpu:0"): matrix, rhs = _GenerateTestData(matrix_shape, num_rhs) x = linalg_ops.matrix_solve_ls(matrix, rhs, regularizer) variables.global_variables_initializer().run() self.run_op_benchmark( sess, control_flow_ops.group(x), min_iters=25, store_memory_usage=False, name=("matrix_solve_ls_gpu_shape_{matrix_shape}_num_rhs_" "{num_rhs}").format( matrix_shape=matrix_shape, num_rhs=num_rhs))
def testSelectOpScalarCondition(self): if test.is_gpu_available(cuda_only=True): random_seed.set_random_seed(0) x = random_ops.truncated_normal([1, 784], seed=0) conv = _two_layer_model(x) add = math_ops.add(conv, conv) condition = constant_op.constant(True) select = gen_math_ops._select(condition, conv, add) output = array_ops.identity(select) with session.Session() as sess: output_val_ref = sess.run(output) with session.Session(config=_get_config()) as sess: metadata = config_pb2.RunMetadata() output_val = sess.run(output, run_metadata=metadata) nodes = [] num_transposes = 0 for node in metadata.cost_graph.node: if _is_transpose(node.name): num_transposes += 1 nodes.append(node.name) expected_num_transposes = 2 self.assertEqual(expected_num_transposes, num_transposes) self._assert_trans_nhwc_to_nchw('Conv2D-0', nodes) self._assert_trans_nchw_to_nhwc('Select-0-0', nodes) self.assertAllClose(output_val_ref, output_val, atol=1e-3)
def testReverseWithConstDims(self): if test.is_gpu_available(cuda_only=True): random_seed.set_random_seed(0) x = random_ops.truncated_normal([1, 784], seed=0) conv = _two_layer_model(x) dims = constant_op.constant([3, 1], name='DimsConst') reverse = array_ops.reverse(conv, dims) output = array_ops.identity(reverse) with session.Session() as sess: output_val_ref = sess.run(output) with session.Session(config=_get_config()) as sess: metadata = config_pb2.RunMetadata() output_val = sess.run(output, run_metadata=metadata) nodes = [] num_transposes = 0 for node in metadata.cost_graph.node: if _is_transpose(node.name): num_transposes += 1 nodes.append(node.name) # Four transposes were initially added in the Expand phase of # LayoutOptimizer; two of them are cancelled out in the Collapse phase. expected_num_transposes = 2 self.assertEqual(expected_num_transposes, num_transposes) self._assert_trans_nhwc_to_nchw('Conv2D-0', nodes) self._assert_trans_nchw_to_nhwc('ReverseV2-0-0', nodes) self.assertIn('ReverseV2-1-LayoutOptimizer', nodes) self.assertAllClose(output_val_ref, output_val, atol=1e-3)
def testConcatWithControlDependency(self): if test.is_gpu_available(cuda_only=True): random_seed.set_random_seed(0) x = random_ops.truncated_normal([1, 784], seed=0) conv = _two_layer_model(x) axis = constant_op.constant(3) var = variables.Variable(3) assign = state_ops.assign(var, 6) with ops.control_dependencies([assign]): concat = array_ops.concat([conv, conv], axis) output = array_ops.identity(concat) with session.Session() as sess: output_val_ref = sess.run(output) with session.Session(config=_get_config()) as sess: metadata = config_pb2.RunMetadata() output_val = sess.run(output, run_metadata=metadata) nodes = [] num_transposes = 0 for node in metadata.cost_graph.node: if _is_transpose(node.name): num_transposes += 1 nodes.append(node.name) # Four transposes were initially added in the Expand phase of # LayoutOptimizer; two of them are cancelled out in the Collapse phase. expected_num_transposes = 2 self.assertEqual(expected_num_transposes, num_transposes) self._assert_trans_nhwc_to_nchw('Conv2D-0', nodes) self._assert_trans_nchw_to_nhwc('concat-0-0', nodes) self.assertIn('concat-2-LayoutOptimizer', nodes) self.assertAllClose(output_val_ref, output_val, atol=1e-3)
def testTwoConvLayers(self): if test.is_gpu_available(cuda_only=True): random_seed.set_random_seed(0) x = random_ops.truncated_normal([1, 784], seed=0) output = two_layer_model(x) with session.Session() as sess: output_val_ref = sess.run(output) with session.Session(config=get_config()) as sess: metadata = config_pb2.RunMetadata() output_val = sess.run(output, run_metadata=metadata) nodes = [] num_transposes = 0 for node in metadata.cost_graph.node: if node.name.startswith('LayoutOptimizerTranspose'): num_transposes += 1 nodes.append(node.name) # Four transposes were initially added in the Expand phase of # LayoutOptimizer; two of them are cancelled out in the Collapse phase. expected_num_transposes = 2 self.assertEqual(expected_num_transposes, num_transposes) self.assertIn('LayoutOptimizerTransposeNHWCToNCHW-Conv2D-Reshape-0', nodes) self.assertIn('LayoutOptimizerTransposeNCHWToNHWC-Relu_1-MaxPool_1', nodes) self.assertAllClose(output_val_ref, output_val, atol=1e-3)
def testTraceLoopBytes(self): if not test.is_gpu_available(): return ops.reset_default_graph() steps = 100 with ops.device('/gpu:0'): x = array_ops.ones((100, 100), dtype=dtypes.float32) n = array_ops.constant(steps, dtype=dtypes.int32) x1 = array_ops.ones((100, 100)) x *= x1 def loop_body(i, x): x *= x return i + 1, x _, y = control_flow_ops.while_loop( lambda i, x: i < n, loop_body, [array_ops.constant(0), x]) grad = gradients.gradients(y, [x1]) with session.Session(config=self._no_rewrite_session_config()) as sess: run_options = config_pb2.RunOptions( trace_level=config_pb2.RunOptions.FULL_TRACE) run_metadata = config_pb2.RunMetadata() sess.run(grad, options=run_options, run_metadata=run_metadata) options = option_builder.ProfileOptionBuilder.time_and_memory() options['min_bytes'] = 0 options['min_micros'] = 0 options['select'] = ('bytes', 'peak_bytes', 'output_bytes', 'residual_bytes') options['output'] = 'none' ret_pb = model_analyzer.profile( sess.graph, run_meta=run_metadata, cmd='scope', options=options) self.assertGreater(ret_pb.total_requested_bytes, 1000000)
def testDepthwiseConv2DFilterGradExplicit(self): for index, (input_size, filter_size, output_size, stride, padding, dilations) in enumerate(CheckGradConfigsToTestExplicit()): tf_logging.info( "Testing DepthwiseConv2DFilterGradExplicit, %dth config: %r * %r, " "stride: %d, padding: %s", index, input_size, filter_size, stride, padding) # double datatype is currently not supported for convolution ops # on the ROCm platform optional_float64 = [] if test.is_built_with_rocm() else [dtypes.float64] data_formats = ["NHWC", "NCHW"] if test.is_gpu_available() else ["NHWC"] for data_type in [dtypes.float16, dtypes.float32] + optional_float64: for data_format in data_formats: self._ConstructAndTestGradient( input_size, filter_size, output_size, stride, padding, data_type, test_input=False, use_gpu=True, data_format=data_format, dilations=dilations)
def testMaxPoolGradV2(self): if test.is_gpu_available(cuda_only=True): random_seed.set_random_seed(0) x = random_ops.truncated_normal([1, 784], seed=0) conv = _two_layer_model(x) ksize = constant_op.constant([1, 2, 3, 1], shape=[4]) strides = array_ops.placeholder(dtype='int32', shape=[4]) max_pool_grad = gen_nn_ops.max_pool_grad_v2(conv, conv, conv, ksize, strides, 'VALID') output = array_ops.identity(max_pool_grad) strides_val = [1, 3, 2, 1] with session.Session() as sess: output_val_ref = sess.run(output, feed_dict={strides: strides_val}) with session.Session(config=_get_config()) as sess: metadata = config_pb2.RunMetadata() output_val = sess.run( output, run_metadata=metadata, feed_dict={ strides: strides_val }) nodes = [] num_transposes = 0 for node in metadata.cost_graph.node: if _is_transpose(node.name): num_transposes += 1 nodes.append(node.name) expected_num_transposes = 2 self.assertEqual(expected_num_transposes, num_transposes) self._assert_trans_nhwc_to_nchw('Conv2D-0', nodes) self._assert_trans_nchw_to_nhwc('MaxPoolGradV2-0-0', nodes) self._assert_vec_nhwc_to_nchw('MaxPoolGradV2-4', nodes) self.assertIn('MaxPoolGradV2-3-LayoutOptimizer', nodes) self.assertAllClose(output_val_ref, output_val, atol=1e-3)
def benchmarkTridiagonalMulOp(self): devices = [('/cpu:0', 'cpu')] if test.is_gpu_available(cuda_only=True): devices += [('/gpu:0', 'gpu')] for device_option, size_option in itertools.product( devices, self.sizes): device_id, device_name = device_option m, batch_size, n = size_option with ops.Graph().as_default(), \ session.Session(config=benchmark.benchmark_config()) as sess, \ ops.device(device_id): upper, diag, lower, vec = self._generateData( batch_size, m, n) x1 = self.baseline(upper, diag, lower, vec) x2 = linalg_impl.tridiagonal_matmul( (upper, diag, lower), vec, diagonals_format='sequence') variables.global_variables_initializer().run() self.run_op_benchmark( sess, control_flow_ops.group(x1), min_iters=10, store_memory_usage=False, name=('tridiagonal_matmul_baseline_%s' '_batch_size_%d_m_%d_n_%d' % (device_name, batch_size, m, n))) self.run_op_benchmark( sess, control_flow_ops.group(x2), min_iters=10, store_memory_usage=False, name=('tridiagonal_matmul_%s_batch_size_%d_m_%d_n_%d' % (device_name, batch_size, m, n)))
def _test_runtime_with_model(self, model): (x_train, y_train), _ = testing_utils.get_test_data( train_samples=self.batch, test_samples=0, input_shape=(self.timestep, self.input_shape), num_classes=self.output_shape) y_train = keras.utils.to_categorical(y_train, self.output_shape) model.compile(optimizer='sgd', loss=['categorical_crossentropy', None]) existing_loss = 0 for _ in range(self.epoch): history = model.fit(x_train, y_train) loss_value = history.history['loss'][0] self.assertNotEqual(existing_loss, loss_value) existing_loss = loss_value _, runtime_value = model.predict(x_train) if test.is_gpu_available(): self.assertEqual(runtime_value[0], rnn._RUNTIME_GPU) else: self.assertEqual(runtime_value[0], rnn._RUNTIME_CPU)
def test_loop_with_vars_intertwined(self): """Test graph with intertwined while loops.""" if test.is_gpu_available(cuda_only=True): random_seed.set_random_seed(0) x = _input([8, 8]) _, _, k, l = _loop_vars_intertwined( array_ops.ones(array_ops.shape(x)), x, _matmul_act, _matmul_act) optimizer = gradient_descent.GradientDescentOptimizer( learning_rate=0.01) g = optimizer.compute_gradients(k, [x]) output = (k, l, g) output_val_ref, output_val, cost_graph = self._run(output) node_map = _build_node_map(cost_graph.node) self._assert_output_fp16(node_map, 'while/MatMul') self._assert_output_fp16(node_map, 'while/Relu') self._assert_output_fp16(node_map, 'while/MatMul_1') self._assert_output_fp16(node_map, 'while/Relu_1') self.assertAllClose(output_val_ref, output_val, atol=1e-3, rtol=1e-3)
def testDepthwiseConv2DFormat(self): if not test.is_gpu_available(): return for index, (input_size, filter_size, _, stride, padding, dilations) in enumerate(ConfigsToTest()): tf_logging.info( "Testing DepthwiseConv2DFormat, %dth config: %r * %r, stride: %d, " "padding: %s", index, input_size, filter_size, stride, padding) # double datatype is currently not supported for convolution ops # on the ROCm platform optional_float64 = [] if test.is_built_with_rocm() else [dtypes.float64] for data_type in ([dtypes.float32] + optional_float64): tolerance = 1e-4 if data_type == dtypes.float32 else 1e-12 self._VerifyValues( input_size, filter_size, stride, padding, data_type, use_gpu=True, data_format="NCHW", dilations=dilations, tolerance=tolerance)
def testBinaryOpSecondPort(self): if test.is_gpu_available(cuda_only=True): output = _model_with_second_port() with session.Session() as sess: output_val_ref = sess.run(output) with session.Session(config=_get_config()) as sess: metadata = config_pb2.RunMetadata() output_val = sess.run(output, run_metadata=metadata) nodes = [] num_transposes = 0 for node in metadata.cost_graph.node: if node.name.startswith('LayoutOptimizerTranspose'): num_transposes += 1 nodes.append(node.name) expected_num_transposes = 2 self.assertEqual(expected_num_transposes, num_transposes) self.assertIn('LayoutOptimizerTransposeNHWCToNCHW-FusedBatchNorm-0', nodes) self.assertIn('LayoutOptimizerTransposeNCHWToNHWC-Add-0-0', nodes) self.assertAllClose(output_val_ref, output_val, atol=1e-3)
def test3DGPU(self): # If no GPU available, skip the test if not test.is_gpu_available(cuda_only=True): return datatypes = [ np.int8, np.float16, np.float32, np.float64, np.complex128 ] large_shapes = [[4, 1000, 3], [4, 1000, 8], [4, 1000, 13], [4, 3, 1000], [4, 8, 1000], [4, 13, 1000]] * 3 perms = [[0, 2, 1]] * 6 + [[2, 1, 0]] * 6 + [[1, 2, 0] ] * 3 + [[2, 0, 1]] * 3 for datatype in datatypes: for input_shape, perm in zip(large_shapes, perms): total_size = np.prod(input_shape) inp = np.arange(1, total_size + 1, dtype=datatype).reshape(input_shape) np_ans = self._np_transpose(inp, perm) with self.test_session(use_gpu=True): inx = ops.convert_to_tensor(inp) y = array_ops.transpose(inx, perm) tf_ans = y.eval() self.assertAllEqual(np_ans, tf_ans) self.assertShapeEqual(np_ans, y)
def testStridedSliceWithMask(self): if test.is_gpu_available(cuda_only=True): random_seed.set_random_seed(0) x = random_ops.truncated_normal([1, 784], seed=0) conv = _two_layer_model(x) # This will generate a StridedSlice op with begin mask and end mask. s = conv[:, :, 1:-1, :] output = array_ops.identity(s) with session.Session() as sess: output_val_ref = sess.run(output) with session.Session(config=_get_config()) as sess: metadata = config_pb2.RunMetadata() output_val = sess.run(output, run_metadata=metadata) nodes = [] num_transposes = 0 for node in metadata.cost_graph.node: if node.name.startswith('LayoutOptimizerTranspose'): num_transposes += 1 nodes.append(node.name) # Four transposes were initially added in the Expand phase of # LayoutOptimizer; two of them are cancelled out in the Collapse phase. expected_num_transposes = 2 self.assertEqual(expected_num_transposes, num_transposes) self.assertIn('LayoutOptimizerTransposeNHWCToNCHW-Conv2D-0', nodes) self.assertIn('LayoutOptimizerTransposeNCHWToNHWC-strided_slice-0-0', nodes) self.assertIn('LayoutOptimizer-strided_slice-strided_slice/stack', nodes) self.assertIn('LayoutOptimizer-strided_slice-strided_slice/stack_1', nodes) self.assertIn('LayoutOptimizer-strided_slice-strided_slice/stack_2', nodes) self.assertAllClose(output_val_ref, output_val, atol=1e-3)
def testSamplingAtRandnSwitchover(self): # The randn sampler is used as the bounds are moved farther from the mean, # and the probability of accepting a sample increases the farther the # bounds are from the mean. # This test asserts that at the point of switchover, both samplers are # working (not raising an error or returning nan) and returning the # expected moments. use_gpu = test.is_gpu_available() stddev_inside_bounds_before_using_randn = ( _get_stddev_inside_bounds_before_using_randn(use_gpu)) epsilon = 0.001 self.validateMoments( shape=[10**6], mean=0., stddev=1.0, minval=-epsilon, maxval=stddev_inside_bounds_before_using_randn - epsilon) self.validateMoments( shape=[10**6], mean=0., stddev=1.0, minval=-epsilon, maxval=stddev_inside_bounds_before_using_randn + epsilon)
def test_conv_bn_dropout(self): """Test dropout precision of convolution batch norm graph.""" with compat.forward_compatibility_horizon(2019, 6, 7): if test.is_gpu_available(cuda_only=True): random_seed.set_random_seed(0) x = _input([2, 8, 8, 1]) y = _conv_bn(x) y = nn.dropout(y, rate=0.5) y = _conv_bn(y) y = array_ops.identity(y) optimizer = gradient_descent.GradientDescentOptimizer( learning_rate=0.01) g = optimizer.compute_gradients(y, [x]) output = (y, g) output_val_ref, output_val, cost_graph = self._run(output) node_map = _build_node_map(cost_graph.node) self._assert_output_fp16(node_map, 'Conv2D') self._assert_output_fp16(node_map, 'FusedBatchNormV3') self._assert_output_fp16(node_map, 'dropout/mul') self._assert_output_fp16(node_map, 'Conv2D_1') output_val_ref, output_val, cost_graph = self._run(output) self.assertAllClose(output_val_ref, output_val, atol=1e-3, rtol=1e-3)
def testSparseDevicePlacement(self, use_resource): for index_dtype in [dtypes.int32, dtypes.int64]: with self.cached_session(force_gpu=test.is_gpu_available()): # If a GPU is available, tests that all optimizer ops can be placed on # it (i.e. they have GPU kernels). if use_resource: global_step = resource_variable_ops.ResourceVariable( array_ops.zeros([], dtypes.int64)) var = resource_variable_ops.ResourceVariable([[1.0], [2.0]]) else: global_step = variables.Variable( array_ops.zeros([], dtypes.int64)) var = variables.Variable([[1.0], [2.0]]) indices = constant_op.constant([0, 1], dtype=index_dtype) gathered_sum = math_ops.reduce_sum( array_ops.gather(var, indices)) optimizer = lazy_adam_gs_optimizer.LazyAdamGSOptimizer( global_step=global_step, learning_rate=3.0) minimize_op = optimizer.minimize(gathered_sum, global_step=global_step) variables.global_variables_initializer().run() minimize_op.run()
def testConv2DKernelSmallerThanStrideSame(self, gpu_only=True): if gpu_only and not test.is_gpu_available(): tf_logging.info("Skipping Conv2DKernelSmallerThanStrideSame test.") return # expected = [0, 0, 2, 4] self._VerifyValues(tensor_in_sizes=[1, 3, 3, 1], filter_in_sizes=[1, 1, 1, 1], bias=[-5.0], strides=[2, 2], padding="SAME") # expected = [0, 0, 4, 6] self._VerifyValues(tensor_in_sizes=[1, 4, 4, 1], filter_in_sizes=[1, 1, 1, 1], bias=[-5.0], strides=[2, 2], padding="SAME") # expected = [4, 0, 1, 0] self._VerifyValues(tensor_in_sizes=[1, 4, 4, 1], filter_in_sizes=[2, 2, 1, 1], bias=[-40.0], strides=[3, 3], padding="SAME")
def test_noninlined_funcdef(self): """Test graph with non-inlined function subgraph. This requires the grappler pass to handle an OpDef that only appears in the graph's function registry instead of the global op registry. """ if test.is_gpu_available(cuda_only=True): random_seed.set_random_seed(0) x = _input([8, 8]) y = _matmul_act(x) y = _example_noninlined_funcdef(y) optimizer = gradient_descent.GradientDescentOptimizer( learning_rate=0.01) g = optimizer.compute_gradients(y, [x]) output = (g, y) output_val_ref, output_val, cost_graph = self._run(output) node_map = _build_node_map(cost_graph.node) self._assert_output_fp16(node_map, 'MatMul') self.assertAllClose(output_val_ref, output_val, atol=1e-3, rtol=1e-3)
def testLoopGPU(self): if not test.is_gpu_available(): return ops.reset_default_graph() with ops.device('/gpu:0'): tfprof_node, run_meta = _run_loop_model() # The while-loop caused a node to appear 4 times in scheduling. ret = _extract_node( run_meta, 'rnn/while/rnn/basic_rnn_cell/basic_rnn_cell/MatMul') self.assertEqual(len(ret['/job:localhost/replica:0/task:0/gpu:0']), 4) total_cpu_execs = 0 for node in ret['/job:localhost/replica:0/task:0/gpu:0']: total_cpu_execs += node.op_end_rel_micros ret = _extract_node( run_meta, 'rnn/while/rnn/basic_rnn_cell/basic_rnn_cell/MatMul:MatMul') self.assertGreaterEqual(len(ret['/gpu:0/stream:all']), 4) total_accelerator_execs = 0 for node in ret['/gpu:0/stream:all']: total_accelerator_execs += node.op_end_rel_micros mm_node = lib.SearchTFProfNode( tfprof_node, 'rnn/while/rnn/basic_rnn_cell/basic_rnn_cell/MatMul') self.assertEqual(mm_node.run_count, 4) self.assertEqual(mm_node.accelerator_exec_micros, total_accelerator_execs) self.assertEqual(mm_node.cpu_exec_micros, total_cpu_execs) self.assertEqual(mm_node.exec_micros, total_cpu_execs + total_accelerator_execs)
def test_multi_paths_2(self): """Test graph with multiple paths.""" if test.is_gpu_available(cuda_only=True): random_seed.set_random_seed(0) x = _input([8, 8]) y1 = _matmul_act(x) y2 = _matmul_act(x) y = y1 + y2 + x optimizer = gradient_descent.GradientDescentOptimizer( learning_rate=0.01) g = optimizer.compute_gradients(y, [x]) output = (g, y) output_val_ref, output_val, cost_graph = self._run(output) node_map = _build_node_map(cost_graph.node) self._assert_output_fp16(node_map, 'MatMul') self._assert_output_fp16(node_map, 'Relu') self._assert_output_fp16(node_map, 'MatMul_1') self._assert_output_fp16(node_map, 'Relu_1') self.assertAllClose(output_val_ref, output_val, atol=1e-3, rtol=1e-3)
def testDeviceWrapperDynamicExecutionNodesAreAllProperlyLocated(self): if not test.is_gpu_available(): # Can't perform this test w/o a GPU return gpu_dev = test.gpu_device_name() with self.test_session(use_gpu=True) as sess: with variable_scope.variable_scope( "root", initializer=init_ops.constant_initializer(0.5)): x = array_ops.zeros([1, 1, 3]) cell = rnn_cell_impl.DeviceWrapper(rnn_cell_impl.GRUCell(3), gpu_dev) with ops.device("/cpu:0"): outputs, _ = rnn.dynamic_rnn( cell=cell, inputs=x, dtype=dtypes.float32) run_metadata = config_pb2.RunMetadata() opts = config_pb2.RunOptions( trace_level=config_pb2.RunOptions.FULL_TRACE) sess.run([variables_lib.global_variables_initializer()]) _ = sess.run(outputs, options=opts, run_metadata=run_metadata) cpu_stats, gpu_stats = self._retrieve_cpu_gpu_stats(run_metadata) self.assertFalse([s for s in cpu_stats if "gru_cell" in s.node_name]) self.assertTrue([s for s in gpu_stats if "gru_cell" in s.node_name])
def test_multi_paths_2(self): """Test graph with multiple paths.""" if test.is_gpu_available(cuda_only=True): random_seed.set_random_seed(0) x = _input([8, 8]) y1 = _matmul_act(x) y2 = _matmul_act(x) y = y1 + y2 + x optimizer = gradient_descent.GradientDescentOptimizer(learning_rate=0.01) g = optimizer.compute_gradients(y, [x]) output = (g, y) output_val_ref, output_val, cost_graph = self._run(output) node_map = _build_node_map(cost_graph.node) self._assert_output_fp16(node_map, 'MatMul') self._assert_output_fp16(node_map, 'Relu') self._assert_output_fp16(node_map, 'MatMul_1') self._assert_output_fp16(node_map, 'Relu_1') # Bump up the tolerance for the ROCm platform # The default tolerance (1e-3) results in a tiny fraction (<1%) of # miscompares on ROCm platform, and hence the tolerance bump tol = 2e-3 if test.is_built_with_rocm else 1e-3 self.assertAllClose(output_val_ref, output_val, atol=tol, rtol=tol)
def testSelectEverythingDetail(self): ops.reset_default_graph() dev = '/device:GPU:0' if test.is_gpu_available() else '/device:CPU:0' outfile = os.path.join(test.get_temp_dir(), 'dump') opts = (builder( builder.trainable_variables_parameter()).with_file_output( outfile).with_accounted_types(['.*']).select([ 'micros', 'bytes', 'params', 'float_ops', 'occurrence', 'device', 'op_types', 'input_shapes' ]).build()) with profile_context.ProfileContext(test.get_temp_dir(), trace_steps=[], dump_steps=[]) as pctx: with session.Session() as sess, ops.device(dev): x = lib.BuildSmallModel() sess.run(variables.global_variables_initializer()) pctx.trace_next_step() pctx.dump_next_step() _ = sess.run(x) pctx.profiler.profile_name_scope(options=opts) with gfile.Open(outfile, 'r') as f: # pylint: disable=line-too-long dump_str = lib.CheckAndRemoveDoc(f.read()) outputs = dump_str.split('\n') self.assertEqual( outputs[0], 'node name | # parameters | # float_ops | requested bytes | total execution time | accelerator execution time | cpu execution time | assigned devices | op types | op count (run|defined) | input shapes' ) for o in outputs[1:]: if o.find('Conv2D ') > 0: metrics = o[o.find('(') + 1:o.find(')')].split(',') # Make sure time is profiled. gap = 1 if test.is_gpu_available() else 2 for i in range(3, 6, gap): mat = re.search('(.*)[um]s/(.*)[um]s', metrics[i]) self.assertGreater(float(mat.group(1)), 0.0) self.assertGreater(float(mat.group(2)), 0.0) # Make sure device is profiled. if test.is_gpu_available(): self.assertTrue(metrics[6].find('gpu') > 0) self.assertFalse(metrics[6].find('cpu') > 0) else: self.assertFalse(metrics[6].find('gpu') > 0) self.assertTrue(metrics[6].find('cpu') > 0) # Make sure float_ops is profiled. mat = re.search('(.*)k/(.*)k flops', metrics[1].strip()) self.assertGreater(float(mat.group(1)), 0.0) self.assertGreater(float(mat.group(2)), 0.0) # Make sure op_count is profiled. self.assertEqual(metrics[8].strip(), '1/1|1/1') # Make sure input_shapes is profiled. self.assertEqual(metrics[9].strip(), '0:2x6x6x3|1:3x3x3x6') if o.find('DW (3x3x3x6') > 0: metrics = o[o.find('(') + 1:o.find(')')].split(',') mat = re.search('(.*)/(.*) params', metrics[1].strip()) self.assertGreater(float(mat.group(1)), 0.0) self.assertGreater(float(mat.group(2)), 0.0) # pylint: enable=line-too-long # Test that profiler restored from profile file gives the same result. gfile.Remove(outfile) profile_file = os.path.join(test.get_temp_dir(), 'profile_1') with lib.ProfilerFromFile(profile_file) as profiler: profiler.profile_name_scope(options=opts) with gfile.Open(outfile, 'r') as f: self.assertEqual(dump_str, lib.CheckAndRemoveDoc(f.read()))
def testNumbersGPU(self): if not test.is_gpu_available(): self.skipTest("No GPU available") for t in [np.float16, np.float32, np.float64]: self._testRelu( np.array([[-9, 7, -5, 3, -1], [1, -3, 5, -7, 9]]).astype(t))
def test_conv3d(self, kwargs, expected_output_shape=None): kwargs['filters'] = 2 kwargs['kernel_size'] = (3, 3, 3) if 'data_format' not in kwargs or test.is_gpu_available(cuda_only=True): self._run_test(kwargs, expected_output_shape)
def test_model_with_crossentropy_losses_channels_first(self): """Tests use of all crossentropy losses with `channels_first`. Tests `sparse_categorical_crossentropy`, `categorical_crossentropy`, and `binary_crossentropy`. Verifies that evaluate gives the same result with either `channels_first` or `channels_last` image_data_format. """ def prepare_simple_model(input_tensor, loss_name, target): axis = 1 if K.image_data_format() == 'channels_first' else -1 loss = None num_channels = None activation = None if loss_name == 'sparse_categorical_crossentropy': loss = lambda y_true, y_pred: K.sparse_categorical_crossentropy( # pylint: disable=g-long-lambda y_true, y_pred, axis=axis) num_channels = np.amax(target) + 1 activation = 'softmax' elif loss_name == 'categorical_crossentropy': loss = lambda y_true, y_pred: K.categorical_crossentropy( # pylint: disable=g-long-lambda y_true, y_pred, axis=axis) num_channels = target.shape[axis] activation = 'softmax' elif loss_name == 'binary_crossentropy': loss = lambda y_true, y_pred: K.binary_crossentropy(y_true, y_pred) # pylint: disable=unnecessary-lambda num_channels = target.shape[axis] activation = 'sigmoid' predictions = Conv2D(num_channels, 1, activation=activation, kernel_initializer='ones', bias_initializer='ones')(input_tensor) simple_model = keras.models.Model(inputs=input_tensor, outputs=predictions) simple_model.compile(optimizer='rmsprop', loss=loss) return simple_model if test.is_gpu_available(cuda_only=True): with test_util.use_gpu(): losses_to_test = ['sparse_categorical_crossentropy', 'categorical_crossentropy', 'binary_crossentropy'] data_channels_first = np.array([[[[8., 7.1, 0.], [4.5, 2.6, 0.55], [0.9, 4.2, 11.2]]]], dtype=np.float32) # Labels for testing 4-class sparse_categorical_crossentropy, 4-class # categorical_crossentropy, and 2-class binary_crossentropy: labels_channels_first = [np.array([[[[0, 1, 3], [2, 1, 0], [2, 2, 1]]]], dtype=np.float32), # pylint: disable=line-too-long np.array([[[[0, 1, 0], [0, 1, 0], [0, 0, 0]], [[1, 0, 0], [0, 0, 1], [0, 1, 0]], [[0, 0, 0], [1, 0, 0], [0, 0, 1]], [[0, 0, 1], [0, 0, 0], [1, 0, 0]]]], dtype=np.float32), # pylint: disable=line-too-long np.array([[[[0, 1, 0], [0, 1, 0], [0, 0, 1]], [[1, 0, 1], [1, 0, 1], [1, 1, 0]]]], dtype=np.float32)] # pylint: disable=line-too-long # Compute one loss for each loss function in the list `losses_to_test`: loss_channels_last = [0., 0., 0.] loss_channels_first = [0., 0., 0.] old_data_format = K.image_data_format() # Evaluate a simple network with channels last, with all three loss # functions: K.set_image_data_format('channels_last') data = np.moveaxis(data_channels_first, 1, -1) for index, loss_function in enumerate(losses_to_test): labels = np.moveaxis(labels_channels_first[index], 1, -1) inputs = keras.Input(shape=(3, 3, 1)) model = prepare_simple_model(inputs, loss_function, labels) loss_channels_last[index] = model.evaluate(x=data, y=labels, batch_size=1, verbose=0) # Evaluate the same network with channels first, with all three loss # functions: K.set_image_data_format('channels_first') data = data_channels_first for index, loss_function in enumerate(losses_to_test): labels = labels_channels_first[index] inputs = keras.Input(shape=(1, 3, 3)) model = prepare_simple_model(inputs, loss_function, labels) loss_channels_first[index] = model.evaluate(x=data, y=labels, batch_size=1, verbose=0) K.set_image_data_format(old_data_format) np.testing.assert_allclose(loss_channels_first, loss_channels_last, err_msg='{}{}'.format( 'Computed different losses for ', 'channels_first and channels_last'))
def testMultiStepProfile(self): ops.reset_default_graph() opts = model_analyzer.PRINT_ALL_TIMING_MEMORY.copy() opts['account_type_regexes'] = ['.*'] with session.Session() as sess: r1, r2, r3 = lib.BuildSplitableModel() sess.run(variables.global_variables_initializer()) profiler = model_analyzer.Profiler(sess.graph) pb0 = profiler.profile_name_scope(opts) run_meta = config_pb2.RunMetadata() _ = sess.run(r1, options=config_pb2.RunOptions( trace_level=config_pb2.RunOptions.FULL_TRACE), run_metadata=run_meta) profiler.add_step(1, run_meta) pb1 = profiler.profile_name_scope(opts) self.assertNotEqual(lib.SearchTFProfNode(pb1, 'DW'), None) self.assertEqual(lib.SearchTFProfNode(pb1, 'DW2'), None) self.assertEqual(lib.SearchTFProfNode(pb1, 'add'), None) run_meta2 = config_pb2.RunMetadata() _ = sess.run(r2, options=config_pb2.RunOptions( trace_level=config_pb2.RunOptions.FULL_TRACE), run_metadata=run_meta2) profiler.add_step(2, run_meta2) pb2 = profiler.profile_name_scope(opts) self.assertNotEqual(lib.SearchTFProfNode(pb2, 'DW'), None) self.assertNotEqual(lib.SearchTFProfNode(pb2, 'DW2'), None) self.assertEqual(lib.SearchTFProfNode(pb2, 'add'), None) run_meta3 = config_pb2.RunMetadata() _ = sess.run(r3, options=config_pb2.RunOptions( trace_level=config_pb2.RunOptions.FULL_TRACE), run_metadata=run_meta3) profiler.add_step(3, run_meta3) pb3 = profiler.profile_name_scope(opts) self.assertNotEqual(lib.SearchTFProfNode(pb3, 'DW'), None) self.assertNotEqual(lib.SearchTFProfNode(pb3, 'DW2'), None) self.assertNotEqual(lib.SearchTFProfNode(pb3, 'add'), None) self.assertEqual(lib.SearchTFProfNode(pb0, 'Conv2D'), None) self.assertGreater(lib.SearchTFProfNode(pb1, 'Conv2D').exec_micros, 0) self.assertEqual(lib.SearchTFProfNode(pb1, 'Conv2D_1'), None) self.assertGreater(lib.SearchTFProfNode(pb2, 'Conv2D_1').exec_micros, 0) self.assertEqual(lib.SearchTFProfNode(pb2, 'add'), None) self.assertGreater(lib.SearchTFProfNode(pb3, 'add').exec_micros, 0) advice_pb = profiler.advise(model_analyzer.ALL_ADVICE) self.assertTrue('AcceleratorUtilizationChecker' in advice_pb.checkers) self.assertTrue('ExpensiveOperationChecker' in advice_pb.checkers) self.assertTrue('OperationChecker' in advice_pb.checkers) checker = advice_pb.checkers['AcceleratorUtilizationChecker'] if test.is_gpu_available(): self.assertGreater(len(checker.reports), 0) else: self.assertEqual(len(checker.reports), 0) checker = advice_pb.checkers['ExpensiveOperationChecker'] self.assertGreater(len(checker.reports), 0)
def test_depthwise_conv2d(self, kwargs): kwargs['kernel_size'] = (3, 3) if 'data_format' not in kwargs or test.is_gpu_available(cuda_only=True): self._run_test(kwargs)
def test_cudnnrnn_bidirectional(self): if test.is_gpu_available(cuda_only=True): with self.test_session(use_gpu=True): rnn = keras.layers.CuDNNGRU samples = 2 dim = 2 timesteps = 2 output_dim = 2 mode = 'concat' x = np.random.random((samples, timesteps, dim)) target_dim = 2 * output_dim if mode == 'concat' else output_dim y = np.random.random((samples, target_dim)) # test with Sequential model model = keras.Sequential() model.add( keras.layers.Bidirectional(rnn(output_dim), merge_mode=mode, input_shape=(None, dim))) model.compile(loss='mse', optimizer=RMSPropOptimizer(learning_rate=0.001)) model.fit(x, y, epochs=1, batch_size=1) # test config model.get_config() model = keras.models.model_from_json(model.to_json()) model.summary() # test stacked bidirectional layers model = keras.Sequential() model.add( keras.layers.Bidirectional(rnn(output_dim, return_sequences=True), merge_mode=mode, input_shape=(None, dim))) model.add( keras.layers.Bidirectional(rnn(output_dim), merge_mode=mode)) model.compile(loss='mse', optimizer=RMSPropOptimizer(learning_rate=0.001)) model.fit(x, y, epochs=1, batch_size=1) # test with functional API inputs = keras.Input((timesteps, dim)) outputs = keras.layers.Bidirectional(rnn(output_dim), merge_mode=mode)(inputs) model = keras.Model(inputs, outputs) model.compile(loss='mse', optimizer=RMSPropOptimizer(learning_rate=0.001)) model.fit(x, y, epochs=1, batch_size=1) # Bidirectional and stateful inputs = keras.Input(batch_shape=(1, timesteps, dim)) outputs = keras.layers.Bidirectional(rnn(output_dim, stateful=True), merge_mode=mode)(inputs) model = keras.Model(inputs, outputs) model.compile(loss='mse', optimizer=RMSPropOptimizer(learning_rate=0.001)) model.fit(x, y, epochs=1, batch_size=1)
def testOpEdgeCases(self, gpu_only=True): if gpu_only and not test.is_gpu_available(): tf_logging.info("Skipping OpEdgeCases tests.") return with self.cached_session() as sess, self.test_scope(): # Illegal strides. with self.assertRaisesRegexp( errors_impl.UnimplementedError, ".*strides.*in the batch and depth dimensions"): sess.run( fused_conv2d_bias_activation_op. fused_conv2d_bias_activation( _IotaNdF32Constant([1, 1, 1, 1]), _IotaNdF32Constant([1, 1, 1, 1]), _IotaNdF32Constant([1]), strides=[2, 1, 1, 1], padding="SAME", activation_mode="Relu")) with self.assertRaisesRegexp( errors_impl.UnimplementedError, ".*strides.*in the batch and depth dimensions"): sess.run( fused_conv2d_bias_activation_op. fused_conv2d_bias_activation( _IotaNdF32Constant([1, 1, 1, 1]), _IotaNdF32Constant([1, 1, 1, 1]), _IotaNdF32Constant([1]), strides=[1, 1, 1, 2], padding="SAME", activation_mode="Relu")) # Illegal activation mode. with self.assertRaisesRegexp(ValueError, "Op passed string 'Tanh' not in:"): sess.run( fused_conv2d_bias_activation_op. fused_conv2d_bias_activation( _IotaNdF32Constant([1, 1, 1, 1]), _IotaNdF32Constant([1, 1, 1, 1]), _IotaNdF32Constant([1]), strides=[1, 1, 1, 1], padding="SAME", activation_mode="Tanh")) # Filter larger than input. with self.assertRaisesRegexp(ValueError, "Negative dimension size"): sess.run( fused_conv2d_bias_activation_op. fused_conv2d_bias_activation( _IotaNdF32Constant([32, 20, 20, 3]), _IotaNdF32Constant([20, 21, 3, 2]), _IotaNdF32Constant([2]), strides=[1, 1, 1, 1], padding="VALID", activation_mode="Relu")) with self.assertRaisesRegexp(ValueError, "Negative dimension size"): sess.run( fused_conv2d_bias_activation_op. fused_conv2d_bias_activation( _IotaNdF32Constant([32, 20, 20, 3]), _IotaNdF32Constant([21, 20, 3, 2]), _IotaNdF32Constant([2]), strides=[1, 1, 1, 1], padding="VALID", activation_mode="Relu"))
def test_whether_there_is_a_gpu(self): self.assertEqual(len(replicate_model_fn._get_local_devices('GPU')), test.is_gpu_available())
def _CompareBackward(self, x, rank, fft_length=None, use_placeholder=False): if test.is_gpu_available(cuda_only=True): super(RFFTOpsTest, self)._CompareBackward(x, rank, fft_length, use_placeholder)
def testError(self): for rank in VALID_FFT_RANKS: for dims in xrange(0, rank): x = np.zeros((1, ) * dims).astype(np.complex64) with self.assertRaisesWithPredicateMatch( ValueError, "Shape .* must have rank at least {}".format(rank)): self._tfFFT(x, rank) with self.assertRaisesWithPredicateMatch( ValueError, "Shape .* must have rank at least {}".format(rank)): self._tfIFFT(x, rank) for dims in xrange(rank, rank + 2): x = np.zeros((1, ) * rank) # Test non-rank-1 fft_length produces an error. fft_length = np.zeros((1, 1)).astype(np.int32) with self.assertRaisesWithPredicateMatch( ValueError, "Shape .* must have rank 1"): self._tfFFT(x, rank, fft_length) with self.assertRaisesWithPredicateMatch( ValueError, "Shape .* must have rank 1"): self._tfIFFT(x, rank, fft_length) # Test wrong fft_length length. fft_length = np.zeros((rank + 1, )).astype(np.int32) with self.assertRaisesWithPredicateMatch( ValueError, "Dimension must be .*but is {}.*".format(rank + 1)): self._tfFFT(x, rank, fft_length) with self.assertRaisesWithPredicateMatch( ValueError, "Dimension must be .*but is {}.*".format(rank + 1)): self._tfIFFT(x, rank, fft_length) # Test that calling the kernel directly without padding to fft_length # produces an error. rffts_for_rank = { 1: [gen_spectral_ops.rfft, gen_spectral_ops.irfft], 2: [gen_spectral_ops.rfft2d, gen_spectral_ops.irfft2d], 3: [gen_spectral_ops.rfft3d, gen_spectral_ops.irfft3d] } rfft_fn, irfft_fn = rffts_for_rank[rank] with self.assertRaisesWithPredicateMatch( errors.InvalidArgumentError, "Input dimension .* must have length of at least 6 but got: 5" ): x = np.zeros((5, ) * rank).astype(np.float32) fft_length = [6] * rank with self.test_session(): rfft_fn(x, fft_length).eval() # TODO(rjryan): Remove when CPU-based IRFFT is supported. if test.is_gpu_available(cuda_only=True): with self.assertRaisesWithPredicateMatch( errors.InvalidArgumentError, "Input dimension .* must have length of at least .* but got: 3" ): x = np.zeros((3, ) * rank).astype(np.complex64) fft_length = [6] * rank with self.test_session(): irfft_fn(x, fft_length).eval()