def testTrainReplicated(self): if ipu_utils.running_on_ipu_model(): self.skipTest( "Replicated top level graphs are not supported on the " "IPU_MODEL target") def my_model_fn(features, labels, mode): # pylint: disable=unused-argument self.assertEqual(model_fn_lib.ModeKeys.TRAIN, mode) loss = ipu.ops.cross_replica_ops.cross_replica_sum(features, name="loss") train_op = array_ops.identity(loss) return model_fn_lib.EstimatorSpec(mode=mode, loss=loss, train_op=train_op) def my_input_fn(): dataset = tu.create_dual_increasing_dataset(10, data_shape=[1], label_shape=[1]) dataset = dataset.batch(batch_size=1, drop_remainder=True) return dataset ipu_options = ipu_utils.create_ipu_config() ipu_options = ipu_utils.auto_select_ipus(ipu_options, 4) config = ipu_run_config.RunConfig( ipu_run_config=ipu_run_config.IPURunConfig( iterations_per_loop=2, num_replicas=4, ipu_options=ipu_options), log_step_count_steps=1, save_summary_steps=1) estimator = ipu_estimator.IPUEstimator(model_fn=my_model_fn, config=config) session_run_counter = _SessionRunCounter() num_steps = 6 estimator.train(input_fn=my_input_fn, steps=num_steps, hooks=[session_run_counter]) self.assertEqual( session_run_counter.num_session_runs, num_steps // config.ipu_run_config.iterations_per_loop) model_dir = estimator.model_dir events_file = glob.glob(model_dir + "/*tfevents*") assert len(events_file) == 1 events_file = events_file[0] loss_output = list() for e in summary_iterator.summary_iterator(events_file): for v in e.summary.value: if "loss" in v.tag: loss_output.append(v.simple_value) # loss is averaged across iterations per loop self.assertEqual(loss_output, [14.0, 16.0, 18.0])
def test_all_reduce(self): if ipu_utils.running_on_ipu_model(): self.skipTest( "Replicated top level graphs are not supported on the " "IPU_MODEL target") strategy = ipu_strategy.IPUStrategy() def make_all_reduce_function(reduce_op): @def_function.function(experimental_compile=True) def all_reduce_function(): replica_ctx = distribution_strategy_context.get_replica_context( ) x = math_ops.cast(replication_ops.replication_index(), np.float32) return replica_ctx.all_reduce(reduce_op, x) return all_reduce_function report = tu.ReportJSON(self, eager_mode=True, replicated=True) report.reset() with strategy.scope(): summed = strategy.experimental_run_v2( make_all_reduce_function(reduce_util.ReduceOp.SUM)) self.assertEqual(1.0, summed.numpy()) mean = strategy.experimental_run_v2( make_all_reduce_function(reduce_util.ReduceOp.MEAN)) self.assertEqual(0.5, mean.numpy())
def testPipelineCompare3(self): if utils.running_on_ipu_model(): self.skipTest( "Replicated top level graphs are not supported on the " "IPU_MODEL target") def dataset_fn(): dataset = tu.create_single_increasing_dataset(10, shape=[4]) dataset = dataset.batch(batch_size=2, drop_remainder=True) def my_dataset_parser(value): label = math_ops.reduce_mean(value, axis=[1]) return math_ops.cast(value, np.int32), math_ops.cast( label / 10, np.int32) return dataset.map(my_dataset_parser) gradient_accumulation_count = 20 repeat_count = 2 optimizer = gradient_descent.GradientDescentOptimizer(0.01) def stage1(idx, label): with variable_scope.variable_scope("stage1", use_resource=True): embedding = variable_scope.get_variable( "c", shape=[10, 1216], dtype=np.float32, initializer=init_ops.constant_initializer(10.01), trainable=True) x = embedding_ops.embedding_lookup(embedding, idx) return x, label def stage2(x, label): with variable_scope.variable_scope("stage2", use_resource=True): return x, label def stage3(x, label): with variable_scope.variable_scope("stage3", use_resource=True): return x, label def stage4(x, label): with variable_scope.variable_scope("stage4", use_resource=True): logits = math_ops.reduce_sum(x, axis=[-1]) loss = math_ops.reduce_mean( nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=label)) return loss pipelining_test_util.PipelineTester.compare_pipeline_to_cpu( [stage1, stage2, stage3, stage4], lambda: [], [], repeat_count, gradient_accumulation_count, dataset_fn, optimizer, self, 13821, True, schedule=pipelining_ops.PipelineSchedule.Interleaved)
def testKerasLenet(self): """Check that the output of PoplarExecutableRunner produces the same output as the original Graph execution. """ if utils.running_on_ipu_model(): self.skipTest( "PoplarExecutableRunner only works with physical IPUs") with tempfile.TemporaryDirectory() as tmp: poplar_binaries_folder = os.path.join(tmp, "poplar") model_path = os.path.join(tmp, "model") weights_file = os.path.join(tmp, "weights.bin") output_path = os.path.join(tmp, "output") input_values = np.random.uniform(size=(1, 32, 32, 1)) input_file = "%s/input.bin" % tmp with self.session() as sess: self.configureIPU(poplar_binaries_folder, False) with ops.device("/device:IPU:0"): out, inp, model = instantiate_lenet() utils.move_variable_initialization_to_cpu() sess.run(global_variables_initializer()) utils.export_inputs_to_file([inp], input_file, {inp: input_values}) # Run the model once to generate the poplar binaries. reference_values = sess.run(out, {inp: input_values}) # Export the model & weights. saved_model.save(model, model_path) metadata_file = self.getSingleFileWithExt(poplar_binaries_folder, "json") executable_file = self.getSingleFileWithExt( poplar_binaries_folder, "ipu_bin") self.runPythonCommand( (("./tensorflow/compiler/plugin/poplar/tools/" "tensorflow_weights_extractor.py -o %s -s %s -m %s") % (weights_file, model_path, metadata_file)).split()) self.runCommand((("./third_party/ipus/tools/PoplarExecutableRunner" " --binaries %s,%s,%s " "--output_folder=%s --strict") % ( executable_file, weights_file, input_file, output_path, )).split()) output_file = self.getSingleFileWithExt(output_path, "data") with open(output_file, 'r') as f: runner_values = np.array(json.load(f)) logging.info("Reference %s\nRunner: %s", reference_values, runner_values) self.assertAllClose(reference_values, runner_values)
def _gradient_accumulation_loop(test_wrapper, fwd_fn, inputs_fn, input_values, repeat_count, num_batches_to_accumulate, dataset_fn, optimizer, num_iterations=None): g = ops.Graph() if num_iterations is None: num_iterations = repeat_count * num_batches_to_accumulate with g.as_default(), test_wrapper.test_session(graph=g) as session: dataset = dataset_fn() inputs = inputs_fn() infeed_queue = ipu_infeed_queue.IPUInfeedQueue(dataset, next_feed_id()) outfeed_queue = ipu_outfeed_queue.IPUOutfeedQueue(next_feed_id()) with variable_scope.variable_scope("ipu", use_resource=True, reuse=False): def model(*args): loss = fwd_fn(*functional_ops._convert_to_list(args)) # pylint: disable=W0212 enqueue_op = outfeed_queue.enqueue(loss) opt = gradient_accumulation_optimizer.GradientAccumulationOptimizerV2( optimizer, num_batches_to_accumulate) outs = list(args[:len(args) - infeed_queue.number_of_tuple_elements]) outs.append(enqueue_op) outs.append(opt.minimize(loss)) return outs def my_net(*args): return loops.repeat(num_iterations, model, inputs=args, infeed_queue=infeed_queue) with ops.device("/device:IPU:0"): loop_ret = ipu_compiler.compile(my_net, inputs=inputs) outfeed_op = outfeed_queue.dequeue() profiling = utils.running_on_ipu_model() cfg = utils.create_ipu_config(profiling=profiling, profile_execution=profiling) cfg = utils.set_ipu_model_options(cfg, compile_ipu_code=True, tiles_per_ipu=128) cfg = utils.auto_select_ipus(cfg, 1) utils.configure_ipu_system(cfg) utils.move_variable_initialization_to_cpu() session.run(variables.global_variables_initializer()) session.run(infeed_queue.initializer) session.run(loop_ret, feed_dict=dict(zip(inputs, input_values))) return session.run(outfeed_op)
def testTrainWithAutomaticSharding(self): if ipu_utils.running_on_ipu_model(): self.skipTest( "Replicated top level graphs are not supported on the " "IPU_MODEL target") def my_model_fn(features, labels, mode): self.assertEqual(model_fn_lib.ModeKeys.TRAIN, mode) with variable_scope.variable_scope("vs", use_resource=True): predictions = layers.Dense(units=1)(features) loss = losses.mean_squared_error(labels=labels, predictions=predictions) sharded_optimizer_obj = sharded_optimizer.ShardedOptimizer( gradient_descent.GradientDescentOptimizer(0.1)) train_op = sharded_optimizer_obj.minimize(loss) return model_fn_lib.EstimatorSpec(mode=mode, loss=loss, train_op=train_op) def my_input_fn(): dataset = dataset_ops.Dataset.from_tensor_slices( _create_regression_dataset(num_samples=1000, num_features=5)) dataset = dataset.batch(batch_size=2, drop_remainder=True).repeat() return dataset ipu_options = ipu_utils.create_ipu_config() ipu_options = ipu_utils.auto_select_ipus(ipu_options, 4) config = ipu_run_config.RunConfig( ipu_run_config=ipu_run_config.IPURunConfig( iterations_per_loop=2, num_shards=4, autosharding=True, ipu_options=ipu_options), log_step_count_steps=1, save_summary_steps=1) estimator = ipu_estimator.IPUEstimator(model_fn=my_model_fn, config=config) estimator.train(input_fn=my_input_fn, steps=10) model_dir = estimator.model_dir events_file = glob.glob(model_dir + "/*tfevents*") assert len(events_file) == 1 events_file = events_file[0] loss_output = list() for e in summary_iterator.summary_iterator(events_file): for v in e.summary.value: if "loss" in v.tag: loss_output.append(v.simple_value) self.assertTrue(loss_output[0] > loss_output[-1])
def testReplicatedEvaluationOnHost(self): if ipu_utils.running_on_ipu_model(): self.skipTest( "Replicated top level graphs are not supported on the " "IPU_MODEL target") def my_input_fn(): features = [0, 0, 0, 1, 0, 0, 0, 1] labels = [0, 1, 0, 1, 0, 1, 0, 1] return dataset_ops.Dataset.from_tensor_slices( (features, labels)).batch(2, drop_remainder=True) def my_metrics_fn(features, labels): labels64 = math_ops.cast(labels, np.int64) return { "accuracy": metrics_impl.accuracy(labels, features), "precision": metrics_impl.precision(labels, features), "recall": metrics_impl.recall(labels, features), "recall_at_1": metrics_impl.recall_at_k(labels64, features, k=1), "recall_at_2": metrics_impl.recall_at_k(labels64, features, k=2), "mse": metrics_impl.mean_squared_error(labels, features), "rmse": metrics_impl.root_mean_squared_error(labels, features), } def my_model_fn(features, labels, mode): loss = math_ops.cast(replication_ops.replication_index(), np.float32) eval_metrics = (my_metrics_fn, [features, labels]) return ipu_estimator.IPUEstimatorSpec(mode, loss=loss, eval_metrics=eval_metrics) ipu_options = ipu_utils.create_ipu_config() ipu_options = ipu_utils.auto_select_ipus(ipu_options, num_ipus=4) config = ipu_run_config.RunConfig( ipu_run_config=ipu_run_config.IPURunConfig( iterations_per_loop=1, num_replicas=4, ipu_options=ipu_options)) estimator = ipu_estimator.IPUEstimator(model_fn=my_model_fn, config=config) scores = estimator.evaluate(my_input_fn, steps=1) self.assertEqual(0.75, scores["accuracy"]) self.assertEqual(1.0, scores["precision"]) self.assertEqual(0.5, scores["recall"]) self.assertEqual(0.5, scores["recall_at_1"]) self.assertEqual(1.0, scores["recall_at_2"]) self.assertEqual(0.25, scores["mse"]) self.assertEqual(0.5, scores["rmse"]) self.assertEqual(1.5, scores[model_fn_lib.LOSS_METRIC_KEY])
def testReplicatedPrediction(self): if ipu_utils.running_on_ipu_model(): self.skipTest( "Replicated top level graphs are not supported on the " "IPU_MODEL target") def my_input_fn(): features = [ [1.0], # IPU0 [3.0], # IPU0 [5.0], # IPU1 [3.0], # IPU1 [7.0], # IPU2 [3.0], # IPU2 [9.0], # IPU3 [3.0], # IPU3 ] return dataset_ops.Dataset.from_tensor_slices(features).batch( batch_size=2, drop_remainder=True) hook = ipu_session_run_hooks.IPULoggingTensorHook(every_n_iter=1, replication_factor=4) def my_model_fn(features, mode): logging_op = hook.log({"features": features}) with ops.control_dependencies([logging_op]): predictions = math_ops.reduce_max(features) return model_fn_lib.EstimatorSpec( mode, predictions=predictions, ) ipu_options = ipu_utils.create_ipu_config() ipu_options = ipu_utils.auto_select_ipus(ipu_options, num_ipus=4) config = ipu_run_config.RunConfig( ipu_run_config=ipu_run_config.IPURunConfig( iterations_per_loop=1, num_replicas=4, ipu_options=ipu_options)) estimator = ipu_estimator.IPUEstimator(model_fn=my_model_fn, config=config) outputs = estimator.predict(input_fn=my_input_fn, yield_single_examples=True) self.assertEqual(3.0, next(outputs)) self.assertEqual(5.0, next(outputs)) outputs = estimator.predict(input_fn=my_input_fn, yield_single_examples=False, hooks=[hook]) np.testing.assert_array_equal([3.0, 5.0, 7.0, 9.0], next(outputs))
def testPipelineCompare7(self): if utils.running_on_ipu_model(): self.skipTest( "Replicated top level graphs are not supported on the " "IPU_MODEL target") # Stage 1 and 2 don't have a backward stage. def dataset_fn(): dataset = tu.create_single_increasing_dataset(7, shape=[4, 4, 2]) dataset = dataset.batch(batch_size=2, drop_remainder=True) def dataset_parser(value): img = value / 7 label = value[0][0][0][0] return img, label return dataset.map(dataset_parser) gradient_accumulation_count = 16 repeat_count = 2 optimizer = gradient_descent.GradientDescentOptimizer(0.01) def stage1(c, img, label): with variable_scope.variable_scope("stage1", use_resource=True): return img, c, label def stage2(x, c, label): with variable_scope.variable_scope("stage2", use_resource=True): with ops.control_dependencies([internal_ops.print_tensor(x)]): return x * 20, c, label def stage3(x, c, label): with variable_scope.variable_scope("stage3", use_resource=True): return layers.Dense( 2, kernel_initializer=init_ops.constant_initializer(0.5), bias_initializer=init_ops.constant_initializer(0.5))( x), c, label def stage4(x, c, label): with variable_scope.variable_scope("stage4", use_resource=True): return math_ops.reduce_sum(x) + c + label def inputs_fn(): with ops.device('cpu'): return [array_ops.placeholder(np.float32, shape=[])] pipelining_test_util.PipelineTester.compare_pipeline_to_cpu( [stage1, stage2, stage3, stage4], inputs_fn, [10.01], repeat_count, gradient_accumulation_count, dataset_fn, optimizer, self, 14502, True, pipelining_ops.PipelineSchedule.Grouped)
def testReplicatedEvaluation(self): if ipu_utils.running_on_ipu_model(): self.skipTest( "Replicated top level graphs are not supported on the " "IPU_MODEL target") def my_input_fn(): # IPU0 mean: 2, max: 3 # IPU1 mean: 4, max: 5 features = [ [1.0], # IPU0 [3.0], # IPU0 [5.0], # IPU1 [3.0], # IPU1 [1.0], # IPU2 [3.0], # IPU2 [5.0], # IPU3 [3.0], # IPU3 ] return dataset_ops.Dataset.from_tensor_slices(features).batch( batch_size=2, drop_remainder=True) def my_model_fn(features, mode): loss = math_ops.reduce_max(features) eval_metric_ops = { "feature_mean": metrics_impl.mean(features), } return model_fn_lib.EstimatorSpec(mode, loss=loss, eval_metric_ops=eval_metric_ops) ipu_options = ipu_utils.create_ipu_config() ipu_options = ipu_utils.auto_select_ipus(ipu_options, num_ipus=4) config = ipu_run_config.RunConfig( ipu_run_config=ipu_run_config.IPURunConfig( iterations_per_loop=1, num_replicas=4, ipu_options=ipu_options)) estimator = ipu_estimator.IPUEstimator(model_fn=my_model_fn, config=config) scores = estimator.evaluate(my_input_fn, steps=1) self.assertEqual(3., scores["feature_mean"]) self.assertEqual(4., scores[model_fn_lib.LOSS_METRIC_KEY])
def test_optimizer(self): if ipu_utils.running_on_ipu_model(): self.skipTest( "Replicated top level graphs are not supported on the " "IPU_MODEL target") strategy = ipu_strategy.IPUStrategy() report = tu.ReportJSON(self, eager_mode=True, replicated=True) report.reset() with strategy.scope(): initial_variable = 2.0 variable = variables.Variable(initial_variable) learning_rate = 0.5 num_iterations = 3 data = [1.0, 2.0] dataset = dataset_ops.Dataset.from_tensor_slices((data)) dataset = dataset.repeat(num_iterations) infeed = ipu_infeed_queue.IPUInfeedQueue(dataset, feed_name="feed", replication_factor=2) optimizer = keras.optimizer_v2.gradient_descent.SGD(learning_rate) @def_function.function(experimental_compile=True) def apply_gradient(): gradient = infeed._dequeue() # pylint: disable=protected-access optimizer.apply_gradients([(gradient, variable)]) # The optimizers in v2 will sum the gradients, and not average them. expected_gradient = np.sum(data) expected_variable = initial_variable infeed.initializer # pylint: disable=pointless-statement for _ in range(num_iterations): strategy.experimental_run_v2(apply_gradient) expected_variable -= learning_rate * expected_gradient self.assertEqual(expected_variable, variable.numpy())
def testPipelineCompare1(self): if utils.running_on_ipu_model(): self.skipTest( "Replicated top level graphs are not supported on the " "IPU_MODEL target") def dataset_fn(): dataset = tu.create_single_increasing_dataset(7, shape=[4, 4, 2]) dataset = dataset.batch(batch_size=2, drop_remainder=True) def my_dataset_parser(value): img = value / 7 label = value[0][0][0][0] return img, label return dataset.map(my_dataset_parser) gradient_accumulation_count = 20 repeat_count = 2 optimizer = gradient_descent.GradientDescentOptimizer(0.01) def stage1(c, img, label): with variable_scope.variable_scope("stage1", use_resource=True): y = layers.Conv2D( 2, 1, use_bias=True, kernel_initializer=init_ops.constant_initializer(0.5), bias_initializer=init_ops.constant_initializer(0.5), name='conv1')(img) return y, c, label def stage2(x, c, label): with variable_scope.variable_scope("stage2", use_resource=True): return x * 20, c, label def stage3(x, c, label): with variable_scope.variable_scope("stage3", use_resource=True): return layers.Dense( 2, kernel_initializer=init_ops.constant_initializer(0.5), bias_initializer=init_ops.constant_initializer(0.5))( x), c, label def stage4(x, c, label): with variable_scope.variable_scope("stage4", use_resource=True): return math_ops.reduce_sum( layers.Dense( 2, kernel_initializer=init_ops.constant_initializer(0.5), bias_initializer=init_ops.constant_initializer(0.5)) (x)) + c + label def inputs_fn(): with ops.device('cpu'): return [array_ops.placeholder(np.float32, shape=[])] pipelining_test_util.PipelineTester.compare_pipeline_to_cpu( [stage1, stage2, stage3, stage4], inputs_fn, [10.01], repeat_count, gradient_accumulation_count, dataset_fn, optimizer, self, 14374, True, schedule=pipelining_ops.PipelineSchedule.Interleaved)
def pipeline_on_ipu(stages, inputs_fn, input_values, repeat_count, gradient_accumulation_count, dataset_fn, optimizer, test_wrapper, expected_max_tile_memory, recomp, schedule, device_mapping=None, batch_serialization_iterations=1): g = ops.Graph() with g.as_default(), test_wrapper.test_session(graph=g) as session: dataset = dataset_fn() inputs = inputs_fn() infeed_queue = ipu_infeed_queue.IPUInfeedQueue( dataset, next_feed_id()) outfeed_queue = ipu_outfeed_queue.IPUOutfeedQueue(next_feed_id()) with variable_scope.variable_scope("ipu", use_resource=True, reuse=False): def optimizer_function(loss): return pipelining_ops.OptimizerFunctionOutput( optimizer, loss) def my_net(*args): return pipelining_ops.pipeline( stages, gradient_accumulation_count, repeat_count=repeat_count, batch_serialization_iterations= batch_serialization_iterations, inputs=args, optimizer_function=optimizer_function, infeed_queue=infeed_queue, outfeed_queue=outfeed_queue, pipeline_schedule=schedule, device_mapping=device_mapping) with ops.device("/device:IPU:0"): compiled_model_pipeline = ipu_compiler.compile(my_net, inputs=inputs) # Execution profiles of code with dynamic control flow are not supported # on real HW. profiling = utils.running_on_ipu_model() cfg = utils.create_ipu_config(profiling=profiling, profile_execution=profiling) cfg = utils.set_ipu_model_options(cfg, compile_ipu_code=True, tiles_per_ipu=128) num_ipus = get_num_ipus(device_mapping) if device_mapping else 4 cfg = utils.auto_select_ipus(cfg, num_ipus) if recomp: cfg = utils.set_recomputation_options(cfg, allow_recompute=True) utils.configure_ipu_system(cfg) utils.move_variable_initialization_to_cpu() outfeed_op = outfeed_queue.dequeue() report = tu.ReportJSON(test_wrapper, session, configure_device=False) session.run(variables.global_variables_initializer()) session.run(infeed_queue.initializer) report.reset() session.run(compiled_model_pipeline, feed_dict=dict(zip(inputs, input_values))) out = session.run(outfeed_op)[0] if profiling: report.parse_log() if not device_mapping: device_mapping = [ i - (i % 4) + ((i % 4) if (i % 4) < 2 else 5 - (i % 4)) for i in range(len(stages)) ] report.assert_pipeline_stages_on_expected_ipu(device_mapping) report.assert_max_tile_memory(expected_max_tile_memory, tolerance=0.3) return out
def _sharded_on_ipu(stages, inputs_fn, input_values, repeat_count, num_batches_to_accumulate, dataset_fn, optimizer, test_wrapper, recomp, device_mapping): g = ops.Graph() with g.as_default(), test_wrapper.test_session(graph=g) as session: dataset = dataset_fn() inputs = inputs_fn() infeed_queue = ipu_infeed_queue.IPUInfeedQueue( dataset, next_feed_id()) outfeed_queue = ipu_outfeed_queue.IPUOutfeedQueue(next_feed_id()) with variable_scope.variable_scope("ipu_sharded", use_resource=True, reuse=False): if device_mapping is None: device_mapping = range(len(stages)) def pipeline(*args): outputs = args for i, stage in zip(device_mapping, stages): with scopes.ipu_shard(i): outputs = stage( *functional_ops._convert_to_list(outputs)) # pylint: disable=W0212 loss = outputs enqueue_op = outfeed_queue.enqueue(loss) opt = gradient_accumulation_optimizer.GradientAccumulationOptimizer( optimizer, num_batches_to_accumulate) outs = list(args[:len(args) - infeed_queue.number_of_tuple_elements]) outs.append(enqueue_op) outs.append(opt.minimize(loss)) return outs def my_net(*args): return loops.repeat(num_batches_to_accumulate, pipeline, inputs=args, infeed_queue=infeed_queue) with ops.device("/device:IPU:0"): compiled_model_pipeline = ipu_compiler.compile(my_net, inputs=inputs) outfeed_op = outfeed_queue.dequeue() # Execution profiles of code with dynamic control flow are not supported on real HW profiling = utils.running_on_ipu_model() cfg = utils.create_ipu_config(profiling=profiling, profile_execution=profiling) cfg = utils.set_ipu_model_options(cfg, compile_ipu_code=True, tiles_per_ipu=128) num_ipus = get_num_ipus(device_mapping) if device_mapping else 4 cfg = utils.auto_select_ipus(cfg, num_ipus) if recomp: cfg = utils.set_recomputation_options(cfg, allow_recompute=True) utils.configure_ipu_system(cfg) utils.move_variable_initialization_to_cpu() session.run(variables.global_variables_initializer()) session.run(infeed_queue.initializer) for _ in range(repeat_count): session.run(compiled_model_pipeline, feed_dict=dict(zip(inputs, input_values))) return session.run(outfeed_op)