def test_multi_device_inputs(self): test_driver = get_initialised_driver() graph = test_driver.create_graph(test_driver.app, test_driver.num_gpus, True) with self.test_session(graph=graph) as sess: GRAPH_CREATED.send(test_driver.app, iter_msg=None) SESS_STARTED.send(test_driver.app, iter_msg=None) for i in range(2): sess.run(test_driver.app.gradient_op) s_0, s_1, s_2, s_3 = sess.run([ tf.get_default_graph().get_tensor_by_name( 'worker_0/feature_input:0'), tf.get_default_graph().get_tensor_by_name( 'worker_1/feature_input:0'), tf.get_default_graph().get_tensor_by_name( 'worker_2/feature_input:0'), tf.get_default_graph().get_tensor_by_name( 'worker_3/feature_input:0') ]) msg = 'same input data for different devices' self.assertGreater(np.sum(np.abs(s_0 - s_1)), 0.0, msg) self.assertGreater(np.sum(np.abs(s_0 - s_2)), 0.0, msg) self.assertGreater(np.sum(np.abs(s_0 - s_3)), 0.0, msg) self.assertGreater(np.sum(np.abs(s_1 - s_2)), 0.0, msg) self.assertGreater(np.sum(np.abs(s_1 - s_3)), 0.0, msg) self.assertGreater(np.sum(np.abs(s_2 - s_3)), 0.0, msg) SESS_FINISHED.send(test_driver.app, itermsg=None) test_driver.app.stop()
def run(self, application, graph=None): """ Initialise a TF graph, connect data sampler and network within the graph context, run training loops or inference loops. :param application: a niftynet application :param graph: default base graph to run the application :return: """ if graph is None: graph = ApplicationDriver.create_graph( application=application, num_gpus=self.num_gpus, num_threads=self.num_threads, is_training_action=self.is_training_action) start_time = time.time() loop_status = {'current_iter': self.initial_iter, 'normal_exit': False} with tf.Session(config=tf_config(), graph=graph): try: # broadcasting event of session started SESS_STARTED.send(application, iter_msg=None) # create a iteration message generator and # iteratively run the graph (the main engine loop) iteration_messages = self._generator(**vars(self))() ApplicationDriver.loop(application=application, iteration_messages=iteration_messages, loop_status=loop_status) except KeyboardInterrupt: tf.logging.warning('User cancelled application') except (tf.errors.OutOfRangeError, EOFError): if not loop_status.get('normal_exit', False): # reached the end of inference Dataset loop_status['normal_exit'] = True except RuntimeError: import sys import traceback exc_type, exc_value, exc_traceback = sys.exc_info() traceback.print_exception(exc_type, exc_value, exc_traceback, file=sys.stdout) finally: tf.logging.info('cleaning up...') # broadcasting session finished event iter_msg = IterationMessage() iter_msg.current_iter = loop_status.get('current_iter', -1) SESS_FINISHED.send(application, iter_msg=iter_msg) application.stop() if not loop_status.get('normal_exit', False): # loop didn't finish normally tf.logging.warning('stopped early, incomplete iterations.') tf.logging.info("%s stopped (time in second %.2f).", type(application).__name__, (time.time() - start_time))
def test_training_update(self): test_driver = get_initialised_driver() graph = test_driver.create_graph(test_driver.app, 1, True) with self.test_session(graph=graph) as sess: SESS_STARTED.send(test_driver.app, iter_msg=None) train_op = test_driver.app.gradient_op test_tensor = graph.get_tensor_by_name('G/conv_bn_selu/conv_/w:0') var_0 = sess.run(test_tensor) sess.run(train_op) var_1 = sess.run(test_tensor) square_diff = np.sum(np.abs(var_0 - var_1)) self.assertGreater(square_diff, 0.0, 'train_op does not change model') SESS_FINISHED.send(test_driver.app, itermsg=None) test_driver.app.stop()
def test_multi_device_multi_optimiser_gradients(self): test_driver = get_initialised_driver( application='tests.toy_application.ToyApplicationMultOpti') graph = test_driver.create_graph(test_driver.app, test_driver.num_gpus, True) with self.test_session(graph=graph) as sess: SESS_STARTED.send(test_driver.app, iter_msg=None) for i in range(2): sess.run(test_driver.app.gradient_op) # query generator gradient sample to check dis_0, dis_1, dis_2, dis_3, dis_ave = sess.run([ graph.get_tensor_by_name( 'worker_0/ComputeGradientsD/gradients/AddN_5:0'), graph.get_tensor_by_name( 'worker_1/ComputeGradientsD/gradients/AddN_5:0'), graph.get_tensor_by_name( 'worker_2/ComputeGradientsD/gradients/AddN_5:0'), graph.get_tensor_by_name( 'worker_3/ComputeGradientsD/gradients/AddN_5:0'), graph.get_tensor_by_name('ApplyGradients/AveOverDevices:0') ]) # query discriminator gradient sample to check gen_0, gen_1, gen_2, gen_3, gen_ave = sess.run([ graph.get_tensor_by_name( 'worker_0/ComputeGradientsG/gradients/worker_0/tinynet/G/conv/conv_/conv/ExpandDims_1_grad/Reshape:0' ), graph.get_tensor_by_name( 'worker_1/ComputeGradientsG/gradients/worker_1/tinynet/G/conv/conv_/conv/ExpandDims_1_grad/Reshape:0' ), graph.get_tensor_by_name( 'worker_2/ComputeGradientsG/gradients/worker_2/tinynet/G/conv/conv_/conv/ExpandDims_1_grad/Reshape:0' ), graph.get_tensor_by_name( 'worker_3/ComputeGradientsG/gradients/worker_3/tinynet/G/conv/conv_/conv/ExpandDims_1_grad/Reshape:0' ), graph.get_tensor_by_name( 'ApplyGradients/AveOverDevices_14:0') ]) self.check_gradients(gen_0, gen_1, gen_2, gen_3, gen_ave) self.check_gradients(dis_0, dis_1, dis_2, dis_3, dis_ave) SESS_FINISHED.send(test_driver.app, itermsg=None) test_driver.app.stop()
def __init__(self, model_dir, save_every_n=0, max_checkpoints=1, is_training_action=True, **_unused): self.save_every_n = save_every_n self.max_checkpoints = max_checkpoints self.file_name_prefix = make_model_name(model_dir) self.saver = None # initialise the saver after the graph finalised SESS_STARTED.connect(self.init_saver) # save the training model at a positive frequency if self.save_every_n > 0: ITER_FINISHED.connect(self.save_model_interval) # always save the final training model before exiting if is_training_action: SESS_FINISHED.connect(self.save_model)
def test_multi_device_gradients(self): test_driver = get_initialised_driver() graph = test_driver.create_graph(test_driver.app, test_driver.num_gpus, True) with self.test_session(graph=graph) as sess: GRAPH_CREATED.send(test_driver.app, iter_msg=None) SESS_STARTED.send(test_driver.app, iter_msg=None) for i in range(2): sess.run(test_driver.app.gradient_op) g_0, g_1, g_2, g_3, g_ave = sess.run([ tf.get_default_graph().get_tensor_by_name( 'worker_0/ComputeGradients/gradients/AddN_5:0'), tf.get_default_graph().get_tensor_by_name( 'worker_1/ComputeGradients/gradients/AddN_5:0'), tf.get_default_graph().get_tensor_by_name( 'worker_2/ComputeGradients/gradients/AddN_5:0'), tf.get_default_graph().get_tensor_by_name( 'worker_3/ComputeGradients/gradients/AddN_5:0'), tf.get_default_graph().get_tensor_by_name( 'ApplyGradients/AveOverDevices:0') ]) msg = 'same gradients for different devices' self.assertGreater(np.sum(np.abs(g_0 - g_1)), 0.0, msg) self.assertGreater(np.sum(np.abs(g_0 - g_2)), 0.0, msg) self.assertGreater(np.sum(np.abs(g_0 - g_3)), 0.0, msg) self.assertGreater(np.sum(np.abs(g_1 - g_2)), 0.0, msg) self.assertGreater(np.sum(np.abs(g_1 - g_3)), 0.0, msg) self.assertGreater(np.sum(np.abs(g_2 - g_3)), 0.0, msg) g_array = np.concatenate([ g_0.reshape((1, -1)), g_1.reshape((1, -1)), g_2.reshape((1, -1)), g_3.reshape((1, -1)) ], axis=0) g_ave = g_ave.reshape(-1) g_np_ave = np.mean(g_array, axis=0) self.assertAllClose(g_np_ave, g_ave) SESS_FINISHED.send(test_driver.app, itermsg=None) test_driver.app.stop()
def test_multi_device_gradients(self): test_driver = get_initialised_driver() graph = test_driver.create_graph(test_driver.app, test_driver.num_gpus, True) with self.test_session(graph=graph) as sess: SESS_STARTED.send(test_driver.app, iter_msg=None) for i in range(2): sess.run(test_driver.app.gradient_op) g_0, g_1, g_2, g_3, g_ave = sess.run([ graph.get_tensor_by_name( 'worker_0/ComputeGradients/gradients/AddN_5:0'), graph.get_tensor_by_name( 'worker_1/ComputeGradients/gradients/AddN_5:0'), graph.get_tensor_by_name( 'worker_2/ComputeGradients/gradients/AddN_5:0'), graph.get_tensor_by_name( 'worker_3/ComputeGradients/gradients/AddN_5:0'), graph.get_tensor_by_name('ApplyGradients/AveOverDevices:0') ]) self.check_gradients(g_0, g_1, g_2, g_3, g_ave) SESS_FINISHED.send(test_driver.app, itermsg=None) test_driver.app.stop()
def __init__(self, **_unused): # SESS_STARTED.connect(self.start_sampler_threads) SESS_FINISHED.connect(self.stop_sampler_threads)