def _create_graph(self, graph=tf.Graph()): """ TensorFlow graph is only created within this function. """ assert isinstance(graph, tf.Graph) main_device = self._device_string(0, is_worker=False) # start constructing the graph, handling training and inference cases with graph.as_default(), tf.device(main_device): # initialise network, these are connected in # the context of multiple gpus self.app.initialise_network() self.app.add_validation_flag() # for data parallelism -- # defining and collecting variables from multiple devices bn_ops = None for gpu_id in range(0, max(self.num_gpus, 1)): worker_device = self._device_string(gpu_id, is_worker=True) scope_string = 'worker_{}'.format(gpu_id) with tf.name_scope(scope_string) as scope: with tf.device(worker_device): # setup network for each of the multiple devices self.app.connect_data_and_network( self.outputs_collector, self.gradients_collector) if self.is_training: # batch norm statistics from the last device bn_ops = tf.get_collection(BN_COLLECTION, scope) # assemble all training operations if self.is_training and self.gradients_collector: updates_op = [] # batch normalisation moving averages operation if bn_ops: updates_op.extend(bn_ops) # combine them with model parameter updating operation with tf.name_scope('ApplyGradients'): with graph.control_dependencies(updates_op): self.app.set_network_gradient_op( self.gradients_collector.gradients) # initialisation operation with tf.name_scope('Initialization'): self._init_op = global_vars_init_or_restore() with tf.name_scope('MergedOutputs'): self.outputs_collector.finalise_output_op() # saving operation self.saver = tf.train.Saver(max_to_keep=self.max_checkpoints, save_relative_paths=True) # no more operation definitions after this point tf.Graph.finalize(graph) return graph
def _create_graph(self): """ tensorflow graph is only created within this function """ graph = tf.Graph() main_device = self._device_string(0, is_worker=False) # start constructing the graph, handling training and inference cases with graph.as_default(), tf.device(main_device): # initialise sampler and network, these are connected in # the context of multiple gpus with tf.name_scope('Sampler'): self.app.initialise_sampler() self.app.initialise_network() # for data parallelism -- # defining and collecting variables from multiple devices bn_ops = None for gpu_id in range(0, max(self.num_gpus, 1)): worker_device = self._device_string(gpu_id, is_worker=True) scope_string = 'worker_{}'.format(gpu_id) with tf.name_scope(scope_string) as scope: with tf.device(worker_device): # setup network for each of the multiple devices self.app.connect_data_and_network( self.outputs_collector, self.gradients_collector) if self.is_training: # batch norm statistics from the last device bn_ops = tf.get_collection(BN_COLLECTION, scope) # assemble all training operations if self.is_training and self.gradients_collector: updates_op = [] # batch normalisation moving averages operation if bn_ops: updates_op.extend(bn_ops) # combine them with model parameter updating operation with tf.name_scope('ApplyGradients'): with graph.control_dependencies(updates_op): self.app.set_network_update_op( self.gradients_collector.gradients) # initialisation operation with tf.name_scope('Initialization'): self._init_op = global_vars_init_or_restore() with tf.name_scope('MergedOutputs'): self.outputs_collector.finalise_output_op() # saving operation self.saver = tf.train.Saver(max_to_keep=self.max_checkpoints) # no more operation definitions after this point tf.Graph.finalize(graph) return graph
def rand_init_model(self, _sender, **_unused): """ Randomly initialising all trainable variables defined in the default session. :param _sender: :param _unused: :return: """ with tf.name_scope('Initialisation'): init_op = global_vars_init_or_restore() tf.get_default_session().run(init_op) tf.logging.info('Parameters from random initialisations ...')
def test_no_restores(self): tf.reset_default_graph() block1 = ConvolutionalLayer(4, 3, name='bar', with_bn=False, w_initializer=tf.constant_initializer(1.)) b2 = block1(tf.ones([1., 5., 5., 1.])) init_op = global_vars_init_or_restore() all_vars = tf.global_variables() with self.test_session() as sess: sess.run(init_op) def getvar(x): return [v for v in all_vars if v.name == x][0] bar_w_var = getvar(block1.layer_scope().name + '/conv_/w:0') [bar_w] = sess.run([bar_w_var]) self.assertAllClose(bar_w, np.ones([3, 3, 1, 4]))
def test_restore_block(self): definition = { 'foo': [1], 'bar/conv_/w': np.random.randn(3, 3, 1, 3), 'bar2/conv_/w': np.random.randn(3, 3, 1, 3), 'foo3/conv_/w': np.random.randn(3, 3, 1, 3), 'bar/bing/boffin': [2] } checkpoint_name = self.make_checkpoint('chk1', definition) tf.reset_default_graph() block1 = ConvolutionalLayer(3, 3, with_bn=False, name='foo') b1 = block1(tf.ones([1., 5., 5., 1.])) tf.add_to_collection(RESTORABLE, ('foo', checkpoint_name, 'bar')) block2 = ConvolutionalLayer(4, 3, name='bar', with_bn=False, w_initializer=tf.constant_initializer(1.)) b2 = block2(tf.ones([1., 5., 5., 1.])) block3 = ConvolutionalLayer(3, 3, with_bn=False, name='foo2') block3.restore_from_checkpoint(checkpoint_name, 'bar2') b3 = block3(tf.ones([1., 5., 5., 1.])) block4 = ConvolutionalLayer(3, 3, with_bn=False, name='foo3') block4.restore_from_checkpoint(checkpoint_name) b4 = block4(tf.ones([1., 5., 5., 1.])) tf.add_to_collection(RESTORABLE, ('foo', checkpoint_name, 'bar')) init_op = global_vars_init_or_restore() all_vars = tf.global_variables() with self.test_session() as sess: sess.run(init_op) getvar = lambda x: [v for v in all_vars if v.name == x][0] foo_w_var = getvar(block1.layer_scope().name + '/conv_/w:0') bar_w_var = getvar(block2.layer_scope().name + '/conv_/w:0') foo2_w_var = getvar(block3.layer_scope().name + '/conv_/w:0') foo3_w_var = getvar(block4.layer_scope().name + '/conv_/w:0') vars = [foo_w_var, bar_w_var, foo2_w_var, foo3_w_var] [foo_w, bar_w, foo2_w, foo3_w] = sess.run(vars) self.assertAllClose(foo_w, definition['bar/conv_/w']) self.assertAllClose(bar_w, np.ones([3, 3, 1, 4])) self.assertAllClose(foo2_w, definition['bar2/conv_/w']) self.assertAllClose(foo3_w, definition['foo3/conv_/w'])
def test_restore_block(self): definition = {'foo': [1], 'bar/conv_/w': np.random.randn(3, 3, 1, 3), 'bar2/conv_/w': np.random.randn(3, 3, 1, 3), 'foo3/conv_/w': np.random.randn(3, 3, 1, 3), 'bar/bing/boffin': [2]} checkpoint_name = self.make_checkpoint('chk1', definition) tf.reset_default_graph() block1 = ConvolutionalLayer(3, 3, with_bn=False, name='foo') b1 = block1(tf.ones([1., 5., 5., 1.])) tf.add_to_collection(RESTORABLE, ('foo', checkpoint_name, 'bar')) block2 = ConvolutionalLayer(4, 3, name='bar', with_bn=False, w_initializer=tf.constant_initializer(1.)) b2 = block2(tf.ones([1., 5., 5., 1.])) block3 = ConvolutionalLayer(3, 3, with_bn=False, name='foo2') block3.restore_from_checkpoint(checkpoint_name, 'bar2') b3 = block3(tf.ones([1., 5., 5., 1.])) block4 = ConvolutionalLayer(3, 3, with_bn=False, name='foo3') block4.restore_from_checkpoint(checkpoint_name) b4 = block4(tf.ones([1., 5., 5., 1.])) tf.add_to_collection(RESTORABLE, ('foo', checkpoint_name, 'bar')) init_op = global_vars_init_or_restore() all_vars = tf.global_variables() with self.test_session() as sess: sess.run(init_op) def getvar(x): return [v for v in all_vars if v.name == x][0] foo_w_var = getvar(block1.layer_scope().name + '/conv_/w:0') bar_w_var = getvar(block2.layer_scope().name + '/conv_/w:0') foo2_w_var = getvar(block3.layer_scope().name + '/conv_/w:0') foo3_w_var = getvar(block4.layer_scope().name + '/conv_/w:0') vars = [foo_w_var, bar_w_var, foo2_w_var, foo3_w_var] [foo_w, bar_w, foo2_w, foo3_w] = sess.run(vars) self.assertAllClose(foo_w, definition['bar/conv_/w']) self.assertAllClose(bar_w, np.ones([3, 3, 1, 4])) self.assertAllClose(foo2_w, definition['bar2/conv_/w']) self.assertAllClose(foo3_w, definition['foo3/conv_/w'])