def testWaitForSessionLocalInit(self): server = tf.train.Server.create_local_server() with tf.Graph().as_default() as graph: v = tf.Variable(1, name="v") w = tf.Variable( v, trainable=False, collections=[tf.GraphKeys.LOCAL_VARIABLES], name="w") sm = tf.train.SessionManager( graph=graph, ready_op=tf.report_uninitialized_variables(), ready_for_local_init_op=tf.report_uninitialized_variables( tf.all_variables()), local_init_op=w.initializer) # Initialize v but not w s = tf.Session(server.target, graph=graph) s.run(v.initializer) sess = sm.wait_for_session(server.target, max_wait_secs=3) self.assertEqual( True, tf.is_variable_initialized(sess.graph.get_tensor_by_name("v:0")).eval( session=sess)) self.assertEqual( True, tf.is_variable_initialized(sess.graph.get_tensor_by_name("w:0")).eval( session=sess)) self.assertEquals(1, sess.run(v)) self.assertEquals(1, sess.run(w))
def testPrepareSessionWithReadyForLocalInitOp(self): with tf.Graph().as_default(): v = tf.Variable(1, name="v") w = tf.Variable( v, trainable=False, collections=[tf.GraphKeys.LOCAL_VARIABLES], name="w") with self.test_session(): self.assertEqual(False, tf.is_variable_initialized(v).eval()) self.assertEqual(False, tf.is_variable_initialized(w).eval()) sm2 = tf.train.SessionManager( ready_op=tf.report_uninitialized_variables(), ready_for_local_init_op=tf.report_uninitialized_variables( tf.all_variables()), local_init_op=w.initializer) sess = sm2.prepare_session("", init_op=v.initializer) self.assertEqual( True, tf.is_variable_initialized(sess.graph.get_tensor_by_name("v:0")).eval( session=sess)) self.assertEqual( True, tf.is_variable_initialized(sess.graph.get_tensor_by_name("w:0")).eval( session=sess)) self.assertEquals(1, sess.run(v)) self.assertEquals(1, sess.run(w))
def testRecoverSession(self): # Create a checkpoint. checkpoint_dir = os.path.join(self.get_temp_dir(), "recover_session") try: gfile.DeleteRecursively(checkpoint_dir) except errors.OpError: pass # Ignore gfile.MakeDirs(checkpoint_dir) with tf.Graph().as_default(): v = tf.Variable(1, name="v") sm = tf.train.SessionManager(ready_op=tf.report_uninitialized_variables()) saver = tf.train.Saver({"v": v}) sess, initialized = sm.recover_session("", saver=saver, checkpoint_dir=checkpoint_dir) self.assertFalse(initialized) sess.run(v.initializer) self.assertEquals(1, sess.run(v)) saver.save(sess, os.path.join(checkpoint_dir, "recover_session_checkpoint")) # Create a new Graph and SessionManager and recover. with tf.Graph().as_default(): v = tf.Variable(2, name="v") with self.test_session(): self.assertEqual(False, tf.is_variable_initialized(v).eval()) sm2 = tf.train.SessionManager( ready_op=tf.report_uninitialized_variables()) saver = tf.train.Saver({"v": v}) sess, initialized = sm2.recover_session("", saver=saver, checkpoint_dir=checkpoint_dir) self.assertTrue(initialized) self.assertEqual( True, tf.is_variable_initialized( sess.graph.get_tensor_by_name("v:0")).eval(session=sess)) self.assertEquals(1, sess.run(v))
def initializeOrRestore(self): self.ckptDir = os.path.join(self.checkpoint_dir, self.dataset.name) self.ckptPrefix = os.path.join(self.ckptDir, self.name, self.name) vgg_ckpt_file = os.path.join(self.ckptDir, 'vgg_16', 'vgg_16.ckpt') mt_ckpt_file = layers.latest_checkpoint(os.path.join(self.ckptDir, 'mt')) # ckpt_file = layers.latest_checkpoint(os.path.join(self.ckptDir, 'vgg_16', 'vgg_16.ckpt')) globalVars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES) if vgg_ckpt_file is not None and tf.train.checkpoint_exists(vgg_ckpt_file): varsInCkpt, varsNotInCkpt = layers.scan_checkpoint_for_vars(vgg_ckpt_file, globalVars) if len(varsInCkpt) != 0: restorationSaver = tf.train.Saver(varsInCkpt) self.sess.run(tf.report_uninitialized_variables(var_list=varsInCkpt)) restorationSaver.restore(self.sess, vgg_ckpt_file) else: varsNotInCkpt = globalVars if mt_ckpt_file is not None and tf.train.checkpoint_exists(mt_ckpt_file): varsInCkpt, varsNotInCkpt = layers.scan_checkpoint_for_vars(mt_ckpt_file, varsNotInCkpt) varsInCkpt, varsNotInCkpt = layers.replaceVarInListsByName(varsInCkpt, varsNotInCkpt, 'fc6') if len(varsInCkpt) != 0: restorationSaver = tf.train.Saver(varsInCkpt) self.sess.run(tf.report_uninitialized_variables(var_list=varsInCkpt)) restorationSaver.restore(self.sess, mt_ckpt_file) else: varsNotInCkpt = globalVars self.saver = tf.train.Saver() self.sess.run(tf.group(tf.variables_initializer(varsNotInCkpt), tf.local_variables_initializer()))
def testRecoverSessionWithReadyForLocalInitOpFailsToReadyLocal(self): # We use ready_for_local_init_op=tf.report_uninitialized_variables(), # which causes recover_session to not run local_init_op, and to return # initialized=False # Create a checkpoint. checkpoint_dir = os.path.join( self.get_temp_dir(), "recover_session_ready_for_local_init_fails_to_ready_local") try: gfile.DeleteRecursively(checkpoint_dir) except errors.OpError: pass # Ignore gfile.MakeDirs(checkpoint_dir) with tf.Graph().as_default(): v = tf.Variable(1, name="v") sm = tf.train.SessionManager(ready_op=tf.report_uninitialized_variables()) saver = tf.train.Saver({"v": v}) sess, initialized = sm.recover_session( "", saver=saver, checkpoint_dir=checkpoint_dir) self.assertFalse(initialized) sess.run(v.initializer) self.assertEquals(1, sess.run(v)) saver.save(sess, os.path.join(checkpoint_dir, "recover_session_checkpoint")) # Create a new Graph and SessionManager and recover. with tf.Graph().as_default(): v = tf.Variable(2, name="v") w = tf.Variable( v, trainable=False, collections=[tf.GraphKeys.LOCAL_VARIABLES], name="w") with self.test_session(): self.assertEqual(False, tf.is_variable_initialized(v).eval()) self.assertEqual(False, tf.is_variable_initialized(w).eval()) sm2 = tf.train.SessionManager( ready_op=tf.report_uninitialized_variables(), ready_for_local_init_op=tf.report_uninitialized_variables(), local_init_op=w.initializer) saver = tf.train.Saver({"v": v}) sess, initialized = sm2.recover_session( "", saver=saver, checkpoint_dir=checkpoint_dir) self.assertFalse(initialized) self.assertEqual( True, tf.is_variable_initialized(sess.graph.get_tensor_by_name("v:0")).eval( session=sess)) self.assertEqual( False, tf.is_variable_initialized(sess.graph.get_tensor_by_name("w:0")).eval( session=sess)) self.assertEquals(1, sess.run(v))
def testPrepareSessionFails(self): checkpoint_dir = os.path.join(self.get_temp_dir(), "prepare_session") checkpoint_dir2 = os.path.join(self.get_temp_dir(), "prepare_session2") try: gfile.DeleteRecursively(checkpoint_dir) gfile.DeleteRecursively(checkpoint_dir2) except OSError: pass # Ignore gfile.MakeDirs(checkpoint_dir) with tf.Graph().as_default(): v = tf.Variable([1.0, 2.0, 3.0], name="v") sm = tf.train.SessionManager(ready_op=tf.report_uninitialized_variables()) saver = tf.train.Saver({"v": v}) sess = sm.prepare_session( "", init_op=tf.initialize_all_variables(), saver=saver, checkpoint_dir=checkpoint_dir ) self.assertAllClose([1.0, 2.0, 3.0], sess.run(v)) checkpoint_filename = os.path.join(checkpoint_dir, "prepare_session_checkpoint") saver.save(sess, checkpoint_filename) # Create a new Graph and SessionManager and recover. with tf.Graph().as_default(): # Renames the checkpoint directory. os.rename(checkpoint_dir, checkpoint_dir2) gfile.MakeDirs(checkpoint_dir) v = tf.Variable([6.0, 7.0, 8.0], name="v") with self.test_session(): self.assertEqual(False, tf.is_variable_initialized(v).eval()) tf.train.SessionManager(ready_op=tf.report_uninitialized_variables()) saver = tf.train.Saver({"v": v}) # This should fail as there's no checkpoint within 2 seconds. with self.assertRaisesRegexp(RuntimeError, "no init_op or init_fn was given"): sess = sm.prepare_session( "", init_op=None, saver=saver, checkpoint_dir=checkpoint_dir, wait_for_checkpoint=True, max_wait_secs=2, ) # Rename the checkpoint directory back. gfile.DeleteRecursively(checkpoint_dir) os.rename(checkpoint_dir2, checkpoint_dir) # This should succeed as there's checkpoint. sess = sm.prepare_session( "", init_op=None, saver=saver, checkpoint_dir=checkpoint_dir, wait_for_checkpoint=True, max_wait_secs=2 ) self.assertEqual(True, tf.is_variable_initialized(sess.graph.get_tensor_by_name("v:0")).eval(session=sess))
def testPrepareSessionSucceedsWithInitFeedDict(self): with tf.Graph().as_default(): p = tf.placeholder(tf.float32, shape=(3,)) v = tf.Variable(p, name="v") sm = tf.train.SessionManager(ready_op=tf.report_uninitialized_variables()) sess = sm.prepare_session("", init_op=tf.initialize_all_variables(), init_feed_dict={p: [1.0, 2.0, 3.0]}) self.assertAllClose([1.0, 2.0, 3.0], sess.run(v))
def guarantee_initialized_variables(self, session, list_of_variables = None): if list_of_variables is None: list_of_variables = tf.all_variables() uninitialized_variables = list(tf.get_variable(name) for name in session.run(tf.report_uninitialized_variables(list_of_variables))) session.run(tf.initialize_variables(uninitialized_variables)) return uninitialized_variables
def test_restore_fn_classification(self): # Define mock tensorflow classification graph and save variables. test_graph_classification = tf.Graph() with test_graph_classification.as_default(): image = tf.placeholder(dtype=tf.float32, shape=[1, 20, 20, 3]) with tf.variable_scope('mock_model'): net = slim.conv2d(image, num_outputs=32, kernel_size=1, scope='layer1') slim.conv2d(net, num_outputs=3, kernel_size=1, scope='layer2') init_op = tf.global_variables_initializer() saver = tf.train.Saver() save_path = self.get_temp_dir() with self.test_session() as sess: sess.run(init_op) saved_model_path = saver.save(sess, save_path) # Create tensorflow detection graph and load variables from # classification checkpoint. test_graph_detection = tf.Graph() with test_graph_detection.as_default(): inputs_shape = [2, 2, 2, 3] inputs = tf.to_float(tf.random_uniform( inputs_shape, minval=0, maxval=255, dtype=tf.int32)) preprocessed_inputs = self._model.preprocess(inputs) prediction_dict = self._model.predict(preprocessed_inputs) self._model.postprocess(prediction_dict) restore_fn = self._model.restore_fn(saved_model_path, from_detection_checkpoint=False) with self.test_session() as sess: restore_fn(sess) for var in sess.run(tf.report_uninitialized_variables()): self.assertNotIn('FeatureExtractor', var.name)
def testRecoverSessionNoChkptStillRunsLocalInitOp(self): # This test checks for backwards compatibility. # In particular, we continue to ensure that recover_session will execute # local_init_op exactly once, regardless of whether the session was # successfully recovered. with tf.Graph().as_default(): w = tf.Variable( 1, trainable=False, collections=[tf.GraphKeys.LOCAL_VARIABLES], name="w") with self.test_session(): self.assertEqual(False, tf.is_variable_initialized(w).eval()) sm2 = tf.train.SessionManager( ready_op=tf.report_uninitialized_variables(), ready_for_local_init_op=None, local_init_op=w.initializer) # Try to recover session from None sess, initialized = sm2.recover_session( "", saver=None, checkpoint_dir=None) # Succeeds because recover_session still run local_init_op self.assertFalse(initialized) self.assertEqual( True, tf.is_variable_initialized(sess.graph.get_tensor_by_name("w:0")).eval( session=sess)) self.assertEquals(1, sess.run(w))
def _find_initializable_tensors(intializables, session): for_reports = [] status_tensors = [] boolean_tensors = [] for v in intializables: if isinstance(v, (tuple, list)): status_tensors.append(v[0]) boolean_tensors.append(v[1]) # TODO(@awav): Tensorflow Iterator must have to be skipped at # auto-intialization unless TensorFlow issue #14633 is resolved. elif isinstance(v, tf.data.Iterator): continue else: for_reports.append(v) if for_reports: uninitialized = tf.report_uninitialized_variables(var_list=for_reports) def uninitialized_names(): for uv in session.run(uninitialized): yield uv.decode('utf-8') names = set(uninitialized_names()) for v in for_reports: if v.name.split(':')[0] in names: yield v if boolean_tensors: stats = session.run(boolean_tensors) length = len(stats) for i in range(length): if not stats[i]: yield status_tensors[i]
def parameter_server(): with tf.device( "/job:ps/task:0"): var = tf.Variable(0.0 , name= 'var') server = tf.train.Server(cluster, job_name="ps" , task_index=0) sess = tf.Session(target=server.target) print "*" * 40 print server.target print "*" * 40 for i in range(5): print("Parameter server: sleeping...") sleep(1) print("Parameter server: waiting for cluster connection...") sess.run(tf.report_uninitialized_variables()) print("Parameter server: cluster ready!") print("Parameter server: initializing variables...") sess.run(tf.global_variables_initializer()) print("Parameter server: variables initialized") for i in range(5): val = sess.run(var) print("Parameter server: var has value %.1f" % val) sleep(1.0) print("Parameter server: blocking...") server.join()
def testPrepareSessionSucceedsWithInitFn(self): with tf.Graph().as_default(): v = tf.Variable([125], name="v") sm = tf.train.SessionManager(ready_op=tf.report_uninitialized_variables()) sess = sm.prepare_session("", init_fn=lambda sess: sess.run(v.initializer)) self.assertAllClose([125], sess.run(v))
def testWaitForSessionReturnsNoneAfterTimeout(self): with tf.Graph().as_default(): tf.Variable(1, name="v") sm = tf.train.SessionManager(ready_op=tf.report_uninitialized_variables(), recovery_wait_secs=1) # Set max_wait_secs to allow us to try a few times. with self.assertRaises(errors.DeadlineExceededError): sm.wait_for_session(master="", max_wait_secs=3)
def testAssertVariablesInitialized(self): with tf.Graph().as_default(), self.test_session() as sess: v = tf.Variable([1, 2], name="v") w = tf.Variable([3, 4], name="w") _ = v, w uninited = tf.report_uninitialized_variables() self.assertAllEqual(np.array([b"v", b"w"]), sess.run(uninited)) tf.initialize_all_variables().run() self.assertEqual(0, sess.run(uninited).size)
def testWaitForSessionWithReadyForLocalInitOpFailsToReadyLocal(self): with tf.Graph().as_default() as graph: v = tf.Variable(1, name="v") w = tf.Variable( v, trainable=False, collections=[tf.GraphKeys.LOCAL_VARIABLES], name="w") sm = tf.train.SessionManager( graph=graph, ready_op=tf.report_uninitialized_variables(), ready_for_local_init_op=tf.report_uninitialized_variables(), local_init_op=w.initializer) with self.assertRaises(tf.errors.DeadlineExceededError): # Time-out because w fails to be initialized, # because of overly restrictive ready_for_local_init_op sm.wait_for_session("", max_wait_secs=3)
def testInitWithNoneLocalInitOpError(self): # Creating a SessionManager with a None local_init_op but # non-None ready_for_local_init_op raises ValueError with self.assertRaisesRegexp(ValueError, "If you pass a ready_for_local_init_op " "you must also pass a local_init_op "): tf.train.SessionManager( ready_for_local_init_op=tf.report_uninitialized_variables( tf.all_variables()), local_init_op=None)
def testVariableList(self): with tf.Graph().as_default(), self.test_session() as sess: v = tf.Variable([1, 2], name="v") w = tf.Variable([3, 4], name="w") uninited = tf.report_uninitialized_variables() self.assertAllEqual(np.array([b"v", b"w"]), sess.run(uninited)) sess.run(w.initializer) self.assertAllEqual(np.array([b"v"]), sess.run(uninited)) v.initializer.run() self.assertEqual(0, sess.run(uninited).size)
def testPrepareSessionWithReadyNotReadyForLocal(self): with tf.Graph().as_default(): v = tf.Variable(1, name="v") w = tf.Variable( v, trainable=False, collections=[tf.GraphKeys.LOCAL_VARIABLES], name="w") with self.test_session(): self.assertEqual(False, tf.is_variable_initialized(v).eval()) self.assertEqual(False, tf.is_variable_initialized(w).eval()) sm2 = tf.train.SessionManager( ready_op=tf.report_uninitialized_variables(), ready_for_local_init_op=tf.report_uninitialized_variables( tf.all_variables()), local_init_op=w.initializer) with self.assertRaisesRegexp( RuntimeError, "Init operations did not make model ready for local_init"): sm2.prepare_session("", init_op=None)
def test_restore_fn_detection(self): init_op = tf.global_variables_initializer() saver = tf_saver.Saver() save_path = self.get_temp_dir() with self.test_session() as sess: sess.run(init_op) saved_model_path = saver.save(sess, save_path) restore_fn = self._model.restore_fn(saved_model_path, from_detection_checkpoint=True) restore_fn(sess) for var in sess.run(tf.report_uninitialized_variables()): self.assertNotIn('FeatureExtractor', var.name)
def get_distributed_session_creator(server): """ Args: server (tf.train.Server): Returns: tf.train.SessionCreator """ server_def = server.server_def is_chief = (server_def.job_name == 'worker') and (server_def.task_index == 0) init_op = tf.global_variables_initializer() local_init_op = tf.local_variables_initializer() ready_op = tf.report_uninitialized_variables() ready_for_local_init_op = tf.report_uninitialized_variables(tf.global_variables()) sm = tf.train.SessionManager( local_init_op=local_init_op, ready_op=ready_op, ready_for_local_init_op=ready_for_local_init_op, graph=tf.get_default_graph()) # to debug wrong variable collection # from pprint import pprint # print("GLOBAL:") # pprint([(k.name, k.device) for k in tf.global_variables()]) # print("LOCAL:") # pprint([(k.name, k.device) for k in tf.local_variables()]) class _Creator(tf.train.SessionCreator): def create_session(self): if is_chief: return sm.prepare_session(master=server.target, init_op=init_op) else: tf.logging.set_verbosity(tf.logging.INFO) # print message about uninitialized vars ret = sm.wait_for_session(master=server.target) tf.logging.set_verbosity(tf.logging.WARN) return ret return _Creator()
def test_restore_map_for_detection_ckpt(self): init_op = tf.global_variables_initializer() saver = tf_saver.Saver() save_path = self.get_temp_dir() with self.test_session() as sess: sess.run(init_op) saved_model_path = saver.save(sess, save_path) var_map = self._model.restore_map(from_detection_checkpoint=True) self.assertIsInstance(var_map, dict) saver = tf.train.Saver(var_map) saver.restore(sess, saved_model_path) for var in sess.run(tf.report_uninitialized_variables()): self.assertNotIn('FeatureExtractor', var.name)
def test_evaluate_ready_for_local_init(self): with tf.Graph().as_default() as g, self.test_session(g): tf.contrib.framework.create_global_step() v = variables.Variable(1.0) w = variables.Variable(v + 1, collections=[ops.GraphKeys.LOCAL_VARIABLES], trainable=False) ready_for_local_init_op = tf.report_uninitialized_variables( tf.global_variables()) ops.add_to_collection(ops.GraphKeys.READY_FOR_LOCAL_INIT_OP, ready_for_local_init_op) _ = learn.graph_actions.evaluate( g, output_dir=self._output_dir, checkpoint_path=None, eval_dict={'a': v}, max_steps=1)
def worker(worker_n): with tf.device( "/job:ps/task:0"): var = tf.Variable(0.0 , name= 'var') server = tf.train.Server(cluster, job_name="worker", task_index=worker_n) sess = tf.Session(target=server.target) print("Worker %d: waiting for cluster connection..." % worker_n) sess.run(tf.report_uninitialized_variables()) print("Worker %d: cluster ready!" % worker_n) while sess.run(tf.report_uninitialized_variables()): print("Worker %d: waiting for variable initialization..." % worker_n) sleep(1.0) print("Worker %d: variables initialized" % worker_n) for i in range(5): print("Worker %d: incrementing var" % worker_n) sess.run(var.assign_add(1.0)) sleep(1.0) print("Worker %d: blocking..." % worker_n) server.join()
def guarantee_initialized_variables(session, variables=None): """Guarantee that all the specified variables are initialized. If a variable is already initialized, leave it alone. Otherwise, initialize it. If no variables are specified, checks all variables in the default graph. Args: variables (list[tf.Variable]) """ name_to_var = {v.op.name: v for v in tf.global_variables() + tf.local_variables()} uninitialized_variables = list(name_to_var[name] for name in session.run(tf.report_uninitialized_variables(variables))) init_op = tf.variables_initializer(uninitialized_variables) session.run(init_op) return uninitialized_variables
def testWaitForSessionInsufficientReadyForLocalInitCheck(self): with tf.Graph().as_default() as graph: v = tf.Variable(1, name="v") w = tf.Variable( v, trainable=False, collections=[tf.GraphKeys.LOCAL_VARIABLES], name="w") sm = tf.train.SessionManager( graph=graph, ready_op=tf.report_uninitialized_variables(), ready_for_local_init_op=None, local_init_op=w.initializer) with self.assertRaisesRegexp(tf.errors.FailedPreconditionError, "Attempting to use uninitialized value v"): sm.wait_for_session("", max_wait_secs=3)
def initializeOrRestore(self): self.ckptDir = os.path.join(self.checkpoint_dir, self.dataset.name, self.name) self.ckptPrefix = os.path.join(self.ckptDir, self.name) globalVars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES) ckpt_file = layers.latest_checkpoint(self.ckptDir, "checkpoint") if ckpt_file is not None and tf.train.checkpoint_exists(ckpt_file): varsInCkpt, varsNotInCkpt = layers.scan_checkpoint_for_vars(ckpt_file, globalVars) if len(varsInCkpt) != 0: restorationSaver = tf.train.Saver(varsInCkpt) self.sess.run(tf.report_uninitialized_variables(var_list=varsInCkpt)) restorationSaver.restore(self.sess, ckpt_file) else: varsNotInCkpt = globalVars self.saver = tf.train.Saver() self.sess.run(tf.group(tf.variables_initializer(varsNotInCkpt), tf.local_variables_initializer()))
def testPrepareSessionWithInsufficientReadyForLocalInitCheck(self): with tf.Graph().as_default(): v = tf.Variable(1, name="v") w = tf.Variable( v, trainable=False, collections=[tf.GraphKeys.LOCAL_VARIABLES], name="w") with self.test_session(): self.assertEqual(False, tf.is_variable_initialized(v).eval()) self.assertEqual(False, tf.is_variable_initialized(w).eval()) sm2 = tf.train.SessionManager( ready_op=tf.report_uninitialized_variables(), ready_for_local_init_op=None, local_init_op=w.initializer) with self.assertRaisesRegexp(tf.errors.FailedPreconditionError, "Attempting to use uninitialized value v"): sm2.prepare_session("", init_op=None)
def test_restore_map_for_classification_ckpt(self, use_keras): # Define mock tensorflow classification graph and save variables. test_graph_classification = tf.Graph() with test_graph_classification.as_default(): image = tf.placeholder(dtype=tf.float32, shape=[1, 20, 20, 3]) if use_keras: with tf.name_scope('mock_model'): layer_one = keras.Conv2D(32, kernel_size=1, name='layer1') net = layer_one(image) layer_two = keras.Conv2D(3, kernel_size=1, name='layer2') layer_two(net) else: with tf.variable_scope('mock_model'): net = slim.conv2d(image, num_outputs=32, kernel_size=1, scope='layer1') slim.conv2d(net, num_outputs=3, kernel_size=1, scope='layer2') init_op = tf.global_variables_initializer() saver = tf.train.Saver() save_path = self.get_temp_dir() with self.test_session(graph=test_graph_classification) as sess: sess.run(init_op) saved_model_path = saver.save(sess, save_path) # Create tensorflow detection graph and load variables from # classification checkpoint. test_graph_detection = tf.Graph() with test_graph_detection.as_default(): model, _, _, _ = self._create_model(use_keras=use_keras) inputs_shape = [2, 2, 2, 3] inputs = tf.to_float(tf.random_uniform( inputs_shape, minval=0, maxval=255, dtype=tf.int32)) preprocessed_inputs, true_image_shapes = model.preprocess(inputs) prediction_dict = model.predict(preprocessed_inputs, true_image_shapes) model.postprocess(prediction_dict, true_image_shapes) another_variable = tf.Variable([17.0], name='another_variable') # pylint: disable=unused-variable var_map = model.restore_map(fine_tune_checkpoint_type='classification') self.assertNotIn('another_variable', var_map) self.assertIsInstance(var_map, dict) saver = tf.train.Saver(var_map) with self.test_session(graph=test_graph_detection) as sess: saver.restore(sess, saved_model_path) for var in sess.run(tf.report_uninitialized_variables()): self.assertNotIn('FeatureExtractor', var)
def test_restore_map_for_detection_ckpt(self, use_keras): model, _, _, _ = self._create_model(use_keras=use_keras) model.predict(tf.constant(np.array([[[[0, 0], [1, 1]], [[1, 0], [0, 1]]]], dtype=np.float32)), true_image_shapes=None) init_op = tf.global_variables_initializer() saver = tf.train.Saver() save_path = self.get_temp_dir() with self.test_session() as sess: sess.run(init_op) saved_model_path = saver.save(sess, save_path) var_map = model.restore_map( fine_tune_checkpoint_type='detection', load_all_detection_checkpoint_vars=False) self.assertIsInstance(var_map, dict) saver = tf.train.Saver(var_map) saver.restore(sess, saved_model_path) for var in sess.run(tf.report_uninitialized_variables()): self.assertNotIn('FeatureExtractor', var)
def run(args, server): env = create_env(args.env_id, client_id=str(args.task), remotes=args.remotes) trainer = A3C(env, args.task, args.visualise) # Variable names that start with "local" are not saved in checkpoints. if use_tf12_api: variables_to_save = [ v for v in tf.global_variables() if not v.name.startswith("local") ] init_op = tf.variables_initializer(variables_to_save) init_all_op = tf.global_variables_initializer() else: variables_to_save = [ v for v in tf.all_variables() if not v.name.startswith("local") ] init_op = tf.initialize_variables(variables_to_save) init_all_op = tf.initialize_all_variables() saver = FastSaver(variables_to_save) var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, tf.get_variable_scope().name) logger.info('Trainable vars:') for v in var_list: logger.info(' %s %s', v.name, v.get_shape()) def init_fn(ses): logger.info("Initializing all parameters.") ses.run(init_all_op) def get_init_fn(): if args.checkpoint_path is None: return lambda sess: init_fn(sess) # Warn the user if a checkpoint exists in the train_dir. Then we'll be # ignoring the checkpoint anyway. train_dir = os.path.join(args.log_dir, 'train') if tf.train.latest_checkpoint(train_dir): logger.info( 'Ignoring --checkpoint_path because a checkpoint already exists in %s' % train_dir) return lambda sess: init_fn(sess) exclusions = [] if args.checkpoint_exclude_scopes: exclusions = [ scope.strip() for scope in FLAGS.checkpoint_exclude_scopes.split(',') ] variables_to_restore = [] for var in variables_to_save: #tf.contrib.framework.get_model_variables(): for exclusion in exclusions: if var.op.name.startswith(exclusion): break else: variables_to_restore.append(var) if tf.gfile.IsDirectory(args.checkpoint_path): checkpoint_path = tf.train.latest_checkpoint(args.checkpoint_path) else: checkpoint_path = args.checkpoint_path print(variables_to_restore) logger.info('Fine-tuning from %s' % checkpoint_path) return tf.contrib.framework.assign_from_checkpoint_fn( checkpoint_path, variables_to_restore, ignore_missing_vars=args.ignore_missing_vars) config = tf.ConfigProto(device_filters=[ "/job:ps", "/job:worker/task:{}/cpu:0".format(args.task) ]) logdir = os.path.join(args.log_dir, 'train') if use_tf12_api: summary_writer = tf.summary.FileWriter(logdir + "_%d" % args.task) else: summary_writer = tf.train.SummaryWriter(logdir + "_%d" % args.task) logger.info("Events directory: %s_%s", logdir, args.task) sv = tf.train.Supervisor( is_chief=(args.task == 0), logdir=logdir, saver=saver, summary_op=None, init_op=init_op, init_fn=get_init_fn(), summary_writer=summary_writer, ready_op=tf.report_uninitialized_variables(variables_to_save), global_step=trainer.global_step, save_model_secs=30, save_summaries_secs=30) num_global_steps = 100000000 logger.info( "Starting session. If this hangs, we're mostly likely waiting to connect to the parameter server. " + "One common cause is that the parameter server DNS name isn't resolving yet, or is misspecified." ) with sv.managed_session(server.target, config=config) as sess, sess.as_default(): sess.run(trainer.sync) trainer.start(sess, summary_writer) global_step = sess.run(trainer.global_step) logger.info("Starting training at step=%d", global_step) while not sv.should_stop() and (not num_global_steps or global_step < num_global_steps): trainer.process(sess) global_step = sess.run(trainer.global_step) # Ask for all the services to stop. sv.stop() logger.info('reached %s steps. worker stopped.', global_step)
def build_graph(self, features, labels, mode, params): """docstring.""" del labels, params misc_utils.print_out("Running fast mode_fn") hparams = self.hparams # Create global_step tf.train.get_or_create_global_step() if mode == tf.contrib.learn.ModeKeys.INFER: # Doing inference only on one GPU inf_hparams = tf.contrib.training.HParams(**hparams.values()) inf_hparams.set_hparam("num_gpus", 1) # Inference is done in fp32 and in the same way as that of dist_strategy. inf_hparams.set_hparam("use_fp16", False) misc_utils.print_out("inference hparmas:") misc_utils.print_hparams(inf_hparams) # Create variable_mgr var_mgr = self._get_variable_mgr(inf_hparams) with mixed_precision_scope(), tf.device("gpu:0"), tf.name_scope( "tower_0"), var_mgr.create_outer_variable_scope(0): model = gnmt_model.GNMTModel(inf_hparams, mode=mode, features=features) sample_ids = model.sample_id reverse_target_vocab_table = lookup_ops.index_to_string_table_from_file( inf_hparams.tgt_vocab_file, default_value=vocab_utils.UNK) sample_words = reverse_target_vocab_table.lookup( tf.to_int64(sample_ids)) # make sure outputs is of shape [batch_size, time] or [beam_width, # batch_size, time] when using beam search. if inf_hparams.time_major: sample_words = tf.transpose(sample_words) elif sample_words.shape.ndims == 3: # beam search output in [batch_size, time, beam_width] shape. sample_words = tf.transpose(sample_words, [2, 0, 1]) predictions = {"predictions": sample_words} # return loss, vars, grads, predictions, train_op, scaffold return None, None, None, predictions, None, None elif mode == tf.contrib.learn.ModeKeys.TRAIN: num_towers = hparams.num_gpus # Shard inputs tower_features = self._shard_inputs(features, num_towers) # Create loss scale vars if necessary loss_scale, loss_scale_normal_steps = self._create_loss_scale_vars( ) # Create variable_mgr var_mgr = self._get_variable_mgr(hparams) # Build per-tower fprop and bprop devices = var_mgr.get_devices() tower_gradvars = [] tower_scopes = [] var_scopes = [] train_losses = [] learning_rates = [] batch_sizes = [] opts = [] def fprop_and_bprop(tid): """docstring.""" model = gnmt_model.GNMTModel(hparams, mode=mode, features=tower_features[tid]) # sync training. assert model.learning_rate is not None # The following handles shouldn't be built in when doing manual assert model.grad_norm is None assert model.update is None tower_loss = model.train_loss # Only check loss numerics if in fp16 if hparams.use_fp16 and hparams.check_tower_loss_numerics: tower_loss = tf.check_numerics( tower_loss, "tower_%d has Inf/NaN loss" % tid) # Cast to fp32, otherwise would easily overflow. tower_loss = tf.to_float(tower_loss) var_params, grads, opt = self._compute_tower_grads( tower_loss, var_mgr.trainable_variables_on_device(tid, tid), model.learning_rate, use_fp16=hparams.use_fp16, loss_scale=loss_scale, colocate_gradients_with_ops=hparams. colocate_gradients_with_ops) self._print_varinfo(var_params, tid) res = [model.train_loss, model.learning_rate, model.batch_size] res.extend(grads) opts.append(opt) return res def unpack_fprop_and_bprop_output(output): train_loss = output[0] learning_rate = output[1] batch_size = output[2] grads = output[3:] return train_loss, learning_rate, batch_size, grads with mixed_precision_scope(): for tid in range(num_towers): with tf.device(devices[tid % len(devices)]), tf.name_scope( "tower_%s" % tid) as scope: tower_scopes.append(scope) with var_mgr.create_outer_variable_scope( tid) as var_scope: var_scopes.append(var_scope) outputs = maybe_xla_compile( hparams, fprop_and_bprop, tid) (train_loss, learning_rate, batch_size, grads) = unpack_fprop_and_bprop_output(outputs) train_losses.append(train_loss) learning_rates.append(learning_rate) batch_sizes.append(batch_size) var_params = var_mgr.trainable_variables_on_device( tid, tid) tower_gradvars.append(list(zip(grads, var_params))) # Add summaries if hparams.show_metrics: tf.summary.scalar("learning_rate", learning_rates[0]) if loss_scale: tf.summary.scalar("loss_scale", loss_scale) if hparams.enable_auto_loss_scale: tf.summary.scalar("loss_scale_normal_steps", loss_scale_normal_steps) misc_utils.print_out("Finish building fprop and per-tower bprop.") # Aggregate gradients # The following compute the aggregated grads for each tower, stored in # opaque grad_states structure. apply_grads_devices, grad_states = var_mgr.preprocess_device_grads( tower_gradvars) master_grads = None master_params = None update_ops = [] for i, device in enumerate(apply_grads_devices): with tf.device(device), tf.name_scope(tower_scopes[i]): # Get per-tower grads. with tf.name_scope("get_gradients_to_apply"): avg_gradvars = var_mgr.get_gradients_to_apply( i, grad_states) avg_grads = [gv[0] for gv in avg_gradvars] # gradients post-processing with tf.name_scope("clip_gradients"): if hparams.clip_grads: clipped_grads, grad_norm = model_helper.gradient_clip( avg_grads, max_gradient_norm=hparams.max_gradient_norm) # summary the grad on the 1st tower if i == 0 and hparams.show_metrics: tf.summary.scalar("grad_norm", grad_norm) tf.summary.scalar( "clipped_grad_norm", tf.global_norm(clipped_grads)) else: clipped_grads = avg_grads if i == 0: master_grads = clipped_grads # Build apply-gradients ops clipped_gradvars = list( zip(clipped_grads, [gv[1] for gv in avg_gradvars])) if i == 0: master_params = [gv[1] for gv in avg_gradvars] with tf.name_scope("append_gradient_ops"): loss_scale_params = variable_mgr_util.AutoLossScaleParams( enable_auto_loss_scale=hparams. enable_auto_loss_scale, loss_scale=loss_scale, loss_scale_normal_steps=loss_scale_normal_steps, inc_loss_scale_every_n=hparams. fp16_inc_loss_scale_every_n, is_chief=True) opt = opts[i] var_mgr.append_apply_gradients_ops( grad_states, opt, clipped_gradvars, update_ops, loss_scale_params) misc_utils.print_out("Finish building grad aggregation.") assert len(update_ops) == num_towers train_op = tf.group(update_ops) with tf.control_dependencies([train_op]): global_step = tf.train.get_global_step() train_op = global_step.assign_add(1) # Compute loss on the first gpu # TODO(jamesqin): optimize it? with tf.device("gpu:0"): loss = misc_utils.weighted_avg(train_losses, batch_sizes) # Create local init_ops # TODO(jamesqin): handle resource variables! # At present if not using mirror strategy, not using resource vars. local_init_ops = [] local_init_op = tf.local_variables_initializer() with tf.control_dependencies([local_init_op]): local_init_ops.append(var_mgr.get_post_init_ops()) local_init_ops.extend([local_init_op, tf.tables_initializer()]) saveable_vars = var_mgr.savable_variables() # Add saveables for cudnn vars in master tower. saveable_objects = tf.get_collection(tf.GraphKeys.SAVEABLE_OBJECTS) saveable_objects = [x for x in saveable_objects if "v0" in x.name] misc_utils.print_out("Saveable vars(%d): " % len(saveable_vars)) for mv in saveable_vars: misc_utils.print_out(mv.name) misc_utils.print_out("All global trainable vars(%d): " % len(tf.trainable_variables())) for tv in tf.trainable_variables(): misc_utils.print_out(tv.name) misc_utils.print_out("All global vars(%d): " % len(tf.global_variables())) for gv in tf.global_variables(): misc_utils.print_out(gv.name) misc_utils.print_out("master backproped params(%d): " % len(master_params)) for mp in master_params: misc_utils.print_out(mp.name) # Note the cudnn vars are skipped the init check. :( scaffold = tf.train.Scaffold( ready_op=tf.report_uninitialized_variables(saveable_vars), ready_for_local_init_op=tf.report_uninitialized_variables( saveable_vars), local_init_op=tf.group(*local_init_ops), saver=tf.train.Saver(saveable_vars + saveable_objects, save_relative_paths=True)) misc_utils.print_out("Finish building model_fn") # return loss, vars, grads, predictions, train_op, scaffold return loss, master_params, master_grads, None, train_op, scaffold
def run_worker(args): """Starts a worker thread that learns how to play the specified Atari game.""" cluster_def = get_cluster_def(args.num_threads) config = tf.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=2) server = tf.train.Server(cluster_def, 'thread', args.worker_index, config=config) # Configure the supervisor. is_chief = args.worker_index == 0 checkpoint_dir = os.path.join(args.log_dir, 'checkpoint') thread_dir = os.path.join(args.log_dir, 'thread-{}'.format(args.worker_index)) summary_writer = tf.summary.FileWriter(thread_dir) global_variables_initializer = tf.global_variables_initializer() init_fn = lambda sess: sess.run(global_variables_initializer) # Initialize the model. env = environment.AtariWrapper(args.env_name, environment.TRAINING, args.action_space) player = agent.Agent(args.worker_index, env, args.render, args.num_local_steps, args.learning_rate, args.entropy_regularization, args.max_gradient_norm, args.discount, summary_writer, args.summary_update_interval) # Local copies of the model will not be saved. model_variables = [ var for var in tf.global_variables() if not var.name.startswith('local') ] supervisor = tf.train.Supervisor( ready_op=tf.report_uninitialized_variables(model_variables), is_chief=is_chief, init_op=tf.variables_initializer(model_variables), logdir=checkpoint_dir, summary_op=None, saver=tf.train.Saver(model_variables), global_step=player.global_step, save_summaries_secs=30, save_model_secs=30, summary_writer=summary_writer, init_fn=init_fn) config = tf.ConfigProto(device_filters=[ '/job:master', '/job:thread/task:{}/cpu:0'.format(args.worker_index) ]) LOGGER.info('Starting worker. This may take a while.') with supervisor.managed_session(server.target, config=config) as sess, sess.as_default(): global_step = 0 while not supervisor.should_stop( ) and global_step < args.num_global_steps: global_step = player.train(sess) supervisor.stop() LOGGER.info('Stopped after %d global steps.', player.global_step)
def Worker(index, update_game_num, Synchronizer, cluster, model_path): config = tf.ConfigProto( allow_soft_placement=True, log_device_placement=False, ) config.gpu_options.allow_growth = True worker = tf.train.Server(cluster, job_name="worker", task_index=index, config=config) #config.gpu_options.per_process_gpu_memory_fraction = 0.2 sess = tf.Session(target=worker.target, config=config) mini_net = MiniNetwork(sess, index=index, summary_writer=None, rl_training=True, cluster=cluster, ppo_load_path=FLAGS.restore_model_path, ppo_save_path=model_path, freeze_head=FLAGS.freeze_head, use_bn=FLAGS.use_bn, use_sep_net=FLAGS.use_sep_net, restore_model=FLAGS.restore_model, restore_from=FLAGS.restore_from, restore_to=FLAGS.restore_to) global_buffer = Buffer() agents = [] for i in range(THREAD_NUM): agent = MiniAgent(agent_id=i, global_buffer=global_buffer, net=mini_net, restore_model=FLAGS.restore_model) agents.append(agent) print("Worker %d: waiting for cluster connection..." % index) sess.run(tf.report_uninitialized_variables()) print("Worker %d: cluster ready!" % index) while len(sess.run(tf.report_uninitialized_variables())): print("Worker %d: waiting for variable initialization..." % index) time.sleep(1) print("Worker %d: variables initialized" % index) game_num = np.ceil(update_game_num // THREAD_NUM) UPDATE_EVENT.clear() ROLLING_EVENT.set() difficulty = INITIAL_DIFF # Run threads threads = [] for i in range(THREAD_NUM - 1): t = threading.Thread(target=run_thread, args=(agents[i], game_num, Synchronizer, difficulty)) threads.append(t) t.daemon = True t.start() time.sleep(3) run_thread(agents[-1], game_num, Synchronizer, difficulty) for t in threads: t.join()
def __init__(self, graph_path, target_size=(320, 240), tf_config=None): self.target_size = target_size # load graph logger.info('loading graph from %s(default size=%dx%d)' % (graph_path, target_size[0], target_size[1])) with tf.gfile.GFile(graph_path, 'rb') as f: graph_def = tf.GraphDef() graph_def.ParseFromString(f.read()) self.graph = tf.get_default_graph() tf.import_graph_def(graph_def, name='TfPoseEstimator') self.persistent_sess = tf.Session(graph=self.graph, config=tf_config) # for op in self.graph.get_operations(): # print(op.name) # for ts in [n.name for n in tf.get_default_graph().as_graph_def().node]: # print(ts) self.tensor_image = self.graph.get_tensor_by_name( 'TfPoseEstimator/image:0') self.tensor_output = self.graph.get_tensor_by_name( 'TfPoseEstimator/Openpose/concat_stage7:0') self.tensor_heatMat = self.tensor_output[:, :, :, :19] self.tensor_pafMat = self.tensor_output[:, :, :, 19:] self.upsample_size = tf.placeholder(dtype=tf.int32, shape=(2, ), name='upsample_size') self.tensor_heatMat_up = tf.image.resize_area( self.tensor_output[:, :, :, :19], self.upsample_size, align_corners=False, name='upsample_heatmat') self.tensor_pafMat_up = tf.image.resize_area( self.tensor_output[:, :, :, 19:], self.upsample_size, align_corners=False, name='upsample_pafmat') smoother = Smoother({'data': self.tensor_heatMat_up}, 25, 3.0) gaussian_heatMat = smoother.get_output() max_pooled_in_tensor = tf.nn.pool(gaussian_heatMat, window_shape=(3, 3), pooling_type='MAX', padding='SAME') self.tensor_peaks = tf.where( tf.equal(gaussian_heatMat, max_pooled_in_tensor), gaussian_heatMat, tf.zeros_like(gaussian_heatMat)) self.heatMat = self.pafMat = None # warm-up self.persistent_sess.run( tf.variables_initializer([ v for v in tf.global_variables() if v.name.split(':')[0] in [ x.decode('utf-8') for x in self.persistent_sess.run( tf.report_uninitialized_variables()) ] ])) self.persistent_sess.run( [self.tensor_peaks, self.tensor_heatMat_up, self.tensor_pafMat_up], feed_dict={ self.tensor_image: [ np.ndarray(shape=(target_size[1], target_size[0], 3), dtype=np.float32) ], self.upsample_size: [target_size[1], target_size[0]] }) self.persistent_sess.run( [self.tensor_peaks, self.tensor_heatMat_up, self.tensor_pafMat_up], feed_dict={ self.tensor_image: [ np.ndarray(shape=(target_size[1], target_size[0], 3), dtype=np.float32) ], self.upsample_size: [target_size[1] // 2, target_size[0] // 2] }) self.persistent_sess.run( [self.tensor_peaks, self.tensor_heatMat_up, self.tensor_pafMat_up], feed_dict={ self.tensor_image: [ np.ndarray(shape=(target_size[1], target_size[0], 3), dtype=np.float32) ], self.upsample_size: [target_size[1] // 4, target_size[0] // 4] })
def __init__(self, patch_size, dataset, devices, train_vols, test_vols, name=None): self.name=name self.summaries = [] self.devices = devices self.patch_size = patch_size self.padded_patch_size = (1,) + patch_size + (1,) patchx,patchy,patchz = patch_size config = tf.ConfigProto( allow_soft_placement=True, #gpu_options=tf.GPUOptions(per_process_gpu_memory_fraction=0.9, allow_growth=True), #log_device_placement=True, ) self.sess = tf.Session(config=config) self.run_metadata = tf.RunMetadata() with tf.device("/cpu:0"): n_volumes = len(dataset.image) full_labels_truth = static_constant_multivolume(self.sess, dataset.human_labels, self.padded_patch_size) full_labels_lies = static_constant_multivolume(self.sess, dataset.machine_labels, self.padded_patch_size) full_image = static_constant_multivolume(self.sess, dataset.image, self.padded_patch_size) samples = static_constant_multivolume(self.sess, dataset.samples, (1,3), indexing='CORNER') print("finished loading data") with tf.name_scope('params'): self.step=tf.Variable(0) discrim, reconstruct = discrim_net3.make_forward_net(patch_size,2,1) self.discrim = discrim self.iteration_type=tf.placeholder(shape=[],dtype=tf.int32) with tf.name_scope('optimize'): loss=0 reconstruction_loss=0 for i,d in enumerate(devices): with tf.name_scope("gpu"+str(i)): with tf.device(d): vol_id = tf.cond(tf.equal(self.iteration_type,0), lambda: random_sample(tf.constant(train_vols)), lambda: random_sample(tf.constant(test_vols)), ) focus=tf.concat([[0],tf.reshape(samples[vol_id,('RAND',0)],(3,)),[0]],0) focus=tf.Print(focus,[vol_id, focus], message="focus", summarize=10) rr=augment.RandomRotationPadded() #1 is correct and 0 is incorrect lies_glimpse = rr(equal_to_centre(full_labels_lies[vol_id,focus])) tmp = full_labels_truth[vol_id,focus] truth_glimpse = rr(equal_to_centre(tmp)) human_labels = rr(tmp) image_glimpse = rr(full_image[vol_id,focus]) self.summaries.append(image_summary("lies_glimpse", lies_glimpse)) self.summaries.append(image_summary("truth_glimpse", truth_glimpse)) self.summaries.append(image_summary("human_labels", tf.to_float(human_labels))) occluded = random_occlusion(lies_glimpse) with tf.device("/cpu:0"): any_error = tf.stop_gradient(1-tf.to_float(tf.reduce_all(tf.equal(truth_glimpse, lies_glimpse)))) with tf.device(d): gpu_any_error = tf.identity(any_error) reconstruction = reconstruct(tf.concat([occluded, image_glimpse],4)) reconstruction_loss += tf.reduce_sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=reconstruction, labels=truth_glimpse)) self.summaries.append(image_summary("reconstruction", tf.nn.sigmoid(reconstruction))) self.summaries.append(image_summary("occluded", occluded)) truth_discrim_tower = discrim(tf.concat([truth_glimpse,image_glimpse],4)) lies_discrim_tower = tf.cond(tf.greater(gpu_any_error, 0.5), lambda: discrim(tf.concat([lies_glimpse,image_glimpse],4)), lambda: map(tf.identity, truth_discrim_tower)) with tf.device(d): loss += tf.nn.sigmoid_cross_entropy_with_logits(logits=tf.reduce_sum(lies_discrim_tower[-1]), labels=any_error) loss += tf.nn.sigmoid_cross_entropy_with_logits(logits=tf.reduce_sum(truth_discrim_tower[-1]), labels=tf.constant(0,dtype=tf.float32)) with tf.device("/cpu:0"): #any_error = has_error(lies_glimpse, human_labels) lies_glimpse = tf.identity(lies_glimpse) human_labels = tf.identity(human_labels) for i in range(4,6): ds_shape = static_shape(lies_discrim_tower[i]) expander = compose(*reversed(discrim_net3.range_expanders[0:i])) tmp=slices_to_shape(expander(shape_to_slices(ds_shape[1:4]))) assert tuple(tmp) == tuple(self.patch_size) def get_localized_errors(): print(ds_shape) x=localized_errors(lies_glimpse, human_labels, ds_shape = ds_shape, expander=expander) return tf.Print(x,[any_error],message="any error") errors = tf.cond( tf.greater(any_error, 0.5), lambda: get_localized_errors(), lambda: tf.zeros(ds_shape)) #errors = tf.Print(errors, [tf.reduce_sum(errors)]) loss += tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits = lies_discrim_tower[i], labels=errors)) loss += tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits = truth_discrim_tower[i], labels=tf.zeros_like(truth_discrim_tower[i]))) self.summaries.append(image_summary("guess"+str(i), upsample_mean(tf.nn.sigmoid(lies_discrim_tower[i]), self.padded_patch_size, expander), zero_one=True)) self.summaries.append(image_summary("truth"+str(i), upsample_mean(errors, self.padded_patch_size, expander))) loss = loss/len(devices) reconstruction_loss = reconstruction_loss/len(devices) var_list = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, scope='params') def train_op(): optimizer = tf.train.AdamOptimizer(0.0001, beta1=0.95, beta2=0.9995, epsilon=0.1) op = optimizer.minimize(8e5*loss + reconstruction_loss, colocate_gradients_with_ops=True, var_list = var_list) ema_loss=EMA(decay=0.99) ema_loss.update(loss) ema_reconstruction_loss=EMA(decay=0.99) ema_reconstruction_loss.update(reconstruction_loss) with tf.control_dependencies([op]): with tf.control_dependencies([self.step.assign_add(1)]): op = tf.group( tf.Print(0, [tf.identity(self.step), loss], message="step|loss"), ) quick_summary_op = tf.summary.merge([ tf.summary.scalar("loss", loss), tf.summary.scalar("reconstruction_loss", reconstruction_loss), tf.summary.scalar("ema_reconstruction_loss", ema_reconstruction_loss.val), tf.summary.scalar("ema_loss", ema_loss.val), ]) return op, quick_summary_op def test_op(): ema_test_loss=EMA(decay=0.9) ema_test_loss.update(loss) ema_test_reconstruction_loss=EMA(decay=0.9) ema_test_reconstruction_loss.update(reconstruction_loss) quick_summary_op = tf.summary.merge([ tf.summary.scalar("test_loss", loss), tf.summary.scalar("test_reconstruction_loss", reconstruction_loss), tf.summary.scalar("ema_test_reconstruction_loss", ema_test_reconstruction_loss.val), tf.summary.scalar("ema_test_loss", ema_test_loss.val), ]) return tf.no_op(), quick_summary_op self.iter_op, self.quick_summary_op = tf.cond(tf.equal(self.iteration_type,0), train_op, test_op) self.sess.run(tf.variables_initializer( tf.get_collection(tf.GraphKeys.VARIABLES,scope='params')+ tf.get_collection(tf.GraphKeys.VARIABLES,scope='optimize')) ) print(self.sess.run( tf.report_uninitialized_variables( tf.all_variables( )))) summary_op = tf.summary.merge(self.summaries) self.saver = tf.train.Saver(var_list=var_list,keep_checkpoint_every_n_hours=2) self.summary_op = summary_op
def Parameter_Server(Synchronizer, cluster, log_path, model_path, procs): config = tf.ConfigProto( allow_soft_placement=True, log_device_placement=False, ) config.gpu_options.allow_growth = True server = tf.train.Server(cluster, job_name="ps", task_index=0, config=config) sess = tf.Session(target=server.target, config=config) summary_writer = tf.summary.FileWriter(log_path) Net = MiniNetwork(sess=sess, summary_writer=summary_writer, rl_training=FLAGS.training, cluster=cluster, index=0, device=DEVICE[0 % len(DEVICE)], ppo_load_path=FLAGS.restore_model_path, ppo_save_path=model_path) Sec_Net = SecondNetwork(sess=sess, rl_training=False, reuse=True, cluster=None, index=0, load_model=True) agent = mini_source_agent.MiniSourceAgent( index=-1, net=Net, sec_net=Sec_Net, restore_model=FLAGS.restore_model, rl_training=FLAGS.training) print("Parameter server: waiting for cluster connection...") sess.run(tf.report_uninitialized_variables()) print("Parameter server: cluster ready!") print("Parameter server: initializing variables...") agent.init_network() print("Parameter server: variables initialized") update_counter = 0 max_win_rate = 0. while update_counter < TRAIN_ITERS: agent.reset_old_network() # wait for update Synchronizer.wait() logging("Update Network!") # TODO count the time , compare cpu and gpu time.sleep(1) # update finish Synchronizer.wait() logging("Update Network finished!") steps, win_rate = agent.update_summary(update_counter) logging("Steps: %d, win rate: %f" % (steps, win_rate)) update_counter += 1 if win_rate >= max_win_rate: agent.save_model() max_win_rate = win_rate return max_win_rate
def Worker(index, update_game_num, Synchronizer, cluster, model_path): config = tf.ConfigProto( allow_soft_placement=True, log_device_placement=False, ) config.gpu_options.allow_growth = True worker = tf.train.Server(cluster, job_name="worker", task_index=index, config=config) sess = tf.Session(target=worker.target, config=config) Net = MiniNetwork(sess=sess, summary_writer=None, rl_training=FLAGS.training, cluster=cluster, index=index, device=DEVICE[index % len(DEVICE)], ppo_load_path=FLAGS.restore_model_path, ppo_save_path=model_path) Sec_Net = SecondNetwork(sess=sess, rl_training=False, reuse=True, cluster=None, index=index, load_model=True) global_buffer = Buffer() agents = [] for i in range(THREAD_NUM): agent = mini_source_agent.MiniSourceAgent( index=i, global_buffer=global_buffer, net=Net, sec_net=Sec_Net, restore_model=FLAGS.restore_model, rl_training=FLAGS.training, strategy_agent=None) agents.append(agent) print("Worker %d: waiting for cluster connection..." % index) sess.run(tf.report_uninitialized_variables()) print("Worker %d: cluster ready!" % index) while len(sess.run(tf.report_uninitialized_variables())): print("Worker %d: waiting for variable initialization..." % index) time.sleep(1) print("Worker %d: variables initialized" % index) game_num = np.ceil(update_game_num // THREAD_NUM) UPDATE_EVENT.clear() ROLLING_EVENT.set() # Run threads threads = [] for i in range(THREAD_NUM - 1): t = threading.Thread(target=run_thread, args=(agents[i], game_num, Synchronizer, FLAGS.difficulty)) threads.append(t) t.daemon = True t.start() time.sleep(3) run_thread(agents[-1], game_num, Synchronizer, FLAGS.difficulty) for t in threads: t.join()
def run(args, server): # lkx: client and remote doesn't mater for non VNC and flash game # env = create_env(args.env_id, client_id=str(args.task), remotes=args.remotes) # trainer = A3C(env, args.task) target_task = 1 # int(args.target_task) env_names = args.env_id.split("_") envs = [ create_env(env_name, client_id=str(args.worker_id), remotes=args.remotes) for env_name in env_names ] trainer = A3C(envs, int(args.worker_id), target_task) # Variable names that start with "local" are not saved in checkpoints. variables_to_save = [ v for v in tf.all_variables() if not v.name.startswith("local") ] init_op = tf.initialize_variables(variables_to_save) init_all_op = tf.initialize_all_variables() saver = FastSaver(variables_to_save) variables_to_restore = [ v for v in tf.all_variables() if v.name.startswith("global0") and "global_step" not in v.name ] # Adam_2 and 3 cost by the distillation train op pre_train_saver = FastSaver(variables_to_restore) # variables_global_toinit = [v for v in tf.all_variables() if v.name.startswith("global0")] # for v in tf.all_variables(): # if v.name.startswith("global/"): # print v.name def init_fn(ses): logger.info("Initializing all parameters.") ses.run(init_all_op) pre_train_saver.restore( ses, "../expResults/20170125_09-26/train/model.ckpt-4986751") # "../expResults/20170125_09-26/train/model.ckpt-4198738") # "../expResults/20170124_15-11/train/model.ckpt-4986137") # "../expResults/20170124_15-11/train/model.ckpt-4301837") # "../expResults/20170124_15-11/train/model.ckpt-2140636") config = tf.ConfigProto(device_filters=[ "/job:ps", "/job:worker/task:{}/cpu:0".format(args.worker_id) ]) # refer to worker id logdir = os.path.join(args.log_dir, 'train') summary_writer = tf.train.SummaryWriter(logdir + "_%d" % args.worker_id) logger.info("Events directory: %s_%s", logdir, args.worker_id) sv = tf.train.Supervisor( is_chief=(args.worker_id == 0), logdir=logdir, saver=saver, summary_op=None, init_op= init_op, # Defaults to an Operation that initializes all variables init_fn=init_fn, # Called after the optional init_op is called summary_writer=summary_writer, ready_op=tf.report_uninitialized_variables( variables_to_save), # list the names of uninitialized variables. global_step=trainer.global_step[target_task], save_model_secs=30, save_summaries_secs=30) num_taskss = len(envs) num_global_steps = 20000000 #10000000 logger.info( "Starting session. If this hangs, we're mostly likely waiting to connect to the parameter server. " + "One common cause is that the parameter server DNS name isn't resolving yet, or is misspecified." ) with sv.managed_session(server.target, config=config) as sess, sess.as_default(): for ii in np.arange(num_taskss): sess.run(trainer.sync[ii]) sess.run(trainer.sync_logits) trainer.start(sess, summary_writer) global_step = sess.run(trainer.global_step[target_task]) logger.info("Starting training at step=%d", global_step) while not sv.should_stop() and (not num_global_steps or global_step < num_global_steps): # if global_step <= 1000000 and np.random.uniform(0, 1) > 0.5: # todo annealing # batch_aux = trainer.get_knowledge(sess) # trainer.process(sess, batch_aux) trainer.process(sess) global_step = sess.run(trainer.global_step[target_task]) # Ask for all the services to stop. sv.stop() logger.info('reached %s steps. worker stopped.', global_step)
def train(args, server, cluster, env, queue_shapes, trajectory_queue_size): agent = Agent(args, server, cluster, env, queue_shapes, trajectory_queue_size) # Variable names that start with "local" are not saved in checkpoints. variables_to_save = [ v for v in tf.global_variables() if not v.name.startswith("local")] init_op = tf.variables_initializer(variables_to_save) init_all_op = tf.global_variables_initializer() saver = ut.tf.FastSaver(variables_to_save) var_list = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, tf.get_variable_scope().name) logger.info('Trainable vars:') for v in var_list: logger.info(' %s %s', v.name, v.get_shape()) def init_fn(ses): logger.info("Initializing all parameters.") ses.run(init_all_op) devices = ["/job:ps"] if args.task == 0: devices += [f"/job:worker/task:{args.task}/gpu:0", f"/job:worker/task:{args.task}/cpu:0"] else: devices += [f"/job:worker/task:{args.task}/cpu:0"] config = tf.ConfigProto(device_filters=devices, allow_soft_placement=True) logger.info("Events directory: %s_%s", args.load_path, args.task) summary_writer = tf.summary.FileWriter(f"{args.load_path}_{args.task}") agent.summary_writer = summary_writer sv = tf.train.Supervisor( is_chief=args.task == 0, logdir=str(args.load_path), saver=saver, summary_op=None, init_op=init_op, init_fn=init_fn, summary_writer=summary_writer, ready_op=tf.report_uninitialized_variables(variables_to_save), global_step=agent.policy_step, save_model_secs=30, save_summaries_secs=30) num_policy_steps = 100000000 logger.info( "Starting session. If this hangs, we're mostly likely waiting" " to connect to the parameter server. One common cause is that" " the parameter server DNS name isn't resolving yet, or is misspecified.") with sv.managed_session(server.target, config=config) as sess, \ sess.as_default(): def sync(): #logger.error("SYNC") sess.run(agent.sync) ############################### # Run thread ############################### if args.task >= 1: sync() agent.start_worker_thread(sess, summary_writer) policy_step = sess.run(agent.policy_step) logger.info("Starting training at step=%d", policy_step) while not sv.should_stop() and ( \ not num_policy_steps or policy_step < num_policy_steps): if args.task == 0: agent.train_policy(sess) else: sync() policy_step = sess.run(agent.policy_step) # Ask for all the services to stop. sv.stop() logger.info('reached %s steps. worker stopped.', policy_step)
print(tf.get_collection("my_collection_name")) print(tf.get_collection(tf.GraphKeys.LOCAL_VARIABLES)) print(tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)) # Placing a variable in a given device with tf.device("/device:CPU:0"): v = tf.get_variable("v", [1]) # Initialize all global variables sess.run(tf.global_variables_initializer()) # One can also initialize variables individually sess.run(my_variable.initializer) # Find the variables that have not been initialized print(sess.run(tf.report_uninitialized_variables())) # Correct way to initialize a variable that depends on another variable x = tf.get_variable("x", shape=(), initializer=tf.zeros_initializer()) w = tf.get_variable("w", initializer=x.initialized_value() + 1) # We can now use the variable as a normal tensor l = w + 1 # We can assign values to already defined variables assignment = x.assign_add(1) sess.run(tf.global_variables_initializer()) sess.run(x) sess.run(assignment) sess.run(x)
def initialize(self): """Fetch record then uses tf's saver.restore.""" if self.do_restore: # First, determine which checkpoint to use. if self.from_ckpt is not None: # Use a cached checkpoint file. ckpt_filename = self.from_ckpt log.info('Restoring variables from checkpoint %s ...' \ % ckpt_filename) else: # Otherwise, use a database checkpoint. self.load_rec() if self.load_data is None else None if self.load_data is not None: rec, ckpt_filename = self.load_data log.info('Restoring variables from record %s (step %d)...' \ % (str(rec['_id']), rec['step'])) else: # No db checkpoint to load. ckpt_filename = None if ckpt_filename is not None: # Determine which vars should be restored from the specified checkpoint. restore_vars = self.get_restore_vars(ckpt_filename) restore_names = [name for name, var in restore_vars.items()] # remap the actually restored names to the new ones if self.load_param_dict: for each_old_name in self.load_param_dict.keys(): if each_old_name in restore_names: restore_names.remove(each_old_name) restore_names.append( self.load_param_dict[each_old_name]) # Actually load the vars. log.info('Restored Vars (in ckpt, in graph):\n' + str(restore_names)) tf_saver_restore = tf.train.Saver(restore_vars) tf_saver_restore.restore(self.sess, ckpt_filename) log.info('... done restoring.') # Run post init_ops if needed if self.var_manager: self.sess.run( tf.group(*self.var_manager.get_post_init_ops())) # Reinitialize all other, unrestored vars. unrestored_vars = [\ var \ for name, var in self.var_list.items() \ if name not in restore_names] unrestored_var_names = [\ name \ for name, var in self.var_list.items() \ if (name not in restore_names) and not(any([name.endswith(s) for s in OPTIMIZER_NAMES]))] log.info('Unrestored Vars (in graph, not in ckpt):\n' + str(unrestored_var_names)) self.sess.run(tf.variables_initializer( unrestored_vars)) # initialize variables not restored assert len(self.sess.run( tf.report_uninitialized_variables())) == 0, (self.sess.run( tf.report_uninitialized_variables())) if not self.do_restore \ or (self.load_data is None and self.from_ckpt is None): init_op_global = tf.global_variables_initializer() self.sess.run(init_op_global) init_op_local = tf.local_variables_initializer() self.sess.run(init_op_local) if self.var_manager: self.sess.run(tf.group(*self.var_manager.get_post_init_ops()))
def __init__(self, graph_path, target_size=(320, 240), tf_config=None, trt_bool=False): self.target_size = target_size # load graph logger.info('loading graph from %s(default size=%dx%d)' % (graph_path, target_size[0], target_size[1])) with tf.gfile.GFile(graph_path, 'rb') as f: graph_def = tf.GraphDef() graph_def.ParseFromString(f.read()) if trt_bool is True: output_nodes = ["Openpose/concat_stage7"] graph_def = trt.create_inference_graph( graph_def, output_nodes, max_batch_size=1, max_workspace_size_bytes=1 << 20, precision_mode="FP16", # precision_mode="INT8", minimum_segment_size=3, is_dynamic_op=True, maximum_cached_engines=int(1e3), use_calibration=True, ) self.graph = tf.get_default_graph() tf.import_graph_def(graph_def, name='TfPoseEstimator') self.persistent_sess = tf.Session(graph=self.graph, config=tf_config) for ts in [n.name for n in tf.get_default_graph().as_graph_def().node]: print(ts) self.tensor_image = self.graph.get_tensor_by_name( 'TfPoseEstimator/image:0') self.tensor_output = self.graph.get_tensor_by_name( 'TfPoseEstimator/Openpose/concat_stage7:0') self.tensor_heatMat = self.tensor_output[:, :, :, :19] self.tensor_pafMat = self.tensor_output[:, :, :, 19:] self.upsample_size = tf.placeholder(dtype=tf.int32, shape=(2, ), name='upsample_size') self.tensor_heatMat_up = tf.image.resize_area( self.tensor_output[:, :, :, :19], self.upsample_size, align_corners=False, name='upsample_heatmat') self.tensor_pafMat_up = tf.image.resize_area( self.tensor_output[:, :, :, 19:], self.upsample_size, align_corners=False, name='upsample_pafmat') if trt_bool is True: smoother = Smoother({'data': self.tensor_heatMat_up}, 25, 3.0, 19) else: smoother = Smoother({'data': self.tensor_heatMat_up}, 25, 3.0) gaussian_heatMat = smoother.get_output() max_pooled_in_tensor = tf.nn.pool(gaussian_heatMat, window_shape=(3, 3), pooling_type='MAX', padding='SAME') self.tensor_peaks = tf.where( tf.equal(gaussian_heatMat, max_pooled_in_tensor), gaussian_heatMat, tf.zeros_like(gaussian_heatMat)) self.heatMat = self.pafMat = None # warm-up self.persistent_sess.run( tf.variables_initializer([ v for v in tf.global_variables() if v.name.split(':')[0] in [ x.decode('utf-8') for x in self.persistent_sess.run( tf.report_uninitialized_variables()) ] ])) self.persistent_sess.run( [self.tensor_peaks, self.tensor_heatMat_up, self.tensor_pafMat_up], feed_dict={ self.tensor_image: [ np.ndarray(shape=(target_size[1], target_size[0], 3), dtype=np.float32) ], self.upsample_size: [target_size[1], target_size[0]] }) self.persistent_sess.run( [self.tensor_peaks, self.tensor_heatMat_up, self.tensor_pafMat_up], feed_dict={ self.tensor_image: [ np.ndarray(shape=(target_size[1], target_size[0], 3), dtype=np.float32) ], self.upsample_size: [target_size[1] // 2, target_size[0] // 2] }) self.persistent_sess.run( [self.tensor_peaks, self.tensor_heatMat_up, self.tensor_pafMat_up], feed_dict={ self.tensor_image: [ np.ndarray(shape=(target_size[1], target_size[0], 3), dtype=np.float32) ], self.upsample_size: [target_size[1] // 4, target_size[0] // 4] }) # logs if self.tensor_image.dtype == tf.quint8: logger.info('quantization mode enabled.')
# accuracy with tf.name_scope('Accuracy'): correct_prediction = tf.equal(tf.argmax(y_conv, 1), tf.argmax(y_, 1)) accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) local_init_op = opt.local_step_init_op if is_chief: local_init_op = opt.chief_init_op ready_for_local_init_op = opt.ready_for_local_init_op # Initial token and chief queue runners required by the sync_replicas mode chief_queue_runner = opt.get_chief_queue_runner() sync_init_op = opt.get_init_tokens_op() init_op = tf.global_variables_initializer() variables_check_op = tf.report_uninitialized_variables() sess_config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False, device_filters=[ "/job:ps", "/job:worker/task:%d" % FLAGS.task_index ]) sv = tf.train.Supervisor(is_chief=is_chief, init_op=init_op, local_init_op=local_init_op, ready_for_local_init_op=ready_for_local_init_op, global_step=global_step) server_grpc_url = "grpc://" + workers[FLAGS.task_index] state = False with sv.prepare_or_wait_for_session(server_grpc_url,
def main(): parser = argparse.ArgumentParser() parser.add_argument('data', help='data file') parser.add_argument('--load', help='model to load') parser.add_argument('--epochs', type=int, default=10000, help='number of epochs to train for') parser.add_argument('--seed', type=int, default=0, help='random seed') parser.add_argument('--resample', action='store_true', help='resample data') parser.add_argument('--gpus', help='gpu to use') parser.add_argument('--num_images', type=int, default=2, help='number of input images') parser.add_argument('--batch_size', type=int, default=1024, help='batch size') parser.add_argument('--learning_rate', type=float, default=5e-3, help='batch size') parser.add_argument('--num_pts', type=int, default=1, help='number of output waypoints') parser.add_argument('--capacity', type=float, default=1, help='network capacity') parser.add_argument('--cam_coord', type=float, default=-1, help='use focal length coordinates') parser.add_argument('--min', type=tuple, default=(0, -0.5, -0.5), help='minimum xyz ') parser.add_argument('--max', type=tuple, default=(1, 0.5, 0.5), help='maximum xyz') parser.add_argument('--bins', type=int, default=100, help='number of bins per coordinate') parser.add_argument('--dense', type=int, default=0, help='number of additional dense layers') args = parser.parse_args() args.min = [(0, -0.5, -0.1), (0, -1, -0.15), (0, -1.5, -0.2), (0, -2, -0.3), (0, -3, -0.5)] args.max = [(1, 0.5, 0.1), (2, 1, 0.15), (4, 1.5, 0.2), (6, 2, 0.3), (7, 0.3, 0.5)] if (args.gpus is not None): os.environ["CUDA_VISIBLE_DEVICES"] = args.gpus # Model and optimization params val_perc = 0.3 #g_depths = [64, 64, 64] #f_depths = [64, 64, 64] batch_size = args.batch_size #512#1024#64 num_epochs = args.epochs learning_rate = args.learning_rate #50#e-1 learn_rate_decay = 100 / num_epochs save_variables_divider = 10 log_path = './model/logs' save_path = createStampedFolder(os.path.join(log_path, 'variable_log')) ###################### # Make model print('Building model') model = OrangeClassNet(args.capacity, args.num_images, args.num_pts, args.cam_coord, args.min, args.max, args.bins, args.dense) # Load in Data train_indices, val_indices, num_list, traj_data, val_data = parseDirData( args.data, args.seed, args.resample, val_perc, args.num_pts) num_train_samples = train_indices.shape[0] num_val_samples = val_indices.shape[0] fopen = open('val_data.pickle', 'wb') #print(val_data) pickle.dump(val_data, fopen, pickle.HIGHEST_PROTOCOL) # exit(0) # Train model print('Training...') print('Training Samples: ' + str(num_train_samples)) print('Validation Samples: ' + str(num_val_samples)) data_loc = copy.deepcopy(args.data) data_loc_name = data_loc.strip("..").strip(".").strip("/").replace( "/", "_") mean_img_loc = data_loc + "../mean_imgv2_" + data_loc_name + '.npy' print(mean_img_loc) if not (os.path.exists(mean_img_loc)): print('mean image file not found') mean_image = compute_mean_image(train_indices, data_loc, model) np.save(mean_img_loc, mean_image) else: print('mean image file found') mean_image = np.load(mean_img_loc) # mean_image = np.zeros((model.h, model.w, 3)) val_inputs, val_outputs_x, val_outputs_y, val_outputs_z = loadData( val_indices, num_list, data_loc, model, traj_data) print('Validation Loaded') #train_path = addTimestamp(os.path.join(log_path, 'train_')) val_path = addTimestamp(os.path.join(log_path, 'validation_')) plot_data_path = addTimestamp(os.path.join(log_path, 'plot_data_')) #train_writer = tf.summary.FileWriter(train_path, graph=tf.get_default_graph()) val_writer = tf.summary.FileWriter(val_path, graph=tf.get_default_graph()) os.makedirs(plot_data_path) saver = tf.train.Saver() init = tf.global_variables_initializer() feed_dict = {} #model.keep_prob: 0.9} print('Writers Set Up') with tf.Session() as sess: # Load model if specified if args.load: saver.restore(sess, tf.train.latest_checkpoint(args.load)) uninit_vars_op = tf.report_uninitialized_variables() uninit_vars = sess.run(uninit_vars_op) uninit_vars_op.mark_used() if uninit_vars.size != 0: print(uninit_vars) #, sep=',') sess.close() raise RuntimeError('Uninitialized variables present') else: sess.run(init) print('Session') iters = 0 plotting_data = dict() plotting_data['idx'] = range(len(val_indices)) #print(plotting_data['idx']) #exit(0) plotting_data['truth'] = [ val_outputs_x[plotting_data['idx']], val_outputs_y[plotting_data['idx']], val_outputs_z[plotting_data['idx']] ] plotting_data['data'] = list() plotting_data['foc_l'] = args.cam_coord plotting_data['min'] = model.min plotting_data['max'] = model.max plotting_data['bins'] = model.bins for ii in plotting_data['idx']: plotting_data['data'].append([]) #print(plotting_data) #for epoch in range(num_epochs): for epoch in range(1): print('Epoch: ', epoch) batch_idx = 0 # Decay learning rate new_learn_rate = np.exp(-epoch * learn_rate_decay) * learning_rate print('Learning Rate Set to: ' + str(new_learn_rate)) model.learning_fac.assign(new_learn_rate) """ while batch_idx < num_train_samples: end_idx = min(batch_idx + batch_size, num_train_samples) train_inputs, train_outputs_x, train_outputs_y, train_outputs_z = loadData(train_indices[batch_idx:end_idx],num_list,data_loc, model,traj_data) feed_dict[model.image_input] = train_inputs feed_dict[model.waypoint_output_x] = train_outputs_x feed_dict[model.waypoint_output_y] = train_outputs_y feed_dict[model.waypoint_output_z] = train_outputs_z #sess.run([model.train_summary_op, model.train_step], feed_dict=feed_dict) sess.run(model.train_step, feed_dict=feed_dict) batch_idx = batch_idx + batch_size iters = iters + 1 if iters % 20 == 0: summary, logits = sess.run([model.train_summ,model.logits], feed_dict=feed_dict) accuracy = acc_metric(logits,train_outputs_x,train_outputs_y,train_outputs_z, model) print('Training Accuracy: ' + str(accuracy)) train_writer.add_summary(summary, iters) #Clear references to data: train_inputs = train_outputs = feed_dict[model.image_input] = feed_dict[model.waypoint_output_x] = feed_dict[model.waypoint_output_y] = feed_dict[model.waypoint_output_z] = None """ val_batch_idx = 0 num_validation = len(val_indices) #val_summary = 0 val_cost = np.zeros((1, )) resnet_output = np.zeros( (args.num_pts, 3, 0, model.bins)) # 2nd arg for num_waypoints raw_losses = np.zeros((3, )) accuracy = [] while val_batch_idx < num_validation: val_batch_endx = min(val_batch_idx + batch_size, num_validation) val_dict = { model.image_input: val_inputs[val_batch_idx:val_batch_endx], model.waypoint_output[0]: val_outputs_x[val_batch_idx:val_batch_endx], model.waypoint_output[1]: val_outputs_y[val_batch_idx:val_batch_endx], model.waypoint_output[2]: val_outputs_z[val_batch_idx:val_batch_endx] } val_summary_temp, val_cost_temp, resnet_output_temp, raw_losses_temp = sess.run( [ model.val_summ, model.objective, model.logits, model.losses ], feed_dict=val_dict) val_writer.add_summary(val_summary_temp, iters) #val_summary_temp val_cost = np.multiply( val_cost, (float(val_batch_idx) / val_batch_endx)) + np.multiply( val_cost_temp, (float(val_batch_endx - val_batch_idx) / val_batch_endx)) resnet_output_temp = np.array(resnet_output_temp) accuracy.append( acc_metric(resnet_output_temp, val_dict[model.waypoint_output[0]], val_dict[model.waypoint_output[1]], val_dict[model.waypoint_output[2]], model)) resnet_output = np.concatenate( (resnet_output, resnet_output_temp), axis=2) raw_losses = np.multiply( raw_losses_temp, (float(val_batch_idx) / val_batch_endx)) + np.multiply( np.array(raw_losses_temp), (float(val_batch_endx - val_batch_idx) / val_batch_endx)) val_batch_idx = val_batch_endx accuracy = np.mean(accuracy, axis=0) print('Validation Summary = ', val_cost) print('Accuracy = ', accuracy) resnet_output = np.array(resnet_output) print(raw_losses) print(resnet_output.shape) for ii in plotting_data['idx']: plotting_data['data'][ii].append(resnet_output[:, :, ii, :]) with open(plot_data_path + '/data.pickle', 'wb') as f: pickle.dump(plotting_data, f, pickle.HIGHEST_PROTOCOL) #val_writer.add_summary(val_summary, iters) #train_writer.flush() val_writer.flush() # Save variables """ if ((epoch + 1) % save_variables_divider == 0 or (epoch == 0) or (epoch == num_epochs - 1)): print("Saving variables") if epoch == 0: print("For epoch 0") saver.save(sess, os.path.join(save_path, 'variables'), epoch) else: print("For epoch ", epoch) saver.save(sess, os.path.join(save_path, 'variables'), epoch, write_meta_graph=False) # Re-shuffle data after each epoch rand_idx = np.random.permutation(num_train_samples) train_indices = train_indices[rand_idx] """ #train_writer.flush() val_writer.flush() print("Done")
inputs = model.inputs[:2] dense = model.get_layer('NSP-Dense').output outputs = keras.layers.Dense(units=2, activation='softmax')(dense) model = keras.models.Model(inputs, outputs) model.compile( RAdam(lr=LR), loss='sparse_categorical_crossentropy', metrics=['sparse_categorical_accuracy'], ) print(model.summary()) sess = K.get_session() uninitialized_variables = set( [i.decode('ascii') for i in sess.run(tf.report_uninitialized_variables())]) init_op = tf.variables_initializer([ v for v in tf.global_variables() if v.name.split(':')[0] in uninitialized_variables ]) sess.run(init_op) es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=2) mc = ModelCheckpoint('best_model.h5', monitor='val_acc', mode='max', verbose=1, save_best_only=True) history = model.fit(train_x, train_y,
def Parameter_Server(Synchronizer, cluster, log_path, model_path, procs): config = tf.ConfigProto( allow_soft_placement=True, log_device_placement=False, ) config.gpu_options.allow_growth = True server = tf.train.Server(cluster, job_name="ps", task_index=0, config=config) #config.gpu_options.per_process_gpu_memory_fraction = 0.2 sess = tf.Session(target=server.target, config=config) summary_writer = tf.summary.FileWriter(log_path) mini_net = MiniNetwork(sess, index=0, summary_writer=summary_writer, rl_training=True, cluster=cluster, ppo_load_path=FLAGS.restore_model_path, ppo_save_path=model_path, freeze_head=FLAGS.freeze_head, use_bn=FLAGS.use_bn, use_sep_net=FLAGS.use_sep_net, restore_model=FLAGS.restore_model, restore_from=FLAGS.restore_from, restore_to=FLAGS.restore_to) agent = MiniAgent(agent_id=-1, global_buffer=Buffer(), net=mini_net, restore_model=FLAGS.restore_model) print("Parameter server: waiting for cluster connection...") sess.run(tf.report_uninitialized_variables()) print("Parameter server: cluster ready!") print("Parameter server: initializing variables...") agent.init_network() print("Parameter server: variables initialized") last_win_rate = 0. update_counter = 0 while update_counter < TRAIN_ITERS: agent.reset_old_network() # wait for update Synchronizer.wait() logging("Update Network!") # TODO count the time , compare cpu and gpu time.sleep(1) # update finish Synchronizer.wait() logging("Update Network finished!") steps, win_rate = agent.update_summary(update_counter) logging("Steps: %d, win rate: %f" % (steps, win_rate)) update_counter += 1 if win_rate >= last_win_rate: agent.save_model() last_win_rate = win_rate for p in procs: print('Process terminate') p.terminate()
def run_tester(args, server): env = new_env(args) env.reset() env.max_history = args.eval_num if args.alg == 'A3C': agent = A3C(env, args) elif args.alg == 'Q': agent = Q(env, args) elif args.alg == 'VPN': agent = VPN(env, args) else: raise ValueError('Invalid algorithm: ' + args.alg) device = 'gpu' if args.gpu > 0 else 'cpu' gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.15) config = tf.ConfigProto(device_filters=[ "/job:ps", "/job:worker/task:{}/{}:0".format(args.task, device) ], gpu_options=gpu_options, allow_soft_placement=True) variables_to_save = [v for v in tf.global_variables() if \ not v.name.startswith("global") and not v.name.startswith("local/target/")] global_variables = [ v for v in tf.global_variables() if not v.name.startswith("local") ] init_op = tf.variables_initializer(global_variables) init_all_op = tf.global_variables_initializer() var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, tf.get_variable_scope().name) logger.info('Trainable vars:') for v in var_list: logger.info(' %s %s', v.name, v.get_shape()) logger.info("Num parameters: %d", agent.local_network.num_param) def init_fn(ses): logger.info("Initializing all parameters.") ses.run(init_all_op) saver = FastSaver(variables_to_save, max_to_keep=0) sv = tf.train.Supervisor( is_chief=False, global_step=agent.global_step, summary_op=None, init_op=init_op, init_fn=init_fn, ready_op=tf.report_uninitialized_variables(global_variables), saver=saver, save_model_secs=0, save_summaries_secs=0) best_reward = -10000 with sv.managed_session(server.target, config=config) as sess, sess.as_default(): epoch = args.eval_epoch while args.eval_freq * epoch <= args.max_step: path = os.path.join(args.log, "e%d" % epoch) if not os.path.exists(path + ".index"): time.sleep(10) continue logger.info("Start evaluation (Epoch %d)", epoch) saver.restore(sess, path) np.random.seed(args.seed) reward = evaluate(env, agent.local_network, args.eval_num, eps=args.eps_eval) logfile = open(os.path.join(args.log, "eval.csv"), "a") print("Epoch: %d, Reward: %.2f" % (epoch, reward)) logfile.write("%d, %.3f\n" % (epoch, reward)) logfile.close() if reward > best_reward: best_reward = reward sv.saver.save(sess, os.path.join(args.log, 'best')) print("Saved to: %s" % os.path.join(args.log, 'best')) epoch += 1 logger.info('tester stopped.')
def run(args, server): env = create_env(args.env_id, client_id=str(args.task), remotes=args.remotes, num_trials=args.num_trials) num_global_steps = 10000000 num_test_steps = 1000000 trainer = A3C(env, args.task, args.visualise, args.learning_rate, args.meta, args.remotes, args.num_trials, num_global_steps) # log, checkpoints et tensorboard # (Original Comment) Variable names that start with "local" are not saved in checkpoints. if use_tf12_api: variables_to_save = [ v for v in tf.global_variables() if not v.name.startswith("local") ] init_op = tf.variables_initializer(variables_to_save) init_all_op = tf.global_variables_initializer() else: variables_to_save = [ v for v in tf.all_variables() if not v.name.startswith("local") ] init_op = tf.initialize_variables(variables_to_save) init_all_op = tf.initialize_all_variables() saver = FastSaver(variables_to_save) var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, tf.get_variable_scope().name) logger.info('Trainable vars:') for v in var_list: logger.info(' %s %s', v.name, v.get_shape()) def init_fn(ses): logger.info("Initializing all parameters.") ses.run(init_all_op) config = tf.ConfigProto(device_filters=[ "/job:ps", "/job:worker/task:{}/cpu:0".format(args.task) ]) logdir = os.path.join(args.log_dir, 'train') if use_tf12_api: summary_writer = tf.summary.FileWriter(logdir + "_%d" % args.task) else: summary_writer = tf.train.SummaryWriter(logdir + "_%d" % args.task) logger.info("Events directory: %s_%s", logdir, args.task) # The tf.train.Supervisor provides a set of services that helps implement a robust training process. *(4) sv = tf.train.Supervisor( is_chief=(args.task == 0), logdir=logdir, saver=saver, summary_op=None, init_op=init_op, init_fn=init_fn, summary_writer=summary_writer, ready_op=tf.report_uninitialized_variables(variables_to_save), global_step=trainer.global_step, save_model_secs=30, save_summaries_secs=30) ''' # beginning of the training logger.info( "Starting session. If this hangs, we're mostly likely waiting to connect to the parameter server. " + "One common cause is that the parameter server DNS name isn't resolving yet, or is misspecified.") with sv.managed_session(server.target, config=config) as sess, sess.as_default(): sess.run(trainer.sync) # copy weights from the parameter server to the local model trainer.start(sess, summary_writer) # lance l'execution de la methode "_run" du TheadRunner "trainer.runner" (object A3C du fichier A3C), qui genere des partial rollouts et les mets dans la queue global_step = sess.run(trainer.global_step) # will check in the tmp folder if there is some previously interrupted training to be continued, otherwise start from sratch and initialize the global_step counter at 0 logger.info("Starting training at step=%d", global_step) while not sv.should_stop() and (not num_global_steps or global_step < num_global_steps): trainer.process(sess) # (original comment) grabs a rollout in the queue and update the parameters of the server global_step = sess.run(trainer.global_step) # End of the training, asks for all the services to stop. sv.stop() logger.info('Training finished ; reached %s steps. worker stopped.', global_step) time.sleep(5) ''' # Beginning of the test phase with sv.managed_session(server.target, config=config) as sess, sess.as_default(): sess.run(trainer.sync) trainer.start(sess, summary_writer) initial_global_step = global_step = sess.run(trainer.global_step) logger.info("Starting tests at step=%d", global_step) while not sv.should_stop() and ( not num_test_steps or (global_step - initial_global_step) < num_test_steps): trainer.inc_global_step(sess) global_step = sess.run(trainer.global_step) logger.info('Tests finished ; reached %s steps. worker stopped.', global_step) sv.stop()
def run(args, server): env = new_env(args) if args.alg == 'A3C': trainer = A3C(env, args) elif args.alg == 'Q': trainer = Q(env, args) elif args.alg == 'VPN': env_off = new_env(args) env_off.verbose = 0 env_off.reset() trainer = VPN(env, args, env_off=env_off) else: raise ValueError('Invalid algorithm: ' + args.alg) # Variable names that start with "local" are not saved in checkpoints. variables_to_save = [v for v in tf.global_variables() if \ not v.name.startswith("global") and not v.name.startswith("local/target/")] global_variables = [ v for v in tf.global_variables() if not v.name.startswith("local") ] init_op = tf.variables_initializer(global_variables) init_all_op = tf.global_variables_initializer() saver = FastSaver(variables_to_save, max_to_keep=0) var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, tf.get_variable_scope().name) logger.info('Trainable vars:') for v in var_list: logger.info(' %s %s', v.name, v.get_shape()) logger.info("Num parameters: %d", trainer.local_network.num_param) def init_fn(ses): logger.info("Initializing all parameters.") ses.run(init_all_op) device = 'gpu' if args.gpu > 0 else 'cpu' gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.15) config = tf.ConfigProto(device_filters=[ "/job:ps", "/job:worker/task:{}/{}:0".format(args.task, device) ], gpu_options=gpu_options, allow_soft_placement=True) logdir = os.path.join(args.log, 'train') summary_writer = tf.summary.FileWriter(logdir + "_%d" % args.task) logger.info("Events directory: %s_%s", logdir, args.task) sv = tf.train.Supervisor( is_chief=(args.task == 0), logdir=logdir, saver=saver, summary_op=None, init_op=init_op, init_fn=init_fn, summary_writer=summary_writer, ready_op=tf.report_uninitialized_variables(global_variables), global_step=trainer.global_step, save_model_secs=0, save_summaries_secs=30) logger.info( "Starting session. If this hangs, we're mostly likely waiting to connect to the parameter server. " + "One common cause is that the parameter server DNS name isn't resolving yet, or is misspecified." ) with sv.managed_session(server.target, config=config) as sess, sess.as_default(): sess.run(trainer.sync) trainer.start(sess, summary_writer) global_step = sess.run(trainer.global_step) epoch = -1 logger.info("Starting training at step=%d", global_step) while not sv.should_stop() and (not args.max_step or global_step < args.max_step): if args.task == 0 and int(global_step / args.eval_freq) > epoch: epoch = int(global_step / args.eval_freq) filename = os.path.join(args.log, 'e%d' % (epoch)) sv.saver.save(sess, filename) sv.saver.save(sess, os.path.join(args.log, 'latest')) print("Saved to: %s" % filename) trainer.process(sess) global_step = sess.run(trainer.global_step) if args.task == 0 and int(global_step / args.eval_freq) > epoch: epoch = int(global_step / args.eval_freq) filename = os.path.join(args.log, 'e%d' % (epoch)) sv.saver.save(sess, filename) sv.saver.save(sess, os.path.join(args.log, 'latest')) print("Saved to: %s" % filename) # Ask for all the services to stop. sv.stop() logger.info('reached %s steps. worker stopped.', global_step)
def run(args): logger.info('Read data:') logger.info('Build graph:') model = EditableGAN(args) print('######################## GPU ALLOCATION ########################') print(args.gpu) print('######################## GPU ALLOCATION ########################') os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu variables_to_save = tf.global_variables() init_op = tf.variables_initializer(variables_to_save) init_all_op = tf.global_variables_initializer() saver = FastSaver(var_list=variables_to_save, max_to_keep=5) logger.info('GLOBAL vars:') var_list = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, tf.get_variable_scope().name) for v in var_list: logger.info(' %s %s', v.name, v.get_shape()) if args.load_model != '': model_name = args.load_model else: model_name = '{}_{}'.format("GAN", datetime.now().strftime("%Y-%m-%d_%H-%M-%S") + "_" + args.model_name) logdir = './logs' makedirs(logdir) logdir = os.path.join(logdir, model_name) logger.info('Events directory: %s', logdir) summary_writer = tf.summary.FileWriter(logdir) def init_fn(sess): logger.info('Initializing all parameters.') sess.run(init_all_op) sv = tf.train.Supervisor(is_chief=True, logdir=logdir, saver=saver, summary_op=None, init_op=init_op, init_fn=init_fn, summary_writer=summary_writer, ready_op=tf.report_uninitialized_variables(variables_to_save), global_step=model.global_step, save_model_secs=1200, save_summaries_secs=30) f = open(os.path.join(logdir, 'description.txt'), 'w') f.write('Description : \n' + args.description) f.close() if args.train: logger.info("Starting training session.") with sv.managed_session() as sess: base_dir = os.path.join('results', model_name) makedirs(base_dir) model.train(sess, summary_writer, base_dir) logger.info("Starting testing session.") with sv.managed_session() as sess: base_dir = os.path.join('results', model_name) makedirs(base_dir) model.test(sess, base_dir)
def run(args, server): env = create_env(args.env_id, client_id=str(args.task), remotes=args.remotes) trainer = A3C(env, args.task, args.visualise, args) # Variable names that start with "local" are not saved in checkpoints. if use_tf12_api: variables_to_save = [ v for v in tf.global_variables() if not v.name.startswith("local") ] init_op = tf.variables_initializer(variables_to_save) init_all_op = tf.global_variables_initializer() else: variables_to_save = [ v for v in tf.all_variables() if not v.name.startswith("local") ] init_op = tf.initialize_variables(variables_to_save) init_all_op = tf.initialize_all_variables() saver = FastSaver(variables_to_save) var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, tf.get_variable_scope().name) logger.info('Trainable vars:') for v in var_list: logger.info(' %s %s', v.name, v.get_shape()) def init_fn(ses): logger.info("Initializing all parameters.") ses.run(init_all_op) config = tf.ConfigProto(device_filters=[ "/job:ps", "/job:worker/task:{}/cpu:0".format(args.task) ]) logdir = os.path.join(args.log_dir, 'train') if use_tf12_api: summary_writer = tf.summary.FileWriter(logdir + "_%d" % args.task) else: summary_writer = tf.train.SummaryWriter(logdir + "_%d" % args.task) logger.info("Events directory: %s_%s", logdir, args.task) sv = tf.train.Supervisor( is_chief=(args.task == 0), logdir=logdir, saver=saver, summary_op=None, init_op=init_op, init_fn=init_fn, summary_writer=summary_writer, ready_op=tf.report_uninitialized_variables(variables_to_save), global_step=trainer.global_step, save_model_secs=30, save_summaries_secs=30) num_global_steps = 100000000 logger.info( "Starting session. If this hangs, we're mostly likely waiting to connect to the parameter server. " + "One common cause is that the parameter server DNS name isn't resolving yet, or is misspecified." ) with sv.managed_session(server.target, config=config) as sess, sess.as_default(): sess.run(trainer.sync) trainer.start(sess, summary_writer) global_step = sess.run(trainer.global_step) logger.info("Starting training at step=%d", global_step) while not sv.should_stop() and (not num_global_steps or global_step < num_global_steps): trainer.process(sess) global_step = sess.run(trainer.global_step) # Ask for all the services to stop. sv.stop() logger.info('reached %s steps. worker stopped.', global_step)
def tf_assert_all_init(sess): uninit_vars = sess.run(tf.report_uninitialized_variables()) assert len( uninit_vars ) == 0, 'Expected all variables to have been initialized, but these have not been: %s' % uninit_vars
def Parameter_Server(Synchronizer, cluster, log_path, model_path, procs): config = tf.ConfigProto( allow_soft_placement=True, log_device_placement=False, ) config.gpu_options.allow_growth = True server = tf.train.Server(cluster, job_name="ps", task_index=0, config=config) sess = tf.Session(target=server.target, config=config) summary_writer = tf.summary.FileWriter(log_path) Net = MiniNetwork(sess=sess, summary_writer=summary_writer, rl_training=FLAGS.training, cluster=cluster, index=0, device=DEVICE[0 % len(DEVICE)], ppo_load_path=FLAGS.restore_model_path, ppo_save_path=model_path, ob_space_add=FLAGS.ob_space_add, act_space_add=FLAGS.act_space_add, freeze_head=FLAGS.freeze_head, use_bn=FLAGS.use_bn, use_sep_net=FLAGS.use_sep_net, restore_model=FLAGS.restore_model, restore_from=FLAGS.restore_from, restore_to=FLAGS.restore_to, load_latest=FLAGS.load_latest, add_image=FLAGS.add_image, partial_restore=FLAGS.partial_restore, weighted_sum_type=FLAGS.weighted_sum_type, initial_type=FLAGS.initial_type) agent = mini_source_agent.MiniSourceAgent( index=-1, net=Net, restore_model=FLAGS.restore_model, rl_training=FLAGS.training, ob_space_add=FLAGS.ob_space_add) print("Parameter server: waiting for cluster connection...") sess.run(tf.report_uninitialized_variables()) print("Parameter server: cluster ready!") print("Parameter server: initializing variables...") agent.init_network() print("Parameter server: variables initialized") update_counter = 0 max_win_rate = 0. latest_win_rate = 0. while update_counter < TRAIN_ITERS: agent.reset_old_network() # wait for update Synchronizer.wait() logging("Update Network!") # TODO count the time , compare cpu and gpu time.sleep(1) # update finish Synchronizer.wait() logging("Update Network finished!") steps, win_rate = agent.update_summary(update_counter) logging("Steps: %d, win rate: %f" % (steps, win_rate)) update_counter += 1 if win_rate >= max_win_rate: agent.save_model() max_win_rate = win_rate latest_win_rate = win_rate agent.net.save_latest_policy() return max_win_rate, latest_win_rate
def run(args, server): env = create_env(args.env_id, client_id=str(args.task), remotes=args.remotes) trainer = A3C(env, args.task, args.visualise, args.unsup) # logging if args.task == 0: with open(args.log_dir + '/log.txt', 'w') as fid: for key, val in constants.items(): fid.write('%s: %s\n' % (str(key), str(val))) fid.write('input observation: %s\n' % str(env.observation_space.shape)) fid.write('env name: %s\n' % str(env.spec.id)) fid.write('unsup method type: %s\n' % str(args.unsup)) # Variable names that start with "local" are not saved in checkpoints. if use_tf12_api: variables_to_save = [ v for v in tf.global_variables() if not v.name.startswith("local") ] init_op = tf.variables_initializer(variables_to_save) init_all_op = tf.global_variables_initializer() else: variables_to_save = [ v for v in tf.all_variables() if not v.name.startswith("local") ] init_op = tf.initialize_variables(variables_to_save) init_all_op = tf.initialize_all_variables() saver = FastSaver(variables_to_save) if args.pretrain is not None: variables_to_restore = [ v for v in tf.trainable_variables() if not v.name.startswith("local") ] pretrain_saver = FastSaver(variables_to_restore) var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, tf.get_variable_scope().name) logger.info('Trainable vars:') for v in var_list: logger.info(' %s %s', v.name, v.get_shape()) def init_fn(ses): logger.info("Initializing all parameters.") ses.run(init_all_op) if args.pretrain is not None: pretrain = tf.train.latest_checkpoint(args.pretrain) logger.info("==> Restoring from given pretrained checkpoint.") logger.info(" Pretraining address: %s", pretrain) pretrain_saver.restore(ses, pretrain) logger.info("==> Done restoring model! Restored %d variables.", len(variables_to_restore)) config = tf.ConfigProto(device_filters=[ "/job:ps", "/job:worker/task:{}/cpu:0".format(args.task) ]) logdir = os.path.join(args.log_dir, 'train') if use_tf12_api: summary_writer = tf.summary.FileWriter(logdir + "_%d" % args.task) else: summary_writer = tf.train.SummaryWriter(logdir + "_%d" % args.task) logger.info("Events directory: %s_%s", logdir, args.task) sv = tf.train.Supervisor( is_chief=(args.task == 0), logdir=logdir, saver=saver, summary_op=None, init_op=init_op, init_fn=init_fn, summary_writer=summary_writer, ready_op=tf.report_uninitialized_variables(variables_to_save), global_step=trainer.global_step, save_model_secs=30, save_summaries_secs=30) num_global_steps = constants['MAX_GLOBAL_STEPS'] logger.info( "Starting session. If this hangs, we're mostly likely waiting to connect to the parameter server. " + "One common cause is that the parameter server DNS name isn't resolving yet, or is misspecified." ) with sv.managed_session(server.target, config=config) as sess, sess.as_default(): # Workaround for FailedPreconditionError # see: https://github.com/openai/universe-starter-agent/issues/44 and 31 sess.run(trainer.sync) trainer.start(sess, summary_writer) global_step = sess.run(trainer.global_step) logger.info("Starting training at gobal_step=%d", global_step) while not sv.should_stop() and (not num_global_steps or global_step < num_global_steps): trainer.process(sess) global_step = sess.run(trainer.global_step) # Ask for all the services to stop. sv.stop() logger.info('reached %s steps. worker stopped.', global_step)
def main(_): tic = time.time() tf.logging.set_verbosity(tf.logging.INFO) if not FLAGS.dataset_dir: raise ValueError( 'You must supply the dataset directory with --dataset_dir') # init net_name_scope_pruned = FLAGS.net_name_scope_pruned net_name_scope_checkpoint = FLAGS.net_name_scope_checkpoint indexed_prune_scopes_for_units = valid_indexed_prune_scopes_for_units kept_percentages_dict = get_kept_percentages_dict_from_path( FLAGS.checkpoint_path) kept_percentages = sorted(map(float, FLAGS.kept_percentages.split(','))) # check networks with the kps are pre-trained. for kp in kept_percentages: if kp not in kept_percentages_dict: raise Error('kept_percentage=' + str(kp) + ' not in folder:' + FLAGS.checkpoint_path) num_options = len(kept_percentages) num_units = len(indexed_prune_scopes_for_units) print('num_options=%d, num_blocks=%d' % (num_options, num_units)) print('HG: total number of configurations=%d' % (num_options**num_units)) if FLAGS.configuration_type == 'sample': configs = get_sampled_configurations(num_units, num_options, FLAGS.total_num_configurations) elif FLAGS.configuration_type == 'special': configs = get_special_configurations(num_units, num_options) num_configurations = len(configs) #Getting MPI rank integer # comm = MPI.COMM_WORLD # rank = comm.Get_rank() # if rank >= num_configurations: # print("ERROR: rank(%d) > num_configurations(%d)" %(rank, num_configurations)) # return rank = 0 FLAGS.configuration_index = FLAGS.start_configuration_index + rank config = configs[FLAGS.configuration_index] print('HG: kept_percentages=%s, num_configs=%d, start_config_index=%d, rank=%d, config_index=%d' \ %(str(kept_percentages), num_configurations, FLAGS.start_configuration_index, rank, FLAGS.configuration_index)) # prepare for training with the specific config indexed_prune_scopes, kept_percentage = config_to_indexed_prune_scopes( config, indexed_prune_scopes_for_units, kept_percentages) # prepare file system results_dir = os.path.join( FLAGS.train_dir, "id" + str(FLAGS.configuration_index)) #+'_'+str(FLAGS.max_number_of_steps)) train_dir = os.path.join(results_dir, 'train') if (not FLAGS.continue_training) or ( not tf.train.latest_checkpoint(train_dir)): prepare_file_system(train_dir) def write_detailed_info(info): with open(os.path.join(train_dir, 'train_details.txt'), 'a') as f: f.write(info + '\n') info = 'train_dir: ' + train_dir + '\n' info += 'options:' + str(kept_percentages) + '\n' info += 'configuration: ' + str(config) + '\n' info += 'indexed_prune_scopes: ' + str(indexed_prune_scopes) + '\n' info += 'kept_percentage: ' + str(kept_percentage) print(info) write_detailed_info(info) with tf.Graph().as_default(): deploy_config = model_deploy.DeploymentConfig( num_clones=FLAGS.num_clones, clone_on_cpu=FLAGS.clone_on_cpu, replica_id=FLAGS.task, num_replicas=FLAGS.worker_replicas, num_ps_tasks=FLAGS.num_ps_tasks) ###################### # Select the dataset # ###################### dataset = dataset_factory.get_dataset(FLAGS.dataset_name, FLAGS.train_dataset_name, FLAGS.dataset_dir) test_dataset = dataset_factory.get_dataset(FLAGS.dataset_name, FLAGS.test_dataset_name, FLAGS.dataset_dir) batch_queue = train_inputs(dataset, deploy_config, FLAGS) test_images, test_labels = test_inputs(test_dataset, deploy_config, FLAGS) images, labels = batch_queue.dequeue() ###################### # Select the network# ###################### network_fn_pruned = nets_factory.get_network_fn_pruned( FLAGS.model_name, num_classes=(dataset.num_classes - FLAGS.labels_offset), weight_decay=FLAGS.weight_decay) #################### # Define the model # #################### prune_info = indexed_prune_scopes_to_prune_info( indexed_prune_scopes, kept_percentage) logits_train, _ = network_fn_pruned(images, prune_info=prune_info, is_training=True, is_local_train=False, reuse_variables=False, scope=net_name_scope_pruned) logits_eval, _ = network_fn_pruned(test_images, prune_info=prune_info, is_training=False, is_local_train=False, reuse_variables=True, scope=net_name_scope_pruned) cross_entropy = add_cross_entropy(logits_train, labels) correct_prediction = add_correct_prediction(logits_eval, test_labels) ############################# # Specify the loss functions # ############################# collection_name = 'subgraph_losses' tf.add_to_collection(collection_name, cross_entropy) # get regularization loss regularization_losses = get_regularization_losses_within_scopes() print_list('regularization_losses', regularization_losses) # total loss and its summary total_loss = tf.add_n(tf.get_collection(collection_name), name='total_loss') for l in tf.get_collection(collection_name) + [total_loss]: tf.summary.scalar(l.op.name + '/summary', l) ######################################### # Configure the optimization procedure. # ######################################### with tf.device(deploy_config.variables_device()): global_step = tf.Variable(0, trainable=False, name='global_step') with tf.device(deploy_config.optimizer_device()): learning_rate = configure_learning_rate(dataset.num_samples, global_step, FLAGS) optimizer = configure_optimizer(learning_rate, FLAGS) tf.summary.scalar('learning_rate', learning_rate) ############################# # Add train operation # ############################# variables_to_train = get_trainable_variables_within_scopes() train_op = add_train_op(optimizer, total_loss, global_step, var_list=variables_to_train) print_list("variables_to_train", variables_to_train) # Gather update_ops: the updates for the batch_norm variables created by network_fn_pruned. update_ops = get_update_ops_within_scopes() print_list("update_ops", update_ops) update_ops.append(train_op) update_op = tf.group(*update_ops) with tf.control_dependencies([update_op]): train_tensor = tf.identity(total_loss, name='train_op') # add summary op summary_op = tf.summary.merge_all() print("HG: trainable_variables=", len(tf.trainable_variables())) print("HG: model_variables=", len(tf.model_variables())) print("HG: global_variables=", len(tf.global_variables())) # print_list('model_variables but not trainable variables', list(set(tf.model_variables()).difference(tf.trainable_variables()))) # print_list('global_variables but not model variables', list(set(tf.global_variables()).difference(tf.model_variables()))) # get train scopes for each kept_percentage train_scopes_dict = {} for scope_index in xrange(len(indexed_prune_scopes)): indexed_prune_scope = indexed_prune_scopes[scope_index] scope_kept_percentage = kept_percentage[scope_index] if scope_kept_percentage not in train_scopes_dict: train_scopes_dict[scope_kept_percentage] = [] train_scope = get_train_scope_for_local_train(indexed_prune_scope) train_scopes_dict[scope_kept_percentage].append(train_scope) for key, train_scopes in train_scopes_dict.items(): train_scopes_dict[key] = sorted(set(train_scopes)) #print_list("train_scopes", train_scopes) print('HG: train_scopes_dict:') pprint(train_scopes_dict) sess_config = tf.ConfigProto(intra_op_parallelism_threads=16, inter_op_parallelism_threads=16) with tf.Session(config=sess_config) as sess: ########################### # prepare for filewritter # ########################### train_writer = tf.summary.FileWriter(train_dir, sess.graph) # if restart the training or there is no checkpoint in the train_dir if (not FLAGS.continue_training) or ( not tf.train.latest_checkpoint(train_dir)): ################################################# # Restore pruned model variable values. # ################################################# all_variables_to_train = [] for scope_kept_percentage, train_scopes in train_scopes_dict.items( ): print('HG: kept_percentage', scope_kept_percentage) checkpoint_path = os.path.join( FLAGS.checkpoint_path, kept_percentages_dict[scope_kept_percentage][0], 'train') # 'model.ckpt-'+str(FLAGS.local_train_steps)) variables_to_train = { re.sub( net_name_scope_pruned, net_name_scope_pruned + "_p" + str(scope_kept_percentage), v.op.name): v for v in get_model_variables_within_scopes( train_scopes) } print_list("restore pruned model variables", variables_to_train.values()) load_checkpoint(sess, checkpoint_path, var_list=variables_to_train) all_variables_to_train.extend(variables_to_train.values()) ################################################# # Restore orignal model variable values. # ################################################# variables_to_restore = { re.sub(net_name_scope_pruned, net_name_scope_checkpoint, v.op.name): v for v in get_model_variables_within_scopes() if v not in set(all_variables_to_train) } print_list("restore original model variables", variables_to_restore.values()) load_checkpoint(sess, checkpoint_path, var_list=variables_to_restore) else: ########################################### ## Restore all variables from checkpoint ## ########################################### variables_to_restore = get_global_variables_within_scopes() load_checkpoint(sess, train_dir, var_list=variables_to_restore) ################################################# # init unitialized global variable. # ################################################# variables_to_init = get_global_variables_within_scopes( sess.run(tf.report_uninitialized_variables())) print_list("init unitialized variables", variables_to_init) sess.run(tf.variables_initializer(variables_to_init)) init_global_step_value = sess.run(global_step) print('initial global step: ', init_global_step_value) if init_global_step_value >= FLAGS.max_number_of_steps: print('Exit: init_global_step_value (%d) >= FLAGS.max_number_of_steps (%d)' \ %(init_global_step_value, FLAGS.max_number_of_steps)) return ########################### # Record CPU usage # ########################### # mpstat_output_filename = os.path.join(train_dir, "cpu-usage.log") # os.system("mpstat -P ALL 1 > " + mpstat_output_filename + " 2>&1 &") ########################### # Kicks off the training. # ########################### coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess, coord=coord) saver = tf.train.Saver(max_to_keep=FLAGS.max_to_keep) print('HG: # of threads=', len(threads)) duration = 0 duration_cnt = 0 train_time = 0 train_only_cnt = 0 print("start to train at:", datetime.now()) for i in range(init_global_step_value, FLAGS.max_number_of_steps + 1): #train_step = i+FLAGS.local_train_steps train_step = i # run optional meta data, or summary, while run train tensor if i > init_global_step_value: #if i < FLAGS.max_number_of_steps: # run metadata and train if i % FLAGS.runmeta_every_n_steps == FLAGS.runmeta_every_n_steps - 1: run_options = tf.RunOptions( trace_level=tf.RunOptions.FULL_TRACE) run_metadata = tf.RunMetadata() loss_value = sess.run(train_tensor, options=run_options, run_metadata=run_metadata) train_writer.add_run_metadata(run_metadata, 'step%d-train' % i) # Create the Timeline object, and write it to a json file fetched_timeline = timeline.Timeline( run_metadata.step_stats) chrome_trace = fetched_timeline.generate_chrome_trace_format( ) with open( os.path.join(train_dir, 'timeline_' + str(i) + '.json'), 'w') as f: f.write(chrome_trace) # record summary and train elif i % FLAGS.summary_every_n_steps == 0: train_summary, loss_value = sess.run( [summary_op, train_tensor]) train_writer.add_summary(train_summary, train_step) # train only else: start_time = time.time() loss_value = sess.run(train_tensor) train_only_cnt += 1 train_time += time.time() - start_time duration_cnt += 1 duration += time.time() - start_time if i % FLAGS.log_every_n_steps == 0 and duration_cnt > 0: log_frequency = duration_cnt examples_per_sec = log_frequency * FLAGS.batch_size / duration sec_per_batch = float(duration / log_frequency) summary = tf.Summary() summary.value.add(tag='examples_per_sec', simple_value=examples_per_sec) summary.value.add(tag='sec_per_batch', simple_value=sec_per_batch) train_writer.add_summary(summary, train_step) format_str = ( '%s: step %d, loss = %.3f (%.1f examples/sec; %.3f sec/batch)' ) print(format_str % (datetime.now(), i, loss_value, examples_per_sec, sec_per_batch)) duration = 0 duration_cnt = 0 info = format_str % (datetime.now(), i, loss_value, examples_per_sec, sec_per_batch) write_detailed_info(info) else: # run only total loss when i=0 train_summary, loss_value = sess.run( [summary_op, total_loss]) #loss_value = sess.run(total_loss) train_writer.add_summary(train_summary, train_step) format_str = ('%s: step %d, loss = %.3f') print(format_str % (datetime.now(), i, loss_value)) info = format_str % (datetime.now(), i, loss_value) write_detailed_info(info) # record the evaluation accuracy is_last_step = (i == FLAGS.max_number_of_steps) if i % FLAGS.evaluate_every_n_steps == 0 or is_last_step: test_accuracy, run_metadata = evaluate_accuracy( sess, coord, test_dataset.num_samples, test_images, test_labels, test_images, test_labels, correct_prediction, FLAGS.test_batch_size, run_meta=False) summary = tf.Summary() summary.value.add(tag='accuracy', simple_value=test_accuracy) train_writer.add_summary(summary, train_step) info = ('%s: step %d, test_accuracy = %.6f') % ( datetime.now(), train_step, test_accuracy) print(info) write_detailed_info(info) ########################### # Save model parameters . # ########################### save_path = saver.save( sess, os.path.join(train_dir, 'model.ckpt-' + str(i))) print("HG: Model saved in file: %s" % save_path) coord.request_stop() coord.join(threads) total_time = time.time() - tic # train_time = train_time*(FLAGS.max_number_of_steps - init_global_step_value)/train_only_cnt # info = "HG: training time(min): %.1f, total time(min): %.1f \n" %( train_time/60.0, total_time/60.0) train_speed = train_time * 1.0 / train_only_cnt train_time = train_speed * ( FLAGS.max_number_of_steps ) #- init_global_step_value) #/train_only_cnt info = "HG: training speed(sec/batch): %.6f\n" % (train_speed) info += "HG: training time(min): %.1f, total time(min): %.1f" % ( train_time / 60.0, total_time / 60.0) print(info) write_detailed_info(info)
def setup(self, master, is_chief, global_step, ckpt_dir, summary_ops, global_vars=None, local_vars=None, save_var_list=None, save_steps=None, job_name="worker", task_index=0, async_mode=True): """ Arguments: master (obj): specify the target of TF session. is_chief (bool): indicating whether this process is a chief worker. global_step (obj): the global_step var in the binded graph. ckpt_dir (str): specify the checkpoint directory of TF session. summary_ops (dict): a dict of TF summary operators. global_vars (list): global variables. local_vars (list): local variables. save_var_list (list): list of saveable variables. save_steps: (int): every save_steps to save checkpoint. export_dir (list): path to export SavedModel. job_name (str): job_name in distributed mode. task_index (int): task_index in distributed mode. async_mode (bool): indicating whether this is an asynchronous task. """ if global_vars is not None: logger.info("in executor:") for v in global_vars: logger.info("{}".format(v)) init_op = tf.variables_initializer(global_vars) else: # single-machine init_op = tf.global_variables_initializer() if local_vars is None: local_init_op = None ready_op = tf.report_uninitialized_variables(global_vars) else: pair_global_vars, pair_local_vars = self.get_variable_pairs( global_vars, local_vars) for gv, lv in zip(pair_global_vars, pair_local_vars): logger.info("{}, {}".format(gv, lv)) local_init_op = tf.group(*([ tf.assign(local_var, global_var) for local_var, global_var in zip(pair_local_vars, pair_global_vars) ])) ready_op = tf.report_uninitialized_variables(global_vars + list(pair_local_vars)) ready_for_local_init_op = tf.report_uninitialized_variables( global_vars) # create tensorflow saver object self.saver = tf.train.Saver( var_list=global_vars if save_var_list is None else save_var_list, reshape=False, sharded=False, max_to_keep=10, keep_checkpoint_every_n_hours=10000.0, name=None, restore_sequentially=False, saver_def=None, builder=None, defer_build=False, allow_empty=True, write_version=tf.train.SaverDef.V2, pad_step_number=False, save_relative_paths=True) # handle restore variables from checkpoint def init_fn(scaffold, session): if ckpt_dir: file = tf.train.latest_checkpoint(checkpoint_dir=ckpt_dir, latest_filename=None) if file is not None: logger.info('begin to restore model from {}'.format(file)) scaffold.saver.restore(sess=session, save_path=file) self.scaffold = tf.train.Scaffold( init_op=init_op, init_feed_dict=None, init_fn=init_fn, ready_op=ready_op, ready_for_local_init_op=ready_for_local_init_op, local_init_op=local_init_op, summary_op=None, saver=self.saver, copy_from_scaffold=None) self.do_summary = False for flag, summary_op_list in summary_ops.items(): if len(summary_op_list) > 0: summary_ops[flag] = tf.summary.merge(summary_op_list) else: summary_ops[flag] = None if ckpt_dir: actor_summary_dir = os.path.join(ckpt_dir, "actor_summary") summary_dir = os.path.join(ckpt_dir, "worker_summary") summary_hook = easy_rl.utils.hooks.UpdateSummarySaverHook( self, global_step, job_name, task_index, save_steps=(save_steps or 100), output_dir=actor_summary_dir if job_name == "actor" else summary_dir, summary_op=summary_ops) saver_hook = tf.train.CheckpointSaverHook( checkpoint_dir=ckpt_dir, save_steps=(save_steps or 300), scaffold=self.scaffold, checkpoint_basename='model.ckpt') chief_only_hooks = [saver_hook] hooks = [summary_hook] else: chief_only_hooks = [] hooks = [] # filter devices for asynchronous training if async_mode: if job_name == "learner": device_filters = [ '/job:ps', '/job:memory', '/job:{job_name}/task:{task_index}'.format( job_name=job_name, task_index=task_index) ] else: device_filters = None config_proto = tf.ConfigProto(device_filters=device_filters) else: config_proto = None self.session = tf.train.MonitoredTrainingSession( master=master, is_chief=is_chief, checkpoint_dir=None, scaffold=self.scaffold, chief_only_hooks=chief_only_hooks, hooks=hooks, save_summaries_steps=None, save_summaries_secs=None, config=config_proto)
def __init__(self, blob, target_size=(224, 224), tf_config=None, is_mvnc=False): # earlier target_size=(320,240) if is_mvnc: if mvnc is None: print( "Please install MVNC libraries to use --is-mvnc option...") quit(-1) self.device = mvnc.Device(devices[0]) self.device.openDevice() self.obj = self.device.AllocateGraph(blob) self.graph = tf.get_default_graph() self.persistent_sess = tf.Session(graph=self.graph, config=tf_config) self.tensor_image = None self.tensor_output = tf.placeholder(tf.float16, shape=(1, target_size[0] // 8, target_size[1] // 8, 57), name='vectmap') #57? else: self.device = None graph_def = tf.GraphDef() graph_def.ParseFromString(blob) self.graph = tf.get_default_graph() tf.import_graph_def(graph_def, name='TfPoseEstimator') self.obj = self.persistent_sess = tf.Session(graph=self.graph, config=tf_config) try: self.tensor_image = self.graph.get_tensor_by_name( 'TfPoseEstimator/image:0') except KeyError as e: self.tensor_image = self.graph.get_tensor_by_name( 'TfPoseEstimator/split:0') try: self.tensor_output = self.graph.get_tensor_by_name( 'Openpose/concat_stage7:0') except KeyError as e: self.tensor_output = self.graph.get_tensor_by_name( 'TfPoseEstimator/Openpose/concat_stage7:0') # for op in self.graph.get_operations(): # print(op.name) # for ts in [n.name for n in tf.get_default_graph().as_graph_def().node]: # print(ts) self.tensor_heatMat = self.tensor_output[:, :, :, :19] self.tensor_pafMat = self.tensor_output[:, :, :, 19:] self.upsample_size = tf.placeholder(dtype=tf.int32, shape=(2, ), name='upsample_size') self.tensor_heatMat_up = tf.image.resize_area( self.tensor_output[:, :, :, :19], self.upsample_size, align_corners=False, name='upsample_heatmat') self.tensor_pafMat_up = tf.image.resize_area( self.tensor_output[:, :, :, 19:], self.upsample_size, align_corners=False, name='upsample_pafmat') smoother = Smoother({'data': self.tensor_heatMat_up}, 25, 3.0) gaussian_heatMat = smoother.get_output() max_pooled_in_tensor = tf.nn.pool(gaussian_heatMat, window_shape=(3, 3), pooling_type='MAX', padding='SAME') self.tensor_peaks = tf.where( tf.equal(gaussian_heatMat, max_pooled_in_tensor), gaussian_heatMat, tf.zeros_like(gaussian_heatMat)) self.heatMat = self.pafMat = None # warm-up if is_mvnc: self.persistent_sess.run( tf.variables_initializer([ v for v in tf.global_variables() if v.name.split(':')[0] in [ x.decode('utf-8') for x in self.persistent_sess.run( tf.report_uninitialized_variables()) ] ])) self.persistent_sess.run( [ self.tensor_peaks, self.tensor_heatMat_up, self.tensor_pafMat_up ], feed_dict={ self.tensor_output: [ np.ndarray(shape=(target_size[1] // 8, target_size[0] // 8, 57), dtype=np.float16) ], self.upsample_size: [target_size[1], target_size[0] ] #[target_size[1] // 8, target_size[0] // 8] }) self.persistent_sess.run( [ self.tensor_peaks, self.tensor_heatMat_up, self.tensor_pafMat_up ], feed_dict={ self.tensor_output: [ np.ndarray(shape=(target_size[1] // 8, target_size[0] // 8, 57), dtype=np.float16) ], self.upsample_size: [target_size[1] // 2, target_size[0] // 2 ] #[target_size[1] // 16, target_size[0] // 16] }) self.persistent_sess.run( [ self.tensor_peaks, self.tensor_heatMat_up, self.tensor_pafMat_up ], feed_dict={ self.tensor_output: [ np.ndarray(shape=(target_size[1] // 8, target_size[0] // 8, 57), dtype=np.float16) ], self.upsample_size: [target_size[1] // 4, target_size[0] // 4 ] #[target_size[1] // 32, target_size[0] // 32] }) else: self.persistent_sess.run( tf.variables_initializer([ v for v in tf.global_variables() if v.name.split(':')[0] in [ x.decode('utf-8') for x in self.persistent_sess.run( tf.report_uninitialized_variables()) ] ])) self.persistent_sess.run( [ self.tensor_peaks, self.tensor_heatMat_up, self.tensor_pafMat_up ], feed_dict={ self.tensor_image: [ np.ndarray(shape=(target_size[1], target_size[0], 3), dtype=np.float32) ], self.upsample_size: [target_size[1], target_size[0]] }) self.persistent_sess.run( [ self.tensor_peaks, self.tensor_heatMat_up, self.tensor_pafMat_up ], feed_dict={ self.tensor_image: [ np.ndarray(shape=(target_size[1], target_size[0], 3), dtype=np.float32) ], self.upsample_size: [target_size[1] // 2, target_size[0] // 2] }) self.persistent_sess.run( [ self.tensor_peaks, self.tensor_heatMat_up, self.tensor_pafMat_up ], feed_dict={ self.tensor_image: [ np.ndarray(shape=(target_size[1], target_size[0], 3), dtype=np.float32) ], self.upsample_size: [target_size[1] // 4, target_size[0] // 4] }) self.is_mvnc = is_mvnc
def evaluate_on_train_set(): "train a network" # create session which all the evaluation happens in sess = tf.Session() # create a filename queue first filename_queue, examples_in_database = index_the_database_into_queue( FLAGS.database_path, shuffle=True) # create an epoch counter # there is an additional step with variable initialization in order to get the name of "count up to" in the graph batch_counter = tf.Variable(0) sess.run(tf.global_variables_initializer()) batch_counter_increment = tf.assign( batch_counter, tf.Variable(0).count_up_to( np.round( (examples_in_database * FLAGS.num_epochs) / FLAGS.batch_size))) batch_counter_var_name = sess.run(tf.report_uninitialized_variables()) epoch_counter = tf.div(batch_counter * FLAGS.batch_size, examples_in_database) # create a custom shuffle queue ligand_files, current_epoch, label_batch, sparse_image_batch = image_and_label_queue( batch_size=FLAGS.batch_size, pixel_size=FLAGS.pixel_size, side_pixels=FLAGS.side_pixels, num_threads=FLAGS.num_threads, filename_queue=filename_queue, epoch_counter=epoch_counter, train=False) image_batch = tf.sparse_tensor_to_dense(sparse_image_batch, validate_indices=False) keep_prob = tf.placeholder(tf.float32) y_conv = wide_conv_net(image_batch, keep_prob, FLAGS.batch_size) # compute softmax over raw predictions predictions = tf.nn.softmax(y_conv)[:, 1] # restore variables from sleep saver = tf.train.Saver() saver.restore(sess, FLAGS.saved_session) # use sess.run( tf.contrib.framework.get_variables_by_name( batch_counter_var_name[0])[0].initializer) coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess, coord=coord) # create an instance of a class to store predictions all_predictios = store_predictions() all_predictions_av3 = store_predictions_av3() # add_batch(self, ligand_file_path, batch_predictions, batch_labels) print "starting evalution..." try: while True or not coord.should_stop(): start = time.time() batch_num = sess.run([batch_counter_increment]) my_ligand_files, my_ligand_frames, my_predictions, my_labels = sess.run( [ligand_files, current_epoch, predictions, label_batch], feed_dict={keep_prob: 1}) print "current_epoch:", my_ligand_frames[ 0], "batch_num:", batch_num, print "\tprediction averages:", np.mean(my_predictions), print "\texamples per second:", "%.2f" % (FLAGS.batch_size / (time.time() - start)) all_predictios.add_batch(my_ligand_files, my_ligand_frames, my_predictions) # add_batch(self, ligand_file_path, batch_predictions, batch_labels) all_predictions_av3.add_batch(my_ligand_files, my_ligand_frames, my_predictions, my_labels) print "my labels:", my_labels except tf.errors.OutOfRangeError: print "exiting the loop" all_predictios.save() all_predictions_av3.save_predictions(FLAGS.predictions_file_path)
selected_variables = [ v for v in tf.global_variables() if not v.name.startswith("local") ] selected_variables_init_op = tf.variables_initializer(selected_variables) saver = tf.train.Saver(var_list=selected_variables) summary_writer = tf.summary.FileWriter(LOG_DIR + "__%d" % TASK_INDEX) supervisor = tf.train.Supervisor( is_chief=(JOB_NAME == 'worker' and TASK_INDEX == 0), logdir=LOG_DIR, saver=saver, init_op=selected_variables_init_op, summary_writer=summary_writer, summary_op=None, ready_op=tf.report_uninitialized_variables(selected_variables), global_step=trainer.global_step, save_model_secs= 30 # Number of seconds between the creation of model checkpoints. Defaults to 600 seconds. Pass 0 to disable checkpoints. ) with supervisor.managed_session( master=server.target, config=tf.ConfigProto(device_filters=[ "/job:ps", f"/job:worker/task:{TASK_INDEX}/cpu:0" ])) as sess, sess.as_default(): if PRETRAIN_MODEL_PATH: saver.restore( sess=sess, save_path=tf.train.latest_checkpoint(PRETRAIN_MODEL_PATH))