def testWaitForSessionLocalInit(self):
    server = tf.train.Server.create_local_server()
    with tf.Graph().as_default() as graph:
      v = tf.Variable(1, name="v")
      w = tf.Variable(
          v,
          trainable=False,
          collections=[tf.GraphKeys.LOCAL_VARIABLES],
          name="w")
      sm = tf.train.SessionManager(
          graph=graph,
          ready_op=tf.report_uninitialized_variables(),
          ready_for_local_init_op=tf.report_uninitialized_variables(
              tf.all_variables()),
          local_init_op=w.initializer)

      # Initialize v but not w
      s = tf.Session(server.target, graph=graph)
      s.run(v.initializer)

      sess = sm.wait_for_session(server.target, max_wait_secs=3)
      self.assertEqual(
          True,
          tf.is_variable_initialized(sess.graph.get_tensor_by_name("v:0")).eval(
              session=sess))
      self.assertEqual(
          True,
          tf.is_variable_initialized(sess.graph.get_tensor_by_name("w:0")).eval(
              session=sess))
      self.assertEquals(1, sess.run(v))
      self.assertEquals(1, sess.run(w))
 def testPrepareSessionWithReadyForLocalInitOp(self):
   with tf.Graph().as_default():
     v = tf.Variable(1, name="v")
     w = tf.Variable(
         v,
         trainable=False,
         collections=[tf.GraphKeys.LOCAL_VARIABLES],
         name="w")
     with self.test_session():
       self.assertEqual(False, tf.is_variable_initialized(v).eval())
       self.assertEqual(False, tf.is_variable_initialized(w).eval())
     sm2 = tf.train.SessionManager(
         ready_op=tf.report_uninitialized_variables(),
         ready_for_local_init_op=tf.report_uninitialized_variables(
             tf.all_variables()),
         local_init_op=w.initializer)
     sess = sm2.prepare_session("", init_op=v.initializer)
     self.assertEqual(
         True,
         tf.is_variable_initialized(sess.graph.get_tensor_by_name("v:0")).eval(
             session=sess))
     self.assertEqual(
         True,
         tf.is_variable_initialized(sess.graph.get_tensor_by_name("w:0")).eval(
             session=sess))
     self.assertEquals(1, sess.run(v))
     self.assertEquals(1, sess.run(w))
  def testRecoverSession(self):
    # Create a checkpoint.
    checkpoint_dir = os.path.join(self.get_temp_dir(), "recover_session")
    try:
      gfile.DeleteRecursively(checkpoint_dir)
    except errors.OpError:
      pass                      # Ignore
    gfile.MakeDirs(checkpoint_dir)

    with tf.Graph().as_default():
      v = tf.Variable(1, name="v")
      sm = tf.train.SessionManager(ready_op=tf.report_uninitialized_variables())
      saver = tf.train.Saver({"v": v})
      sess, initialized = sm.recover_session("", saver=saver,
                                             checkpoint_dir=checkpoint_dir)
      self.assertFalse(initialized)
      sess.run(v.initializer)
      self.assertEquals(1, sess.run(v))
      saver.save(sess, os.path.join(checkpoint_dir,
                                    "recover_session_checkpoint"))
    # Create a new Graph and SessionManager and recover.
    with tf.Graph().as_default():
      v = tf.Variable(2, name="v")
      with self.test_session():
        self.assertEqual(False, tf.is_variable_initialized(v).eval())
      sm2 = tf.train.SessionManager(
          ready_op=tf.report_uninitialized_variables())
      saver = tf.train.Saver({"v": v})
      sess, initialized = sm2.recover_session("", saver=saver,
                                              checkpoint_dir=checkpoint_dir)
      self.assertTrue(initialized)
      self.assertEqual(
          True, tf.is_variable_initialized(
              sess.graph.get_tensor_by_name("v:0")).eval(session=sess))
      self.assertEquals(1, sess.run(v))
Esempio n. 4
0
    def initializeOrRestore(self):

        self.ckptDir = os.path.join(self.checkpoint_dir, self.dataset.name)
        self.ckptPrefix = os.path.join(self.ckptDir, self.name, self.name)
        vgg_ckpt_file = os.path.join(self.ckptDir, 'vgg_16', 'vgg_16.ckpt')
        mt_ckpt_file = layers.latest_checkpoint(os.path.join(self.ckptDir, 'mt'))
        # ckpt_file = layers.latest_checkpoint(os.path.join(self.ckptDir, 'vgg_16', 'vgg_16.ckpt'))
        globalVars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)

        if vgg_ckpt_file is not None and tf.train.checkpoint_exists(vgg_ckpt_file):
            varsInCkpt, varsNotInCkpt = layers.scan_checkpoint_for_vars(vgg_ckpt_file, globalVars)
            if len(varsInCkpt) != 0:
                restorationSaver = tf.train.Saver(varsInCkpt)
                self.sess.run(tf.report_uninitialized_variables(var_list=varsInCkpt))
                restorationSaver.restore(self.sess, vgg_ckpt_file)
        else:
            varsNotInCkpt = globalVars

        if mt_ckpt_file is not None and tf.train.checkpoint_exists(mt_ckpt_file):
            varsInCkpt, varsNotInCkpt = layers.scan_checkpoint_for_vars(mt_ckpt_file, varsNotInCkpt)
            varsInCkpt, varsNotInCkpt = layers.replaceVarInListsByName(varsInCkpt, varsNotInCkpt, 'fc6')
            if len(varsInCkpt) != 0:
                restorationSaver = tf.train.Saver(varsInCkpt)
                self.sess.run(tf.report_uninitialized_variables(var_list=varsInCkpt))
                restorationSaver.restore(self.sess, mt_ckpt_file)
        else:
            varsNotInCkpt = globalVars

        self.saver = tf.train.Saver()
        self.sess.run(tf.group(tf.variables_initializer(varsNotInCkpt), tf.local_variables_initializer()))
  def testRecoverSessionWithReadyForLocalInitOpFailsToReadyLocal(self):
    # We use ready_for_local_init_op=tf.report_uninitialized_variables(),
    # which causes recover_session to not run local_init_op, and to return
    # initialized=False

    # Create a checkpoint.
    checkpoint_dir = os.path.join(
        self.get_temp_dir(),
        "recover_session_ready_for_local_init_fails_to_ready_local")
    try:
      gfile.DeleteRecursively(checkpoint_dir)
    except errors.OpError:
      pass  # Ignore
    gfile.MakeDirs(checkpoint_dir)

    with tf.Graph().as_default():
      v = tf.Variable(1, name="v")
      sm = tf.train.SessionManager(ready_op=tf.report_uninitialized_variables())
      saver = tf.train.Saver({"v": v})
      sess, initialized = sm.recover_session(
          "", saver=saver, checkpoint_dir=checkpoint_dir)
      self.assertFalse(initialized)
      sess.run(v.initializer)
      self.assertEquals(1, sess.run(v))
      saver.save(sess, os.path.join(checkpoint_dir,
                                    "recover_session_checkpoint"))
    # Create a new Graph and SessionManager and recover.
    with tf.Graph().as_default():
      v = tf.Variable(2, name="v")
      w = tf.Variable(
          v,
          trainable=False,
          collections=[tf.GraphKeys.LOCAL_VARIABLES],
          name="w")
      with self.test_session():
        self.assertEqual(False, tf.is_variable_initialized(v).eval())
        self.assertEqual(False, tf.is_variable_initialized(w).eval())
      sm2 = tf.train.SessionManager(
          ready_op=tf.report_uninitialized_variables(),
          ready_for_local_init_op=tf.report_uninitialized_variables(),
          local_init_op=w.initializer)
      saver = tf.train.Saver({"v": v})
      sess, initialized = sm2.recover_session(
          "", saver=saver, checkpoint_dir=checkpoint_dir)
      self.assertFalse(initialized)
      self.assertEqual(
          True,
          tf.is_variable_initialized(sess.graph.get_tensor_by_name("v:0")).eval(
              session=sess))
      self.assertEqual(
          False,
          tf.is_variable_initialized(sess.graph.get_tensor_by_name("w:0")).eval(
              session=sess))
      self.assertEquals(1, sess.run(v))
    def testPrepareSessionFails(self):
        checkpoint_dir = os.path.join(self.get_temp_dir(), "prepare_session")
        checkpoint_dir2 = os.path.join(self.get_temp_dir(), "prepare_session2")
        try:
            gfile.DeleteRecursively(checkpoint_dir)
            gfile.DeleteRecursively(checkpoint_dir2)
        except OSError:
            pass  # Ignore
        gfile.MakeDirs(checkpoint_dir)

        with tf.Graph().as_default():
            v = tf.Variable([1.0, 2.0, 3.0], name="v")
            sm = tf.train.SessionManager(ready_op=tf.report_uninitialized_variables())
            saver = tf.train.Saver({"v": v})
            sess = sm.prepare_session(
                "", init_op=tf.initialize_all_variables(), saver=saver, checkpoint_dir=checkpoint_dir
            )
            self.assertAllClose([1.0, 2.0, 3.0], sess.run(v))
            checkpoint_filename = os.path.join(checkpoint_dir, "prepare_session_checkpoint")
            saver.save(sess, checkpoint_filename)
        # Create a new Graph and SessionManager and recover.
        with tf.Graph().as_default():
            # Renames the checkpoint directory.
            os.rename(checkpoint_dir, checkpoint_dir2)
            gfile.MakeDirs(checkpoint_dir)
            v = tf.Variable([6.0, 7.0, 8.0], name="v")
            with self.test_session():
                self.assertEqual(False, tf.is_variable_initialized(v).eval())
            tf.train.SessionManager(ready_op=tf.report_uninitialized_variables())
            saver = tf.train.Saver({"v": v})
            # This should fail as there's no checkpoint within 2 seconds.
            with self.assertRaisesRegexp(RuntimeError, "no init_op or init_fn was given"):
                sess = sm.prepare_session(
                    "",
                    init_op=None,
                    saver=saver,
                    checkpoint_dir=checkpoint_dir,
                    wait_for_checkpoint=True,
                    max_wait_secs=2,
                )
            # Rename the checkpoint directory back.
            gfile.DeleteRecursively(checkpoint_dir)
            os.rename(checkpoint_dir2, checkpoint_dir)
            # This should succeed as there's checkpoint.
            sess = sm.prepare_session(
                "", init_op=None, saver=saver, checkpoint_dir=checkpoint_dir, wait_for_checkpoint=True, max_wait_secs=2
            )
            self.assertEqual(True, tf.is_variable_initialized(sess.graph.get_tensor_by_name("v:0")).eval(session=sess))
 def testPrepareSessionSucceedsWithInitFeedDict(self):
     with tf.Graph().as_default():
         p = tf.placeholder(tf.float32, shape=(3,))
         v = tf.Variable(p, name="v")
         sm = tf.train.SessionManager(ready_op=tf.report_uninitialized_variables())
         sess = sm.prepare_session("", init_op=tf.initialize_all_variables(), init_feed_dict={p: [1.0, 2.0, 3.0]})
         self.assertAllClose([1.0, 2.0, 3.0], sess.run(v))
Esempio n. 8
0
 def guarantee_initialized_variables(self, session, list_of_variables = None):
     if list_of_variables is None:
         list_of_variables = tf.all_variables()
     uninitialized_variables = list(tf.get_variable(name) for name in
             session.run(tf.report_uninitialized_variables(list_of_variables)))
     session.run(tf.initialize_variables(uninitialized_variables))
     return uninitialized_variables
  def test_restore_fn_classification(self):
    # Define mock tensorflow classification graph and save variables.
    test_graph_classification = tf.Graph()
    with test_graph_classification.as_default():
      image = tf.placeholder(dtype=tf.float32, shape=[1, 20, 20, 3])
      with tf.variable_scope('mock_model'):
        net = slim.conv2d(image, num_outputs=32, kernel_size=1, scope='layer1')
        slim.conv2d(net, num_outputs=3, kernel_size=1, scope='layer2')

      init_op = tf.global_variables_initializer()
      saver = tf.train.Saver()
      save_path = self.get_temp_dir()
      with self.test_session() as sess:
        sess.run(init_op)
        saved_model_path = saver.save(sess, save_path)

    # Create tensorflow detection graph and load variables from
    # classification checkpoint.
    test_graph_detection = tf.Graph()
    with test_graph_detection.as_default():
      inputs_shape = [2, 2, 2, 3]
      inputs = tf.to_float(tf.random_uniform(
          inputs_shape, minval=0, maxval=255, dtype=tf.int32))
      preprocessed_inputs = self._model.preprocess(inputs)
      prediction_dict = self._model.predict(preprocessed_inputs)
      self._model.postprocess(prediction_dict)
      restore_fn = self._model.restore_fn(saved_model_path,
                                          from_detection_checkpoint=False)
      with self.test_session() as sess:
        restore_fn(sess)
        for var in sess.run(tf.report_uninitialized_variables()):
          self.assertNotIn('FeatureExtractor', var.name)
 def testRecoverSessionNoChkptStillRunsLocalInitOp(self):
   # This test checks for backwards compatibility.
   # In particular, we continue to ensure that recover_session will execute
   # local_init_op exactly once, regardless of whether the session was
   # successfully recovered.
   with tf.Graph().as_default():
     w = tf.Variable(
         1,
         trainable=False,
         collections=[tf.GraphKeys.LOCAL_VARIABLES],
         name="w")
     with self.test_session():
       self.assertEqual(False, tf.is_variable_initialized(w).eval())
     sm2 = tf.train.SessionManager(
         ready_op=tf.report_uninitialized_variables(),
         ready_for_local_init_op=None,
         local_init_op=w.initializer)
     # Try to recover session from None
     sess, initialized = sm2.recover_session(
         "", saver=None, checkpoint_dir=None)
     # Succeeds because recover_session still run local_init_op
     self.assertFalse(initialized)
     self.assertEqual(
         True,
         tf.is_variable_initialized(sess.graph.get_tensor_by_name("w:0")).eval(
             session=sess))
     self.assertEquals(1, sess.run(w))
Esempio n. 11
0
def _find_initializable_tensors(intializables, session):
    for_reports = []
    status_tensors = []
    boolean_tensors = []

    for v in intializables:
        if isinstance(v, (tuple, list)):
            status_tensors.append(v[0])
            boolean_tensors.append(v[1])
        # TODO(@awav): Tensorflow Iterator must have to be skipped at
        # auto-intialization unless TensorFlow issue #14633 is resolved.
        elif isinstance(v, tf.data.Iterator):
            continue
        else:
            for_reports.append(v)

    if for_reports:
        uninitialized = tf.report_uninitialized_variables(var_list=for_reports)
        def uninitialized_names():
            for uv in session.run(uninitialized):
                yield uv.decode('utf-8')

        names = set(uninitialized_names())
        for v in for_reports:
            if v.name.split(':')[0] in names:
                yield v

    if boolean_tensors:
        stats = session.run(boolean_tensors)
        length = len(stats)
        for i in range(length):
            if not stats[i]:
                yield status_tensors[i]
Esempio n. 12
0
def parameter_server():
    with tf.device( "/job:ps/task:0"):
        var = tf.Variable(0.0 , name= 'var')

    server = tf.train.Server(cluster, job_name="ps" , task_index=0)
    sess = tf.Session(target=server.target)
    print "*" * 40
    print server.target
    print "*" * 40

    for i in range(5):
        print("Parameter server: sleeping...")
        sleep(1)

    print("Parameter server: waiting for cluster connection...")
    sess.run(tf.report_uninitialized_variables())
    print("Parameter server: cluster ready!")

    print("Parameter server: initializing variables...")
    sess.run(tf.global_variables_initializer())
    print("Parameter server: variables initialized")

    for i in range(5):
        val = sess.run(var)
        print("Parameter server: var has value %.1f" % val)
        sleep(1.0)

    print("Parameter server: blocking...")

    server.join()
 def testPrepareSessionSucceedsWithInitFn(self):
   with tf.Graph().as_default():
     v = tf.Variable([125], name="v")
     sm = tf.train.SessionManager(ready_op=tf.report_uninitialized_variables())
     sess = sm.prepare_session("",
                               init_fn=lambda sess: sess.run(v.initializer))
     self.assertAllClose([125], sess.run(v))
    def testWaitForSessionReturnsNoneAfterTimeout(self):
        with tf.Graph().as_default():
            tf.Variable(1, name="v")
            sm = tf.train.SessionManager(ready_op=tf.report_uninitialized_variables(), recovery_wait_secs=1)

            # Set max_wait_secs to allow us to try a few times.
            with self.assertRaises(errors.DeadlineExceededError):
                sm.wait_for_session(master="", max_wait_secs=3)
Esempio n. 15
0
 def testAssertVariablesInitialized(self):
   with tf.Graph().as_default(), self.test_session() as sess:
     v = tf.Variable([1, 2], name="v")
     w = tf.Variable([3, 4], name="w")
     _ = v, w
     uninited = tf.report_uninitialized_variables()
     self.assertAllEqual(np.array([b"v", b"w"]), sess.run(uninited))
     tf.initialize_all_variables().run()
     self.assertEqual(0, sess.run(uninited).size)
  def testWaitForSessionWithReadyForLocalInitOpFailsToReadyLocal(self):
    with tf.Graph().as_default() as graph:
      v = tf.Variable(1, name="v")
      w = tf.Variable(
          v,
          trainable=False,
          collections=[tf.GraphKeys.LOCAL_VARIABLES],
          name="w")
      sm = tf.train.SessionManager(
          graph=graph,
          ready_op=tf.report_uninitialized_variables(),
          ready_for_local_init_op=tf.report_uninitialized_variables(),
          local_init_op=w.initializer)

      with self.assertRaises(tf.errors.DeadlineExceededError):
        # Time-out because w fails to be initialized,
        # because of overly restrictive ready_for_local_init_op
        sm.wait_for_session("", max_wait_secs=3)
 def testInitWithNoneLocalInitOpError(self):
   # Creating a SessionManager with a None local_init_op but
   # non-None ready_for_local_init_op raises ValueError
   with self.assertRaisesRegexp(ValueError,
                                "If you pass a ready_for_local_init_op "
                                "you must also pass a local_init_op "):
     tf.train.SessionManager(
         ready_for_local_init_op=tf.report_uninitialized_variables(
             tf.all_variables()),
         local_init_op=None)
Esempio n. 18
0
 def testVariableList(self):
   with tf.Graph().as_default(), self.test_session() as sess:
     v = tf.Variable([1, 2], name="v")
     w = tf.Variable([3, 4], name="w")
     uninited = tf.report_uninitialized_variables()
     self.assertAllEqual(np.array([b"v", b"w"]), sess.run(uninited))
     sess.run(w.initializer)
     self.assertAllEqual(np.array([b"v"]), sess.run(uninited))
     v.initializer.run()
     self.assertEqual(0, sess.run(uninited).size)
 def testPrepareSessionWithReadyNotReadyForLocal(self):
   with tf.Graph().as_default():
     v = tf.Variable(1, name="v")
     w = tf.Variable(
         v,
         trainable=False,
         collections=[tf.GraphKeys.LOCAL_VARIABLES],
         name="w")
     with self.test_session():
       self.assertEqual(False, tf.is_variable_initialized(v).eval())
       self.assertEqual(False, tf.is_variable_initialized(w).eval())
     sm2 = tf.train.SessionManager(
         ready_op=tf.report_uninitialized_variables(),
         ready_for_local_init_op=tf.report_uninitialized_variables(
             tf.all_variables()),
         local_init_op=w.initializer)
     with self.assertRaisesRegexp(
         RuntimeError,
         "Init operations did not make model ready for local_init"):
       sm2.prepare_session("", init_op=None)
 def test_restore_fn_detection(self):
   init_op = tf.global_variables_initializer()
   saver = tf_saver.Saver()
   save_path = self.get_temp_dir()
   with self.test_session() as sess:
     sess.run(init_op)
     saved_model_path = saver.save(sess, save_path)
     restore_fn = self._model.restore_fn(saved_model_path,
                                         from_detection_checkpoint=True)
     restore_fn(sess)
     for var in sess.run(tf.report_uninitialized_variables()):
       self.assertNotIn('FeatureExtractor', var.name)
Esempio n. 21
0
def get_distributed_session_creator(server):
    """
    Args:
       server (tf.train.Server):

    Returns:
        tf.train.SessionCreator
    """

    server_def = server.server_def
    is_chief = (server_def.job_name == 'worker') and (server_def.task_index == 0)

    init_op = tf.global_variables_initializer()
    local_init_op = tf.local_variables_initializer()
    ready_op = tf.report_uninitialized_variables()
    ready_for_local_init_op = tf.report_uninitialized_variables(tf.global_variables())
    sm = tf.train.SessionManager(
        local_init_op=local_init_op,
        ready_op=ready_op,
        ready_for_local_init_op=ready_for_local_init_op,
        graph=tf.get_default_graph())

    # to debug wrong variable collection
    # from pprint import pprint
    # print("GLOBAL:")
    # pprint([(k.name, k.device) for k in tf.global_variables()])
    # print("LOCAL:")
    # pprint([(k.name, k.device) for k in tf.local_variables()])

    class _Creator(tf.train.SessionCreator):
        def create_session(self):
            if is_chief:
                return sm.prepare_session(master=server.target, init_op=init_op)
            else:
                tf.logging.set_verbosity(tf.logging.INFO)   # print message about uninitialized vars
                ret = sm.wait_for_session(master=server.target)
                tf.logging.set_verbosity(tf.logging.WARN)
                return ret

    return _Creator()
 def test_restore_map_for_detection_ckpt(self):
   init_op = tf.global_variables_initializer()
   saver = tf_saver.Saver()
   save_path = self.get_temp_dir()
   with self.test_session() as sess:
     sess.run(init_op)
     saved_model_path = saver.save(sess, save_path)
     var_map = self._model.restore_map(from_detection_checkpoint=True)
     self.assertIsInstance(var_map, dict)
     saver = tf.train.Saver(var_map)
     saver.restore(sess, saved_model_path)
     for var in sess.run(tf.report_uninitialized_variables()):
       self.assertNotIn('FeatureExtractor', var.name)
Esempio n. 23
0
 def test_evaluate_ready_for_local_init(self):
   with tf.Graph().as_default() as g, self.test_session(g):
     tf.contrib.framework.create_global_step()
     v = variables.Variable(1.0)
     w = variables.Variable(v + 1,
                            collections=[ops.GraphKeys.LOCAL_VARIABLES],
                            trainable=False)
     ready_for_local_init_op = tf.report_uninitialized_variables(
         tf.global_variables())
     ops.add_to_collection(ops.GraphKeys.READY_FOR_LOCAL_INIT_OP,
                           ready_for_local_init_op)
     _ = learn.graph_actions.evaluate(
         g, output_dir=self._output_dir, checkpoint_path=None,
         eval_dict={'a': v}, max_steps=1)
Esempio n. 24
0
def worker(worker_n):
    with tf.device( "/job:ps/task:0"):
        var = tf.Variable(0.0 , name= 'var')

    server = tf.train.Server(cluster, job_name="worker", task_index=worker_n)
    sess = tf.Session(target=server.target) 
    
    print("Worker %d: waiting for cluster connection..." % worker_n)
    sess.run(tf.report_uninitialized_variables())
    print("Worker %d: cluster ready!" % worker_n)

    while sess.run(tf.report_uninitialized_variables()):
        print("Worker %d: waiting for variable initialization..." % worker_n)
        sleep(1.0)

    print("Worker %d: variables initialized" % worker_n)
    
    for i in range(5):
        print("Worker %d: incrementing var" % worker_n) 
        sess.run(var.assign_add(1.0))
        sleep(1.0)
    
    print("Worker %d: blocking..." % worker_n)
    server.join()
Esempio n. 25
0
def guarantee_initialized_variables(session, variables=None):
    """Guarantee that all the specified variables are initialized.

    If a variable is already initialized, leave it alone. Otherwise, initialize it.

    If no variables are specified, checks all variables in the default graph.

    Args:
        variables (list[tf.Variable])
    """
    name_to_var = {v.op.name: v for v in tf.global_variables() + tf.local_variables()}
    uninitialized_variables = list(name_to_var[name] for name in
                                   session.run(tf.report_uninitialized_variables(variables)))
    init_op = tf.variables_initializer(uninitialized_variables)
    session.run(init_op)
    return uninitialized_variables
 def testWaitForSessionInsufficientReadyForLocalInitCheck(self):
   with tf.Graph().as_default() as graph:
     v = tf.Variable(1, name="v")
     w = tf.Variable(
         v,
         trainable=False,
         collections=[tf.GraphKeys.LOCAL_VARIABLES],
         name="w")
     sm = tf.train.SessionManager(
         graph=graph,
         ready_op=tf.report_uninitialized_variables(),
         ready_for_local_init_op=None,
         local_init_op=w.initializer)
   with self.assertRaisesRegexp(tf.errors.FailedPreconditionError,
                                "Attempting to use uninitialized value v"):
     sm.wait_for_session("", max_wait_secs=3)
Esempio n. 27
0
    def initializeOrRestore(self):

        self.ckptDir = os.path.join(self.checkpoint_dir, self.dataset.name, self.name)
        self.ckptPrefix = os.path.join(self.ckptDir, self.name)
        globalVars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)
        ckpt_file = layers.latest_checkpoint(self.ckptDir, "checkpoint")

        if ckpt_file is not None and tf.train.checkpoint_exists(ckpt_file):
            varsInCkpt, varsNotInCkpt = layers.scan_checkpoint_for_vars(ckpt_file, globalVars)
            if len(varsInCkpt) != 0:
                restorationSaver = tf.train.Saver(varsInCkpt)
                self.sess.run(tf.report_uninitialized_variables(var_list=varsInCkpt))
                restorationSaver.restore(self.sess, ckpt_file)
        else:
            varsNotInCkpt = globalVars

        self.saver = tf.train.Saver()
        self.sess.run(tf.group(tf.variables_initializer(varsNotInCkpt), tf.local_variables_initializer()))
 def testPrepareSessionWithInsufficientReadyForLocalInitCheck(self):
   with tf.Graph().as_default():
     v = tf.Variable(1, name="v")
     w = tf.Variable(
         v,
         trainable=False,
         collections=[tf.GraphKeys.LOCAL_VARIABLES],
         name="w")
     with self.test_session():
       self.assertEqual(False, tf.is_variable_initialized(v).eval())
       self.assertEqual(False, tf.is_variable_initialized(w).eval())
     sm2 = tf.train.SessionManager(
         ready_op=tf.report_uninitialized_variables(),
         ready_for_local_init_op=None,
         local_init_op=w.initializer)
   with self.assertRaisesRegexp(tf.errors.FailedPreconditionError,
                                "Attempting to use uninitialized value v"):
     sm2.prepare_session("", init_op=None)
Esempio n. 29
0
  def test_restore_map_for_classification_ckpt(self, use_keras):
    # Define mock tensorflow classification graph and save variables.
    test_graph_classification = tf.Graph()
    with test_graph_classification.as_default():
      image = tf.placeholder(dtype=tf.float32, shape=[1, 20, 20, 3])
      if use_keras:
        with tf.name_scope('mock_model'):
          layer_one = keras.Conv2D(32, kernel_size=1, name='layer1')
          net = layer_one(image)
          layer_two = keras.Conv2D(3, kernel_size=1, name='layer2')
          layer_two(net)
      else:
        with tf.variable_scope('mock_model'):
          net = slim.conv2d(image, num_outputs=32, kernel_size=1,
                            scope='layer1')
          slim.conv2d(net, num_outputs=3, kernel_size=1, scope='layer2')

      init_op = tf.global_variables_initializer()
      saver = tf.train.Saver()
      save_path = self.get_temp_dir()
      with self.test_session(graph=test_graph_classification) as sess:
        sess.run(init_op)
        saved_model_path = saver.save(sess, save_path)

    # Create tensorflow detection graph and load variables from
    # classification checkpoint.
    test_graph_detection = tf.Graph()
    with test_graph_detection.as_default():
      model, _, _, _ = self._create_model(use_keras=use_keras)
      inputs_shape = [2, 2, 2, 3]
      inputs = tf.to_float(tf.random_uniform(
          inputs_shape, minval=0, maxval=255, dtype=tf.int32))
      preprocessed_inputs, true_image_shapes = model.preprocess(inputs)
      prediction_dict = model.predict(preprocessed_inputs, true_image_shapes)
      model.postprocess(prediction_dict, true_image_shapes)
      another_variable = tf.Variable([17.0], name='another_variable')  # pylint: disable=unused-variable
      var_map = model.restore_map(fine_tune_checkpoint_type='classification')
      self.assertNotIn('another_variable', var_map)
      self.assertIsInstance(var_map, dict)
      saver = tf.train.Saver(var_map)
      with self.test_session(graph=test_graph_detection) as sess:
        saver.restore(sess, saved_model_path)
        for var in sess.run(tf.report_uninitialized_variables()):
          self.assertNotIn('FeatureExtractor', var)
Esempio n. 30
0
 def test_restore_map_for_detection_ckpt(self, use_keras):
   model, _, _, _ = self._create_model(use_keras=use_keras)
   model.predict(tf.constant(np.array([[[[0, 0], [1, 1]], [[1, 0], [0, 1]]]],
                                      dtype=np.float32)),
                 true_image_shapes=None)
   init_op = tf.global_variables_initializer()
   saver = tf.train.Saver()
   save_path = self.get_temp_dir()
   with self.test_session() as sess:
     sess.run(init_op)
     saved_model_path = saver.save(sess, save_path)
     var_map = model.restore_map(
         fine_tune_checkpoint_type='detection',
         load_all_detection_checkpoint_vars=False)
     self.assertIsInstance(var_map, dict)
     saver = tf.train.Saver(var_map)
     saver.restore(sess, saved_model_path)
     for var in sess.run(tf.report_uninitialized_variables()):
       self.assertNotIn('FeatureExtractor', var)
Esempio n. 31
0
def run(args, server):
    env = create_env(args.env_id,
                     client_id=str(args.task),
                     remotes=args.remotes)
    trainer = A3C(env, args.task, args.visualise)

    # Variable names that start with "local" are not saved in checkpoints.
    if use_tf12_api:
        variables_to_save = [
            v for v in tf.global_variables() if not v.name.startswith("local")
        ]
        init_op = tf.variables_initializer(variables_to_save)
        init_all_op = tf.global_variables_initializer()
    else:
        variables_to_save = [
            v for v in tf.all_variables() if not v.name.startswith("local")
        ]
        init_op = tf.initialize_variables(variables_to_save)
        init_all_op = tf.initialize_all_variables()
    saver = FastSaver(variables_to_save)

    var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                 tf.get_variable_scope().name)
    logger.info('Trainable vars:')
    for v in var_list:
        logger.info('  %s %s', v.name, v.get_shape())

    def init_fn(ses):
        logger.info("Initializing all parameters.")
        ses.run(init_all_op)

    def get_init_fn():

        if args.checkpoint_path is None:
            return lambda sess: init_fn(sess)

        # Warn the user if a checkpoint exists in the train_dir. Then we'll be
        # ignoring the checkpoint anyway.
        train_dir = os.path.join(args.log_dir, 'train')
        if tf.train.latest_checkpoint(train_dir):
            logger.info(
                'Ignoring --checkpoint_path because a checkpoint already exists in %s'
                % train_dir)
            return lambda sess: init_fn(sess)

        exclusions = []
        if args.checkpoint_exclude_scopes:
            exclusions = [
                scope.strip()
                for scope in FLAGS.checkpoint_exclude_scopes.split(',')
            ]

        variables_to_restore = []

        for var in variables_to_save:  #tf.contrib.framework.get_model_variables():
            for exclusion in exclusions:
                if var.op.name.startswith(exclusion):
                    break
            else:
                variables_to_restore.append(var)

        if tf.gfile.IsDirectory(args.checkpoint_path):
            checkpoint_path = tf.train.latest_checkpoint(args.checkpoint_path)
        else:
            checkpoint_path = args.checkpoint_path

        print(variables_to_restore)

        logger.info('Fine-tuning from %s' % checkpoint_path)

        return tf.contrib.framework.assign_from_checkpoint_fn(
            checkpoint_path,
            variables_to_restore,
            ignore_missing_vars=args.ignore_missing_vars)

    config = tf.ConfigProto(device_filters=[
        "/job:ps", "/job:worker/task:{}/cpu:0".format(args.task)
    ])
    logdir = os.path.join(args.log_dir, 'train')

    if use_tf12_api:
        summary_writer = tf.summary.FileWriter(logdir + "_%d" % args.task)
    else:
        summary_writer = tf.train.SummaryWriter(logdir + "_%d" % args.task)

    logger.info("Events directory: %s_%s", logdir, args.task)
    sv = tf.train.Supervisor(
        is_chief=(args.task == 0),
        logdir=logdir,
        saver=saver,
        summary_op=None,
        init_op=init_op,
        init_fn=get_init_fn(),
        summary_writer=summary_writer,
        ready_op=tf.report_uninitialized_variables(variables_to_save),
        global_step=trainer.global_step,
        save_model_secs=30,
        save_summaries_secs=30)

    num_global_steps = 100000000

    logger.info(
        "Starting session. If this hangs, we're mostly likely waiting to connect to the parameter server. "
        +
        "One common cause is that the parameter server DNS name isn't resolving yet, or is misspecified."
    )
    with sv.managed_session(server.target,
                            config=config) as sess, sess.as_default():
        sess.run(trainer.sync)
        trainer.start(sess, summary_writer)
        global_step = sess.run(trainer.global_step)
        logger.info("Starting training at step=%d", global_step)
        while not sv.should_stop() and (not num_global_steps
                                        or global_step < num_global_steps):
            trainer.process(sess)
            global_step = sess.run(trainer.global_step)

    # Ask for all the services to stop.
    sv.stop()
    logger.info('reached %s steps. worker stopped.', global_step)
Esempio n. 32
0
    def build_graph(self, features, labels, mode, params):
        """docstring."""
        del labels, params
        misc_utils.print_out("Running fast mode_fn")

        hparams = self.hparams

        # Create global_step
        tf.train.get_or_create_global_step()

        if mode == tf.contrib.learn.ModeKeys.INFER:
            # Doing inference only on one GPU
            inf_hparams = tf.contrib.training.HParams(**hparams.values())
            inf_hparams.set_hparam("num_gpus", 1)
            # Inference is done in fp32 and in the same way as that of dist_strategy.
            inf_hparams.set_hparam("use_fp16", False)

            misc_utils.print_out("inference hparmas:")
            misc_utils.print_hparams(inf_hparams)

            # Create variable_mgr
            var_mgr = self._get_variable_mgr(inf_hparams)

            with mixed_precision_scope(), tf.device("gpu:0"), tf.name_scope(
                    "tower_0"), var_mgr.create_outer_variable_scope(0):
                model = gnmt_model.GNMTModel(inf_hparams,
                                             mode=mode,
                                             features=features)
                sample_ids = model.sample_id
                reverse_target_vocab_table = lookup_ops.index_to_string_table_from_file(
                    inf_hparams.tgt_vocab_file, default_value=vocab_utils.UNK)
                sample_words = reverse_target_vocab_table.lookup(
                    tf.to_int64(sample_ids))
                # make sure outputs is of shape [batch_size, time] or [beam_width,
                # batch_size, time] when using beam search.
                if inf_hparams.time_major:
                    sample_words = tf.transpose(sample_words)
                elif sample_words.shape.ndims == 3:
                    # beam search output in [batch_size, time, beam_width] shape.
                    sample_words = tf.transpose(sample_words, [2, 0, 1])
                predictions = {"predictions": sample_words}
                # return loss, vars, grads, predictions, train_op, scaffold
                return None, None, None, predictions, None, None
        elif mode == tf.contrib.learn.ModeKeys.TRAIN:
            num_towers = hparams.num_gpus
            # Shard inputs
            tower_features = self._shard_inputs(features, num_towers)
            # Create loss scale vars if necessary
            loss_scale, loss_scale_normal_steps = self._create_loss_scale_vars(
            )

            # Create variable_mgr
            var_mgr = self._get_variable_mgr(hparams)

            # Build per-tower fprop and bprop
            devices = var_mgr.get_devices()
            tower_gradvars = []
            tower_scopes = []
            var_scopes = []
            train_losses = []
            learning_rates = []
            batch_sizes = []
            opts = []

            def fprop_and_bprop(tid):
                """docstring."""
                model = gnmt_model.GNMTModel(hparams,
                                             mode=mode,
                                             features=tower_features[tid])
                # sync training.
                assert model.learning_rate is not None
                # The following handles shouldn't be built in when doing manual
                assert model.grad_norm is None
                assert model.update is None
                tower_loss = model.train_loss
                # Only check loss numerics if in fp16
                if hparams.use_fp16 and hparams.check_tower_loss_numerics:
                    tower_loss = tf.check_numerics(
                        tower_loss, "tower_%d has Inf/NaN loss" % tid)
                # Cast to fp32, otherwise would easily overflow.
                tower_loss = tf.to_float(tower_loss)
                var_params, grads, opt = self._compute_tower_grads(
                    tower_loss,
                    var_mgr.trainable_variables_on_device(tid, tid),
                    model.learning_rate,
                    use_fp16=hparams.use_fp16,
                    loss_scale=loss_scale,
                    colocate_gradients_with_ops=hparams.
                    colocate_gradients_with_ops)
                self._print_varinfo(var_params, tid)
                res = [model.train_loss, model.learning_rate, model.batch_size]
                res.extend(grads)
                opts.append(opt)
                return res

            def unpack_fprop_and_bprop_output(output):
                train_loss = output[0]
                learning_rate = output[1]
                batch_size = output[2]
                grads = output[3:]
                return train_loss, learning_rate, batch_size, grads

            with mixed_precision_scope():
                for tid in range(num_towers):
                    with tf.device(devices[tid % len(devices)]), tf.name_scope(
                            "tower_%s" % tid) as scope:
                        tower_scopes.append(scope)
                        with var_mgr.create_outer_variable_scope(
                                tid) as var_scope:
                            var_scopes.append(var_scope)

                            outputs = maybe_xla_compile(
                                hparams, fprop_and_bprop, tid)
                            (train_loss, learning_rate, batch_size,
                             grads) = unpack_fprop_and_bprop_output(outputs)
                            train_losses.append(train_loss)
                            learning_rates.append(learning_rate)
                            batch_sizes.append(batch_size)
                            var_params = var_mgr.trainable_variables_on_device(
                                tid, tid)
                            tower_gradvars.append(list(zip(grads, var_params)))

            # Add summaries
            if hparams.show_metrics:
                tf.summary.scalar("learning_rate", learning_rates[0])
                if loss_scale:
                    tf.summary.scalar("loss_scale", loss_scale)
                    if hparams.enable_auto_loss_scale:
                        tf.summary.scalar("loss_scale_normal_steps",
                                          loss_scale_normal_steps)
            misc_utils.print_out("Finish building fprop and per-tower bprop.")
            # Aggregate gradients
            # The following compute the aggregated grads for each tower, stored in
            # opaque grad_states structure.
            apply_grads_devices, grad_states = var_mgr.preprocess_device_grads(
                tower_gradvars)
            master_grads = None
            master_params = None
            update_ops = []
            for i, device in enumerate(apply_grads_devices):
                with tf.device(device), tf.name_scope(tower_scopes[i]):
                    # Get per-tower grads.
                    with tf.name_scope("get_gradients_to_apply"):
                        avg_gradvars = var_mgr.get_gradients_to_apply(
                            i, grad_states)
                    avg_grads = [gv[0] for gv in avg_gradvars]

                    # gradients post-processing
                    with tf.name_scope("clip_gradients"):
                        if hparams.clip_grads:
                            clipped_grads, grad_norm = model_helper.gradient_clip(
                                avg_grads,
                                max_gradient_norm=hparams.max_gradient_norm)
                            # summary the grad on the 1st tower
                            if i == 0 and hparams.show_metrics:
                                tf.summary.scalar("grad_norm", grad_norm)
                                tf.summary.scalar(
                                    "clipped_grad_norm",
                                    tf.global_norm(clipped_grads))
                        else:
                            clipped_grads = avg_grads
                        if i == 0:
                            master_grads = clipped_grads

                    # Build apply-gradients ops
                    clipped_gradvars = list(
                        zip(clipped_grads, [gv[1] for gv in avg_gradvars]))
                    if i == 0:
                        master_params = [gv[1] for gv in avg_gradvars]
                    with tf.name_scope("append_gradient_ops"):
                        loss_scale_params = variable_mgr_util.AutoLossScaleParams(
                            enable_auto_loss_scale=hparams.
                            enable_auto_loss_scale,
                            loss_scale=loss_scale,
                            loss_scale_normal_steps=loss_scale_normal_steps,
                            inc_loss_scale_every_n=hparams.
                            fp16_inc_loss_scale_every_n,
                            is_chief=True)
                        opt = opts[i]
                        var_mgr.append_apply_gradients_ops(
                            grad_states, opt, clipped_gradvars, update_ops,
                            loss_scale_params)
            misc_utils.print_out("Finish building grad aggregation.")

            assert len(update_ops) == num_towers
            train_op = tf.group(update_ops)
            with tf.control_dependencies([train_op]):
                global_step = tf.train.get_global_step()
                train_op = global_step.assign_add(1)

            # Compute loss on the first gpu
            # TODO(jamesqin): optimize it?
            with tf.device("gpu:0"):
                loss = misc_utils.weighted_avg(train_losses, batch_sizes)

            # Create local init_ops
            # TODO(jamesqin): handle resource variables!
            # At present if not using mirror strategy, not using resource vars.
            local_init_ops = []
            local_init_op = tf.local_variables_initializer()
            with tf.control_dependencies([local_init_op]):
                local_init_ops.append(var_mgr.get_post_init_ops())
            local_init_ops.extend([local_init_op, tf.tables_initializer()])

            saveable_vars = var_mgr.savable_variables()
            # Add saveables for cudnn vars in master tower.
            saveable_objects = tf.get_collection(tf.GraphKeys.SAVEABLE_OBJECTS)
            saveable_objects = [x for x in saveable_objects if "v0" in x.name]

            misc_utils.print_out("Saveable vars(%d): " % len(saveable_vars))
            for mv in saveable_vars:
                misc_utils.print_out(mv.name)

            misc_utils.print_out("All global trainable vars(%d): " %
                                 len(tf.trainable_variables()))
            for tv in tf.trainable_variables():
                misc_utils.print_out(tv.name)

            misc_utils.print_out("All global vars(%d): " %
                                 len(tf.global_variables()))
            for gv in tf.global_variables():
                misc_utils.print_out(gv.name)

            misc_utils.print_out("master backproped params(%d): " %
                                 len(master_params))
            for mp in master_params:
                misc_utils.print_out(mp.name)

            # Note the cudnn vars are skipped the init check. :(
            scaffold = tf.train.Scaffold(
                ready_op=tf.report_uninitialized_variables(saveable_vars),
                ready_for_local_init_op=tf.report_uninitialized_variables(
                    saveable_vars),
                local_init_op=tf.group(*local_init_ops),
                saver=tf.train.Saver(saveable_vars + saveable_objects,
                                     save_relative_paths=True))

            misc_utils.print_out("Finish building model_fn")
            # return loss, vars, grads, predictions, train_op, scaffold
            return loss, master_params, master_grads, None, train_op, scaffold
Esempio n. 33
0
def run_worker(args):
    """Starts a worker thread that learns how to play the specified Atari game."""

    cluster_def = get_cluster_def(args.num_threads)
    config = tf.ConfigProto(intra_op_parallelism_threads=1,
                            inter_op_parallelism_threads=2)
    server = tf.train.Server(cluster_def,
                             'thread',
                             args.worker_index,
                             config=config)

    # Configure the supervisor.
    is_chief = args.worker_index == 0
    checkpoint_dir = os.path.join(args.log_dir, 'checkpoint')
    thread_dir = os.path.join(args.log_dir,
                              'thread-{}'.format(args.worker_index))
    summary_writer = tf.summary.FileWriter(thread_dir)
    global_variables_initializer = tf.global_variables_initializer()
    init_fn = lambda sess: sess.run(global_variables_initializer)

    # Initialize the model.
    env = environment.AtariWrapper(args.env_name, environment.TRAINING,
                                   args.action_space)
    player = agent.Agent(args.worker_index, env, args.render,
                         args.num_local_steps, args.learning_rate,
                         args.entropy_regularization, args.max_gradient_norm,
                         args.discount, summary_writer,
                         args.summary_update_interval)

    # Local copies of the model will not be saved.
    model_variables = [
        var for var in tf.global_variables()
        if not var.name.startswith('local')
    ]

    supervisor = tf.train.Supervisor(
        ready_op=tf.report_uninitialized_variables(model_variables),
        is_chief=is_chief,
        init_op=tf.variables_initializer(model_variables),
        logdir=checkpoint_dir,
        summary_op=None,
        saver=tf.train.Saver(model_variables),
        global_step=player.global_step,
        save_summaries_secs=30,
        save_model_secs=30,
        summary_writer=summary_writer,
        init_fn=init_fn)

    config = tf.ConfigProto(device_filters=[
        '/job:master', '/job:thread/task:{}/cpu:0'.format(args.worker_index)
    ])

    LOGGER.info('Starting worker. This may take a while.')
    with supervisor.managed_session(server.target,
                                    config=config) as sess, sess.as_default():
        global_step = 0
        while not supervisor.should_stop(
        ) and global_step < args.num_global_steps:
            global_step = player.train(sess)

    supervisor.stop()
    LOGGER.info('Stopped after %d global steps.', player.global_step)
def Worker(index, update_game_num, Synchronizer, cluster, model_path):
    config = tf.ConfigProto(
        allow_soft_placement=True,
        log_device_placement=False,
    )
    config.gpu_options.allow_growth = True
    worker = tf.train.Server(cluster,
                             job_name="worker",
                             task_index=index,
                             config=config)
    #config.gpu_options.per_process_gpu_memory_fraction = 0.2
    sess = tf.Session(target=worker.target, config=config)

    mini_net = MiniNetwork(sess,
                           index=index,
                           summary_writer=None,
                           rl_training=True,
                           cluster=cluster,
                           ppo_load_path=FLAGS.restore_model_path,
                           ppo_save_path=model_path,
                           freeze_head=FLAGS.freeze_head,
                           use_bn=FLAGS.use_bn,
                           use_sep_net=FLAGS.use_sep_net,
                           restore_model=FLAGS.restore_model,
                           restore_from=FLAGS.restore_from,
                           restore_to=FLAGS.restore_to)
    global_buffer = Buffer()
    agents = []
    for i in range(THREAD_NUM):
        agent = MiniAgent(agent_id=i,
                          global_buffer=global_buffer,
                          net=mini_net,
                          restore_model=FLAGS.restore_model)
        agents.append(agent)

    print("Worker %d: waiting for cluster connection..." % index)
    sess.run(tf.report_uninitialized_variables())
    print("Worker %d: cluster ready!" % index)

    while len(sess.run(tf.report_uninitialized_variables())):
        print("Worker %d: waiting for variable initialization..." % index)
        time.sleep(1)
    print("Worker %d: variables initialized" % index)

    game_num = np.ceil(update_game_num // THREAD_NUM)

    UPDATE_EVENT.clear()
    ROLLING_EVENT.set()
    difficulty = INITIAL_DIFF

    # Run threads
    threads = []
    for i in range(THREAD_NUM - 1):
        t = threading.Thread(target=run_thread,
                             args=(agents[i], game_num, Synchronizer,
                                   difficulty))
        threads.append(t)
        t.daemon = True
        t.start()
        time.sleep(3)

    run_thread(agents[-1], game_num, Synchronizer, difficulty)

    for t in threads:
        t.join()
Esempio n. 35
0
    def __init__(self, graph_path, target_size=(320, 240), tf_config=None):
        self.target_size = target_size

        # load graph
        logger.info('loading graph from %s(default size=%dx%d)' %
                    (graph_path, target_size[0], target_size[1]))
        with tf.gfile.GFile(graph_path, 'rb') as f:
            graph_def = tf.GraphDef()
            graph_def.ParseFromString(f.read())

        self.graph = tf.get_default_graph()
        tf.import_graph_def(graph_def, name='TfPoseEstimator')
        self.persistent_sess = tf.Session(graph=self.graph, config=tf_config)

        # for op in self.graph.get_operations():
        #     print(op.name)
        # for ts in [n.name for n in tf.get_default_graph().as_graph_def().node]:
        #     print(ts)

        self.tensor_image = self.graph.get_tensor_by_name(
            'TfPoseEstimator/image:0')
        self.tensor_output = self.graph.get_tensor_by_name(
            'TfPoseEstimator/Openpose/concat_stage7:0')
        self.tensor_heatMat = self.tensor_output[:, :, :, :19]
        self.tensor_pafMat = self.tensor_output[:, :, :, 19:]
        self.upsample_size = tf.placeholder(dtype=tf.int32,
                                            shape=(2, ),
                                            name='upsample_size')
        self.tensor_heatMat_up = tf.image.resize_area(
            self.tensor_output[:, :, :, :19],
            self.upsample_size,
            align_corners=False,
            name='upsample_heatmat')
        self.tensor_pafMat_up = tf.image.resize_area(
            self.tensor_output[:, :, :, 19:],
            self.upsample_size,
            align_corners=False,
            name='upsample_pafmat')
        smoother = Smoother({'data': self.tensor_heatMat_up}, 25, 3.0)
        gaussian_heatMat = smoother.get_output()

        max_pooled_in_tensor = tf.nn.pool(gaussian_heatMat,
                                          window_shape=(3, 3),
                                          pooling_type='MAX',
                                          padding='SAME')
        self.tensor_peaks = tf.where(
            tf.equal(gaussian_heatMat, max_pooled_in_tensor), gaussian_heatMat,
            tf.zeros_like(gaussian_heatMat))

        self.heatMat = self.pafMat = None

        # warm-up
        self.persistent_sess.run(
            tf.variables_initializer([
                v for v in tf.global_variables() if v.name.split(':')[0] in [
                    x.decode('utf-8') for x in self.persistent_sess.run(
                        tf.report_uninitialized_variables())
                ]
            ]))
        self.persistent_sess.run(
            [self.tensor_peaks, self.tensor_heatMat_up, self.tensor_pafMat_up],
            feed_dict={
                self.tensor_image: [
                    np.ndarray(shape=(target_size[1], target_size[0], 3),
                               dtype=np.float32)
                ],
                self.upsample_size: [target_size[1], target_size[0]]
            })
        self.persistent_sess.run(
            [self.tensor_peaks, self.tensor_heatMat_up, self.tensor_pafMat_up],
            feed_dict={
                self.tensor_image: [
                    np.ndarray(shape=(target_size[1], target_size[0], 3),
                               dtype=np.float32)
                ],
                self.upsample_size: [target_size[1] // 2, target_size[0] // 2]
            })
        self.persistent_sess.run(
            [self.tensor_peaks, self.tensor_heatMat_up, self.tensor_pafMat_up],
            feed_dict={
                self.tensor_image: [
                    np.ndarray(shape=(target_size[1], target_size[0], 3),
                               dtype=np.float32)
                ],
                self.upsample_size: [target_size[1] // 4, target_size[0] // 4]
            })
Esempio n. 36
0
	def __init__(self, patch_size, 
				 dataset,
				 devices, 
				 train_vols,
				 test_vols,
				 name=None):

		self.name=name
		self.summaries = []
		self.devices = devices
		self.patch_size = patch_size
		self.padded_patch_size = (1,) + patch_size + (1,)

		patchx,patchy,patchz = patch_size

		config = tf.ConfigProto(
			allow_soft_placement=True,
			#gpu_options=tf.GPUOptions(per_process_gpu_memory_fraction=0.9, allow_growth=True),
			#log_device_placement=True,
		)
		self.sess = tf.Session(config=config)
		self.run_metadata = tf.RunMetadata()

		with tf.device("/cpu:0"):
			n_volumes = len(dataset.image)
			full_labels_truth = static_constant_multivolume(self.sess, dataset.human_labels, self.padded_patch_size)
			full_labels_lies = static_constant_multivolume(self.sess, dataset.machine_labels, self.padded_patch_size)
			full_image = static_constant_multivolume(self.sess, dataset.image, self.padded_patch_size)
			samples = static_constant_multivolume(self.sess, dataset.samples, (1,3), indexing='CORNER')
		print("finished loading data")

		with tf.name_scope('params'):
			self.step=tf.Variable(0)
			discrim, reconstruct = discrim_net3.make_forward_net(patch_size,2,1)
			self.discrim = discrim

		self.iteration_type=tf.placeholder(shape=[],dtype=tf.int32)

		with tf.name_scope('optimize'):
			loss=0
			reconstruction_loss=0
			for i,d in enumerate(devices):
				with tf.name_scope("gpu"+str(i)):
					with tf.device(d):
						vol_id = tf.cond(tf.equal(self.iteration_type,0),
								lambda: random_sample(tf.constant(train_vols)),
								lambda: random_sample(tf.constant(test_vols)),
								)
						focus=tf.concat([[0],tf.reshape(samples[vol_id,('RAND',0)],(3,)),[0]],0)
						focus=tf.Print(focus,[vol_id, focus], message="focus", summarize=10)
				
						rr=augment.RandomRotationPadded()

						#1 is correct and 0 is incorrect
						lies_glimpse = rr(equal_to_centre(full_labels_lies[vol_id,focus]))
						tmp = full_labels_truth[vol_id,focus]
						truth_glimpse = rr(equal_to_centre(tmp))
						human_labels = rr(tmp)
						image_glimpse = rr(full_image[vol_id,focus])
						
						self.summaries.append(image_summary("lies_glimpse", lies_glimpse))
						self.summaries.append(image_summary("truth_glimpse", truth_glimpse))
						self.summaries.append(image_summary("human_labels", tf.to_float(human_labels)))
						
						occluded = random_occlusion(lies_glimpse)

					with tf.device("/cpu:0"):
						any_error = tf.stop_gradient(1-tf.to_float(tf.reduce_all(tf.equal(truth_glimpse, lies_glimpse))))

					with tf.device(d):
						gpu_any_error = tf.identity(any_error)
						reconstruction = reconstruct(tf.concat([occluded, image_glimpse],4))
						reconstruction_loss += tf.reduce_sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=reconstruction, labels=truth_glimpse))
						
						self.summaries.append(image_summary("reconstruction", tf.nn.sigmoid(reconstruction)))
						self.summaries.append(image_summary("occluded", occluded))

						truth_discrim_tower = discrim(tf.concat([truth_glimpse,image_glimpse],4))
						lies_discrim_tower = tf.cond(tf.greater(gpu_any_error, 0.5),
								lambda: discrim(tf.concat([lies_glimpse,image_glimpse],4)),
								lambda: map(tf.identity, truth_discrim_tower))

					with tf.device(d):
						loss += tf.nn.sigmoid_cross_entropy_with_logits(logits=tf.reduce_sum(lies_discrim_tower[-1]), labels=any_error)
						loss += tf.nn.sigmoid_cross_entropy_with_logits(logits=tf.reduce_sum(truth_discrim_tower[-1]), labels=tf.constant(0,dtype=tf.float32))

					with tf.device("/cpu:0"):
						#any_error = has_error(lies_glimpse, human_labels)
						lies_glimpse = tf.identity(lies_glimpse)
						human_labels = tf.identity(human_labels)
						for i in range(4,6):
							ds_shape = static_shape(lies_discrim_tower[i])
							expander = compose(*reversed(discrim_net3.range_expanders[0:i]))

							tmp=slices_to_shape(expander(shape_to_slices(ds_shape[1:4])))
							assert tuple(tmp) == tuple(self.patch_size)
							def get_localized_errors():
								print(ds_shape)
								x=localized_errors(lies_glimpse, human_labels, ds_shape = ds_shape, expander=expander)
								return tf.Print(x,[any_error],message="any error")

							errors = tf.cond(
									tf.greater(any_error, 0.5),
									lambda:	get_localized_errors(),
									lambda: tf.zeros(ds_shape))
							#errors = tf.Print(errors, [tf.reduce_sum(errors)])
							loss += tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits = lies_discrim_tower[i], labels=errors))
							loss += tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits = truth_discrim_tower[i], labels=tf.zeros_like(truth_discrim_tower[i])))
							self.summaries.append(image_summary("guess"+str(i), upsample_mean(tf.nn.sigmoid(lies_discrim_tower[i]), self.padded_patch_size, expander), zero_one=True))
							self.summaries.append(image_summary("truth"+str(i), upsample_mean(errors, self.padded_patch_size, expander)))


			loss = loss/len(devices)
			reconstruction_loss = reconstruction_loss/len(devices)

			var_list = tf.get_collection(
				tf.GraphKeys.TRAINABLE_VARIABLES, scope='params')

			def train_op():
				optimizer = tf.train.AdamOptimizer(0.0001, beta1=0.95, beta2=0.9995, epsilon=0.1)
				op = optimizer.minimize(8e5*loss + reconstruction_loss, colocate_gradients_with_ops=True, var_list = var_list)

				ema_loss=EMA(decay=0.99)
				ema_loss.update(loss)

				ema_reconstruction_loss=EMA(decay=0.99)
				ema_reconstruction_loss.update(reconstruction_loss)

				with tf.control_dependencies([op]):
					with tf.control_dependencies([self.step.assign_add(1)]):
						op = tf.group(
								tf.Print(0, [tf.identity(self.step), loss], message="step|loss"),
								)
				quick_summary_op = tf.summary.merge([
					tf.summary.scalar("loss", loss),
					tf.summary.scalar("reconstruction_loss", reconstruction_loss),
					tf.summary.scalar("ema_reconstruction_loss", ema_reconstruction_loss.val),
					tf.summary.scalar("ema_loss", ema_loss.val),
				])
				return op, quick_summary_op
			def test_op():
				ema_test_loss=EMA(decay=0.9)
				ema_test_loss.update(loss)

				ema_test_reconstruction_loss=EMA(decay=0.9)
				ema_test_reconstruction_loss.update(reconstruction_loss)
				quick_summary_op = tf.summary.merge([
							tf.summary.scalar("test_loss", loss),
							tf.summary.scalar("test_reconstruction_loss", reconstruction_loss),
							tf.summary.scalar("ema_test_reconstruction_loss", ema_test_reconstruction_loss.val),
							tf.summary.scalar("ema_test_loss", ema_test_loss.val),
							])

				return tf.no_op(), quick_summary_op

			self.iter_op, self.quick_summary_op = tf.cond(tf.equal(self.iteration_type,0),
				train_op,
				test_op)
		self.sess.run(tf.variables_initializer(
			tf.get_collection(tf.GraphKeys.VARIABLES,scope='params')+
			tf.get_collection(tf.GraphKeys.VARIABLES,scope='optimize'))
			)
		print(self.sess.run( tf.report_uninitialized_variables( tf.all_variables( ))))

		summary_op = tf.summary.merge(self.summaries)

		self.saver = tf.train.Saver(var_list=var_list,keep_checkpoint_every_n_hours=2)
		self.summary_op = summary_op
def Parameter_Server(Synchronizer, cluster, log_path, model_path, procs):
    config = tf.ConfigProto(
        allow_soft_placement=True,
        log_device_placement=False,
    )
    config.gpu_options.allow_growth = True
    server = tf.train.Server(cluster,
                             job_name="ps",
                             task_index=0,
                             config=config)
    sess = tf.Session(target=server.target, config=config)
    summary_writer = tf.summary.FileWriter(log_path)
    Net = MiniNetwork(sess=sess,
                      summary_writer=summary_writer,
                      rl_training=FLAGS.training,
                      cluster=cluster,
                      index=0,
                      device=DEVICE[0 % len(DEVICE)],
                      ppo_load_path=FLAGS.restore_model_path,
                      ppo_save_path=model_path)
    Sec_Net = SecondNetwork(sess=sess,
                            rl_training=False,
                            reuse=True,
                            cluster=None,
                            index=0,
                            load_model=True)

    agent = mini_source_agent.MiniSourceAgent(
        index=-1,
        net=Net,
        sec_net=Sec_Net,
        restore_model=FLAGS.restore_model,
        rl_training=FLAGS.training)

    print("Parameter server: waiting for cluster connection...")
    sess.run(tf.report_uninitialized_variables())
    print("Parameter server: cluster ready!")

    print("Parameter server: initializing variables...")
    agent.init_network()
    print("Parameter server: variables initialized")

    update_counter = 0
    max_win_rate = 0.
    while update_counter < TRAIN_ITERS:
        agent.reset_old_network()

        # wait for update
        Synchronizer.wait()
        logging("Update Network!")
        # TODO count the time , compare cpu and gpu
        time.sleep(1)

        # update finish
        Synchronizer.wait()
        logging("Update Network finished!")

        steps, win_rate = agent.update_summary(update_counter)
        logging("Steps: %d, win rate: %f" % (steps, win_rate))

        update_counter += 1
        if win_rate >= max_win_rate:
            agent.save_model()
            max_win_rate = win_rate

    return max_win_rate
def Worker(index, update_game_num, Synchronizer, cluster, model_path):
    config = tf.ConfigProto(
        allow_soft_placement=True,
        log_device_placement=False,
    )
    config.gpu_options.allow_growth = True
    worker = tf.train.Server(cluster,
                             job_name="worker",
                             task_index=index,
                             config=config)
    sess = tf.Session(target=worker.target, config=config)

    Net = MiniNetwork(sess=sess,
                      summary_writer=None,
                      rl_training=FLAGS.training,
                      cluster=cluster,
                      index=index,
                      device=DEVICE[index % len(DEVICE)],
                      ppo_load_path=FLAGS.restore_model_path,
                      ppo_save_path=model_path)
    Sec_Net = SecondNetwork(sess=sess,
                            rl_training=False,
                            reuse=True,
                            cluster=None,
                            index=index,
                            load_model=True)

    global_buffer = Buffer()
    agents = []
    for i in range(THREAD_NUM):
        agent = mini_source_agent.MiniSourceAgent(
            index=i,
            global_buffer=global_buffer,
            net=Net,
            sec_net=Sec_Net,
            restore_model=FLAGS.restore_model,
            rl_training=FLAGS.training,
            strategy_agent=None)
        agents.append(agent)

    print("Worker %d: waiting for cluster connection..." % index)
    sess.run(tf.report_uninitialized_variables())
    print("Worker %d: cluster ready!" % index)

    while len(sess.run(tf.report_uninitialized_variables())):
        print("Worker %d: waiting for variable initialization..." % index)
        time.sleep(1)
    print("Worker %d: variables initialized" % index)

    game_num = np.ceil(update_game_num // THREAD_NUM)

    UPDATE_EVENT.clear()
    ROLLING_EVENT.set()

    # Run threads
    threads = []
    for i in range(THREAD_NUM - 1):
        t = threading.Thread(target=run_thread,
                             args=(agents[i], game_num, Synchronizer,
                                   FLAGS.difficulty))
        threads.append(t)
        t.daemon = True
        t.start()
        time.sleep(3)

    run_thread(agents[-1], game_num, Synchronizer, FLAGS.difficulty)

    for t in threads:
        t.join()
Esempio n. 39
0
def run(args, server):
    # lkx: client and remote doesn't mater for non VNC and flash game
    # env = create_env(args.env_id, client_id=str(args.task), remotes=args.remotes)
    # trainer = A3C(env, args.task)

    target_task = 1  # int(args.target_task)
    env_names = args.env_id.split("_")
    envs = [
        create_env(env_name,
                   client_id=str(args.worker_id),
                   remotes=args.remotes) for env_name in env_names
    ]

    trainer = A3C(envs, int(args.worker_id), target_task)

    # Variable names that start with "local" are not saved in checkpoints.
    variables_to_save = [
        v for v in tf.all_variables() if not v.name.startswith("local")
    ]
    init_op = tf.initialize_variables(variables_to_save)
    init_all_op = tf.initialize_all_variables()
    saver = FastSaver(variables_to_save)

    variables_to_restore = [
        v for v in tf.all_variables()
        if v.name.startswith("global0") and "global_step" not in v.name
    ]  # Adam_2 and 3 cost by the distillation train op
    pre_train_saver = FastSaver(variables_to_restore)

    # variables_global_toinit = [v for v in tf.all_variables() if v.name.startswith("global0")]
    # for v in tf.all_variables():
    #     if v.name.startswith("global/"):
    #         print v.name

    def init_fn(ses):
        logger.info("Initializing all parameters.")
        ses.run(init_all_op)
        pre_train_saver.restore(
            ses, "../expResults/20170125_09-26/train/model.ckpt-4986751")
        # "../expResults/20170125_09-26/train/model.ckpt-4198738")
        # "../expResults/20170124_15-11/train/model.ckpt-4986137")
        # "../expResults/20170124_15-11/train/model.ckpt-4301837")
        # "../expResults/20170124_15-11/train/model.ckpt-2140636")

    config = tf.ConfigProto(device_filters=[
        "/job:ps", "/job:worker/task:{}/cpu:0".format(args.worker_id)
    ])  # refer to worker id
    logdir = os.path.join(args.log_dir, 'train')
    summary_writer = tf.train.SummaryWriter(logdir + "_%d" % args.worker_id)
    logger.info("Events directory: %s_%s", logdir, args.worker_id)
    sv = tf.train.Supervisor(
        is_chief=(args.worker_id == 0),
        logdir=logdir,
        saver=saver,
        summary_op=None,
        init_op=
        init_op,  # Defaults to an Operation that initializes all variables
        init_fn=init_fn,  # Called after the optional init_op is called
        summary_writer=summary_writer,
        ready_op=tf.report_uninitialized_variables(
            variables_to_save),  # list the names of uninitialized variables.
        global_step=trainer.global_step[target_task],
        save_model_secs=30,
        save_summaries_secs=30)

    num_taskss = len(envs)

    num_global_steps = 20000000  #10000000

    logger.info(
        "Starting session. If this hangs, we're mostly likely waiting to connect to the parameter server. "
        +
        "One common cause is that the parameter server DNS name isn't resolving yet, or is misspecified."
    )
    with sv.managed_session(server.target,
                            config=config) as sess, sess.as_default():
        for ii in np.arange(num_taskss):
            sess.run(trainer.sync[ii])
        sess.run(trainer.sync_logits)
        trainer.start(sess, summary_writer)
        global_step = sess.run(trainer.global_step[target_task])
        logger.info("Starting training at step=%d", global_step)
        while not sv.should_stop() and (not num_global_steps
                                        or global_step < num_global_steps):
            # if global_step <= 1000000 and np.random.uniform(0, 1) > 0.5:   # todo annealing
            #     batch_aux = trainer.get_knowledge(sess)
            #     trainer.process(sess, batch_aux)
            trainer.process(sess)
            global_step = sess.run(trainer.global_step[target_task])

    # Ask for all the services to stop.
    sv.stop()
    logger.info('reached %s steps. worker stopped.', global_step)
Esempio n. 40
0
def train(args, server, cluster, env, queue_shapes,
          trajectory_queue_size):

    agent = Agent(args, server, cluster, env, queue_shapes,
                  trajectory_queue_size)

    # Variable names that start with "local" are not saved in checkpoints.
    variables_to_save = [
            v for v in tf.global_variables() if not v.name.startswith("local")]

    init_op = tf.variables_initializer(variables_to_save)
    init_all_op = tf.global_variables_initializer()
    saver = ut.tf.FastSaver(variables_to_save)

    var_list = tf.get_collection(
            tf.GraphKeys.TRAINABLE_VARIABLES, tf.get_variable_scope().name)

    logger.info('Trainable vars:')
    for v in var_list:
        logger.info('  %s %s', v.name, v.get_shape())

    def init_fn(ses):
        logger.info("Initializing all parameters.")
        ses.run(init_all_op)

    devices = ["/job:ps"]
    if args.task == 0:
        devices += [f"/job:worker/task:{args.task}/gpu:0", f"/job:worker/task:{args.task}/cpu:0"]
    else:
        devices += [f"/job:worker/task:{args.task}/cpu:0"]

    config = tf.ConfigProto(device_filters=devices, allow_soft_placement=True)
    logger.info("Events directory: %s_%s", args.load_path, args.task)

    summary_writer = tf.summary.FileWriter(f"{args.load_path}_{args.task}")
    agent.summary_writer = summary_writer

    sv = tf.train.Supervisor(
            is_chief=args.task == 0,
            logdir=str(args.load_path),
            saver=saver,
            summary_op=None,
            init_op=init_op,
            init_fn=init_fn,
            summary_writer=summary_writer,
            ready_op=tf.report_uninitialized_variables(variables_to_save),
            global_step=agent.policy_step,
            save_model_secs=30,
            save_summaries_secs=30)

    num_policy_steps = 100000000

    logger.info(
        "Starting session. If this hangs, we're mostly likely waiting"
        " to connect to the parameter server. One common cause is that"
        " the parameter server DNS name isn't resolving yet, or is misspecified.")

    with sv.managed_session(server.target, config=config) as sess, \
            sess.as_default():

        def sync():
            #logger.error("SYNC")
            sess.run(agent.sync)

        ###############################
        # Run thread
        ###############################
        if args.task >= 1:
            sync()
            agent.start_worker_thread(sess, summary_writer)

        policy_step = sess.run(agent.policy_step)
        logger.info("Starting training at step=%d", policy_step)

        while not sv.should_stop() and ( \
                not num_policy_steps or policy_step < num_policy_steps):
            if args.task == 0:
                agent.train_policy(sess)
            else:
                sync()
            policy_step = sess.run(agent.policy_step)

    # Ask for all the services to stop.
    sv.stop()
    logger.info('reached %s steps. worker stopped.', policy_step)
Esempio n. 41
0
print(tf.get_collection("my_collection_name"))
print(tf.get_collection(tf.GraphKeys.LOCAL_VARIABLES))
print(tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES))

# Placing a variable in a given device
with tf.device("/device:CPU:0"):
    v = tf.get_variable("v", [1])

# Initialize all global variables
sess.run(tf.global_variables_initializer())

# One can also initialize variables individually
sess.run(my_variable.initializer)

# Find the variables that have not been initialized
print(sess.run(tf.report_uninitialized_variables()))

# Correct way to initialize a variable that depends on another variable
x = tf.get_variable("x", shape=(), initializer=tf.zeros_initializer())
w = tf.get_variable("w", initializer=x.initialized_value() + 1)

# We can now use the variable as a normal tensor
l = w + 1

# We can assign values to already defined variables
assignment = x.assign_add(1)
sess.run(tf.global_variables_initializer())
sess.run(x)
sess.run(assignment)
sess.run(x)
Esempio n. 42
0
    def initialize(self):
        """Fetch record then uses tf's saver.restore."""
        if self.do_restore:
            # First, determine which checkpoint to use.
            if self.from_ckpt is not None:
                # Use a cached checkpoint file.
                ckpt_filename = self.from_ckpt
                log.info('Restoring variables from checkpoint %s ...' \
                        % ckpt_filename)
            else:
                # Otherwise, use a database checkpoint.
                self.load_rec() if self.load_data is None else None
                if self.load_data is not None:
                    rec, ckpt_filename = self.load_data
                    log.info('Restoring variables from record %s (step %d)...' \
                             % (str(rec['_id']), rec['step']))
                else:
                    # No db checkpoint to load.
                    ckpt_filename = None

            if ckpt_filename is not None:
                # Determine which vars should be restored from the specified checkpoint.
                restore_vars = self.get_restore_vars(ckpt_filename)
                restore_names = [name for name, var in restore_vars.items()]
                # remap the actually restored names to the new ones
                if self.load_param_dict:
                    for each_old_name in self.load_param_dict.keys():
                        if each_old_name in restore_names:
                            restore_names.remove(each_old_name)
                            restore_names.append(
                                self.load_param_dict[each_old_name])

                # Actually load the vars.
                log.info('Restored Vars (in ckpt, in graph):\n' +
                         str(restore_names))
                tf_saver_restore = tf.train.Saver(restore_vars)
                tf_saver_restore.restore(self.sess, ckpt_filename)
                log.info('... done restoring.')

                # Run post init_ops if needed
                if self.var_manager:
                    self.sess.run(
                        tf.group(*self.var_manager.get_post_init_ops()))

                # Reinitialize all other, unrestored vars.
                unrestored_vars = [\
                        var \
                        for name, var in self.var_list.items() \
                        if name not in restore_names]
                unrestored_var_names = [\
                        name \
                        for name, var in self.var_list.items() \
                        if (name not in restore_names) and not(any([name.endswith(s) for s in OPTIMIZER_NAMES]))]
                log.info('Unrestored Vars (in graph, not in ckpt):\n' +
                         str(unrestored_var_names))
                self.sess.run(tf.variables_initializer(
                    unrestored_vars))  # initialize variables not restored
                assert len(self.sess.run(
                    tf.report_uninitialized_variables())) == 0, (self.sess.run(
                        tf.report_uninitialized_variables()))

        if not self.do_restore \
                or (self.load_data is None and self.from_ckpt is None):
            init_op_global = tf.global_variables_initializer()
            self.sess.run(init_op_global)
            init_op_local = tf.local_variables_initializer()
            self.sess.run(init_op_local)
            if self.var_manager:
                self.sess.run(tf.group(*self.var_manager.get_post_init_ops()))
Esempio n. 43
0
    def __init__(self,
                 graph_path,
                 target_size=(320, 240),
                 tf_config=None,
                 trt_bool=False):
        self.target_size = target_size

        # load graph
        logger.info('loading graph from %s(default size=%dx%d)' %
                    (graph_path, target_size[0], target_size[1]))
        with tf.gfile.GFile(graph_path, 'rb') as f:
            graph_def = tf.GraphDef()
            graph_def.ParseFromString(f.read())

        if trt_bool is True:
            output_nodes = ["Openpose/concat_stage7"]
            graph_def = trt.create_inference_graph(
                graph_def,
                output_nodes,
                max_batch_size=1,
                max_workspace_size_bytes=1 << 20,
                precision_mode="FP16",
                # precision_mode="INT8",
                minimum_segment_size=3,
                is_dynamic_op=True,
                maximum_cached_engines=int(1e3),
                use_calibration=True,
            )

        self.graph = tf.get_default_graph()
        tf.import_graph_def(graph_def, name='TfPoseEstimator')
        self.persistent_sess = tf.Session(graph=self.graph, config=tf_config)

        for ts in [n.name for n in tf.get_default_graph().as_graph_def().node]:
            print(ts)

        self.tensor_image = self.graph.get_tensor_by_name(
            'TfPoseEstimator/image:0')
        self.tensor_output = self.graph.get_tensor_by_name(
            'TfPoseEstimator/Openpose/concat_stage7:0')
        self.tensor_heatMat = self.tensor_output[:, :, :, :19]
        self.tensor_pafMat = self.tensor_output[:, :, :, 19:]
        self.upsample_size = tf.placeholder(dtype=tf.int32,
                                            shape=(2, ),
                                            name='upsample_size')
        self.tensor_heatMat_up = tf.image.resize_area(
            self.tensor_output[:, :, :, :19],
            self.upsample_size,
            align_corners=False,
            name='upsample_heatmat')
        self.tensor_pafMat_up = tf.image.resize_area(
            self.tensor_output[:, :, :, 19:],
            self.upsample_size,
            align_corners=False,
            name='upsample_pafmat')
        if trt_bool is True:
            smoother = Smoother({'data': self.tensor_heatMat_up}, 25, 3.0, 19)
        else:
            smoother = Smoother({'data': self.tensor_heatMat_up}, 25, 3.0)
        gaussian_heatMat = smoother.get_output()

        max_pooled_in_tensor = tf.nn.pool(gaussian_heatMat,
                                          window_shape=(3, 3),
                                          pooling_type='MAX',
                                          padding='SAME')
        self.tensor_peaks = tf.where(
            tf.equal(gaussian_heatMat, max_pooled_in_tensor), gaussian_heatMat,
            tf.zeros_like(gaussian_heatMat))

        self.heatMat = self.pafMat = None

        # warm-up
        self.persistent_sess.run(
            tf.variables_initializer([
                v for v in tf.global_variables() if v.name.split(':')[0] in [
                    x.decode('utf-8') for x in self.persistent_sess.run(
                        tf.report_uninitialized_variables())
                ]
            ]))
        self.persistent_sess.run(
            [self.tensor_peaks, self.tensor_heatMat_up, self.tensor_pafMat_up],
            feed_dict={
                self.tensor_image: [
                    np.ndarray(shape=(target_size[1], target_size[0], 3),
                               dtype=np.float32)
                ],
                self.upsample_size: [target_size[1], target_size[0]]
            })
        self.persistent_sess.run(
            [self.tensor_peaks, self.tensor_heatMat_up, self.tensor_pafMat_up],
            feed_dict={
                self.tensor_image: [
                    np.ndarray(shape=(target_size[1], target_size[0], 3),
                               dtype=np.float32)
                ],
                self.upsample_size: [target_size[1] // 2, target_size[0] // 2]
            })
        self.persistent_sess.run(
            [self.tensor_peaks, self.tensor_heatMat_up, self.tensor_pafMat_up],
            feed_dict={
                self.tensor_image: [
                    np.ndarray(shape=(target_size[1], target_size[0], 3),
                               dtype=np.float32)
                ],
                self.upsample_size: [target_size[1] // 4, target_size[0] // 4]
            })

        # logs
        if self.tensor_image.dtype == tf.quint8:
            logger.info('quantization mode enabled.')
Esempio n. 44
0
# accuracy
        with tf.name_scope('Accuracy'):
            correct_prediction = tf.equal(tf.argmax(y_conv, 1),
                                          tf.argmax(y_, 1))
            accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
        local_init_op = opt.local_step_init_op
        if is_chief:
            local_init_op = opt.chief_init_op

        ready_for_local_init_op = opt.ready_for_local_init_op
        # Initial token and chief queue runners required by the sync_replicas mode
        chief_queue_runner = opt.get_chief_queue_runner()
        sync_init_op = opt.get_init_tokens_op()

        init_op = tf.global_variables_initializer()
        variables_check_op = tf.report_uninitialized_variables()
        sess_config = tf.ConfigProto(allow_soft_placement=True,
                                     log_device_placement=False,
                                     device_filters=[
                                         "/job:ps",
                                         "/job:worker/task:%d" %
                                         FLAGS.task_index
                                     ])
    sv = tf.train.Supervisor(is_chief=is_chief,
                             init_op=init_op,
                             local_init_op=local_init_op,
                             ready_for_local_init_op=ready_for_local_init_op,
                             global_step=global_step)
    server_grpc_url = "grpc://" + workers[FLAGS.task_index]
    state = False
    with sv.prepare_or_wait_for_session(server_grpc_url,
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('data', help='data file')
    parser.add_argument('--load', help='model to load')
    parser.add_argument('--epochs',
                        type=int,
                        default=10000,
                        help='number of epochs to train for')
    parser.add_argument('--seed', type=int, default=0, help='random seed')
    parser.add_argument('--resample',
                        action='store_true',
                        help='resample data')
    parser.add_argument('--gpus', help='gpu to use')
    parser.add_argument('--num_images',
                        type=int,
                        default=2,
                        help='number of input images')
    parser.add_argument('--batch_size',
                        type=int,
                        default=1024,
                        help='batch size')
    parser.add_argument('--learning_rate',
                        type=float,
                        default=5e-3,
                        help='batch size')
    parser.add_argument('--num_pts',
                        type=int,
                        default=1,
                        help='number of output waypoints')
    parser.add_argument('--capacity',
                        type=float,
                        default=1,
                        help='network capacity')
    parser.add_argument('--cam_coord',
                        type=float,
                        default=-1,
                        help='use focal length coordinates')
    parser.add_argument('--min',
                        type=tuple,
                        default=(0, -0.5, -0.5),
                        help='minimum xyz ')
    parser.add_argument('--max',
                        type=tuple,
                        default=(1, 0.5, 0.5),
                        help='maximum xyz')
    parser.add_argument('--bins',
                        type=int,
                        default=100,
                        help='number of bins per coordinate')
    parser.add_argument('--dense',
                        type=int,
                        default=0,
                        help='number of additional dense layers')
    args = parser.parse_args()
    args.min = [(0, -0.5, -0.1), (0, -1, -0.15), (0, -1.5, -0.2),
                (0, -2, -0.3), (0, -3, -0.5)]
    args.max = [(1, 0.5, 0.1), (2, 1, 0.15), (4, 1.5, 0.2), (6, 2, 0.3),
                (7, 0.3, 0.5)]

    if (args.gpus is not None):
        os.environ["CUDA_VISIBLE_DEVICES"] = args.gpus

    # Model and optimization params
    val_perc = 0.3
    #g_depths = [64, 64, 64]
    #f_depths = [64, 64, 64]
    batch_size = args.batch_size  #512#1024#64
    num_epochs = args.epochs
    learning_rate = args.learning_rate  #50#e-1
    learn_rate_decay = 100 / num_epochs
    save_variables_divider = 10
    log_path = './model/logs'
    save_path = createStampedFolder(os.path.join(log_path, 'variable_log'))

    ######################

    # Make model
    print('Building model')
    model = OrangeClassNet(args.capacity, args.num_images, args.num_pts,
                           args.cam_coord, args.min, args.max, args.bins,
                           args.dense)

    # Load in Data
    train_indices, val_indices, num_list, traj_data, val_data = parseDirData(
        args.data, args.seed, args.resample, val_perc, args.num_pts)
    num_train_samples = train_indices.shape[0]
    num_val_samples = val_indices.shape[0]

    fopen = open('val_data.pickle', 'wb')
    #print(val_data)
    pickle.dump(val_data, fopen, pickle.HIGHEST_PROTOCOL)
    # exit(0)
    # Train model
    print('Training...')
    print('Training Samples: ' + str(num_train_samples))
    print('Validation Samples: ' + str(num_val_samples))
    data_loc = copy.deepcopy(args.data)
    data_loc_name = data_loc.strip("..").strip(".").strip("/").replace(
        "/", "_")
    mean_img_loc = data_loc + "../mean_imgv2_" + data_loc_name + '.npy'
    print(mean_img_loc)
    if not (os.path.exists(mean_img_loc)):
        print('mean image file not found')
        mean_image = compute_mean_image(train_indices, data_loc, model)
        np.save(mean_img_loc, mean_image)
    else:
        print('mean image file found')
        mean_image = np.load(mean_img_loc)
    # mean_image = np.zeros((model.h, model.w, 3))

    val_inputs, val_outputs_x, val_outputs_y, val_outputs_z = loadData(
        val_indices, num_list, data_loc, model, traj_data)

    print('Validation Loaded')
    #train_path = addTimestamp(os.path.join(log_path, 'train_'))
    val_path = addTimestamp(os.path.join(log_path, 'validation_'))
    plot_data_path = addTimestamp(os.path.join(log_path, 'plot_data_'))
    #train_writer = tf.summary.FileWriter(train_path, graph=tf.get_default_graph())
    val_writer = tf.summary.FileWriter(val_path, graph=tf.get_default_graph())
    os.makedirs(plot_data_path)

    saver = tf.train.Saver()
    init = tf.global_variables_initializer()
    feed_dict = {}  #model.keep_prob: 0.9}
    print('Writers Set Up')

    with tf.Session() as sess:  # Load model if specified
        if args.load:
            saver.restore(sess, tf.train.latest_checkpoint(args.load))
            uninit_vars_op = tf.report_uninitialized_variables()
            uninit_vars = sess.run(uninit_vars_op)
            uninit_vars_op.mark_used()
            if uninit_vars.size != 0:
                print(uninit_vars)  #, sep=',')
                sess.close()
                raise RuntimeError('Uninitialized variables present')
        else:
            sess.run(init)
        print('Session')
        iters = 0
        plotting_data = dict()
        plotting_data['idx'] = range(len(val_indices))
        #print(plotting_data['idx'])
        #exit(0)
        plotting_data['truth'] = [
            val_outputs_x[plotting_data['idx']],
            val_outputs_y[plotting_data['idx']],
            val_outputs_z[plotting_data['idx']]
        ]
        plotting_data['data'] = list()
        plotting_data['foc_l'] = args.cam_coord
        plotting_data['min'] = model.min
        plotting_data['max'] = model.max
        plotting_data['bins'] = model.bins
        for ii in plotting_data['idx']:
            plotting_data['data'].append([])
        #print(plotting_data)
        #for epoch in range(num_epochs):
        for epoch in range(1):
            print('Epoch: ', epoch)
            batch_idx = 0
            # Decay learning rate
            new_learn_rate = np.exp(-epoch * learn_rate_decay) * learning_rate
            print('Learning Rate Set to: ' + str(new_learn_rate))
            model.learning_fac.assign(new_learn_rate)
            """
      while batch_idx < num_train_samples:
        end_idx = min(batch_idx + batch_size, num_train_samples)
        train_inputs, train_outputs_x, train_outputs_y, train_outputs_z = loadData(train_indices[batch_idx:end_idx],num_list,data_loc, model,traj_data)
        feed_dict[model.image_input] = train_inputs
        feed_dict[model.waypoint_output_x] = train_outputs_x
        feed_dict[model.waypoint_output_y] = train_outputs_y
        feed_dict[model.waypoint_output_z] = train_outputs_z
        #sess.run([model.train_summary_op, model.train_step], feed_dict=feed_dict)
        sess.run(model.train_step, feed_dict=feed_dict)
        batch_idx = batch_idx + batch_size
        iters = iters + 1
        if iters % 20 == 0:
          summary, logits = sess.run([model.train_summ,model.logits], feed_dict=feed_dict)
          accuracy = acc_metric(logits,train_outputs_x,train_outputs_y,train_outputs_z, model)
          print('Training Accuracy: ' + str(accuracy))
          train_writer.add_summary(summary, iters)
        #Clear references to data:
        train_inputs = train_outputs = feed_dict[model.image_input] = feed_dict[model.waypoint_output_x] = feed_dict[model.waypoint_output_y] = feed_dict[model.waypoint_output_z] = None
      """
            val_batch_idx = 0
            num_validation = len(val_indices)
            #val_summary = 0
            val_cost = np.zeros((1, ))
            resnet_output = np.zeros(
                (args.num_pts, 3, 0, model.bins))  # 2nd arg for num_waypoints
            raw_losses = np.zeros((3, ))
            accuracy = []

            while val_batch_idx < num_validation:
                val_batch_endx = min(val_batch_idx + batch_size,
                                     num_validation)
                val_dict = {
                    model.image_input:
                    val_inputs[val_batch_idx:val_batch_endx],
                    model.waypoint_output[0]:
                    val_outputs_x[val_batch_idx:val_batch_endx],
                    model.waypoint_output[1]:
                    val_outputs_y[val_batch_idx:val_batch_endx],
                    model.waypoint_output[2]:
                    val_outputs_z[val_batch_idx:val_batch_endx]
                }

                val_summary_temp, val_cost_temp, resnet_output_temp, raw_losses_temp = sess.run(
                    [
                        model.val_summ, model.objective, model.logits,
                        model.losses
                    ],
                    feed_dict=val_dict)
                val_writer.add_summary(val_summary_temp, iters)

                #val_summary_temp
                val_cost = np.multiply(
                    val_cost,
                    (float(val_batch_idx) / val_batch_endx)) + np.multiply(
                        val_cost_temp, (float(val_batch_endx - val_batch_idx) /
                                        val_batch_endx))
                resnet_output_temp = np.array(resnet_output_temp)
                accuracy.append(
                    acc_metric(resnet_output_temp,
                               val_dict[model.waypoint_output[0]],
                               val_dict[model.waypoint_output[1]],
                               val_dict[model.waypoint_output[2]], model))
                resnet_output = np.concatenate(
                    (resnet_output, resnet_output_temp), axis=2)
                raw_losses = np.multiply(
                    raw_losses_temp,
                    (float(val_batch_idx) / val_batch_endx)) + np.multiply(
                        np.array(raw_losses_temp),
                        (float(val_batch_endx - val_batch_idx) /
                         val_batch_endx))

                val_batch_idx = val_batch_endx

            accuracy = np.mean(accuracy, axis=0)
            print('Validation Summary = ', val_cost)
            print('Accuracy = ', accuracy)
            resnet_output = np.array(resnet_output)
            print(raw_losses)
            print(resnet_output.shape)
            for ii in plotting_data['idx']:
                plotting_data['data'][ii].append(resnet_output[:, :, ii, :])
            with open(plot_data_path + '/data.pickle', 'wb') as f:
                pickle.dump(plotting_data, f, pickle.HIGHEST_PROTOCOL)

            #val_writer.add_summary(val_summary, iters)

            #train_writer.flush()
            val_writer.flush()
            # Save variables
            """
      if ((epoch + 1) % save_variables_divider == 0 or (epoch == 0) or (epoch == num_epochs - 1)):
          print("Saving variables")
          if epoch == 0:
            print("For epoch 0")
            saver.save(sess, os.path.join(save_path, 'variables'), epoch)
          else:
            print("For epoch ", epoch)
            saver.save(sess, os.path.join(save_path, 'variables'), epoch, write_meta_graph=False)
      # Re-shuffle data after each epoch
      rand_idx = np.random.permutation(num_train_samples)
      train_indices = train_indices[rand_idx]
      """
    #train_writer.flush()
    val_writer.flush()
    print("Done")
Esempio n. 46
0
    inputs = model.inputs[:2]
    dense = model.get_layer('NSP-Dense').output
    outputs = keras.layers.Dense(units=2, activation='softmax')(dense)
    model = keras.models.Model(inputs, outputs)

    model.compile(
        RAdam(lr=LR),
        loss='sparse_categorical_crossentropy',
        metrics=['sparse_categorical_accuracy'],
    )

print(model.summary())

sess = K.get_session()
uninitialized_variables = set(
    [i.decode('ascii') for i in sess.run(tf.report_uninitialized_variables())])
init_op = tf.variables_initializer([
    v for v in tf.global_variables()
    if v.name.split(':')[0] in uninitialized_variables
])
sess.run(init_op)

es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=2)
mc = ModelCheckpoint('best_model.h5',
                     monitor='val_acc',
                     mode='max',
                     verbose=1,
                     save_best_only=True)

history = model.fit(train_x,
                    train_y,
def Parameter_Server(Synchronizer, cluster, log_path, model_path, procs):
    config = tf.ConfigProto(
        allow_soft_placement=True,
        log_device_placement=False,
    )
    config.gpu_options.allow_growth = True
    server = tf.train.Server(cluster,
                             job_name="ps",
                             task_index=0,
                             config=config)
    #config.gpu_options.per_process_gpu_memory_fraction = 0.2
    sess = tf.Session(target=server.target, config=config)
    summary_writer = tf.summary.FileWriter(log_path)
    mini_net = MiniNetwork(sess,
                           index=0,
                           summary_writer=summary_writer,
                           rl_training=True,
                           cluster=cluster,
                           ppo_load_path=FLAGS.restore_model_path,
                           ppo_save_path=model_path,
                           freeze_head=FLAGS.freeze_head,
                           use_bn=FLAGS.use_bn,
                           use_sep_net=FLAGS.use_sep_net,
                           restore_model=FLAGS.restore_model,
                           restore_from=FLAGS.restore_from,
                           restore_to=FLAGS.restore_to)

    agent = MiniAgent(agent_id=-1,
                      global_buffer=Buffer(),
                      net=mini_net,
                      restore_model=FLAGS.restore_model)

    print("Parameter server: waiting for cluster connection...")
    sess.run(tf.report_uninitialized_variables())
    print("Parameter server: cluster ready!")

    print("Parameter server: initializing variables...")
    agent.init_network()
    print("Parameter server: variables initialized")

    last_win_rate = 0.

    update_counter = 0
    while update_counter < TRAIN_ITERS:
        agent.reset_old_network()

        # wait for update
        Synchronizer.wait()
        logging("Update Network!")
        # TODO count the time , compare cpu and gpu
        time.sleep(1)

        # update finish
        Synchronizer.wait()
        logging("Update Network finished!")

        steps, win_rate = agent.update_summary(update_counter)
        logging("Steps: %d, win rate: %f" % (steps, win_rate))

        update_counter += 1
        if win_rate >= last_win_rate:
            agent.save_model()

        last_win_rate = win_rate
    for p in procs:
        print('Process terminate')
        p.terminate()
Esempio n. 48
0
def run_tester(args, server):
    env = new_env(args)
    env.reset()
    env.max_history = args.eval_num
    if args.alg == 'A3C':
        agent = A3C(env, args)
    elif args.alg == 'Q':
        agent = Q(env, args)
    elif args.alg == 'VPN':
        agent = VPN(env, args)
    else:
        raise ValueError('Invalid algorithm: ' + args.alg)

    device = 'gpu' if args.gpu > 0 else 'cpu'
    gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.15)
    config = tf.ConfigProto(device_filters=[
        "/job:ps", "/job:worker/task:{}/{}:0".format(args.task, device)
    ],
                            gpu_options=gpu_options,
                            allow_soft_placement=True)
    variables_to_save = [v for v in tf.global_variables() if \
                not v.name.startswith("global") and not v.name.startswith("local/target/")]
    global_variables = [
        v for v in tf.global_variables() if not v.name.startswith("local")
    ]

    init_op = tf.variables_initializer(global_variables)
    init_all_op = tf.global_variables_initializer()

    var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                 tf.get_variable_scope().name)
    logger.info('Trainable vars:')
    for v in var_list:
        logger.info('  %s %s', v.name, v.get_shape())
    logger.info("Num parameters: %d", agent.local_network.num_param)

    def init_fn(ses):
        logger.info("Initializing all parameters.")
        ses.run(init_all_op)

    saver = FastSaver(variables_to_save, max_to_keep=0)
    sv = tf.train.Supervisor(
        is_chief=False,
        global_step=agent.global_step,
        summary_op=None,
        init_op=init_op,
        init_fn=init_fn,
        ready_op=tf.report_uninitialized_variables(global_variables),
        saver=saver,
        save_model_secs=0,
        save_summaries_secs=0)

    best_reward = -10000
    with sv.managed_session(server.target,
                            config=config) as sess, sess.as_default():
        epoch = args.eval_epoch
        while args.eval_freq * epoch <= args.max_step:
            path = os.path.join(args.log, "e%d" % epoch)
            if not os.path.exists(path + ".index"):
                time.sleep(10)
                continue
            logger.info("Start evaluation (Epoch %d)", epoch)
            saver.restore(sess, path)
            np.random.seed(args.seed)
            reward = evaluate(env,
                              agent.local_network,
                              args.eval_num,
                              eps=args.eps_eval)

            logfile = open(os.path.join(args.log, "eval.csv"), "a")
            print("Epoch: %d, Reward: %.2f" % (epoch, reward))
            logfile.write("%d, %.3f\n" % (epoch, reward))
            logfile.close()
            if reward > best_reward:
                best_reward = reward
                sv.saver.save(sess, os.path.join(args.log, 'best'))
                print("Saved to: %s" % os.path.join(args.log, 'best'))

            epoch += 1

    logger.info('tester stopped.')
Esempio n. 49
0
def run(args, server):
    env = create_env(args.env_id,
                     client_id=str(args.task),
                     remotes=args.remotes,
                     num_trials=args.num_trials)

    num_global_steps = 10000000
    num_test_steps = 1000000
    trainer = A3C(env, args.task, args.visualise, args.learning_rate,
                  args.meta, args.remotes, args.num_trials, num_global_steps)

    # log, checkpoints et tensorboard

    # (Original Comment) Variable names that start with "local" are not saved in checkpoints.
    if use_tf12_api:
        variables_to_save = [
            v for v in tf.global_variables() if not v.name.startswith("local")
        ]
        init_op = tf.variables_initializer(variables_to_save)
        init_all_op = tf.global_variables_initializer()
    else:
        variables_to_save = [
            v for v in tf.all_variables() if not v.name.startswith("local")
        ]
        init_op = tf.initialize_variables(variables_to_save)
        init_all_op = tf.initialize_all_variables()
    saver = FastSaver(variables_to_save)

    var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                 tf.get_variable_scope().name)
    logger.info('Trainable vars:')
    for v in var_list:
        logger.info('  %s %s', v.name, v.get_shape())

    def init_fn(ses):
        logger.info("Initializing all parameters.")
        ses.run(init_all_op)

    config = tf.ConfigProto(device_filters=[
        "/job:ps", "/job:worker/task:{}/cpu:0".format(args.task)
    ])
    logdir = os.path.join(args.log_dir, 'train')

    if use_tf12_api:
        summary_writer = tf.summary.FileWriter(logdir + "_%d" % args.task)
    else:
        summary_writer = tf.train.SummaryWriter(logdir + "_%d" % args.task)

    logger.info("Events directory: %s_%s", logdir, args.task)
    # The tf.train.Supervisor provides a set of services that helps implement a robust training process. *(4)
    sv = tf.train.Supervisor(
        is_chief=(args.task == 0),
        logdir=logdir,
        saver=saver,
        summary_op=None,
        init_op=init_op,
        init_fn=init_fn,
        summary_writer=summary_writer,
        ready_op=tf.report_uninitialized_variables(variables_to_save),
        global_step=trainer.global_step,
        save_model_secs=30,
        save_summaries_secs=30)
    '''
    # beginning of the training
    logger.info(
        "Starting session. If this hangs, we're mostly likely waiting to connect to the parameter server. " +
        "One common cause is that the parameter server DNS name isn't resolving yet, or is misspecified.")
    with sv.managed_session(server.target, config=config) as sess, sess.as_default():
        sess.run(trainer.sync) # copy weights from the parameter server to the local model
        trainer.start(sess, summary_writer) # lance l'execution de la methode "_run" du TheadRunner "trainer.runner" (object A3C du fichier A3C), qui genere des partial rollouts et les mets dans la queue
        global_step = sess.run(trainer.global_step) # will check in the tmp folder if there is some previously interrupted training to be continued, otherwise start from sratch and initialize the global_step counter at 0
        logger.info("Starting training at step=%d", global_step)
        while not sv.should_stop() and (not num_global_steps or global_step < num_global_steps):
            trainer.process(sess) # (original comment) grabs a rollout in the queue and update the parameters of the server
            global_step = sess.run(trainer.global_step)

    # End of the training, asks for all the services to stop.
    sv.stop()
    logger.info('Training finished ; reached %s steps. worker stopped.', global_step)
    time.sleep(5)
    '''

    # Beginning of the test phase
    with sv.managed_session(server.target,
                            config=config) as sess, sess.as_default():
        sess.run(trainer.sync)
        trainer.start(sess, summary_writer)
        initial_global_step = global_step = sess.run(trainer.global_step)
        logger.info("Starting tests at step=%d", global_step)
        while not sv.should_stop() and (
                not num_test_steps or
            (global_step - initial_global_step) < num_test_steps):
            trainer.inc_global_step(sess)
            global_step = sess.run(trainer.global_step)
    logger.info('Tests finished ; reached %s steps. worker stopped.',
                global_step)
    sv.stop()
Esempio n. 50
0
def run(args, server):
    env = new_env(args)
    if args.alg == 'A3C':
        trainer = A3C(env, args)
    elif args.alg == 'Q':
        trainer = Q(env, args)
    elif args.alg == 'VPN':
        env_off = new_env(args)
        env_off.verbose = 0
        env_off.reset()
        trainer = VPN(env, args, env_off=env_off)
    else:
        raise ValueError('Invalid algorithm: ' + args.alg)

    # Variable names that start with "local" are not saved in checkpoints.
    variables_to_save = [v for v in tf.global_variables() if \
                not v.name.startswith("global") and not v.name.startswith("local/target/")]
    global_variables = [
        v for v in tf.global_variables() if not v.name.startswith("local")
    ]

    init_op = tf.variables_initializer(global_variables)
    init_all_op = tf.global_variables_initializer()
    saver = FastSaver(variables_to_save, max_to_keep=0)

    var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                 tf.get_variable_scope().name)
    logger.info('Trainable vars:')
    for v in var_list:
        logger.info('  %s %s', v.name, v.get_shape())
    logger.info("Num parameters: %d", trainer.local_network.num_param)

    def init_fn(ses):
        logger.info("Initializing all parameters.")
        ses.run(init_all_op)

    device = 'gpu' if args.gpu > 0 else 'cpu'
    gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.15)
    config = tf.ConfigProto(device_filters=[
        "/job:ps", "/job:worker/task:{}/{}:0".format(args.task, device)
    ],
                            gpu_options=gpu_options,
                            allow_soft_placement=True)
    logdir = os.path.join(args.log, 'train')
    summary_writer = tf.summary.FileWriter(logdir + "_%d" % args.task)

    logger.info("Events directory: %s_%s", logdir, args.task)
    sv = tf.train.Supervisor(
        is_chief=(args.task == 0),
        logdir=logdir,
        saver=saver,
        summary_op=None,
        init_op=init_op,
        init_fn=init_fn,
        summary_writer=summary_writer,
        ready_op=tf.report_uninitialized_variables(global_variables),
        global_step=trainer.global_step,
        save_model_secs=0,
        save_summaries_secs=30)

    logger.info(
        "Starting session. If this hangs, we're mostly likely waiting to connect to the parameter server. "
        +
        "One common cause is that the parameter server DNS name isn't resolving yet, or is misspecified."
    )
    with sv.managed_session(server.target,
                            config=config) as sess, sess.as_default():
        sess.run(trainer.sync)
        trainer.start(sess, summary_writer)
        global_step = sess.run(trainer.global_step)
        epoch = -1
        logger.info("Starting training at step=%d", global_step)
        while not sv.should_stop() and (not args.max_step
                                        or global_step < args.max_step):
            if args.task == 0 and int(global_step / args.eval_freq) > epoch:
                epoch = int(global_step / args.eval_freq)
                filename = os.path.join(args.log, 'e%d' % (epoch))
                sv.saver.save(sess, filename)
                sv.saver.save(sess, os.path.join(args.log, 'latest'))
                print("Saved to: %s" % filename)
            trainer.process(sess)
            global_step = sess.run(trainer.global_step)

        if args.task == 0 and int(global_step / args.eval_freq) > epoch:
            epoch = int(global_step / args.eval_freq)
            filename = os.path.join(args.log, 'e%d' % (epoch))
            sv.saver.save(sess, filename)
            sv.saver.save(sess, os.path.join(args.log, 'latest'))
            print("Saved to: %s" % filename)
    # Ask for all the services to stop.
    sv.stop()
    logger.info('reached %s steps. worker stopped.', global_step)
Esempio n. 51
0
def run(args):
    logger.info('Read data:')

    logger.info('Build graph:')
    model = EditableGAN(args)

    print('######################## GPU ALLOCATION ########################')
    print(args.gpu)
    print('######################## GPU ALLOCATION ########################')

    os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu

    variables_to_save = tf.global_variables()
    init_op = tf.variables_initializer(variables_to_save)
    init_all_op = tf.global_variables_initializer()
    saver = FastSaver(var_list=variables_to_save, max_to_keep=5)

    logger.info('GLOBAL vars:')
    var_list = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                 tf.get_variable_scope().name)
    for v in var_list:
        logger.info('  %s %s', v.name, v.get_shape())

    if args.load_model != '':
        model_name = args.load_model
    else:
        model_name = '{}_{}'.format("GAN", datetime.now().strftime("%Y-%m-%d_%H-%M-%S") + "_" + args.model_name)

    logdir = './logs'
    makedirs(logdir)
    logdir = os.path.join(logdir, model_name)
    logger.info('Events directory: %s', logdir)
    summary_writer = tf.summary.FileWriter(logdir)

    def init_fn(sess):
        logger.info('Initializing all parameters.')
        sess.run(init_all_op)

    sv = tf.train.Supervisor(is_chief=True,
                             logdir=logdir,
                             saver=saver,
                             summary_op=None,
                             init_op=init_op,
                             init_fn=init_fn,
                             summary_writer=summary_writer,
                             ready_op=tf.report_uninitialized_variables(variables_to_save),
                             global_step=model.global_step,
                             save_model_secs=1200,
                             save_summaries_secs=30)

    f = open(os.path.join(logdir, 'description.txt'), 'w')
    f.write('Description : \n' + args.description)
    f.close()

    if args.train:
        logger.info("Starting training session.")
        with sv.managed_session() as sess:
            base_dir = os.path.join('results', model_name)
            makedirs(base_dir)
            model.train(sess, summary_writer, base_dir)

    logger.info("Starting testing session.")
    with sv.managed_session() as sess:
        base_dir = os.path.join('results', model_name)
        makedirs(base_dir)
        model.test(sess, base_dir)
Esempio n. 52
0
def run(args, server):
    env = create_env(args.env_id,
                     client_id=str(args.task),
                     remotes=args.remotes)
    trainer = A3C(env, args.task, args.visualise, args)

    # Variable names that start with "local" are not saved in checkpoints.
    if use_tf12_api:
        variables_to_save = [
            v for v in tf.global_variables() if not v.name.startswith("local")
        ]
        init_op = tf.variables_initializer(variables_to_save)
        init_all_op = tf.global_variables_initializer()
    else:
        variables_to_save = [
            v for v in tf.all_variables() if not v.name.startswith("local")
        ]
        init_op = tf.initialize_variables(variables_to_save)
        init_all_op = tf.initialize_all_variables()
    saver = FastSaver(variables_to_save)

    var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                 tf.get_variable_scope().name)
    logger.info('Trainable vars:')
    for v in var_list:
        logger.info('  %s %s', v.name, v.get_shape())

    def init_fn(ses):
        logger.info("Initializing all parameters.")
        ses.run(init_all_op)

    config = tf.ConfigProto(device_filters=[
        "/job:ps", "/job:worker/task:{}/cpu:0".format(args.task)
    ])
    logdir = os.path.join(args.log_dir, 'train')

    if use_tf12_api:
        summary_writer = tf.summary.FileWriter(logdir + "_%d" % args.task)
    else:
        summary_writer = tf.train.SummaryWriter(logdir + "_%d" % args.task)

    logger.info("Events directory: %s_%s", logdir, args.task)
    sv = tf.train.Supervisor(
        is_chief=(args.task == 0),
        logdir=logdir,
        saver=saver,
        summary_op=None,
        init_op=init_op,
        init_fn=init_fn,
        summary_writer=summary_writer,
        ready_op=tf.report_uninitialized_variables(variables_to_save),
        global_step=trainer.global_step,
        save_model_secs=30,
        save_summaries_secs=30)

    num_global_steps = 100000000

    logger.info(
        "Starting session. If this hangs, we're mostly likely waiting to connect to the parameter server. "
        +
        "One common cause is that the parameter server DNS name isn't resolving yet, or is misspecified."
    )
    with sv.managed_session(server.target,
                            config=config) as sess, sess.as_default():
        sess.run(trainer.sync)
        trainer.start(sess, summary_writer)
        global_step = sess.run(trainer.global_step)
        logger.info("Starting training at step=%d", global_step)
        while not sv.should_stop() and (not num_global_steps
                                        or global_step < num_global_steps):
            trainer.process(sess)
            global_step = sess.run(trainer.global_step)

    # Ask for all the services to stop.
    sv.stop()
    logger.info('reached %s steps. worker stopped.', global_step)
Esempio n. 53
0
def tf_assert_all_init(sess):
    uninit_vars = sess.run(tf.report_uninitialized_variables())
    assert len(
        uninit_vars
    ) == 0, 'Expected all variables to have been initialized, but these have not been: %s' % uninit_vars
def Parameter_Server(Synchronizer, cluster, log_path, model_path, procs):
    config = tf.ConfigProto(
        allow_soft_placement=True,
        log_device_placement=False,
    )
    config.gpu_options.allow_growth = True
    server = tf.train.Server(cluster,
                             job_name="ps",
                             task_index=0,
                             config=config)
    sess = tf.Session(target=server.target, config=config)
    summary_writer = tf.summary.FileWriter(log_path)
    Net = MiniNetwork(sess=sess,
                      summary_writer=summary_writer,
                      rl_training=FLAGS.training,
                      cluster=cluster,
                      index=0,
                      device=DEVICE[0 % len(DEVICE)],
                      ppo_load_path=FLAGS.restore_model_path,
                      ppo_save_path=model_path,
                      ob_space_add=FLAGS.ob_space_add,
                      act_space_add=FLAGS.act_space_add,
                      freeze_head=FLAGS.freeze_head,
                      use_bn=FLAGS.use_bn,
                      use_sep_net=FLAGS.use_sep_net,
                      restore_model=FLAGS.restore_model,
                      restore_from=FLAGS.restore_from,
                      restore_to=FLAGS.restore_to,
                      load_latest=FLAGS.load_latest,
                      add_image=FLAGS.add_image,
                      partial_restore=FLAGS.partial_restore,
                      weighted_sum_type=FLAGS.weighted_sum_type,
                      initial_type=FLAGS.initial_type)

    agent = mini_source_agent.MiniSourceAgent(
        index=-1,
        net=Net,
        restore_model=FLAGS.restore_model,
        rl_training=FLAGS.training,
        ob_space_add=FLAGS.ob_space_add)

    print("Parameter server: waiting for cluster connection...")
    sess.run(tf.report_uninitialized_variables())
    print("Parameter server: cluster ready!")

    print("Parameter server: initializing variables...")
    agent.init_network()
    print("Parameter server: variables initialized")

    update_counter = 0
    max_win_rate = 0.
    latest_win_rate = 0.

    while update_counter < TRAIN_ITERS:
        agent.reset_old_network()

        # wait for update
        Synchronizer.wait()
        logging("Update Network!")
        # TODO count the time , compare cpu and gpu
        time.sleep(1)

        # update finish
        Synchronizer.wait()
        logging("Update Network finished!")

        steps, win_rate = agent.update_summary(update_counter)
        logging("Steps: %d, win rate: %f" % (steps, win_rate))

        update_counter += 1
        if win_rate >= max_win_rate:
            agent.save_model()
            max_win_rate = win_rate

        latest_win_rate = win_rate
        agent.net.save_latest_policy()

    return max_win_rate, latest_win_rate
Esempio n. 55
0
def run(args, server):
    env = create_env(args.env_id,
                     client_id=str(args.task),
                     remotes=args.remotes)
    trainer = A3C(env, args.task, args.visualise, args.unsup)

    # logging
    if args.task == 0:
        with open(args.log_dir + '/log.txt', 'w') as fid:
            for key, val in constants.items():
                fid.write('%s: %s\n' % (str(key), str(val)))
            fid.write('input observation: %s\n' %
                      str(env.observation_space.shape))
            fid.write('env name: %s\n' % str(env.spec.id))
            fid.write('unsup method type: %s\n' % str(args.unsup))

    # Variable names that start with "local" are not saved in checkpoints.
    if use_tf12_api:
        variables_to_save = [
            v for v in tf.global_variables() if not v.name.startswith("local")
        ]
        init_op = tf.variables_initializer(variables_to_save)
        init_all_op = tf.global_variables_initializer()
    else:
        variables_to_save = [
            v for v in tf.all_variables() if not v.name.startswith("local")
        ]
        init_op = tf.initialize_variables(variables_to_save)
        init_all_op = tf.initialize_all_variables()
    saver = FastSaver(variables_to_save)
    if args.pretrain is not None:
        variables_to_restore = [
            v for v in tf.trainable_variables()
            if not v.name.startswith("local")
        ]
        pretrain_saver = FastSaver(variables_to_restore)

    var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                 tf.get_variable_scope().name)
    logger.info('Trainable vars:')
    for v in var_list:
        logger.info('  %s %s', v.name, v.get_shape())

    def init_fn(ses):
        logger.info("Initializing all parameters.")
        ses.run(init_all_op)
        if args.pretrain is not None:
            pretrain = tf.train.latest_checkpoint(args.pretrain)
            logger.info("==> Restoring from given pretrained checkpoint.")
            logger.info("    Pretraining address: %s", pretrain)
            pretrain_saver.restore(ses, pretrain)
            logger.info("==> Done restoring model! Restored %d variables.",
                        len(variables_to_restore))

    config = tf.ConfigProto(device_filters=[
        "/job:ps", "/job:worker/task:{}/cpu:0".format(args.task)
    ])
    logdir = os.path.join(args.log_dir, 'train')

    if use_tf12_api:
        summary_writer = tf.summary.FileWriter(logdir + "_%d" % args.task)
    else:
        summary_writer = tf.train.SummaryWriter(logdir + "_%d" % args.task)

    logger.info("Events directory: %s_%s", logdir, args.task)
    sv = tf.train.Supervisor(
        is_chief=(args.task == 0),
        logdir=logdir,
        saver=saver,
        summary_op=None,
        init_op=init_op,
        init_fn=init_fn,
        summary_writer=summary_writer,
        ready_op=tf.report_uninitialized_variables(variables_to_save),
        global_step=trainer.global_step,
        save_model_secs=30,
        save_summaries_secs=30)

    num_global_steps = constants['MAX_GLOBAL_STEPS']

    logger.info(
        "Starting session. If this hangs, we're mostly likely waiting to connect to the parameter server. "
        +
        "One common cause is that the parameter server DNS name isn't resolving yet, or is misspecified."
    )
    with sv.managed_session(server.target,
                            config=config) as sess, sess.as_default():
        # Workaround for FailedPreconditionError
        # see: https://github.com/openai/universe-starter-agent/issues/44 and 31
        sess.run(trainer.sync)

        trainer.start(sess, summary_writer)
        global_step = sess.run(trainer.global_step)
        logger.info("Starting training at gobal_step=%d", global_step)
        while not sv.should_stop() and (not num_global_steps
                                        or global_step < num_global_steps):
            trainer.process(sess)
            global_step = sess.run(trainer.global_step)

    # Ask for all the services to stop.
    sv.stop()
    logger.info('reached %s steps. worker stopped.', global_step)
Esempio n. 56
0
def main(_):
    tic = time.time()
    tf.logging.set_verbosity(tf.logging.INFO)
    if not FLAGS.dataset_dir:
        raise ValueError(
            'You must supply the dataset directory with --dataset_dir')
    # init
    net_name_scope_pruned = FLAGS.net_name_scope_pruned
    net_name_scope_checkpoint = FLAGS.net_name_scope_checkpoint
    indexed_prune_scopes_for_units = valid_indexed_prune_scopes_for_units
    kept_percentages_dict = get_kept_percentages_dict_from_path(
        FLAGS.checkpoint_path)
    kept_percentages = sorted(map(float, FLAGS.kept_percentages.split(',')))

    # check networks with the kps are pre-trained.
    for kp in kept_percentages:
        if kp not in kept_percentages_dict:
            raise Error('kept_percentage=' + str(kp) + ' not in folder:' +
                        FLAGS.checkpoint_path)

    num_options = len(kept_percentages)
    num_units = len(indexed_prune_scopes_for_units)
    print('num_options=%d, num_blocks=%d' % (num_options, num_units))
    print('HG: total number of configurations=%d' % (num_options**num_units))

    if FLAGS.configuration_type == 'sample':
        configs = get_sampled_configurations(num_units, num_options,
                                             FLAGS.total_num_configurations)
    elif FLAGS.configuration_type == 'special':
        configs = get_special_configurations(num_units, num_options)
    num_configurations = len(configs)

    #Getting MPI rank integer
    # comm = MPI.COMM_WORLD
    # rank = comm.Get_rank()
    # if rank >= num_configurations:
    # print("ERROR: rank(%d) > num_configurations(%d)" %(rank, num_configurations))
    # return
    rank = 0
    FLAGS.configuration_index = FLAGS.start_configuration_index + rank
    config = configs[FLAGS.configuration_index]
    print('HG: kept_percentages=%s, num_configs=%d, start_config_index=%d, rank=%d, config_index=%d' \
           %(str(kept_percentages), num_configurations, FLAGS.start_configuration_index,  rank, FLAGS.configuration_index))

    # prepare for training with the specific config
    indexed_prune_scopes, kept_percentage = config_to_indexed_prune_scopes(
        config, indexed_prune_scopes_for_units, kept_percentages)

    # prepare file system
    results_dir = os.path.join(
        FLAGS.train_dir, "id" +
        str(FLAGS.configuration_index))  #+'_'+str(FLAGS.max_number_of_steps))
    train_dir = os.path.join(results_dir, 'train')
    if (not FLAGS.continue_training) or (
            not tf.train.latest_checkpoint(train_dir)):
        prepare_file_system(train_dir)

    def write_detailed_info(info):
        with open(os.path.join(train_dir, 'train_details.txt'), 'a') as f:
            f.write(info + '\n')

    info = 'train_dir: ' + train_dir + '\n'
    info += 'options:' + str(kept_percentages) + '\n'
    info += 'configuration: ' + str(config) + '\n'
    info += 'indexed_prune_scopes: ' + str(indexed_prune_scopes) + '\n'
    info += 'kept_percentage: ' + str(kept_percentage)
    print(info)
    write_detailed_info(info)

    with tf.Graph().as_default():

        deploy_config = model_deploy.DeploymentConfig(
            num_clones=FLAGS.num_clones,
            clone_on_cpu=FLAGS.clone_on_cpu,
            replica_id=FLAGS.task,
            num_replicas=FLAGS.worker_replicas,
            num_ps_tasks=FLAGS.num_ps_tasks)

        ######################
        # Select the dataset #
        ######################
        dataset = dataset_factory.get_dataset(FLAGS.dataset_name,
                                              FLAGS.train_dataset_name,
                                              FLAGS.dataset_dir)
        test_dataset = dataset_factory.get_dataset(FLAGS.dataset_name,
                                                   FLAGS.test_dataset_name,
                                                   FLAGS.dataset_dir)

        batch_queue = train_inputs(dataset, deploy_config, FLAGS)
        test_images, test_labels = test_inputs(test_dataset, deploy_config,
                                               FLAGS)
        images, labels = batch_queue.dequeue()

        ######################
        # Select the network#
        ######################
        network_fn_pruned = nets_factory.get_network_fn_pruned(
            FLAGS.model_name,
            num_classes=(dataset.num_classes - FLAGS.labels_offset),
            weight_decay=FLAGS.weight_decay)

        ####################
        # Define the model #
        ####################
        prune_info = indexed_prune_scopes_to_prune_info(
            indexed_prune_scopes, kept_percentage)
        logits_train, _ = network_fn_pruned(images,
                                            prune_info=prune_info,
                                            is_training=True,
                                            is_local_train=False,
                                            reuse_variables=False,
                                            scope=net_name_scope_pruned)

        logits_eval, _ = network_fn_pruned(test_images,
                                           prune_info=prune_info,
                                           is_training=False,
                                           is_local_train=False,
                                           reuse_variables=True,
                                           scope=net_name_scope_pruned)
        cross_entropy = add_cross_entropy(logits_train, labels)
        correct_prediction = add_correct_prediction(logits_eval, test_labels)

        #############################
        # Specify the loss functions #
        #############################
        collection_name = 'subgraph_losses'
        tf.add_to_collection(collection_name, cross_entropy)
        # get regularization loss
        regularization_losses = get_regularization_losses_within_scopes()
        print_list('regularization_losses', regularization_losses)
        # total loss and its summary
        total_loss = tf.add_n(tf.get_collection(collection_name),
                              name='total_loss')
        for l in tf.get_collection(collection_name) + [total_loss]:
            tf.summary.scalar(l.op.name + '/summary', l)

        #########################################
        # Configure the optimization procedure. #
        #########################################
        with tf.device(deploy_config.variables_device()):
            global_step = tf.Variable(0, trainable=False, name='global_step')
        with tf.device(deploy_config.optimizer_device()):
            learning_rate = configure_learning_rate(dataset.num_samples,
                                                    global_step, FLAGS)
            optimizer = configure_optimizer(learning_rate, FLAGS)
            tf.summary.scalar('learning_rate', learning_rate)

        #############################
        # Add train operation       #
        #############################
        variables_to_train = get_trainable_variables_within_scopes()
        train_op = add_train_op(optimizer,
                                total_loss,
                                global_step,
                                var_list=variables_to_train)
        print_list("variables_to_train", variables_to_train)

        # Gather update_ops: the updates for the batch_norm variables created by network_fn_pruned.
        update_ops = get_update_ops_within_scopes()
        print_list("update_ops", update_ops)

        update_ops.append(train_op)
        update_op = tf.group(*update_ops)
        with tf.control_dependencies([update_op]):
            train_tensor = tf.identity(total_loss, name='train_op')

        # add summary op
        summary_op = tf.summary.merge_all()

        print("HG: trainable_variables=", len(tf.trainable_variables()))
        print("HG: model_variables=", len(tf.model_variables()))
        print("HG: global_variables=", len(tf.global_variables()))
        # print_list('model_variables but not trainable variables', list(set(tf.model_variables()).difference(tf.trainable_variables())))
        # print_list('global_variables but not model variables', list(set(tf.global_variables()).difference(tf.model_variables())))

        # get train scopes for each kept_percentage
        train_scopes_dict = {}
        for scope_index in xrange(len(indexed_prune_scopes)):
            indexed_prune_scope = indexed_prune_scopes[scope_index]
            scope_kept_percentage = kept_percentage[scope_index]
            if scope_kept_percentage not in train_scopes_dict:
                train_scopes_dict[scope_kept_percentage] = []
            train_scope = get_train_scope_for_local_train(indexed_prune_scope)
            train_scopes_dict[scope_kept_percentage].append(train_scope)

        for key, train_scopes in train_scopes_dict.items():
            train_scopes_dict[key] = sorted(set(train_scopes))
        #print_list("train_scopes", train_scopes)
        print('HG: train_scopes_dict:')
        pprint(train_scopes_dict)

        sess_config = tf.ConfigProto(intra_op_parallelism_threads=16,
                                     inter_op_parallelism_threads=16)
        with tf.Session(config=sess_config) as sess:
            ###########################
            # prepare for filewritter #
            ###########################
            train_writer = tf.summary.FileWriter(train_dir, sess.graph)

            # if restart the training or there is no checkpoint in the train_dir
            if (not FLAGS.continue_training) or (
                    not tf.train.latest_checkpoint(train_dir)):
                #################################################
                # Restore  pruned model variable values. #
                #################################################
                all_variables_to_train = []
                for scope_kept_percentage, train_scopes in train_scopes_dict.items(
                ):
                    print('HG: kept_percentage', scope_kept_percentage)
                    checkpoint_path = os.path.join(
                        FLAGS.checkpoint_path,
                        kept_percentages_dict[scope_kept_percentage][0],
                        'train')
                    #    'model.ckpt-'+str(FLAGS.local_train_steps))

                    variables_to_train = {
                        re.sub(
                            net_name_scope_pruned, net_name_scope_pruned +
                            "_p" + str(scope_kept_percentage), v.op.name): v
                        for v in get_model_variables_within_scopes(
                            train_scopes)
                    }
                    print_list("restore pruned model variables",
                               variables_to_train.values())
                    load_checkpoint(sess,
                                    checkpoint_path,
                                    var_list=variables_to_train)
                    all_variables_to_train.extend(variables_to_train.values())

                #################################################
                # Restore  orignal  model variable values. #
                #################################################
                variables_to_restore = {
                    re.sub(net_name_scope_pruned, net_name_scope_checkpoint,
                           v.op.name): v
                    for v in get_model_variables_within_scopes()
                    if v not in set(all_variables_to_train)
                }
                print_list("restore original model variables",
                           variables_to_restore.values())
                load_checkpoint(sess,
                                checkpoint_path,
                                var_list=variables_to_restore)

            else:
                ###########################################
                ## Restore all variables from checkpoint ##
                ###########################################
                variables_to_restore = get_global_variables_within_scopes()
                load_checkpoint(sess, train_dir, var_list=variables_to_restore)

            #################################################
            # init unitialized global variable. #
            #################################################
            variables_to_init = get_global_variables_within_scopes(
                sess.run(tf.report_uninitialized_variables()))
            print_list("init unitialized variables", variables_to_init)
            sess.run(tf.variables_initializer(variables_to_init))

            init_global_step_value = sess.run(global_step)
            print('initial global step: ', init_global_step_value)
            if init_global_step_value >= FLAGS.max_number_of_steps:
                print('Exit: init_global_step_value (%d) >= FLAGS.max_number_of_steps (%d)' \
                    %(init_global_step_value, FLAGS.max_number_of_steps))
                return

            ###########################
            # Record CPU usage  #
            ###########################
            # mpstat_output_filename = os.path.join(train_dir, "cpu-usage.log")
            # os.system("mpstat -P ALL 1 > " + mpstat_output_filename + " 2>&1 &")

            ###########################
            # Kicks off the training. #
            ###########################
            coord = tf.train.Coordinator()
            threads = tf.train.start_queue_runners(sess=sess, coord=coord)
            saver = tf.train.Saver(max_to_keep=FLAGS.max_to_keep)
            print('HG: # of threads=', len(threads))

            duration = 0
            duration_cnt = 0
            train_time = 0
            train_only_cnt = 0

            print("start to train at:", datetime.now())
            for i in range(init_global_step_value,
                           FLAGS.max_number_of_steps + 1):
                #train_step = i+FLAGS.local_train_steps
                train_step = i
                # run optional meta data, or summary, while run train tensor
                if i > init_global_step_value:
                    #if i < FLAGS.max_number_of_steps:

                    # run metadata and train
                    if i % FLAGS.runmeta_every_n_steps == FLAGS.runmeta_every_n_steps - 1:
                        run_options = tf.RunOptions(
                            trace_level=tf.RunOptions.FULL_TRACE)
                        run_metadata = tf.RunMetadata()

                        loss_value = sess.run(train_tensor,
                                              options=run_options,
                                              run_metadata=run_metadata)
                        train_writer.add_run_metadata(run_metadata,
                                                      'step%d-train' % i)

                        # Create the Timeline object, and write it to a json file
                        fetched_timeline = timeline.Timeline(
                            run_metadata.step_stats)
                        chrome_trace = fetched_timeline.generate_chrome_trace_format(
                        )
                        with open(
                                os.path.join(train_dir,
                                             'timeline_' + str(i) + '.json'),
                                'w') as f:
                            f.write(chrome_trace)

                    # record summary and train
                    elif i % FLAGS.summary_every_n_steps == 0:
                        train_summary, loss_value = sess.run(
                            [summary_op, train_tensor])
                        train_writer.add_summary(train_summary, train_step)

                    # train only
                    else:
                        start_time = time.time()
                        loss_value = sess.run(train_tensor)
                        train_only_cnt += 1
                        train_time += time.time() - start_time
                        duration_cnt += 1
                        duration += time.time() - start_time

                    if i % FLAGS.log_every_n_steps == 0 and duration_cnt > 0:
                        log_frequency = duration_cnt
                        examples_per_sec = log_frequency * FLAGS.batch_size / duration
                        sec_per_batch = float(duration / log_frequency)
                        summary = tf.Summary()
                        summary.value.add(tag='examples_per_sec',
                                          simple_value=examples_per_sec)
                        summary.value.add(tag='sec_per_batch',
                                          simple_value=sec_per_batch)
                        train_writer.add_summary(summary, train_step)
                        format_str = (
                            '%s: step %d, loss = %.3f (%.1f examples/sec; %.3f sec/batch)'
                        )
                        print(format_str % (datetime.now(), i, loss_value,
                                            examples_per_sec, sec_per_batch))
                        duration = 0
                        duration_cnt = 0

                        info = format_str % (datetime.now(), i, loss_value,
                                             examples_per_sec, sec_per_batch)
                        write_detailed_info(info)
                else:
                    # run only total loss when i=0
                    train_summary, loss_value = sess.run(
                        [summary_op,
                         total_loss])  #loss_value = sess.run(total_loss)
                    train_writer.add_summary(train_summary, train_step)
                    format_str = ('%s: step %d, loss = %.3f')
                    print(format_str % (datetime.now(), i, loss_value))
                    info = format_str % (datetime.now(), i, loss_value)
                    write_detailed_info(info)

                # record the evaluation accuracy
                is_last_step = (i == FLAGS.max_number_of_steps)
                if i % FLAGS.evaluate_every_n_steps == 0 or is_last_step:

                    test_accuracy, run_metadata = evaluate_accuracy(
                        sess,
                        coord,
                        test_dataset.num_samples,
                        test_images,
                        test_labels,
                        test_images,
                        test_labels,
                        correct_prediction,
                        FLAGS.test_batch_size,
                        run_meta=False)
                    summary = tf.Summary()
                    summary.value.add(tag='accuracy',
                                      simple_value=test_accuracy)
                    train_writer.add_summary(summary, train_step)

                    info = ('%s: step %d, test_accuracy = %.6f') % (
                        datetime.now(), train_step, test_accuracy)
                    print(info)
                    write_detailed_info(info)

                    ###########################
                    # Save model parameters . #
                    ###########################
                    save_path = saver.save(
                        sess, os.path.join(train_dir, 'model.ckpt-' + str(i)))
                    print("HG: Model saved in file: %s" % save_path)

            coord.request_stop()
            coord.join(threads)
            total_time = time.time() - tic
            #            train_time = train_time*(FLAGS.max_number_of_steps - init_global_step_value)/train_only_cnt
            #            info = "HG: training time(min): %.1f, total time(min): %.1f \n" %( train_time/60.0,  total_time/60.0)
            train_speed = train_time * 1.0 / train_only_cnt
            train_time = train_speed * (
                FLAGS.max_number_of_steps
            )  #- init_global_step_value) #/train_only_cnt
            info = "HG: training speed(sec/batch): %.6f\n" % (train_speed)
            info += "HG: training time(min): %.1f, total time(min): %.1f" % (
                train_time / 60.0, total_time / 60.0)
            print(info)
            write_detailed_info(info)
Esempio n. 57
0
    def setup(self,
              master,
              is_chief,
              global_step,
              ckpt_dir,
              summary_ops,
              global_vars=None,
              local_vars=None,
              save_var_list=None,
              save_steps=None,
              job_name="worker",
              task_index=0,
              async_mode=True):
        """
        Arguments:
            master (obj): specify the target of TF session.
            is_chief (bool): indicating whether this process is a chief worker.
            global_step (obj): the global_step var in the binded graph.
            ckpt_dir (str): specify the checkpoint directory of TF session.
            summary_ops (dict): a dict of TF summary operators.
            global_vars (list): global variables.
            local_vars (list): local variables.
            save_var_list (list): list of saveable variables.
            save_steps: (int): every save_steps to save checkpoint.
            export_dir (list): path to export SavedModel.
            job_name (str): job_name in distributed mode.
            task_index (int): task_index in distributed mode.
            async_mode (bool): indicating whether this is an asynchronous task.
        """

        if global_vars is not None:
            logger.info("in executor:")
            for v in global_vars:
                logger.info("{}".format(v))
            init_op = tf.variables_initializer(global_vars)
        else:
            # single-machine
            init_op = tf.global_variables_initializer()

        if local_vars is None:
            local_init_op = None
            ready_op = tf.report_uninitialized_variables(global_vars)
        else:
            pair_global_vars, pair_local_vars = self.get_variable_pairs(
                global_vars, local_vars)
            for gv, lv in zip(pair_global_vars, pair_local_vars):
                logger.info("{}, {}".format(gv, lv))
            local_init_op = tf.group(*([
                tf.assign(local_var, global_var) for local_var, global_var in
                zip(pair_local_vars, pair_global_vars)
            ]))
            ready_op = tf.report_uninitialized_variables(global_vars +
                                                         list(pair_local_vars))
        ready_for_local_init_op = tf.report_uninitialized_variables(
            global_vars)

        # create tensorflow saver object
        self.saver = tf.train.Saver(
            var_list=global_vars if save_var_list is None else save_var_list,
            reshape=False,
            sharded=False,
            max_to_keep=10,
            keep_checkpoint_every_n_hours=10000.0,
            name=None,
            restore_sequentially=False,
            saver_def=None,
            builder=None,
            defer_build=False,
            allow_empty=True,
            write_version=tf.train.SaverDef.V2,
            pad_step_number=False,
            save_relative_paths=True)

        # handle restore variables from checkpoint
        def init_fn(scaffold, session):
            if ckpt_dir:
                file = tf.train.latest_checkpoint(checkpoint_dir=ckpt_dir,
                                                  latest_filename=None)
                if file is not None:
                    logger.info('begin to restore model from {}'.format(file))
                    scaffold.saver.restore(sess=session, save_path=file)

        self.scaffold = tf.train.Scaffold(
            init_op=init_op,
            init_feed_dict=None,
            init_fn=init_fn,
            ready_op=ready_op,
            ready_for_local_init_op=ready_for_local_init_op,
            local_init_op=local_init_op,
            summary_op=None,
            saver=self.saver,
            copy_from_scaffold=None)

        self.do_summary = False
        for flag, summary_op_list in summary_ops.items():
            if len(summary_op_list) > 0:
                summary_ops[flag] = tf.summary.merge(summary_op_list)
            else:
                summary_ops[flag] = None
        if ckpt_dir:
            actor_summary_dir = os.path.join(ckpt_dir, "actor_summary")
            summary_dir = os.path.join(ckpt_dir, "worker_summary")
            summary_hook = easy_rl.utils.hooks.UpdateSummarySaverHook(
                self,
                global_step,
                job_name,
                task_index,
                save_steps=(save_steps or 100),
                output_dir=actor_summary_dir
                if job_name == "actor" else summary_dir,
                summary_op=summary_ops)
            saver_hook = tf.train.CheckpointSaverHook(
                checkpoint_dir=ckpt_dir,
                save_steps=(save_steps or 300),
                scaffold=self.scaffold,
                checkpoint_basename='model.ckpt')
            chief_only_hooks = [saver_hook]
            hooks = [summary_hook]
        else:
            chief_only_hooks = []
            hooks = []

        # filter devices for asynchronous training
        if async_mode:
            if job_name == "learner":
                device_filters = [
                    '/job:ps', '/job:memory',
                    '/job:{job_name}/task:{task_index}'.format(
                        job_name=job_name, task_index=task_index)
                ]
            else:
                device_filters = None
            config_proto = tf.ConfigProto(device_filters=device_filters)
        else:
            config_proto = None
        self.session = tf.train.MonitoredTrainingSession(
            master=master,
            is_chief=is_chief,
            checkpoint_dir=None,
            scaffold=self.scaffold,
            chief_only_hooks=chief_only_hooks,
            hooks=hooks,
            save_summaries_steps=None,
            save_summaries_secs=None,
            config=config_proto)
Esempio n. 58
0
    def __init__(self,
                 blob,
                 target_size=(224, 224),
                 tf_config=None,
                 is_mvnc=False):  # earlier target_size=(320,240)
        if is_mvnc:
            if mvnc is None:
                print(
                    "Please install MVNC libraries to use --is-mvnc option...")
                quit(-1)
            self.device = mvnc.Device(devices[0])
            self.device.openDevice()
            self.obj = self.device.AllocateGraph(blob)
            self.graph = tf.get_default_graph()
            self.persistent_sess = tf.Session(graph=self.graph,
                                              config=tf_config)
            self.tensor_image = None
            self.tensor_output = tf.placeholder(tf.float16,
                                                shape=(1, target_size[0] // 8,
                                                       target_size[1] // 8,
                                                       57),
                                                name='vectmap')  #57?
        else:
            self.device = None
            graph_def = tf.GraphDef()
            graph_def.ParseFromString(blob)
            self.graph = tf.get_default_graph()
            tf.import_graph_def(graph_def, name='TfPoseEstimator')
            self.obj = self.persistent_sess = tf.Session(graph=self.graph,
                                                         config=tf_config)
            try:
                self.tensor_image = self.graph.get_tensor_by_name(
                    'TfPoseEstimator/image:0')
            except KeyError as e:
                self.tensor_image = self.graph.get_tensor_by_name(
                    'TfPoseEstimator/split:0')
            try:
                self.tensor_output = self.graph.get_tensor_by_name(
                    'Openpose/concat_stage7:0')
            except KeyError as e:
                self.tensor_output = self.graph.get_tensor_by_name(
                    'TfPoseEstimator/Openpose/concat_stage7:0')

        # for op in self.graph.get_operations():
        #     print(op.name)

        # for ts in [n.name for n in tf.get_default_graph().as_graph_def().node]:
        #     print(ts)

        self.tensor_heatMat = self.tensor_output[:, :, :, :19]
        self.tensor_pafMat = self.tensor_output[:, :, :, 19:]
        self.upsample_size = tf.placeholder(dtype=tf.int32,
                                            shape=(2, ),
                                            name='upsample_size')
        self.tensor_heatMat_up = tf.image.resize_area(
            self.tensor_output[:, :, :, :19],
            self.upsample_size,
            align_corners=False,
            name='upsample_heatmat')
        self.tensor_pafMat_up = tf.image.resize_area(
            self.tensor_output[:, :, :, 19:],
            self.upsample_size,
            align_corners=False,
            name='upsample_pafmat')
        smoother = Smoother({'data': self.tensor_heatMat_up}, 25, 3.0)
        gaussian_heatMat = smoother.get_output()

        max_pooled_in_tensor = tf.nn.pool(gaussian_heatMat,
                                          window_shape=(3, 3),
                                          pooling_type='MAX',
                                          padding='SAME')
        self.tensor_peaks = tf.where(
            tf.equal(gaussian_heatMat, max_pooled_in_tensor), gaussian_heatMat,
            tf.zeros_like(gaussian_heatMat))

        self.heatMat = self.pafMat = None

        # warm-up
        if is_mvnc:
            self.persistent_sess.run(
                tf.variables_initializer([
                    v for v in tf.global_variables()
                    if v.name.split(':')[0] in [
                        x.decode('utf-8') for x in self.persistent_sess.run(
                            tf.report_uninitialized_variables())
                    ]
                ]))
            self.persistent_sess.run(
                [
                    self.tensor_peaks, self.tensor_heatMat_up,
                    self.tensor_pafMat_up
                ],
                feed_dict={
                    self.tensor_output: [
                        np.ndarray(shape=(target_size[1] // 8,
                                          target_size[0] // 8, 57),
                                   dtype=np.float16)
                    ],
                    self.upsample_size:
                    [target_size[1], target_size[0]
                     ]  #[target_size[1] // 8, target_size[0] // 8]
                })
            self.persistent_sess.run(
                [
                    self.tensor_peaks, self.tensor_heatMat_up,
                    self.tensor_pafMat_up
                ],
                feed_dict={
                    self.tensor_output: [
                        np.ndarray(shape=(target_size[1] // 8,
                                          target_size[0] // 8, 57),
                                   dtype=np.float16)
                    ],
                    self.upsample_size:
                    [target_size[1] // 2, target_size[0] // 2
                     ]  #[target_size[1] // 16, target_size[0] // 16]
                })
            self.persistent_sess.run(
                [
                    self.tensor_peaks, self.tensor_heatMat_up,
                    self.tensor_pafMat_up
                ],
                feed_dict={
                    self.tensor_output: [
                        np.ndarray(shape=(target_size[1] // 8,
                                          target_size[0] // 8, 57),
                                   dtype=np.float16)
                    ],
                    self.upsample_size:
                    [target_size[1] // 4, target_size[0] // 4
                     ]  #[target_size[1] // 32, target_size[0] // 32]
                })

        else:
            self.persistent_sess.run(
                tf.variables_initializer([
                    v for v in tf.global_variables()
                    if v.name.split(':')[0] in [
                        x.decode('utf-8') for x in self.persistent_sess.run(
                            tf.report_uninitialized_variables())
                    ]
                ]))
            self.persistent_sess.run(
                [
                    self.tensor_peaks, self.tensor_heatMat_up,
                    self.tensor_pafMat_up
                ],
                feed_dict={
                    self.tensor_image: [
                        np.ndarray(shape=(target_size[1], target_size[0], 3),
                                   dtype=np.float32)
                    ],
                    self.upsample_size: [target_size[1], target_size[0]]
                })
            self.persistent_sess.run(
                [
                    self.tensor_peaks, self.tensor_heatMat_up,
                    self.tensor_pafMat_up
                ],
                feed_dict={
                    self.tensor_image: [
                        np.ndarray(shape=(target_size[1], target_size[0], 3),
                                   dtype=np.float32)
                    ],
                    self.upsample_size:
                    [target_size[1] // 2, target_size[0] // 2]
                })
            self.persistent_sess.run(
                [
                    self.tensor_peaks, self.tensor_heatMat_up,
                    self.tensor_pafMat_up
                ],
                feed_dict={
                    self.tensor_image: [
                        np.ndarray(shape=(target_size[1], target_size[0], 3),
                                   dtype=np.float32)
                    ],
                    self.upsample_size:
                    [target_size[1] // 4, target_size[0] // 4]
                })

        self.is_mvnc = is_mvnc
Esempio n. 59
0
def evaluate_on_train_set():
    "train a network"

    # create session which all the evaluation happens in
    sess = tf.Session()

    # create a filename queue first
    filename_queue, examples_in_database = index_the_database_into_queue(
        FLAGS.database_path, shuffle=True)

    # create an epoch counter
    # there is an additional step with variable initialization in order to get the name of "count up to" in the graph
    batch_counter = tf.Variable(0)
    sess.run(tf.global_variables_initializer())
    batch_counter_increment = tf.assign(
        batch_counter,
        tf.Variable(0).count_up_to(
            np.round(
                (examples_in_database * FLAGS.num_epochs) / FLAGS.batch_size)))

    batch_counter_var_name = sess.run(tf.report_uninitialized_variables())

    epoch_counter = tf.div(batch_counter * FLAGS.batch_size,
                           examples_in_database)

    # create a custom shuffle queue
    ligand_files, current_epoch, label_batch, sparse_image_batch = image_and_label_queue(
        batch_size=FLAGS.batch_size,
        pixel_size=FLAGS.pixel_size,
        side_pixels=FLAGS.side_pixels,
        num_threads=FLAGS.num_threads,
        filename_queue=filename_queue,
        epoch_counter=epoch_counter,
        train=False)

    image_batch = tf.sparse_tensor_to_dense(sparse_image_batch,
                                            validate_indices=False)

    keep_prob = tf.placeholder(tf.float32)
    y_conv = wide_conv_net(image_batch, keep_prob, FLAGS.batch_size)

    # compute softmax over raw predictions
    predictions = tf.nn.softmax(y_conv)[:, 1]

    # restore variables from sleep
    saver = tf.train.Saver()
    saver.restore(sess, FLAGS.saved_session)

    # use
    sess.run(
        tf.contrib.framework.get_variables_by_name(
            batch_counter_var_name[0])[0].initializer)

    coord = tf.train.Coordinator()
    threads = tf.train.start_queue_runners(sess=sess, coord=coord)

    # create an instance of a class to store predictions
    all_predictios = store_predictions()
    all_predictions_av3 = store_predictions_av3()

    # add_batch(self, ligand_file_path, batch_predictions, batch_labels)

    print "starting evalution..."

    try:
        while True or not coord.should_stop():
            start = time.time()
            batch_num = sess.run([batch_counter_increment])
            my_ligand_files, my_ligand_frames, my_predictions, my_labels = sess.run(
                [ligand_files, current_epoch, predictions, label_batch],
                feed_dict={keep_prob: 1})
            print "current_epoch:", my_ligand_frames[
                0], "batch_num:", batch_num,
            print "\tprediction averages:", np.mean(my_predictions),
            print "\texamples per second:", "%.2f" % (FLAGS.batch_size /
                                                      (time.time() - start))

            all_predictios.add_batch(my_ligand_files, my_ligand_frames,
                                     my_predictions)
            # add_batch(self, ligand_file_path, batch_predictions, batch_labels)
            all_predictions_av3.add_batch(my_ligand_files, my_ligand_frames,
                                          my_predictions, my_labels)

            print "my labels:", my_labels

    except tf.errors.OutOfRangeError:
        print "exiting the loop"

    all_predictios.save()
    all_predictions_av3.save_predictions(FLAGS.predictions_file_path)
Esempio n. 60
0
    selected_variables = [
        v for v in tf.global_variables() if not v.name.startswith("local")
    ]
    selected_variables_init_op = tf.variables_initializer(selected_variables)

    saver = tf.train.Saver(var_list=selected_variables)
    summary_writer = tf.summary.FileWriter(LOG_DIR + "__%d" % TASK_INDEX)

    supervisor = tf.train.Supervisor(
        is_chief=(JOB_NAME == 'worker' and TASK_INDEX == 0),
        logdir=LOG_DIR,
        saver=saver,
        init_op=selected_variables_init_op,
        summary_writer=summary_writer,
        summary_op=None,
        ready_op=tf.report_uninitialized_variables(selected_variables),
        global_step=trainer.global_step,
        save_model_secs=
        30  # Number of seconds between the creation of model checkpoints. Defaults to 600 seconds. Pass 0 to disable checkpoints.
    )
    with supervisor.managed_session(
            master=server.target,
            config=tf.ConfigProto(device_filters=[
                "/job:ps", f"/job:worker/task:{TASK_INDEX}/cpu:0"
            ])) as sess, sess.as_default():

        if PRETRAIN_MODEL_PATH:
            saver.restore(
                sess=sess,
                save_path=tf.train.latest_checkpoint(PRETRAIN_MODEL_PATH))