def testWaitForSessionLocalInit(self):
    server = server_lib.Server.create_local_server()
    with ops.Graph().as_default() as graph:
      v = variables.Variable(1, name="v")
      w = variables.Variable(
          v,
          trainable=False,
          collections=[ops.GraphKeys.LOCAL_VARIABLES],
          name="w")
      sm = session_manager.SessionManager(
          graph=graph,
          ready_op=variables.report_uninitialized_variables(),
          ready_for_local_init_op=variables.report_uninitialized_variables(
              variables.global_variables()),
          local_init_op=w.initializer)

      # Initialize v but not w
      s = session_lib.Session(server.target, graph=graph)
      s.run(v.initializer)

      sess = sm.wait_for_session(server.target, max_wait_secs=3)
      self.assertEqual(
          True,
          variables.is_variable_initialized(
              sess.graph.get_tensor_by_name("v:0")).eval(session=sess))
      self.assertEqual(
          True,
          variables.is_variable_initialized(
              sess.graph.get_tensor_by_name("w:0")).eval(session=sess))
      self.assertEquals(1, sess.run(v))
      self.assertEquals(1, sess.run(w))
 def testPrepareSessionWithReadyForLocalInitOp(self):
   with ops.Graph().as_default():
     v = variables.Variable(1, name="v")
     w = variables.Variable(
         v,
         trainable=False,
         collections=[ops.GraphKeys.LOCAL_VARIABLES],
         name="w")
     with self.test_session():
       self.assertEqual(False, variables.is_variable_initialized(v).eval())
       self.assertEqual(False, variables.is_variable_initialized(w).eval())
     sm2 = session_manager.SessionManager(
         ready_op=variables.report_uninitialized_variables(),
         ready_for_local_init_op=variables.report_uninitialized_variables(
             variables.global_variables()),
         local_init_op=w.initializer)
     sess = sm2.prepare_session("", init_op=v.initializer)
     self.assertEqual(
         True,
         variables.is_variable_initialized(
             sess.graph.get_tensor_by_name("v:0")).eval(session=sess))
     self.assertEqual(
         True,
         variables.is_variable_initialized(
             sess.graph.get_tensor_by_name("w:0")).eval(session=sess))
     self.assertEquals(1, sess.run(v))
     self.assertEquals(1, sess.run(w))
  def testPrepareSessionFails(self):
    checkpoint_dir = os.path.join(self.get_temp_dir(), "prepare_session")
    checkpoint_dir2 = os.path.join(self.get_temp_dir(), "prepare_session2")
    try:
      gfile.DeleteRecursively(checkpoint_dir)
      gfile.DeleteRecursively(checkpoint_dir2)
    except errors.OpError:
      pass  # Ignore
    gfile.MakeDirs(checkpoint_dir)

    with ops.Graph().as_default():
      v = variables.Variable([1.0, 2.0, 3.0], name="v")
      sm = session_manager.SessionManager(
          ready_op=variables.report_uninitialized_variables())
      saver = saver_lib.Saver({"v": v})
      sess = sm.prepare_session(
          "",
          init_op=variables.global_variables_initializer(),
          saver=saver,
          checkpoint_dir=checkpoint_dir)
      self.assertAllClose([1.0, 2.0, 3.0], sess.run(v))
      checkpoint_filename = os.path.join(checkpoint_dir,
                                         "prepare_session_checkpoint")
      saver.save(sess, checkpoint_filename)
    # Create a new Graph and SessionManager and recover.
    with ops.Graph().as_default():
      # Renames the checkpoint directory.
      os.rename(checkpoint_dir, checkpoint_dir2)
      gfile.MakeDirs(checkpoint_dir)
      v = variables.Variable([6.0, 7.0, 8.0], name="v")
      with self.cached_session():
        self.assertEqual(False, variables.is_variable_initialized(v).eval())
      session_manager.SessionManager(
          ready_op=variables.report_uninitialized_variables())
      saver = saver_lib.Saver({"v": v})
      # This should fail as there's no checkpoint within 2 seconds.
      with self.assertRaisesRegexp(
          RuntimeError, "no init_op or init_fn or local_init_op was given"):
        sess = sm.prepare_session(
            "",
            init_op=None,
            saver=saver,
            checkpoint_dir=checkpoint_dir,
            wait_for_checkpoint=True,
            max_wait_secs=2)
      # Rename the checkpoint directory back.
      gfile.DeleteRecursively(checkpoint_dir)
      os.rename(checkpoint_dir2, checkpoint_dir)
      # This should succeed as there's checkpoint.
      sess = sm.prepare_session(
          "",
          init_op=None,
          saver=saver,
          checkpoint_dir=checkpoint_dir,
          wait_for_checkpoint=True,
          max_wait_secs=2)
      self.assertEqual(
          True,
          variables.is_variable_initialized(
              sess.graph.get_tensor_by_name("v:0")).eval(session=sess))
  def testRecoverSessionWithReadyForLocalInitOpFailsToReadyLocal(self):
    # We use ready_for_local_init_op=tf.report_uninitialized_variables(),
    # which causes recover_session to not run local_init_op, and to return
    # initialized=False

    # Create a checkpoint.
    checkpoint_dir = os.path.join(
        self.get_temp_dir(),
        "recover_session_ready_for_local_init_fails_to_ready_local")
    try:
      gfile.DeleteRecursively(checkpoint_dir)
    except errors.OpError:
      pass  # Ignore
    gfile.MakeDirs(checkpoint_dir)

    with ops.Graph().as_default():
      v = variables.Variable(1, name="v")
      sm = session_manager.SessionManager(
          ready_op=variables.report_uninitialized_variables())
      saver = saver_lib.Saver({"v": v})
      sess, initialized = sm.recover_session(
          "", saver=saver, checkpoint_dir=checkpoint_dir)
      self.assertFalse(initialized)
      sess.run(v.initializer)
      self.assertEquals(1, sess.run(v))
      saver.save(sess,
                 os.path.join(checkpoint_dir, "recover_session_checkpoint"))
    # Create a new Graph and SessionManager and recover.
    with ops.Graph().as_default():
      v = variables.Variable(2, name="v")
      w = variables.Variable(
          v,
          trainable=False,
          collections=[ops.GraphKeys.LOCAL_VARIABLES],
          name="w")
      with self.cached_session():
        self.assertEqual(False, variables.is_variable_initialized(v).eval())
        self.assertEqual(False, variables.is_variable_initialized(w).eval())
      sm2 = session_manager.SessionManager(
          ready_op=variables.report_uninitialized_variables(),
          ready_for_local_init_op=variables.report_uninitialized_variables(),
          local_init_op=w.initializer)
      saver = saver_lib.Saver({"v": v})
      sess, initialized = sm2.recover_session(
          "", saver=saver, checkpoint_dir=checkpoint_dir)
      self.assertFalse(initialized)
      self.assertEqual(
          True,
          variables.is_variable_initialized(
              sess.graph.get_tensor_by_name("v:0")).eval(session=sess))
      self.assertEqual(
          False,
          variables.is_variable_initialized(
              sess.graph.get_tensor_by_name("w:0")).eval(session=sess))
      self.assertEquals(1, sess.run(v))
 def testRecoverSessionNoChkptStillRunsLocalInitOp(self):
   # This test checks for backwards compatibility.
   # In particular, we continue to ensure that recover_session will execute
   # local_init_op exactly once, regardless of whether the session was
   # successfully recovered.
   with ops.Graph().as_default():
     w = variables.Variable(
         1,
         trainable=False,
         collections=[ops.GraphKeys.LOCAL_VARIABLES],
         name="w")
     with self.cached_session():
       self.assertEqual(False, variables.is_variable_initialized(w).eval())
     sm2 = session_manager.SessionManager(
         ready_op=variables.report_uninitialized_variables(),
         ready_for_local_init_op=None,
         local_init_op=w.initializer)
     # Try to recover session from None
     sess, initialized = sm2.recover_session(
         "", saver=None, checkpoint_dir=None)
     # Succeeds because recover_session still run local_init_op
     self.assertFalse(initialized)
     self.assertEqual(
         True,
         variables.is_variable_initialized(
             sess.graph.get_tensor_by_name("w:0")).eval(session=sess))
     self.assertEquals(1, sess.run(w))
  def testRecoverSession(self):
    # Create a checkpoint.
    checkpoint_dir = os.path.join(self.get_temp_dir(), "recover_session")
    try:
      gfile.DeleteRecursively(checkpoint_dir)
    except errors.OpError:
      pass  # Ignore
    gfile.MakeDirs(checkpoint_dir)

    with ops.Graph().as_default():
      v = variables.Variable(1, name="v")
      sm = session_manager.SessionManager(
          ready_op=variables.report_uninitialized_variables())
      saver = saver_lib.Saver({"v": v})
      sess, initialized = sm.recover_session(
          "", saver=saver, checkpoint_dir=checkpoint_dir)
      self.assertFalse(initialized)
      sess.run(v.initializer)
      self.assertEquals(1, sess.run(v))
      saver.save(sess,
                 os.path.join(checkpoint_dir, "recover_session_checkpoint"))
    self._test_recovered_variable(checkpoint_dir=checkpoint_dir)
    self._test_recovered_variable(
        checkpoint_filename_with_path=checkpoint_management.latest_checkpoint(
            checkpoint_dir))
    # Cannot set both checkpoint_dir and checkpoint_filename_with_path.
    with self.assertRaises(ValueError):
      self._test_recovered_variable(
          checkpoint_dir=checkpoint_dir,
          checkpoint_filename_with_path=checkpoint_management.latest_checkpoint(
              checkpoint_dir))
Esempio n. 7
0
  def _init_ready_op(self,
                     ready_op=USE_DEFAULT,
                     ready_for_local_init_op=USE_DEFAULT):
    """Initializes ready_op.

    Args:
      ready_op: `Tensor` to check if the model is initialized.
        If it's set to USE_DEFAULT, creates an op that checks all
        the variables are initialized.
      ready_for_local_init_op: `Tensor` to check if the model is ready to run
        local_init_op.
        If it's set to USE_DEFAULT, creates an op that checks all
        the global variables are initialized.
    """
    if ready_op is Supervisor.USE_DEFAULT:
      ready_op = self._get_first_op_from_collection(ops.GraphKeys.READY_OP)
      if ready_op is None:
        ready_op = variables.report_uninitialized_variables()
        ops.add_to_collection(ops.GraphKeys.READY_OP, ready_op)
    self._ready_op = ready_op

    # ready_for_local_init_op defaults to None for backward compatibility
    if ready_for_local_init_op is Supervisor.USE_DEFAULT:
      ready_for_local_init_op = self._get_first_op_from_collection(
          ops.GraphKeys.READY_FOR_LOCAL_INIT_OP)
    self._ready_for_local_init_op = ready_for_local_init_op
Esempio n. 8
0
 def testRecoverSessionNoChkptStillRunsLocalInitOp(self):
     # This test checks for backwards compatibility.
     # In particular, we continue to ensure that recover_session will execute
     # local_init_op exactly once, regardless of whether the session was
     # successfully recovered.
     with ops.Graph().as_default():
         w = variables.VariableV1(
             1,
             trainable=False,
             collections=[ops.GraphKeys.LOCAL_VARIABLES],
             name="w")
         with self.cached_session():
             self.assertEqual(False,
                              variables.is_variable_initialized(w).eval())
         sm2 = session_manager.SessionManager(
             ready_op=variables.report_uninitialized_variables(),
             ready_for_local_init_op=None,
             local_init_op=w.initializer)
         # Try to recover session from None
         sess, initialized = sm2.recover_session("",
                                                 saver=None,
                                                 checkpoint_dir=None)
         # Succeeds because recover_session still run local_init_op
         self.assertFalse(initialized)
         self.assertEqual(
             True,
             variables.is_variable_initialized(
                 sess.graph.get_tensor_by_name("w:0")).eval(session=sess))
         self.assertEqual(1, sess.run(w))
Esempio n. 9
0
    def testRecoverSession(self):
        # Create a checkpoint.
        checkpoint_dir = os.path.join(self.get_temp_dir(), "recover_session")
        try:
            gfile.DeleteRecursively(checkpoint_dir)
        except errors.OpError:
            pass  # Ignore
        gfile.MakeDirs(checkpoint_dir)

        with ops.Graph().as_default():
            v = variables.VariableV1(1, name="v")
            sm = session_manager.SessionManager(
                ready_op=variables.report_uninitialized_variables())
            saver = saver_lib.Saver({"v": v})
            sess, initialized = sm.recover_session(
                "", saver=saver, checkpoint_dir=checkpoint_dir)
            self.assertFalse(initialized)
            sess.run(v.initializer)
            self.assertEqual(1, sess.run(v))
            saver.save(
                sess, os.path.join(checkpoint_dir,
                                   "recover_session_checkpoint"))
        self._test_recovered_variable(checkpoint_dir=checkpoint_dir)
        self._test_recovered_variable(
            checkpoint_filename_with_path=checkpoint_management.
            latest_checkpoint(checkpoint_dir))
        # Cannot set both checkpoint_dir and checkpoint_filename_with_path.
        with self.assertRaises(ValueError):
            self._test_recovered_variable(
                checkpoint_dir=checkpoint_dir,
                checkpoint_filename_with_path=checkpoint_management.
                latest_checkpoint(checkpoint_dir))
Esempio n. 10
0
    def _init_ready_op(self,
                       ready_op=USE_DEFAULT,
                       ready_for_local_init_op=USE_DEFAULT):
        """Initializes ready_op.

    Args:
      ready_op: `Tensor` to check if the model is initialized.
        If it's set to USE_DEFAULT, creates an op that checks all
        the variables are initialized.
      ready_for_local_init_op: `Tensor` to check if the model is ready to run
        local_init_op.
        If it's set to USE_DEFAULT, creates an op that checks all
        the global variables are initialized.
    """
        if ready_op is Supervisor.USE_DEFAULT:
            ready_op = self._get_first_op_from_collection(
                ops.GraphKeys.READY_OP)
            if ready_op is None:
                ready_op = variables.report_uninitialized_variables()
                ops.add_to_collection(ops.GraphKeys.READY_OP, ready_op)
        self._ready_op = ready_op

        # ready_for_local_init_op defaults to None for backward compatibility
        if ready_for_local_init_op is Supervisor.USE_DEFAULT:
            ready_for_local_init_op = self._get_first_op_from_collection(
                ops.GraphKeys.READY_FOR_LOCAL_INIT_OP)
        self._ready_for_local_init_op = ready_for_local_init_op
Esempio n. 11
0
        def get_session(is_chief):
            g = ops.Graph()
            with g.as_default():
                with ops.device("/job:localhost"):
                    v = variables.VariableV1(
                        1,
                        name="default_ready_for_local_init_op_v_" + str(uid))
                    vadd = v.assign_add(1)
                    w = variables.VariableV1(
                        v,
                        trainable=False,
                        collections=[ops.GraphKeys.LOCAL_VARIABLES],
                        name="default_ready_for_local_init_op_w_" + str(uid))
                    ready_for_local_init_op = variables.report_uninitialized_variables(
                        variables.global_variables())
            sv = supervisor.Supervisor(
                logdir=logdir,
                is_chief=is_chief,
                graph=g,
                recovery_wait_secs=1,
                init_op=v.initializer,
                ready_for_local_init_op=ready_for_local_init_op)
            sess = sv.prepare_or_wait_for_session(server.target)

            return sv, sess, v, vadd, w
Esempio n. 12
0
    def testWaitForSessionLocalInit(self):
        server = server_lib.Server.create_local_server()
        with ops.Graph().as_default() as graph:
            v = variables.VariableV1(1, name="v")
            w = variables.VariableV1(
                v,
                trainable=False,
                collections=[ops.GraphKeys.LOCAL_VARIABLES],
                name="w")
            sm = session_manager.SessionManager(
                graph=graph,
                ready_op=variables.report_uninitialized_variables(),
                ready_for_local_init_op=variables.
                report_uninitialized_variables(variables.global_variables()),
                local_init_op=w.initializer)

            # Initialize v but not w
            s = session_lib.Session(server.target, graph=graph)
            s.run(v.initializer)

            sess = sm.wait_for_session(server.target, max_wait_secs=3)
            self.assertEqual(
                True,
                variables.is_variable_initialized(
                    sess.graph.get_tensor_by_name("v:0")).eval(session=sess))
            self.assertEqual(
                True,
                variables.is_variable_initialized(
                    sess.graph.get_tensor_by_name("w:0")).eval(session=sess))
            self.assertEquals(1, sess.run(v))
            self.assertEquals(1, sess.run(w))
Esempio n. 13
0
 def testPrepareSessionSucceedsWithInitFn(self):
     with ops.Graph().as_default():
         v = variables.Variable([125], name="v")
         sm = session_manager.SessionManager(
             ready_op=variables.report_uninitialized_variables())
         sess = sm.prepare_session(
             "", init_fn=lambda sess: sess.run(v.initializer))
         self.assertAllClose([125], sess.run(v))
Esempio n. 14
0
 def testPrepareSessionSucceedsWithInitFn(self):
   with ops.Graph().as_default():
     v = variables.Variable([125], name="v")
     sm = session_manager.SessionManager(
         ready_op=variables.report_uninitialized_variables())
     sess = sm.prepare_session(
         "", init_fn=lambda sess: sess.run(v.initializer))
     self.assertAllClose([125], sess.run(v))
Esempio n. 15
0
 def testPrepareSessionSucceeds(self):
     with ops.Graph().as_default():
         v = variables.Variable([1.0, 2.0, 3.0], name="v")
         sm = session_manager.SessionManager(
             ready_op=variables.report_uninitialized_variables())
         sess = sm.prepare_session(
             "", init_op=variables.global_variables_initializer())
         self.assertAllClose([1.0, 2.0, 3.0], sess.run(v))
Esempio n. 16
0
 def testPrepareSessionSucceeds(self):
   with ops.Graph().as_default():
     v = variables.Variable([1.0, 2.0, 3.0], name="v")
     sm = session_manager.SessionManager(
         ready_op=variables.report_uninitialized_variables())
     sess = sm.prepare_session(
         "", init_op=variables.global_variables_initializer())
     self.assertAllClose([1.0, 2.0, 3.0], sess.run(v))
Esempio n. 17
0
 def testAssertVariablesInitialized(self):
   with ops.Graph().as_default(), self.cached_session() as sess:
     v = variables.Variable([1, 2], name="v")
     w = variables.Variable([3, 4], name="w")
     _ = v, w
     uninited = variables.report_uninitialized_variables()
     self.assertAllEqual(np.array([b"v", b"w"]), self.evaluate(uninited))
     self.evaluate(variables.global_variables_initializer())
     self.assertEqual(0, self.evaluate(uninited).size)
Esempio n. 18
0
  def testWaitForSessionWithReadyForLocalInitOpFailsToReadyLocal(self):
    with ops.Graph().as_default() as graph:
      v = variables.Variable(1, name="v")
      w = variables.Variable(
          v,
          trainable=False,
          collections=[ops.GraphKeys.LOCAL_VARIABLES],
          name="w")
      sm = session_manager.SessionManager(
          graph=graph,
          ready_op=variables.report_uninitialized_variables(),
          ready_for_local_init_op=variables.report_uninitialized_variables(),
          local_init_op=w.initializer)

      with self.assertRaises(errors_impl.DeadlineExceededError):
        # Time-out because w fails to be initialized,
        # because of overly restrictive ready_for_local_init_op
        sm.wait_for_session("", max_wait_secs=3)
 def testAssertVariablesInitialized(self):
   with ops.Graph().as_default(), self.cached_session() as sess:
     v = variables.Variable([1, 2], name="v")
     w = variables.Variable([3, 4], name="w")
     _ = v, w
     uninited = variables.report_uninitialized_variables()
     self.assertAllEqual(np.array([b"v", b"w"]), self.evaluate(uninited))
     variables.global_variables_initializer().run()
     self.assertEqual(0, self.evaluate(uninited).size)
Esempio n. 20
0
  def testWaitForSessionWithReadyForLocalInitOpFailsToReadyLocal(self):
    with ops.Graph().as_default() as graph:
      v = variables.VariableV1(1, name="v")
      w = variables.VariableV1(
          v,
          trainable=False,
          collections=[ops.GraphKeys.LOCAL_VARIABLES],
          name="w")
      sm = session_manager.SessionManager(
          graph=graph,
          ready_op=variables.report_uninitialized_variables(),
          ready_for_local_init_op=variables.report_uninitialized_variables(),
          local_init_op=w.initializer)

      with self.assertRaises(errors_impl.DeadlineExceededError):
        # Time-out because w fails to be initialized,
        # because of overly restrictive ready_for_local_init_op
        sm.wait_for_session("", max_wait_secs=3)
Esempio n. 21
0
  def testWaitForSessionReturnsNoneAfterTimeout(self):
    with ops.Graph().as_default():
      variables.Variable(1, name="v")
      sm = session_manager.SessionManager(
          ready_op=variables.report_uninitialized_variables(),
          recovery_wait_secs=1)

      # Set max_wait_secs to allow us to try a few times.
      with self.assertRaises(errors.DeadlineExceededError):
        sm.wait_for_session(master="", max_wait_secs=3)
Esempio n. 22
0
    def testWaitForSessionReturnsNoneAfterTimeout(self):
        with ops.Graph().as_default():
            variables.Variable(1, name="v")
            sm = session_manager.SessionManager(
                ready_op=variables.report_uninitialized_variables(),
                recovery_wait_secs=1)

            # Set max_wait_secs to allow us to try a few times.
            with self.assertRaises(errors.DeadlineExceededError):
                sm.wait_for_session(master="", max_wait_secs=3)
Esempio n. 23
0
 def testInitWithNoneLocalInitOpError(self):
   # Creating a SessionManager with a None local_init_op but
   # non-None ready_for_local_init_op raises ValueError
   with self.assertRaisesRegexp(ValueError,
                                "If you pass a ready_for_local_init_op "
                                "you must also pass a local_init_op "):
     session_manager.SessionManager(
         ready_for_local_init_op=variables.report_uninitialized_variables(
             variables.global_variables()),
         local_init_op=None)
Esempio n. 24
0
 def testInitWithNoneLocalInitOpError(self):
   # Creating a SessionManager with a None local_init_op but
   # non-None ready_for_local_init_op raises ValueError
   with self.assertRaisesRegex(
       ValueError, "If you pass a ready_for_local_init_op "
       "you must also pass a local_init_op "):
     session_manager.SessionManager(
         ready_for_local_init_op=variables.report_uninitialized_variables(
             variables.global_variables()),
         local_init_op=None)
Esempio n. 25
0
 def testVariableList(self):
   with ops.Graph().as_default(), self.cached_session() as sess:
     v = variables.VariableV1([1, 2], name="v")
     w = variables.VariableV1([3, 4], name="w")
     uninited = variables.report_uninitialized_variables()
     self.assertAllEqual(np.array([b"v", b"w"]), self.evaluate(uninited))
     self.evaluate(w.initializer)
     self.assertAllEqual(np.array([b"v"]), self.evaluate(uninited))
     v.initializer.run()
     self.assertEqual(0, self.evaluate(uninited).size)
 def testVariableList(self):
   with ops.Graph().as_default(), self.cached_session() as sess:
     v = variables.VariableV1([1, 2], name="v")
     w = variables.VariableV1([3, 4], name="w")
     uninited = variables.report_uninitialized_variables()
     self.assertAllEqual(np.array([b"v", b"w"]), self.evaluate(uninited))
     self.evaluate(w.initializer)
     self.assertAllEqual(np.array([b"v"]), self.evaluate(uninited))
     v.initializer.run()
     self.assertEqual(0, self.evaluate(uninited).size)
Esempio n. 27
0
 def testPrepareSessionSucceedsWithLocalInitFeedDict(self):
     with ops.Graph().as_default():
         p = array_ops.placeholder(dtypes.float32, shape=(3, ))
         v = variables.VariableV1(
             p, name="v", collections=[ops.GraphKeys.LOCAL_VARIABLES])
         sm = session_manager.SessionManager(
             local_init_op=v.initializer,
             local_init_feed_dict={p: [1.0, 2.0, 3.0]},
             ready_op=variables.report_uninitialized_variables())
         sess = sm.prepare_session("")
         self.assertAllClose([1.0, 2.0, 3.0], sess.run(v))
Esempio n. 28
0
 def testPrepareSessionWithReadyNotReadyForLocal(self):
   with ops.Graph().as_default():
     v = variables.VariableV1(1, name="v")
     w = variables.VariableV1(
         v,
         trainable=False,
         collections=[ops.GraphKeys.LOCAL_VARIABLES],
         name="w")
     with self.cached_session():
       self.assertEqual(False, variables.is_variable_initialized(v).eval())
       self.assertEqual(False, variables.is_variable_initialized(w).eval())
     sm2 = session_manager.SessionManager(
         ready_op=variables.report_uninitialized_variables(),
         ready_for_local_init_op=variables.report_uninitialized_variables(
             variables.global_variables()),
         local_init_op=w.initializer)
     with self.assertRaisesRegex(
         RuntimeError,
         "Init operations did not make model ready for local_init"):
       sm2.prepare_session("", init_op=None)
Esempio n. 29
0
 def testPrepareSessionSucceedsWithInitFeedDict(self):
     with ops.Graph().as_default():
         p = array_ops.placeholder(dtypes.float32, shape=(3, ))
         v = variables.Variable(p, name="v")
         sm = session_manager.SessionManager(
             ready_op=variables.report_uninitialized_variables())
         sess = sm.prepare_session(
             "",
             init_op=variables.global_variables_initializer(),
             init_feed_dict={p: [1.0, 2.0, 3.0]})
         self.assertAllClose([1.0, 2.0, 3.0], sess.run(v))
Esempio n. 30
0
 def testPrepareSessionWithReadyNotReadyForLocal(self):
   with ops.Graph().as_default():
     v = variables.Variable(1, name="v")
     w = variables.Variable(
         v,
         trainable=False,
         collections=[ops.GraphKeys.LOCAL_VARIABLES],
         name="w")
     with self.cached_session():
       self.assertEqual(False, variables.is_variable_initialized(v).eval())
       self.assertEqual(False, variables.is_variable_initialized(w).eval())
     sm2 = session_manager.SessionManager(
         ready_op=variables.report_uninitialized_variables(),
         ready_for_local_init_op=variables.report_uninitialized_variables(
             variables.global_variables()),
         local_init_op=w.initializer)
     with self.assertRaisesRegexp(
         RuntimeError,
         "Init operations did not make model ready for local_init"):
       sm2.prepare_session("", init_op=None)
Esempio n. 31
0
 def testPrepareSessionSucceedsWithInitFeedDict(self):
   with ops.Graph().as_default():
     p = array_ops.placeholder(dtypes.float32, shape=(3,))
     v = variables.Variable(p, name="v")
     sm = session_manager.SessionManager(
         ready_op=variables.report_uninitialized_variables())
     sess = sm.prepare_session(
         "",
         init_op=variables.global_variables_initializer(),
         init_feed_dict={p: [1.0, 2.0, 3.0]})
     self.assertAllClose([1.0, 2.0, 3.0], sess.run(v))
Esempio n. 32
0
 def testPrepareSessionWithReadyForLocalInitOp(self):
   with ops.Graph().as_default():
     v = variables.VariableV1(1, name="v")
     w = variables.VariableV1(
         v,
         trainable=False,
         collections=[ops.GraphKeys.LOCAL_VARIABLES],
         name="w")
     x = variables.VariableV1(
         3 * v,
         trainable=False,
         collections=[ops.GraphKeys.LOCAL_VARIABLES],
         name="x")
     with self.cached_session():
       self.assertEqual(False, variables.is_variable_initialized(v).eval())
       self.assertEqual(False, variables.is_variable_initialized(w).eval())
       self.assertEqual(False, variables.is_variable_initialized(x).eval())
     sm2 = session_manager.SessionManager(
         ready_op=variables.report_uninitialized_variables(),
         ready_for_local_init_op=variables.report_uninitialized_variables(
             variables.global_variables()),
         local_init_op=[w.initializer, x.initializer])
     sess = sm2.prepare_session("", init_op=v.initializer)
     self.assertEqual(
         True,
         variables.is_variable_initialized(
             sess.graph.get_tensor_by_name("v:0")).eval(session=sess))
     self.assertEqual(
         True,
         variables.is_variable_initialized(
             sess.graph.get_tensor_by_name("w:0")).eval(session=sess))
     self.assertEqual(
         True,
         variables.is_variable_initialized(
             sess.graph.get_tensor_by_name("x:0")).eval(session=sess))
     self.assertEqual(1, sess.run(v))
     self.assertEqual(1, sess.run(w))
     self.assertEqual(3, sess.run(x))
Esempio n. 33
0
  def _init_ready_op(self, ready_op=USE_DEFAULT):
    """Initializes ready_op.

    Args:
      ready_op: `Tensor` to check if the model is initialized.
        If it's set to USE_DEFAULT, creates an op that checks all
        the variables are initialized.
    """
    if ready_op is Supervisor.USE_DEFAULT:
      ready_op = self._get_first_op_from_collection(ops.GraphKeys.READY_OP)
      if ready_op is None:
        ready_op = variables.report_uninitialized_variables()
        ops.add_to_collection(ops.GraphKeys.READY_OP, ready_op)
    self._ready_op = ready_op
Esempio n. 34
0
  def _init_ready_op(self, ready_op=USE_DEFAULT):
    """Initializes ready_op.

    Args:
      ready_op: `Tensor` to check if the model is initialized.
        If it's set to USE_DEFAULT, creates an op that checks all
        the variables are initialized.
    """
    if ready_op is Supervisor.USE_DEFAULT:
      ready_op = self._get_first_op_from_collection(ops.GraphKeys.READY_OP)
      if ready_op is None:
        ready_op = variables.report_uninitialized_variables()
        ops.add_to_collection(ops.GraphKeys.READY_OP, ready_op)
    self._ready_op = ready_op
Esempio n. 35
0
    def _between_graph_worker_fn(self, strategy):
        context = distribute_coordinator_context.get_current_worker_context()
        self.assertTrue(context is not None)
        with self._test_session(target=context.master_target) as sess:
            with ops.device("/job:ps/task:0"):
                # TODO(yuefengz): investigate why not using resource variable will make
                # the test flaky.
                x = variable_scope.get_variable("x",
                                                initializer=10.0,
                                                use_resource=True)
            with ops.device("/job:ps/task:1"):
                y = variable_scope.get_variable("y",
                                                initializer=20.0,
                                                use_resource=True)

            x_add = x.assign_add(2.0)
            y_sub = y.assign_sub(2.0)
            train_op = control_flow_ops.group([x_add, y_sub])

            if context.is_chief:
                variables.global_variables_initializer().run()

            # Synchronize workers after initializaton.
            if context.has_barrier:
                context.wait_for_other_workers()
            else:
                while True:
                    uninit_vars = sess.run(
                        variables.report_uninitialized_variables())
                    # pylint: disable=g-explicit-length-test
                    if len(uninit_vars) == 0:
                        break

            sess.run(train_op)

            # Synchronize workers after one step to make sure they all have finished
            # training.
            if context.has_barrier:
                context.wait_for_other_workers()
            else:
                self._barrier.wait()

            x_val, y_val = sess.run([x, y])

            self.assertEqual(x_val, 16.0)
            self.assertEqual(y_val, 14.0)
            if x_val == 16.0 and y_val == 14.0:
                with self._lock:
                    self._result_correct += 1
Esempio n. 36
0
 def testWaitForSessionInsufficientReadyForLocalInitCheck(self):
     with ops.Graph().as_default() as graph:
         v = variables.Variable(1, name="v")
         w = variables.Variable(v,
                                trainable=False,
                                collections=[ops.GraphKeys.LOCAL_VARIABLES],
                                name="w")
         sm = session_manager.SessionManager(
             graph=graph,
             ready_op=variables.report_uninitialized_variables(),
             ready_for_local_init_op=None,
             local_init_op=w.initializer)
     with self.assertRaisesRegexp(errors_impl.DeadlineExceededError,
                                  "Session was not ready after waiting.*"):
         sm.wait_for_session("", max_wait_secs=3)
Esempio n. 37
0
 def test_evaluate_ready_for_local_init(self):
     with ops.Graph().as_default() as g, self.test_session(g):
         variables_lib.create_global_step()
         v = variables.Variable(1.0)
         variables.Variable(v + 1,
                            collections=[ops.GraphKeys.LOCAL_VARIABLES],
                            trainable=False)
         ready_for_local_init_op = variables.report_uninitialized_variables(
             variables.global_variables())
         ops.add_to_collection(ops.GraphKeys.READY_FOR_LOCAL_INIT_OP,
                               ready_for_local_init_op)
         _ = learn.graph_actions.evaluate(g,
                                          output_dir=self._output_dir,
                                          checkpoint_path=None,
                                          eval_dict={'a': v},
                                          max_steps=1)
Esempio n. 38
0
 def testPrepareSessionDidNotInitLocalVariableList(self):
   with ops.Graph().as_default():
     v = variables.Variable(1, name="v")
     w = variables.Variable(
         v,
         trainable=False,
         collections=[ops.GraphKeys.LOCAL_VARIABLES],
         name="w")
     with self.cached_session():
       self.assertEqual(False, variables.is_variable_initialized(v).eval())
       self.assertEqual(False, variables.is_variable_initialized(w).eval())
     sm2 = session_manager.SessionManager(
         ready_op=variables.report_uninitialized_variables())
     with self.assertRaisesRegexp(RuntimeError,
                                  "Init operations did not make model ready"):
       sm2.prepare_session("", init_op=[v.initializer])
Esempio n. 39
0
 def testPrepareSessionDidNotInitLocalVariableList(self):
   with ops.Graph().as_default():
     v = variables.VariableV1(1, name="v")
     w = variables.VariableV1(
         v,
         trainable=False,
         collections=[ops.GraphKeys.LOCAL_VARIABLES],
         name="w")
     with self.cached_session():
       self.assertEqual(False, variables.is_variable_initialized(v).eval())
       self.assertEqual(False, variables.is_variable_initialized(w).eval())
     sm2 = session_manager.SessionManager(
         ready_op=variables.report_uninitialized_variables())
     with self.assertRaisesRegex(RuntimeError,
                                 "Init operations did not make model ready"):
       sm2.prepare_session("", init_op=[v.initializer])
Esempio n. 40
0
 def testWaitForSessionInsufficientReadyForLocalInitCheck(self):
   with ops.Graph().as_default() as graph:
     v = variables.Variable(1, name="v")
     w = variables.Variable(
         v,
         trainable=False,
         collections=[ops.GraphKeys.LOCAL_VARIABLES],
         name="w")
     sm = session_manager.SessionManager(
         graph=graph,
         ready_op=variables.report_uninitialized_variables(),
         ready_for_local_init_op=None,
         local_init_op=w.initializer)
   with self.assertRaisesRegexp(errors_impl.DeadlineExceededError,
                                "Session was not ready after waiting.*"):
     sm.wait_for_session("", max_wait_secs=3)
Esempio n. 41
0
 def test_evaluate_ready_for_local_init(self):
   with ops.Graph().as_default() as g, self.test_session(g):
     variables_lib.create_global_step()
     v = variables.Variable(1.0)
     w = variables.Variable(
         v + 1, collections=[ops.GraphKeys.LOCAL_VARIABLES], trainable=False)
     ready_for_local_init_op = variables.report_uninitialized_variables(
         variables.global_variables())
     ops.add_to_collection(ops.GraphKeys.READY_FOR_LOCAL_INIT_OP,
                           ready_for_local_init_op)
     _ = learn.graph_actions.evaluate(
         g,
         output_dir=self._output_dir,
         checkpoint_path=None,
         eval_dict={'a': v},
         max_steps=1)
Esempio n. 42
0
 def testPrepareSessionWithCyclicInitializer(self):
   # Regression test. Previously Variable._build_initializer_expr would enter
   # into an infinite recursion when the variable's initial_value involved
   # cyclic dependencies.
   with ops.Graph().as_default():
     i = control_flow_ops.while_loop(lambda i: i < 1, lambda i: i + 1, [0])
     v = variables.VariableV1(array_ops.identity(i), name="v")
     with self.cached_session():
       self.assertEqual(False, variables.is_variable_initialized(v).eval())
     sm = session_manager.SessionManager(
         ready_op=variables.report_uninitialized_variables())
     sess = sm.prepare_session("", init_op=v.initializer)
     self.assertEqual(1, sess.run(v))
     self.assertEqual(
         True,
         variables.is_variable_initialized(
             sess.graph.get_tensor_by_name("v:0")).eval(session=sess))
  def _between_graph_worker_fn(self, strategy):
    context = distribute_coordinator_context.get_current_worker_context()
    self.assertTrue(context is not None)
    with self._test_session(target=context.master_target) as sess:
      with ops.device("/job:ps/task:0"):
        # TODO(yuefengz): investigate why not using resource variable will make
        # the test flaky.
        x = variable_scope.get_variable(
            "x", initializer=10.0, use_resource=True)
      with ops.device("/job:ps/task:1"):
        y = variable_scope.get_variable(
            "y", initializer=20.0, use_resource=True)

      x_add = x.assign_add(2.0)
      y_sub = y.assign_sub(2.0)
      train_op = control_flow_ops.group([x_add, y_sub])

      if context.is_chief:
        self.evaluate(variables.global_variables_initializer())

      # Synchronize workers after initializaton.
      if context.has_barrier:
        context.wait_for_other_workers()
      else:
        while True:
          uninit_vars = sess.run(variables.report_uninitialized_variables())
          # pylint: disable=g-explicit-length-test
          if len(uninit_vars) == 0:
            break

      sess.run(train_op)

      # Synchronize workers after one step to make sure they all have finished
      # training.
      if context.has_barrier:
        context.wait_for_other_workers()
      else:
        self._barrier.wait()

      x_val, y_val = sess.run([x, y])

      self.assertEqual(x_val, 16.0)
      self.assertEqual(y_val, 14.0)
      if x_val == 16.0 and y_val == 14.0:
        with self._lock:
          self._result_correct += 1
Esempio n. 44
0
 def testPrepareSessionWithCyclicInitializer(self):
   # Regression test. Previously Variable._build_initializer_expr would enter
   # into an infinite recursion when the variable's initial_value involved
   # cyclic dependencies.
   with ops.Graph().as_default():
     i = control_flow_ops.while_loop(lambda i: i < 1, lambda i: i + 1, [0])
     v = variables.Variable(array_ops.identity(i), name="v")
     with self.cached_session():
       self.assertEqual(False, variables.is_variable_initialized(v).eval())
     sm = session_manager.SessionManager(
         ready_op=variables.report_uninitialized_variables())
     sess = sm.prepare_session("", init_op=v.initializer)
     self.assertEqual(1, sess.run(v))
     self.assertEqual(
         True,
         variables.is_variable_initialized(
             sess.graph.get_tensor_by_name("v:0")).eval(session=sess))
 def testPrepareSessionWithInsufficientReadyForLocalInitCheck(self):
   with ops.Graph().as_default():
     v = variables.Variable(1, name="v")
     w = variables.Variable(
         v,
         trainable=False,
         collections=[ops.GraphKeys.LOCAL_VARIABLES],
         name="w")
     with self.test_session():
       self.assertEqual(False, variables.is_variable_initialized(v).eval())
       self.assertEqual(False, variables.is_variable_initialized(w).eval())
     sm2 = session_manager.SessionManager(
         ready_op=variables.report_uninitialized_variables(),
         ready_for_local_init_op=None,
         local_init_op=w.initializer)
   with self.assertRaisesRegexp(errors_impl.FailedPreconditionError,
                                "Attempting to use uninitialized value v"):
     sm2.prepare_session("", init_op=None)
Esempio n. 46
0
    def testRecoverSessionFailsStillRunsLocalInitOp(self):
        # Create a checkpoint.
        checkpoint_dir = os.path.join(
            self.get_temp_dir(),
            "recover_session_ready_for_local_init_fails_stil_run")
        try:
            gfile.DeleteRecursively(checkpoint_dir)
        except errors.OpError:
            pass  # Ignore
        gfile.MakeDirs(checkpoint_dir)

        # Create a new Graph and SessionManager and recover.
        with ops.Graph().as_default():
            v = variables.VariableV1(2, name="v")
            w = variables.VariableV1(
                1,
                trainable=False,
                collections=[ops.GraphKeys.LOCAL_VARIABLES],
                name="w")
            with self.cached_session():
                self.assertEqual(False,
                                 variables.is_variable_initialized(v).eval())
                self.assertEqual(False,
                                 variables.is_variable_initialized(w).eval())
            sm2 = session_manager.SessionManager(
                ready_op=variables.report_uninitialized_variables(),
                ready_for_local_init_op=None,
                local_init_op=w.initializer)
            saver = saver_lib.Saver({"v": v})
            sess, initialized = sm2.recover_session(
                "",
                saver=saver,
                checkpoint_dir=checkpoint_dir,
                wait_for_checkpoint=False)
            self.assertFalse(initialized)
            self.assertEqual(
                False,
                variables.is_variable_initialized(
                    sess.graph.get_tensor_by_name("v:0")).eval(session=sess))
            self.assertEqual(
                True,
                variables.is_variable_initialized(
                    sess.graph.get_tensor_by_name("w:0")).eval(session=sess))
            self.assertEqual(1, sess.run(w))
Esempio n. 47
0
  def testRecoverSessionFailsStillRunsLocalInitOp(self):
    # Create a checkpoint.
    checkpoint_dir = os.path.join(
        self.get_temp_dir(),
        "recover_session_ready_for_local_init_fails_stil_run")
    try:
      gfile.DeleteRecursively(checkpoint_dir)
    except errors.OpError:
      pass  # Ignore
    gfile.MakeDirs(checkpoint_dir)

    # Create a new Graph and SessionManager and recover.
    with ops.Graph().as_default():
      v = variables.Variable(2, name="v")
      w = variables.Variable(
          1,
          trainable=False,
          collections=[ops.GraphKeys.LOCAL_VARIABLES],
          name="w")
      with self.cached_session():
        self.assertEqual(False, variables.is_variable_initialized(v).eval())
        self.assertEqual(False, variables.is_variable_initialized(w).eval())
      sm2 = session_manager.SessionManager(
          ready_op=variables.report_uninitialized_variables(),
          ready_for_local_init_op=None,
          local_init_op=w.initializer)
      saver = saver_lib.Saver({"v": v})
      sess, initialized = sm2.recover_session(
          "",
          saver=saver,
          checkpoint_dir=checkpoint_dir,
          wait_for_checkpoint=False)
      self.assertFalse(initialized)
      self.assertEqual(
          False,
          variables.is_variable_initialized(
              sess.graph.get_tensor_by_name("v:0")).eval(session=sess))
      self.assertEqual(
          True,
          variables.is_variable_initialized(
              sess.graph.get_tensor_by_name("w:0")).eval(session=sess))
      self.assertEquals(1, sess.run(w))
Esempio n. 48
0
 def _test_recovered_variable(self,
                              checkpoint_dir=None,
                              checkpoint_filename_with_path=None):
   # Create a new Graph and SessionManager and recover from a checkpoint.
   with ops.Graph().as_default():
     v = variables.Variable(2, name="v")
     with session_lib.Session():
       self.assertEqual(False, variables.is_variable_initialized(v).eval())
     sm2 = session_manager.SessionManager(
         ready_op=variables.report_uninitialized_variables())
     saver = saver_lib.Saver({"v": v})
     sess, initialized = sm2.recover_session(
         "",
         saver=saver,
         checkpoint_dir=checkpoint_dir,
         checkpoint_filename_with_path=checkpoint_filename_with_path)
     self.assertTrue(initialized)
     self.assertEqual(
         True,
         variables.is_variable_initialized(
             sess.graph.get_tensor_by_name("v:0")).eval(session=sess))
     self.assertEquals(1, sess.run(v))
Esempio n. 49
0
 def _test_recovered_variable(self,
                              checkpoint_dir=None,
                              checkpoint_filename_with_path=None):
   # Create a new Graph and SessionManager and recover from a checkpoint.
   with ops.Graph().as_default():
     v = variables.VariableV1(2, name="v")
     with session_lib.Session():
       self.assertEqual(False, variables.is_variable_initialized(v).eval())
     sm2 = session_manager.SessionManager(
         ready_op=variables.report_uninitialized_variables())
     saver = saver_lib.Saver({"v": v})
     sess, initialized = sm2.recover_session(
         "",
         saver=saver,
         checkpoint_dir=checkpoint_dir,
         checkpoint_filename_with_path=checkpoint_filename_with_path)
     self.assertTrue(initialized)
     self.assertEqual(
         True,
         variables.is_variable_initialized(
             sess.graph.get_tensor_by_name("v:0")).eval(session=sess))
     self.assertEqual(1, sess.run(v))
Esempio n. 50
0
    def get_session(is_chief):
      g = ops.Graph()
      with g.as_default():
        with ops.device("/job:local"):
          v = variables.VariableV1(
              1.0, name="ready_for_local_init_op_restore_v_" + str(uid))
          vadd = v.assign_add(1)
          w = variables.VariableV1(
              v,
              trainable=False,
              collections=[ops.GraphKeys.LOCAL_VARIABLES],
              name="ready_for_local_init_op_restore_w_" + str(uid))
          ready_for_local_init_op = variables.report_uninitialized_variables(
              variables.global_variables())
      sv = supervisor.Supervisor(
          logdir=logdir,
          is_chief=is_chief,
          graph=g,
          recovery_wait_secs=1,
          ready_for_local_init_op=ready_for_local_init_op)
      sess = sv.prepare_or_wait_for_session(server.target)

      return sv, sess, v, vadd, w
Esempio n. 51
0
 def testZeroSizeVarInitialized(self):
   with ops.Graph().as_default(), self.cached_session() as sess:
     v = variables.Variable(array_ops.zeros([0, 2]), name="v")
     uninited = variables.report_uninitialized_variables()
     v.initializer.run()  # not strictly necessary
     self.assertEqual(0, self.evaluate(uninited).size)
Esempio n. 52
0
    def _apply_model_average(self, lvars_and_gvars, name=None):
        """Apply local weights to global variables.

    This contains most of the synchronization implementation.

    Args:
      lvars_and_gvars: List of (local_vars, global_vars) pairs.
      name: Optional name for the returned operation.  Default to the
        name passed to the Optimizer constructor.

    Returns:
      train_op: The op to dequeue a token so the replicas can exit this batch
      and start the next one. This is executed by each replica.

    Raises:
      ValueError: If the lvars_and_gvars is empty.
    """
        if not lvars_and_gvars:
            raise ValueError("Must supply at least one variable")

        train_ops = []
        aggregated_lvars = []

        model_reassign_ops = []

        global_vars = [g for v, g in lvars_and_gvars if v is not None]

        # local_anchor op will be placed on this worker task by default.
        local_anchor = control_flow_ops.no_op()
        # Colocating local_step variable prevents it being placed on the PS.
        with ops.colocate_with(local_anchor):
            self._local_step = variables.Variable(
                initial_value=0,
                trainable=False,
                collections=[ops.GraphKeys.LOCAL_VARIABLES],
                dtype=self._global_step.dtype.base_dtype,
                name="%s_local_step" % self._name)

        self.local_step_init_op = state_ops.assign(self._local_step,
                                                   self._global_step)
        chief_init_ops = [self.local_step_init_op]
        self.ready_for_local_init_op = variables.report_uninitialized_variables(
            variables.global_variables())

        with ops.name_scope(None, self._name):
            for lvar, gvar in lvars_and_gvars:
                lvar = ops.convert_to_tensor(lvar)
                with ops.device(gvar.device):
                    # Dense variables.
                    if lvar is None:
                        aggregated_lvars.append(None)  # pass-through.
                        continue
                    elif isinstance(lvar, ops.Tensor):
                        lvar_accum = data_flow_ops.ConditionalAccumulator(
                            lvar.dtype,
                            shape=gvar.get_shape(),
                            shared_name=gvar.name + "/lvar_accum")
                        train_ops.append(
                            lvar_accum.apply_grad(lvar,
                                                  local_step=self._local_step))
                        aggregated_lvars.append(
                            lvar_accum.take_grad(self._replicas_to_aggregate))
                    else:
                        if not isinstance(lvar, ops.IndexedSlices):
                            raise ValueError("Unknown model variable type!")
                        lvar_accum = data_flow_ops.SparseConditionalAccumulator(
                            lvar.dtype,
                            shape=(),
                            shared_name=gvar.name + "/model_variable_accum")
                        train_ops.append(
                            lvar_accum.apply_indexed_slices_grad(
                                lvar, local_step=self._local_step))
                        aggregated_lvars.append(
                            lvar_accum.take_indexed_slices_grad(
                                self._replicas_to_aggregate))

                    self._accumulator_list.append((lvar_accum, gvar.device))

            # sync_op will be assigned to the same device as the global step.
            with ops.device(self._global_step.device), ops.name_scope(""):
                for avg_var, gvar in zip(aggregated_lvars, global_vars):
                    model_reassign_ops.append(state_ops.assign(gvar, avg_var))
                model_reassign_ops.append(
                    state_ops.assign_add(self._global_step, 1))
                update_op = control_flow_ops.group(*(model_reassign_ops))

            # Create token queue.
            with ops.device(self._global_step.device), ops.name_scope(""):
                sync_token_queue = (data_flow_ops.FIFOQueue(
                    -1,
                    self._global_step.dtype.base_dtype,
                    shapes=(),
                    name="sync_token_q",
                    shared_name="sync_token_q"))
                self._sync_token_queue = sync_token_queue

                # dummy_queue is passed to the queue runner. Don't use the real queues
                # because the queue runner doesn't automatically reopen it once it
                # closed queues in PS devices.
                dummy_queue = (data_flow_ops.FIFOQueue(
                    1,
                    types_pb2.DT_INT32,
                    shapes=(),
                    name="dummy_queue",
                    shared_name="dummy_queue"))

            with ops.device(self._global_step.device), ops.name_scope(""):
                # Replicas have to wait until they can get a token from the token queue.
                with ops.control_dependencies(train_ops):
                    token = sync_token_queue.dequeue()
                train_op = state_ops.assign(self._local_step, token)

                with ops.control_dependencies([update_op]):
                    # Sync_op needs to insert tokens to the token queue at the end of the
                    # step so the replicas can fetch them to start the next step.
                    tokens = array_ops.fill([self._tokens_per_step],
                                            self._global_step)
                    sync_op = sync_token_queue.enqueue_many((tokens, ))

                self._chief_queue_runner = queue_runner.QueueRunner(
                    dummy_queue, [sync_op])
            for accum, dev in self._accumulator_list:
                with ops.device(dev):
                    chief_init_ops.append(
                        accum.set_global_step(self._global_step,
                                              name="SetGlobalStep"))
            self.chief_init_op = control_flow_ops.group(*(chief_init_ops))
            self._average_applied = True
            return train_op
 def testNoVars(self):
   with ops.Graph().as_default(), self.cached_session() as sess:
     uninited = variables.report_uninitialized_variables()
     self.assertEqual(0, self.evaluate(uninited).size)
Esempio n. 54
0
def train(train_op,
          logdir,
          train_step_fn=train_step,
          train_step_kwargs=_USE_DEFAULT,
          log_every_n_steps=1,
          graph=None,
          master='',
          is_chief=True,
          global_step=None,
          number_of_steps=None,
          init_op=_USE_DEFAULT,
          init_feed_dict=None,
          local_init_op=_USE_DEFAULT,
          init_fn=None,
          ready_op=_USE_DEFAULT,
          summary_op=_USE_DEFAULT,
          save_summaries_secs=600,
          summary_writer=_USE_DEFAULT,
          startup_delay_steps=0,
          saver=None,
          save_interval_secs=600,
          sync_optimizer=None,
          session_config=None,
          trace_every_n_steps=None):
    """Runs a training loop using a TensorFlow supervisor.

  When the sync_optimizer is supplied, gradient updates are applied
  synchronously. Otherwise, gradient updates are applied asynchronous.

  Args:
    train_op: A `Tensor` that, when executed, will apply the gradients and
      return the loss value.
    logdir: The directory where training logs are written to. If None, model
      checkpoints and summaries will not be written.
    train_step_fn: The function to call in order to execute a single gradient
      step. The function must have take exactly four arguments: the current
      session, the `train_op` `Tensor`, a global step `Tensor` and a dictionary.
    train_step_kwargs: A dictionary which is passed to the `train_step_fn`. By
      default, two `Boolean`, scalar ops called "should_stop" and "should_log"
      are provided.
    log_every_n_steps: The frequency, in terms of global steps, that the loss
      and global step and logged.
    graph: The graph to pass to the supervisor. If no graph is supplied the
      default graph is used.
    master: The address of the tensorflow master.
    is_chief: Specifies whether or not the training is being run by the primary
      replica during replica training.
    global_step: The `Tensor` representing the global step. If left as `None`,
      then slim.variables.get_or_create_global_step() is used.
    number_of_steps: The max number of gradient steps to take during training.
      If the value is left as None, training proceeds indefinitely.
    init_op: The initialization operation. If left to its default value, then
      the session is initialized by calling `tf.global_variables_initializer()`.
    init_feed_dict: A feed dictionary to use when executing the `init_op`.
    local_init_op: The local initialization operation. If left to its default
      value, then the session is initialized by calling
      `tf.local_variables_initializer()` and `tf.tables_initializer()`.
    init_fn: An optional callable to be executed after `init_op` is called. The
      callable must accept one argument, the session being initialized.
    ready_op: Operation to check if the model is ready to use. If left to its
      default value, then the session checks for readiness by calling
      `tf.report_uninitialized_variables()`.
    summary_op: The summary operation.
    save_summaries_secs: How often, in seconds, to save summaries.
    summary_writer: `SummaryWriter` to use.  Can be `None`
      to indicate that no summaries should be written. If unset, we
      create a SummaryWriter.
    startup_delay_steps: The number of steps to wait for before beginning. Note
      that this must be 0 if a sync_optimizer is supplied.
    saver: Saver to save checkpoints. If None, a default one will be created
      and used.
    save_interval_secs: How often, in seconds, to save the model to `logdir`.
    sync_optimizer: an instance of tf.train.SyncReplicasOptimizer. If the
      argument is supplied, gradient updates will be synchronous. If left as
      `None`, gradient updates will be asynchronous.
    session_config: An instance of `tf.ConfigProto` that will be used to
      configure the `Session`. If left as `None`, the default will be used.
    trace_every_n_steps: produce and save a `Timeline` in Chrome trace format
      and add it to the summaries every `trace_every_n_steps`. If None, no trace
      information will be produced or saved.

  Returns:
    the value of the loss function after training.

  Raises:
    ValueError: if `train_op` is empty or if `startup_delay_steps` is
      non-zero when `sync_optimizer` is supplied, if `number_of_steps` is
      negative, or if `trace_every_n_steps` is not `None` and no `logdir` is
      provided.
  """
    if train_op is None:
        raise ValueError('train_op cannot be None.')

    if logdir is None:
        if summary_op != _USE_DEFAULT:
            raise ValueError('Cannot provide summary_op because logdir=None')
        if saver is not None:
            raise ValueError('Cannot provide saver because logdir=None')
        if trace_every_n_steps is not None:
            raise ValueError('Cannot provide trace_every_n_steps because '
                             'logdir=None')

    if sync_optimizer is not None and startup_delay_steps > 0:
        raise ValueError(
            'startup_delay_steps must be zero when sync_optimizer is supplied.'
        )

    if number_of_steps is not None and number_of_steps <= 0:
        raise ValueError(
            '`number_of_steps` must be either None or a positive number.')

    graph = graph or ops.get_default_graph()
    with graph.as_default():
        if global_step is None:
            global_step = variables.get_or_create_global_step()
        saver = saver or tf_saver.Saver()

        with ops.name_scope('init_ops'):
            if init_op == _USE_DEFAULT:
                init_op = tf_variables.global_variables_initializer()

            if ready_op == _USE_DEFAULT:
                ready_op = tf_variables.report_uninitialized_variables()

            if local_init_op == _USE_DEFAULT:
                local_init_op = control_flow_ops.group(
                    tf_variables.local_variables_initializer(),
                    data_flow_ops.tables_initializer())

            if sync_optimizer is not None and isinstance(
                    sync_optimizer,
                    sync_replicas_optimizer.SyncReplicasOptimizer):
                with ops.control_dependencies(
                    [local_init_op] if local_init_op is not None else []):
                    if is_chief:
                        local_init_op = sync_optimizer.chief_init_op
                    else:
                        local_init_op = sync_optimizer.local_step_init_op
                ready_for_local_init_op = sync_optimizer.ready_for_local_init_op
            else:
                ready_for_local_init_op = None

        if summary_op == _USE_DEFAULT:
            summary_op = summary.merge_all()

        if summary_writer == _USE_DEFAULT:
            summary_writer = supervisor.Supervisor.USE_DEFAULT

        if is_chief and sync_optimizer is not None:
            if not isinstance(sync_optimizer,
                              (sync_replicas_optimizer.SyncReplicasOptimizer)):
                raise ValueError(
                    '`sync_optimizer` must be a tf.train.SyncReplicasOptimizer.'
                )

            # Need to create these BEFORE the supervisor finalizes the graph:
            init_tokens_op = sync_optimizer.get_init_tokens_op()
            chief_queue_runner = sync_optimizer.get_chief_queue_runner()

        if train_step_kwargs == _USE_DEFAULT:
            with ops.name_scope('train_step'):
                train_step_kwargs = {}

                if number_of_steps:
                    should_stop_op = math_ops.greater_equal(
                        global_step, number_of_steps)
                else:
                    should_stop_op = constant_op.constant(False)
                train_step_kwargs['should_stop'] = should_stop_op
                train_step_kwargs['should_log'] = math_ops.equal(
                    math_ops.mod(global_step, log_every_n_steps), 0)
                if is_chief and trace_every_n_steps is not None:
                    train_step_kwargs['should_trace'] = math_ops.equal(
                        math_ops.mod(global_step, trace_every_n_steps), 0)
                    train_step_kwargs['logdir'] = logdir

    sv = supervisor.Supervisor(graph=graph,
                               is_chief=is_chief,
                               logdir=logdir,
                               init_op=init_op,
                               init_feed_dict=init_feed_dict,
                               local_init_op=local_init_op,
                               ready_for_local_init_op=ready_for_local_init_op,
                               ready_op=ready_op,
                               summary_op=summary_op,
                               summary_writer=summary_writer,
                               global_step=global_step,
                               saver=saver,
                               save_summaries_secs=save_summaries_secs,
                               save_model_secs=save_interval_secs,
                               init_fn=init_fn)

    if summary_writer is not None:
        train_step_kwargs['summary_writer'] = sv.summary_writer

    should_retry = True
    while should_retry:
        try:
            should_retry = False
            with sv.managed_session(master,
                                    start_standard_services=False,
                                    config=session_config) as sess:
                logging.info('Starting Session.')
                if is_chief:
                    if logdir:
                        sv.start_standard_services(sess)
                elif startup_delay_steps > 0:
                    _wait_for_step(
                        sess, global_step,
                        min(startup_delay_steps, number_of_steps
                            or sys.maxint))
                sv.start_queue_runners(sess)
                logging.info('Starting Queues.')
                if is_chief and sync_optimizer is not None:
                    sv.start_queue_runners(sess, [chief_queue_runner])
                    sess.run(init_tokens_op)
                try:
                    while not sv.should_stop():
                        total_loss, should_stop = train_step_fn(
                            sess, train_op, global_step, train_step_kwargs)
                        if should_stop:
                            logging.info('Stopping Training.')
                            break
                except errors.OutOfRangeError:
                    # OutOfRangeError is thrown when epoch limit per
                    # tf.train.limit_epochs is reached.
                    logging.info('Caught OutOfRangeError. Stopping Training.')
                if logdir and sv.is_chief:
                    logging.info('Finished training! Saving model to disk.')
                    sv.saver.save(sess,
                                  sv.save_path,
                                  global_step=sv.global_step)

        except errors.AbortedError:
            # Always re-run on AbortedError as it indicates a restart of one of the
            # distributed tensorflow servers.
            logging.info('Retrying training!')
            should_retry = True

    return total_loss
Esempio n. 55
0
 def testNoVars(self):
   with ops.Graph().as_default(), self.cached_session() as sess:
     uninited = variables.report_uninitialized_variables()
     self.assertEqual(0, self.evaluate(uninited).size)
Esempio n. 56
0
 def default_ready_for_local_init_op():
   return variables.report_uninitialized_variables(
       variables.global_variables())
Esempio n. 57
0
 def default_ready_op():
   return array_ops.concat([
       variables.report_uninitialized_variables(),
       resources.report_uninitialized_resources()
   ], 0)
    def apply_gradients(self, grads_and_vars, global_step=None, name=None):
        """Apply gradients to variables.

    This contains most of the synchronization implementation and also wraps the
    apply_gradients() from the real optimizer.

    Args:
      grads_and_vars: List of (gradient, variable) pairs as returned by
        compute_gradients().
      global_step: Optional Variable to increment by one after the
        variables have been updated.
      name: Optional name for the returned operation.  Default to the
        name passed to the Optimizer constructor.

    Returns:
      train_op: The op to dequeue a token so the replicas can exit this batch
      and start the next one. This is executed by each replica.

    Raises:
      ValueError: If the grads_and_vars is empty.
      ValueError: If global step is not provided, the staleness cannot be
        checked.
    """
        if not grads_and_vars:
            raise ValueError("Must supply at least one variable")

        if global_step is None:
            raise ValueError("Global step is required to check staleness")

        self._global_step = global_step
        train_ops = []
        aggregated_grad = []
        var_list = []

        # local_anchor op will be placed on this worker task by default.
        local_anchor = control_flow_ops.no_op()
        # Colocating local_step variable prevents it being placed on the PS.
        distribution_strategy = (
            distribution_strategy_context.get_distribution_strategy())
        with distribution_strategy.colocate_vars_with(local_anchor):
            self._local_step = variable_scope.variable(
                initial_value=0,
                trainable=False,
                collections=[ops.GraphKeys.LOCAL_VARIABLES],
                dtype=global_step.dtype.base_dtype,
                name="sync_rep_local_step")

        self.local_step_init_op = state_ops.assign(self._local_step,
                                                   global_step)
        chief_init_ops = [self.local_step_init_op]
        self.ready_for_local_init_op = variables.report_uninitialized_variables(
            variables.global_variables())

        with ops.name_scope(None, self._name):
            for grad, var in grads_and_vars:
                var_list.append(var)
                with ops.device(var.device):
                    # Dense gradients.
                    if grad is None:
                        aggregated_grad.append(None)  # pass-through.
                        continue
                    elif isinstance(grad, ops.Tensor):
                        grad_accum = data_flow_ops.ConditionalAccumulator(
                            grad.dtype,
                            shape=var.get_shape(),
                            shared_name=var.name + "/grad_accum")
                        train_ops.append(
                            grad_accum.apply_grad(grad,
                                                  local_step=self._local_step))
                        aggregated_grad.append(
                            grad_accum.take_grad(self._replicas_to_aggregate))
                    else:
                        if not isinstance(grad, ops.IndexedSlices):
                            raise ValueError("Unknown grad type!")
                        grad_accum = data_flow_ops.SparseConditionalAccumulator(
                            grad.dtype,
                            shape=(),
                            shared_name=var.name + "/grad_accum")
                        train_ops.append(
                            grad_accum.apply_indexed_slices_grad(
                                grad, local_step=self._local_step))
                        aggregated_grad.append(
                            grad_accum.take_indexed_slices_grad(
                                self._replicas_to_aggregate))

                    self._accumulator_list.append((grad_accum, var.device))

            aggregated_grads_and_vars = zip(aggregated_grad, var_list)

            # sync_op will be assigned to the same device as the global step.
            with ops.device(global_step.device), ops.name_scope(""):
                update_op = self._opt.apply_gradients(
                    aggregated_grads_and_vars, global_step)

            # Create token queue.
            with ops.device(global_step.device), ops.name_scope(""):
                sync_token_queue = (data_flow_ops.FIFOQueue(
                    -1,
                    global_step.dtype.base_dtype,
                    shapes=(),
                    name="sync_token_q",
                    shared_name="sync_token_q"))
                self._sync_token_queue = sync_token_queue

                # dummy_queue is passed to the queue runner. Don't use the real queues
                # because the queue runner doesn't automatically reopen it once it
                # closed queues in PS devices.
                dummy_queue = (data_flow_ops.FIFOQueue(
                    1,
                    types_pb2.DT_INT32,
                    shapes=(),
                    name="dummy_queue",
                    shared_name="dummy_queue"))

            with ops.device(global_step.device), ops.name_scope(""):
                # Replicas have to wait until they can get a token from the token queue.
                with ops.control_dependencies(train_ops):
                    token = sync_token_queue.dequeue()
                train_op = state_ops.assign(self._local_step, token)

                with ops.control_dependencies([update_op]):
                    # Sync_op needs to insert tokens to the token queue at the end of the
                    # step so the replicas can fetch them to start the next step.
                    tokens = array_ops.fill([self._tokens_per_step],
                                            global_step)
                    sync_op = sync_token_queue.enqueue_many((tokens, ))

                if self._variable_averages is not None:
                    with ops.control_dependencies([sync_op
                                                   ]), ops.name_scope(""):
                        sync_op = self._variable_averages.apply(
                            self._variables_to_average)

                self._chief_queue_runner = queue_runner.QueueRunner(
                    dummy_queue, [sync_op])
            for accum, dev in self._accumulator_list:
                with ops.device(dev):
                    chief_init_ops.append(
                        accum.set_global_step(global_step,
                                              name="SetGlobalStep"))
            self.chief_init_op = control_flow_ops.group(*(chief_init_ops))
            self._gradients_applied = True
            return train_op
 def testZeroSizeVarInitialized(self):
   with ops.Graph().as_default(), self.cached_session() as sess:
     v = variables.Variable(array_ops.zeros([0, 2]), name="v")
     uninited = variables.report_uninitialized_variables()
     v.initializer.run()  # not strictly necessary
     self.assertEqual(0, self.evaluate(uninited).size)