Exemple #1
0
  def add_meta_graph(self, tags, signature_def_map=None,
                     assets_collection=None):
    """Adds the current meta graph to the SavedModel.

    Creates a Saver in the current scope and uses the Saver to export the meta
    graph def. Invoking this API requires the `add_meta_graph_and_variables()`
    API to have been invoked before.

    Args:
      tags: The set of tags to annotate the meta graph def with.
      signature_def_map: The map of signature defs to be added to the meta graph
          def.
      assets_collection: Assets collection to be saved with SavedModel. Note
          that this collection should be a subset of the assets saved as part of
          the first meta graph in the SavedModel.

    Raises:
      AssertionError: If the variables for the SavedModel have not been saved
          yet.
    """
    if not self._has_saved_variables:
      raise AssertionError(
          "Variables and assets have not been saved yet. "
          "Please invoke `add_meta_graph_and_variables()` first.")

    # Save asset files, if any.
    self._save_assets(assets_collection)

    saver = tf_saver.Saver(variables.all_variables())
    meta_graph_def = saver.export_meta_graph()

    # Tag the meta graph def and add it to the SavedModel.
    self._tag_and_add_meta_graph(meta_graph_def, tags, signature_def_map)
Exemple #2
0
def _get_saver():
  """Lazy init and return saver."""
  saver = _get_first_op_from_collection(ops.GraphKeys.SAVERS)
  if saver is None and variables.all_variables():
    saver = tf_saver.Saver()
    ops.add_to_collection(ops.GraphKeys.SAVERS, saver)
  return saver
Exemple #3
0
    def add_meta_graph(self,
                       tags,
                       signature_def_map=None,
                       assets_collection=None):
        """Adds the current meta graph to the SavedModel.

    Creates a Saver in the current scope and uses the Saver to export the meta
    graph def. Invoking this API requires the `add_meta_graph_and_variables()`
    API to have been invoked before.

    Args:
      tags: The set of tags to annotate the meta graph def with.
      signature_def_map: The map of signature defs to be added to the meta graph
          def.
      assets_collection: Assets collection to be saved with SavedModel. Note
          that this collection should be a subset of the assets saved as part of
          the first meta graph in the SavedModel.

    Raises:
      AssertionError: If the variables for the SavedModel have not been saved
          yet.
    """
        if not self._has_saved_variables:
            raise AssertionError(
                "Variables and assets have not been saved yet. "
                "Please invoke `add_meta_graph_and_variables()` first.")

        # Save asset files, if any.
        self._save_assets(assets_collection)

        saver = tf_saver.Saver(variables.all_variables())
        meta_graph_def = saver.export_meta_graph()

        # Tag the meta graph def and add it to the SavedModel.
        self._tag_and_add_meta_graph(meta_graph_def, tags, signature_def_map)
Exemple #4
0
def _get_saver():
  """Lazy init and return saver."""
  saver = _get_first_op_from_collection(ops.GraphKeys.SAVERS)
  if saver is None and variables.all_variables():
    saver = tf_saver.Saver()
    ops.add_to_collection(ops.GraphKeys.SAVERS, saver)
  return saver
Exemple #5
0
  def add_meta_graph_and_variables(self,
                                   sess,
                                   tags,
                                   signature_def_map=None,
                                   assets_collection=None,
                                   legacy_init_op=None):
    """Adds the current meta graph to the SavedModel and saves variables.

    Creates a Saver to save the variables from the provided session. Exports the
    corresponding meta graph def. This function assumes that the variables to be
    saved have been initialized. For a given `SavedModelBuilder`, this API must
    be called exactly once and for the first meta graph to save. For subsequent
    meta graph defs to be added, the `add_meta_graph()` API must be used.

    Args:
      sess: The TensorFlow session from which to save the meta graph and
        variables.
      tags: The set of tags with which to save the meta graph.
      signature_def_map: The map of signature def map to add to the meta graph
        def.
      assets_collection: Assets collection to be saved with SavedModel.
      legacy_init_op: Op or group of ops to execute after the restore op upon a
        load.
    """
    if self._has_saved_variables:
      raise AssertionError("Variables and assets have already been saved. "
                           "Please invoke `add_meta_graph()` instead.")

    # Save asset files and write them to disk, if any.
    self._save_and_write_assets(assets_collection)

    # Create the variables sub-directory, if it does not exist.
    variables_dir = os.path.join(
        compat.as_text(self._export_dir),
        compat.as_text(constants.VARIABLES_DIRECTORY))
    if not file_io.file_exists(variables_dir):
      file_io.recursive_create_dir(variables_dir)

    variables_path = os.path.join(
        compat.as_text(variables_dir),
        compat.as_text(constants.VARIABLES_FILENAME))

    # Add legacy init op to the SavedModel.
    self._maybe_add_legacy_init_op(legacy_init_op)

    # Save the variables and export meta graph def.
    saver = tf_saver.Saver(
        variables.all_variables(),
        sharded=True,
        write_version=saver_pb2.SaverDef.V2)
    saver.save(sess, variables_path, write_meta_graph=False)
    meta_graph_def = saver.export_meta_graph()

    # Tag the meta graph def and add it to the SavedModel.
    self._tag_and_add_meta_graph(meta_graph_def, tags, signature_def_map)

    # Mark this instance of SavedModel as having saved variables, such that
    # subsequent attempts to save variables will fail.
    self._has_saved_variables = True
Exemple #6
0
    def add_meta_graph_and_variables(self,
                                     sess,
                                     tags,
                                     signature_def_map=None,
                                     assets_collection=None,
                                     legacy_init_op=None):
        """Adds the current meta graph to the SavedModel and saves variables.

    Creates a Saver to save the variables from the provided session. Exports the
    corresponding meta graph def. This function assumes that the variables to be
    saved have been initialized. For a given `SavedModelBuilder`, this API must
    be called exactly once and for the first meta graph to save. For subsequent
    meta graph defs to be added, the `add_meta_graph()` API must be used.

    Args:
      sess: The TensorFlow session from which to save the meta graph and
        variables.
      tags: The set of tags with which to save the meta graph.
      signature_def_map: The map of signature def map to add to the meta graph
        def.
      assets_collection: Assets collection to be saved with SavedModel.
      legacy_init_op: Op or group of ops to execute after the restore op upon a
        load.
    """
        if self._has_saved_variables:
            raise AssertionError(
                "Variables and assets have already been saved. "
                "Please invoke `add_meta_graph()` instead.")

        # Save asset files and write them to disk, if any.
        self._save_and_write_assets(assets_collection)

        # Create the variables sub-directory, if it does not exist.
        variables_dir = os.path.join(
            compat.as_text(self._export_dir),
            compat.as_text(constants.VARIABLES_DIRECTORY))
        if not file_io.file_exists(variables_dir):
            file_io.recursive_create_dir(variables_dir)

        variables_path = os.path.join(
            compat.as_text(variables_dir),
            compat.as_text(constants.VARIABLES_FILENAME))

        # Add legacy init op to the SavedModel.
        self._maybe_add_legacy_init_op(legacy_init_op)

        # Save the variables and export meta graph def.
        saver = tf_saver.Saver(variables.all_variables(),
                               sharded=True,
                               write_version=saver_pb2.SaverDef.V2)
        saver.save(sess, variables_path, write_meta_graph=False)
        meta_graph_def = saver.export_meta_graph()

        # Tag the meta graph def and add it to the SavedModel.
        self._tag_and_add_meta_graph(meta_graph_def, tags, signature_def_map)

        # Mark this instance of SavedModel as having saved variables, such that
        # subsequent attempts to save variables will fail.
        self._has_saved_variables = True
Exemple #7
0
  def get_variable_names(self):
    """Returns list of all variable names in this model.

    Returns:
      List of names.
    """
    with self._graph.as_default():
      return [v.name for v in variables.all_variables()]
Exemple #8
0
    def get_variable_names(self):
        """Returns list of all variable names in this model.

    Returns:
      List of names.
    """
        with self._graph.as_default():
            return [v.name for v in variables.all_variables()]
Exemple #9
0
def _get_saver():
  saver = _get_first_op_from_collection(ops.GraphKeys.SAVERS)
  if saver is not None:
    if saver:
      saver = saver[0]
    else:
      saver = None
  if saver is None and variables.all_variables():
    saver = tf_saver.Saver()
    ops.add_to_collection(ops.GraphKeys.SAVERS, saver)
  return saver
Exemple #10
0
def _get_saver():
  """Lazy init and return saver."""
  saver = _get_first_op_from_collection(ops.GraphKeys.SAVERS)
  if saver is not None:
    if saver:
      saver = saver[0]
    else:
      saver = None
  if saver is None and variables.all_variables():
    saver = tf_saver.Saver(write_version=saver_pb2.SaverDef.V1)
    ops.add_to_collection(ops.GraphKeys.SAVERS, saver)
  return saver
  def run(self,
          num_batches=None,
          graph=None,
          session=None,
          start_queues=True,
          initialize_variables=True,
          **kwargs):
    """Builds and runs the columns of the `DataFrame` and yields batches.

    This is a generator that yields a dictionary mapping column names to
    evaluated columns.

    Args:
      num_batches: the maximum number of batches to produce. If none specified,
        the returned value will iterate through infinite batches.
      graph: the `Graph` in which the `DataFrame` should be built.
      session: the `Session` in which to run the columns of the `DataFrame`.
      start_queues: if true, queues will be started before running and halted
        after producting `n` batches.
      initialize_variables: if true, variables will be initialized.
      **kwargs: Additional keyword arguments e.g. `num_epochs`.

    Yields:
      A dictionary, mapping column names to the values resulting from running
      each column for a single batch.
    """
    if graph is None:
      graph = ops.get_default_graph()
    with graph.as_default():
      if session is None:
        session = sess.Session()
      self_built = self.build(**kwargs)
      keys = list(self_built.keys())
      cols = list(self_built.values())
      if initialize_variables:
        if variables.local_variables():
          session.run(variables.initialize_local_variables())
        if variables.all_variables():
          session.run(variables.initialize_all_variables())
      if start_queues:
        coord = coordinator.Coordinator()
        threads = qr.start_queue_runners(sess=session, coord=coord)
      i = 0
      while num_batches is None or i < num_batches:
        i += 1
        try:
          values = session.run(cols)
          yield collections.OrderedDict(zip(keys, values))
        except errors.OutOfRangeError:
          break
      if start_queues:
        coord.request_stop()
        coord.join(threads)
Exemple #12
0
  def _init_saver(self, saver=USE_DEFAULT):
    """Initializes saver.

    Args:
      saver: A `Saver` object. If set to USE_DEFAULT, create one that
        saves all the variables.
    """
    if saver is Supervisor.USE_DEFAULT:
      saver = self._get_first_op_from_collection(ops.GraphKeys.SAVERS)
      if saver is None and variables.all_variables():
        saver = saver_mod.Saver()
        ops.add_to_collection(ops.GraphKeys.SAVERS, saver)
    self._saver = saver
Exemple #13
0
    def _init_saver(self, saver=USE_DEFAULT):
        """Initializes saver.

    Args:
      saver: A `Saver` object. If set to USE_DEFAULT, create one that
        saves all the variables.
    """
        if saver is Supervisor.USE_DEFAULT:
            saver = self._get_first_op_from_collection(ops.GraphKeys.SAVERS)
            if saver is None and variables.all_variables():
                saver = saver_mod.Saver()
                ops.add_to_collection(ops.GraphKeys.SAVERS, saver)
        self._saver = saver
Exemple #14
0
  def add_meta_graph(self,
                     tags,
                     signature_def_map=None,
                     assets_collection=None,
                     legacy_init_op=None,
                     clear_devices=False):
    """Adds the current meta graph to the SavedModel.

    Creates a Saver in the current scope and uses the Saver to export the meta
    graph def. Invoking this API requires the `add_meta_graph_and_variables()`
    API to have been invoked before.

    Args:
      tags: The set of tags to annotate the meta graph def with.
      signature_def_map: The map of signature defs to be added to the meta graph
          def.
      assets_collection: Assets collection to be saved with SavedModel. Note
          that this collection should be a subset of the assets saved as part of
          the first meta graph in the SavedModel.
      legacy_init_op: Op or group of ops to execute after the restore op upon a
          load.
      clear_devices: Set to true if the device info on the default graph should
          be cleared.

    Raises:
      AssertionError: If the variables for the SavedModel have not been saved
          yet.
    """
    if not self._has_saved_variables:
      raise AssertionError(
          "Variables and assets have not been saved yet. "
          "Please invoke `add_meta_graph_and_variables()` first.")

    self._maybe_clear_devices(clear_devices)

    # Save asset files and write them to disk, if any.
    self._save_and_write_assets(assets_collection)

    # Add legacy init op to the SavedModel.
    self._maybe_add_legacy_init_op(legacy_init_op)

    saver = tf_saver.Saver(
        variables.all_variables(),
        sharded=True,
        write_version=saver_pb2.SaverDef.V2)

    meta_graph_def = saver.export_meta_graph()

    # Tag the meta graph def and add it to the SavedModel.
    self._tag_and_add_meta_graph(meta_graph_def, tags, signature_def_map)
Exemple #15
0
    def add_meta_graph(self,
                       tags,
                       signature_def_map=None,
                       assets_collection=None,
                       legacy_init_op=None,
                       clear_devices=False):
        """Adds the current meta graph to the SavedModel.

    Creates a Saver in the current scope and uses the Saver to export the meta
    graph def. Invoking this API requires the `add_meta_graph_and_variables()`
    API to have been invoked before.

    Args:
      tags: The set of tags to annotate the meta graph def with.
      signature_def_map: The map of signature defs to be added to the meta graph
          def.
      assets_collection: Assets collection to be saved with SavedModel. Note
          that this collection should be a subset of the assets saved as part of
          the first meta graph in the SavedModel.
      legacy_init_op: Op or group of ops to execute after the restore op upon a
          load.
      clear_devices: Set to true if the device info on the default graph should
          be cleared.

    Raises:
      AssertionError: If the variables for the SavedModel have not been saved
          yet.
    """
        if not self._has_saved_variables:
            raise AssertionError(
                "Variables and assets have not been saved yet. "
                "Please invoke `add_meta_graph_and_variables()` first.")

        self._maybe_clear_devices(clear_devices)

        # Save asset files and write them to disk, if any.
        self._save_and_write_assets(assets_collection)

        # Add legacy init op to the SavedModel.
        self._maybe_add_legacy_init_op(legacy_init_op)

        saver = tf_saver.Saver(variables.all_variables(),
                               sharded=True,
                               write_version=saver_pb2.SaverDef.V2)

        meta_graph_def = saver.export_meta_graph()

        # Tag the meta graph def and add it to the SavedModel.
        self._tag_and_add_meta_graph(meta_graph_def, tags, signature_def_map)
Exemple #16
0
 def variables_to_restore(self, moving_avg_variables=None):
   """"""
   
   name_map = {}
   if moving_avg_variables is None:
     moving_avg_variables = variables.trainable_variables()
     moving_avg_variables += variables.moving_average_variables()
   # Remove duplicates
   moving_avg_variables = set(moving_avg_variables)
   # Collect all the variables with moving average,
   for v in moving_avg_variables:
     name_map[self.average_name(v)] = v
   # Make sure we restore variables without moving average as well.
   for v in list(set(variables.all_variables()) - moving_avg_variables):
     if v.op.name not in name_map:
       name_map[v.op.name] = v
   return name_map
Exemple #17
0
    def variables_to_restore(self, moving_avg_variables=None):
        """"""

        name_map = {}
        if moving_avg_variables is None:
            moving_avg_variables = variables.trainable_variables()
            moving_avg_variables += variables.moving_average_variables()
        # Remove duplicates
        moving_avg_variables = set(moving_avg_variables)
        # Collect all the variables with moving average,
        for v in moving_avg_variables:
            name_map[self.average_name(v)] = v
        # Make sure we restore variables without moving average as well.
        for v in list(set(variables.all_variables()) - moving_avg_variables):
            if v.op.name not in name_map:
                name_map[v.op.name] = v
        return name_map
    def variables_to_restore(self, moving_avg_variables=None):
        """Returns a map of names to `Variables` to restore.

    If a variable has a moving average, use the moving average variable name as
    the restore name; otherwise, use the variable name.

    For example,

    ```python
      variables_to_restore = ema.variables_to_restore()
      saver = tf.train.Saver(variables_to_restore)
    ```

    Below is an example of such mapping:

    ```
      conv/batchnorm/gamma/ExponentialMovingAverage: conv/batchnorm/gamma,
      conv_4/conv2d_params/ExponentialMovingAverage: conv_4/conv2d_params,
      global_step: global_step
    ```
    Args:
      moving_avg_variables: a list of variables that require to use of the
        moving variable name to be restored. If None, it will default to
        variables.moving_average_variables() + variables.trainable_variables()

    Returns:
      A map from restore_names to variables. The restore_name can be the
      moving_average version of the variable name if it exist, or the original
      variable name.
    """
        name_map = {}
        if moving_avg_variables is None:
            # Include trainable variables and variables which have been explicitly
            # added to the moving_average_variables collection.
            moving_avg_variables = variables.trainable_variables()
            moving_avg_variables += variables.moving_average_variables()
        # Remove duplicates
        moving_avg_variables = set(moving_avg_variables)
        # Collect all the variables with moving average,
        for v in moving_avg_variables:
            name_map[self.average_name(v)] = v
        # Make sure we restore variables without moving average as well.
        for v in list(set(variables.all_variables()) - moving_avg_variables):
            if v.op.name not in name_map:
                name_map[v.op.name] = v
        return name_map
  def variables_to_restore(self, moving_avg_variables=None):
    """Returns a map of names to `Variables` to restore.

    If a variable has a moving average, use the moving average variable name as
    the restore name; otherwise, use the variable name.

    For example,

    ```python
      variables_to_restore = ema.variables_to_restore()
      saver = tf.train.Saver(variables_to_restore)
    ```

    Below is an example of such mapping:

    ```
      conv/batchnorm/gamma/ExponentialMovingAverage: conv/batchnorm/gamma,
      conv_4/conv2d_params/ExponentialMovingAverage: conv_4/conv2d_params,
      global_step: global_step
    ```
    Args:
      moving_avg_variables: a list of variables that require to use of the
        moving variable name to be restored. If None, it will default to
        variables.moving_average_variables() + variables.trainable_variables()

    Returns:
      A map from restore_names to variables. The restore_name can be the
      moving_average version of the variable name if it exist, or the original
      variable name.
    """
    name_map = {}
    if moving_avg_variables is None:
      # Include trainable variables and variables which have been explicitly
      # added to the moving_average_variables collection.
      moving_avg_variables = variables.trainable_variables()
      moving_avg_variables += variables.moving_average_variables()
    # Remove duplicates
    moving_avg_variables = set(moving_avg_variables)
    # Collect all the variables with moving average,
    for v in moving_avg_variables:
      name_map[self.average_name(v)] = v
    # Make sure we restore variables without moving average as well.
    for v in list(set(variables.all_variables()) - moving_avg_variables):
      if v.op.name not in name_map:
        name_map[v.op.name] = v
    return name_map
Exemple #20
0
    def add_meta_graph_and_variables(self,
                                     sess,
                                     tags,
                                     signature_def_map=None,
                                     assets_collection=None):
        """Adds the current meta graph to the SavedModel and saves variables.

    Creates a Saver to save the variables from the provided session. Exports the
    corresponding meta graph def. This function assumes that the variables to be
    saved have been initialized. For a given `SavedModelBuilder`, this API must
    be called exactly once and for the first meta graph to save. For subsequent
    meta graph defs to be added, the `add_meta_graph()` API must be used.

    Args:
      sess: The TensorFlow session from which to save the meta graph and
        variables.
      tags: The set of tags with which to save the meta graph.
      signature_def_map: The map of signature def map to add to the meta graph
        def.
      assets_collection: Assets collection to be saved with SavedModel.
    """
        if self._has_saved_variables:
            raise AssertionError(
                "Variables and assets have already been saved. "
                "Please invoke `add_meta_graph()` instead.")

        # Save asset files and write them to disk, if any.
        self._save_and_write_assets(assets_collection)

        export_path = os.path.join(
            compat.as_text(self._export_dir),
            compat.as_text(constants.VARIABLES_FILENAME))

        # Save the variables and export meta graph def.
        saver = tf_saver.Saver(variables.all_variables())
        saver.save(sess, export_path, write_meta_graph=False)
        meta_graph_def = saver.export_meta_graph()

        # Tag the meta graph def and add it to the SavedModel.
        self._tag_and_add_meta_graph(meta_graph_def, tags, signature_def_map)

        # Mark this instance of SavedModel as having saved variables, such that
        # subsequent attempts to save variables will fail.
        self._has_saved_variables = True
Exemple #21
0
  def add_meta_graph_and_variables(self,
                                   sess,
                                   tags,
                                   signature_def_map=None,
                                   assets_collection=None):
    """Adds the current meta graph to the SavedModel and saves variables.

    Creates a Saver to save the variables from the provided session. Exports the
    corresponding meta graph def. This function assumes that the variables to be
    saved have been initialized. For a given `SavedModelBuilder`, this API must
    be called exactly once and for the first meta graph to save. For subsequent
    meta graph defs to be added, the `add_meta_graph()` API must be used.

    Args:
      sess: The TensorFlow session from which to save the meta graph and
        variables.
      tags: The set of tags with which to save the meta graph.
      signature_def_map: The map of signature def map to add to the meta graph
        def.
      assets_collection: Assets collection to be saved with SavedModel.
    """
    if self._has_saved_variables:
      raise AssertionError("Variables and assets have already been saved. "
                           "Please invoke `add_meta_graph()` instead.")

    # Save asset files and write them to disk, if any.
    self._save_and_write_assets(assets_collection)

    export_path = os.path.join(
        compat.as_text(self._export_dir),
        compat.as_text(constants.VARIABLES_FILENAME))

    # Save the variables and export meta graph def.
    saver = tf_saver.Saver(variables.all_variables())
    saver.save(sess, export_path, write_meta_graph=False)
    meta_graph_def = saver.export_meta_graph()

    # Tag the meta graph def and add it to the SavedModel.
    self._tag_and_add_meta_graph(meta_graph_def, tags, signature_def_map)

    # Mark this instance of SavedModel as having saved variables, such that
    # subsequent attempts to save variables will fail.
    self._has_saved_variables = True
  def swapping_saver(self, var_list=None, name='swapping_saver', **kwargs):
    """Create a saver swapping moving averages and variables.

    You should use this saver during training.  It will save the moving averages
    of the trained parameters under the original parameter names.  For
    evaluations or inference you should use a regular saver and it will
    automatically use the moving averages for the trained variable.

    You must call this function after all variables have been created and after
    you have called Optimizer.minimize().

    Args:
      var_list: List of variables to save, as per `Saver()`.
                If set to None, will save all the variables that have been
                created before this call.
      name: The name of the saver.
      **kwargs: Keyword arguments of `Saver()`.

    Returns:
      A `tf.Saver` object.

    Raises:
      RuntimeError: If apply_gradients or minimize has not been called before.
    """

    if self._variable_map is None:
      raise RuntimeError('Must call apply_gradients or minimize before '
                         'creating the swapping_saver')
    if var_list is None:
      var_list = variables.all_variables()
    if not isinstance(var_list, dict):
      var_list = saver.BaseSaverBuilder.OpListToDict(var_list)
    # Now swap variables and moving averages
    swapped_var_list = {}
    for k, v in six.iteritems(var_list):
      v_swap = self._variable_map.get(v.op.name, None)
      if v_swap:
        swapped_var_list[k] = v_swap
      else:
        swapped_var_list[k] = v
    # Build the swapping saver.
    return saver.Saver(swapped_var_list, name=name, **kwargs)
    def swapping_saver(self, var_list=None, name='swapping_saver', **kwargs):
        """Create a saver swapping moving averages and variables.

    You should use this saver during training.  It will save the moving averages
    of the trained parameters under the original parameter names.  For
    evaluations or inference you should use a regular saver and it will
    automatically use the moving averages for the trained variable.

    You must call this function after all variables have been created and after
    you have called Optimizer.minimize().

    Args:
      var_list: List of variables to save, as per `Saver()`.
                If set to None, will save all the variables that have been
                created before this call.
      name: The name of the saver.
      **kwargs: Keyword arguments of `Saver()`.

    Returns:
      A `tf.Saver` object.

    Raises:
      RuntimeError: If apply_gradients or minimize has not been called before.
    """

        if self._variable_map is None:
            raise RuntimeError('Must call apply_gradients or minimize before '
                               'creating the swapping_saver')
        if var_list is None:
            var_list = variables.all_variables()
        if not isinstance(var_list, dict):
            var_list = saver.BaseSaverBuilder.OpListToDict(var_list)
        # Now swap variables and moving averages
        swapped_var_list = {}
        for k, v in six.iteritems(var_list):
            v_swap = self._variable_map.get(v.op.name, None)
            if v_swap:
                swapped_var_list[k] = v_swap
            else:
                swapped_var_list[k] = v
        # Build the swapping saver.
        return saver.Saver(swapped_var_list, name=name, **kwargs)
Exemple #24
0
    def variables_to_restore(self):
        """Returns a map of names to `Variables` to restore.

    If a variable has a moving average, use the moving average variable name as
    the restore name; otherwise, use the variable name.

    For example,

    ```python
      variables_to_restore = ema.variables_to_restore()
      saver = tf.train.Saver(variables_to_restore)
    ```

    Below is an example of such mapping:

    ```
      conv/batchnorm/gamma/ExponentialMovingAverage: conv/batchnorm/gamma,
      conv_4/conv2d_params/ExponentialMovingAverage: conv_4/conv2d_params,
      global_step: global_step
    ```

    Returns:
      A map from restore_names to variables. The restore_name can be the
      moving_average version of the variable name if it exist, or the original
      variable name.
    """
        name_map = {}
        # Collect all the variables with moving average, including all
        # the trainable variables and variables which have been explicitly
        # added to the collection.
        moving_avg_variables = list(
            set(variables.moving_average_variables() +
                variables.trainable_variables()))
        for v in moving_avg_variables:
            name_map[self.average_name(v)] = v
        # Make sure we restore variables without moving average as well.
        for v in list(
                set(variables.all_variables()) - set(moving_avg_variables)):
            if v.op.name not in name_map:
                name_map[v.op.name] = v
        return name_map
  def variables_to_restore(self):
    """Returns a map of names to `Variables` to restore.

    If a variable has a moving average, use the moving average variable name as
    the restore name; otherwise, use the variable name.

    For example,

    ```python
      variables_to_restore = ema.variables_to_restore()
      saver = tf.train.Saver(variables_to_restore)
    ```

    Below is an example of such mapping:

    ```
      conv/batchnorm/gamma/ExponentialMovingAverage: conv/batchnorm/gamma,
      conv_4/conv2d_params/ExponentialMovingAverage: conv_4/conv2d_params,
      global_step: global_step
    ```

    Returns:
      A map from restore_names to variables. The restore_name can be the
      moving_average version of the variable name if it exist, or the original
      variable name.
    """
    name_map = {}
    # Collect all the variables with moving average, including all
    # the trainable variables and variables which have been explicitly
    # added to the collection.
    moving_avg_variables = list(set(variables.moving_average_variables() +
                                    variables.trainable_variables()))
    for v in moving_avg_variables:
      name_map[self.average_name(v)] = v
    # Make sure we restore variables without moving average as well.
    for v in list(set(variables.all_variables()) - set(moving_avg_variables)):
      if v.op.name not in name_map:
        name_map[v.op.name] = v
    return name_map
  def __init__(self,
               var_list=None,
               reshape=False,
               sharded=False,
               max_to_keep=5,
               keep_checkpoint_every_n_hours=10000.0,
               name=None,
               restore_sequentially=False,
               saver_def=None,
               builder=None):
    """Creates a `Saver`.

    The constructor adds ops to save and restore variables.

    `var_list` specifies the variables that will be saved and restored. It can
    be passed as a `dict` or a list:

    * A `dict` of names to variables: The keys are the names that will be
      used to save or restore the variables in the checkpoint files.
    * A list of variables: The variables will be keyed with their op name in
      the checkpoint files.

    For example:

    ```python
    v1 = tf.Variable(..., name='v1')
    v2 = tf.Variable(..., name='v2')

    # Pass the variables as a dict:
    saver = tf.train.Saver({'v1': v1, 'v2': v2})

    # Or pass them as a list.
    saver = tf.train.Saver([v1, v2])
    # Passing a list is equivalent to passing a dict with the variable op names
    # as keys:
    saver = tf.train.Saver({v.op.name: v for v in [v1, v2]})
    ```

    The optional `reshape` argument, if `True`, allows restoring a variable from
    a save file where the variable had a different shape, but the same number
    of elements and type.  This is useful if you have reshaped a variable and
    want to reload it from an older checkpoint.

    The optional `sharded` argument, if `True`, instructs the saver to shard
    checkpoints per device.

    Args:
      var_list: A list of `Variable` objects or a dictionary mapping names to
        variables.  If `None`, defaults to the list of all variables.
      reshape: If `True`, allows restoring parameters from a checkpoint
        where the variables have a different shape.
      sharded: If `True`, shard the checkpoints, one per device.
      max_to_keep: Maximum number of recent checkpoints to keep.
        Defaults to 10,000 hours.
      keep_checkpoint_every_n_hours: How often to keep checkpoints.
        Defaults to 10,000 hours.
      name: String.  Optional name to use as a prefix when adding operations.
      restore_sequentially: A `Bool`, which if true, causes restore of different
        variables to happen sequentially within each device.  This can lower
        memory usage when restoring very large models.
      saver_def: Optional `SaverDef` proto to use instead of running the
        builder. This is only useful for specialty code that wants to recreate
        a `Saver` object for a previously built `Graph` that had a `Saver`.
        The `saver_def` proto should be the one returned by the
        `as_saver_def()` call of the `Saver` that was created for that `Graph`.
      builder: Optional `SaverBuilder` to use if a `saver_def` was not provided.
        Defaults to `BaseSaverBuilder()`.

    Raises:
      TypeError: If `var_list` is invalid.
      ValueError: If any of the keys or values in `var_list` are not unique.
    """
    if saver_def is None:
      if builder is None:
        builder = BaseSaverBuilder()
      if var_list is None:
        var_list = variables.all_variables()
      if not var_list:
        raise ValueError("No variables to save")
      saver_def = builder.build(
          var_list,
          reshape=reshape,
          sharded=sharded,
          max_to_keep=max_to_keep,
          keep_checkpoint_every_n_hours=keep_checkpoint_every_n_hours,
          name=name,
          restore_sequentially=restore_sequentially)
    if not isinstance(saver_def, saver_pb2.SaverDef):
      raise ValueError("saver_def must if a saver_pb2.SaverDef: %s" % saver_def)
    if not saver_def.save_tensor_name:
      raise ValueError("saver_def must specify the save_tensor_name: %s"
                       % str(saver_def))
    if not saver_def.restore_op_name:
      raise ValueError("saver_def must specify the restore_op_name: %s"
                       % str(saver_def))
    self._filename_tensor_name = saver_def.filename_tensor_name
    self._save_tensor_name = saver_def.save_tensor_name
    self._restore_op_name = saver_def.restore_op_name
    self._max_to_keep = saver_def.max_to_keep
    # If keep_checkpoint_every_n_hours is not set, set it to 10000 hours.
    self._keep_checkpoint_every_n_hours = (
        saver_def.keep_checkpoint_every_n_hours if
        saver_def.keep_checkpoint_every_n_hours else 10000)
    self._next_checkpoint_time = (
        time.time() + self._keep_checkpoint_every_n_hours * 3600)
    self._sharded = saver_def.sharded
    self._last_checkpoints = []
    def apply_gradients(
            self,
            grads_and_vars,
            worker_id,
            global_step=None,
            name=None,
            collect_cdfs=False,
            #  batch_idx_list=None, worker_kill_list=None, num_workers=None, num_batches_per_epoch=None):
            matrix_to_solve=None,
            num_batches_per_epoch=None):
        """Apply gradients to variables.
    This contains most of the synchronization implementation and also wraps the
    apply_gradients() from the real optimizer.
    Args:
      grads_and_vars: List of (gradient, variable) pairs as returned by
        compute_gradients().
      global_step: Optional Variable to increment by one after the
        variables have been updated.
      name: Optional name for the returned operation.  Default to the
        name passed to the Optimizer constructor.
    Returns:
      train_op: The op to dequeue a token so the replicas can exit this batch
      and start the next one. This is executed by each replica.
    Raises:
      ValueError: If the grads_and_vars is empty.
      ValueError: If global step is not provided, the staleness cannot be
        checked.
    """
        if not grads_and_vars:
            raise ValueError("Must supply at least one variable")

        if global_step is None:
            raise ValueError("Global step is required to check staleness")

        self._global_step = global_step
        train_ops = []
        aggregated_grad = []
        var_list = []

        self._local_step = variables.Variable(
            initial_value=0,
            trainable=False,
            collections=[ops.GraphKeys.LOCAL_VARIABLES],
            dtype=global_step.dtype.base_dtype,
            name="sync_rep_local_step")
        self.local_step_init_op = state_ops.assign(self._local_step,
                                                   global_step._ref())
        chief_init_ops = [self.local_step_init_op]
        self.ready_for_local_init_op = variables.report_uninitialized_variables(
            variables.all_variables())

        # The wait op waits for the current worker to dequeue a token from its respective token queue
        self._wait_op = self._sync_token_queues[worker_id].dequeue()

        # Replicas have to wait until they can get a token from the token queue
        # BEFORE begining to compute gradients.
        with ops.device(global_step.device):
            queue_size = self._sync_token_queues[worker_id].size()
            update_local_step_op = state_ops.assign(self._local_step,
                                                    global_step._ref())

        # Gradient accum creation
        with ops.name_scope(None, self._name):
            for grad, var in grads_and_vars:
                var_list.append(var)
                tf.logging.info("Grad " + str(grad) + " assigned to " +
                                str(var.device))
                with ops.device(var.device):
                    if grad is None:
                        continue
                    elif isinstance(grad, ops.Tensor):
                        grad_accum = data_flow_ops.ConditionalAccumulator(
                            grad.dtype,
                            shape=var.get_shape(),
                            shared_name=var.name + "/grad_accum")
                    else:
                        if not isinstance(grad, ops.IndexedSlices):
                            raise ValueError("Unknown grad type!")
                        grad_accum = data_flow_ops.SparseConditionalAccumulator(
                            grad.dtype,
                            shape=(),
                            shared_name=var.name + "/grad_accum")

                    self._accumulator_list.append((grad_accum, var))
            """# Phase 1 gradient computation
      with ops.control_dependencies([update_local_step_op]):
        for index, (grad, var) in enumerate(grads_and_vars):
          with ops.device(var.device):
            if grad is None:
              continue

            elif isinstance(grad, ops.Tensor):
              grad_accum = self._accumulator_list[index][0]

              train_ops.append(grad_accum.apply_grad(grad,
                                                     local_step=self._local_step._ref()))

            else:
              if not isinstance(grad, ops.IndexedSlices):
                raise ValueError("Unknown grad type!")
              grad_accum = self._accumulator_list[index][0]

              train_ops.append(grad_accum.apply_indexed_slices_grad(
                grad, local_step=self._local_step._ref()))"""

            # Phase 1 gradient computation
            with ops.control_dependencies([update_local_step_op]):
                for index, (grad, var) in enumerate(grads_and_vars):
                    print_start_op = logging_ops.Print(
                        global_step, [global_step],
                        message="Starting to apply grads for variable %d" %
                        index)
                    train_ops.append(print_start_op)
                    with ops.device(var.device):
                        work_idx_print = logging_ops.Print(
                            worker_id, [worker_id],
                            message="worker id for comp grad")
                        ps_step_printer0 = logging_ops.Print(
                            global_step, [global_step],
                            message="global step printer0 on ps")
                        train_ops.append(work_idx_print)
                        train_ops.append(ps_step_printer0)
                        '''Implement LS computation and solution here'''
                        #b = np.ones(int(num_batches_per_epoch))
                        b = tf.ones([int(num_batches_per_epoch), 1],
                                    tf.float32)
                        A = matrix_to_solve
                        #            A_for_calc = np.transpose(A)
                        LS_solution = linalg_ops.matrix_solve_ls(A,
                                                                 b,
                                                                 fast=False)
                        LS_calc = tf.reshape(LS_solution, [-1])
                        weight = tf.slice(LS_calc, [worker_id], [1])
                        #            print_ls_op = logging_ops.Print(LS_calc, [LS_calc], message="Solution for LS!")
                        #            train_ops.append(print_ls_op)
                        weighted_grad = tf.scalar_mul(weight[0], grad)
                        '''Kill some workers'''
                        if grad is None:
                            continue

                        elif isinstance(grad, ops.Tensor):
                            grad_accum = self._accumulator_list[index][0]

                            with ops.control_dependencies([print_start_op]):
                                with tf.device("job:worker/task:%d" %
                                               worker_id):
                                    #                  apply_grad_op = grad_accum.apply_grad(grad,
                                    apply_grad_op = grad_accum.apply_grad(
                                        weighted_grad,
                                        local_step=self._local_step._ref())
                                    with ops.control_dependencies(
                                        [apply_grad_op]):
                                        finished_print_op = logging_ops.Print(
                                            global_step, [global_step],
                                            message=
                                            "Done applying grads for variable %d"
                                            % index)
                                        train_ops.append(finished_print_op)

                        else:
                            if not isinstance(grad, ops.IndexedSlices):
                                raise ValueError("Unknown grad type!")
                            grad_accum = self._accumulator_list[index][0]

                            with ops.control_dependencies([print_start_op]):
                                with tf.device("job:worker/task:%d" %
                                               worker_id):
                                    apply_grad_op = grad_accum.apply_indexed_slices_grad(
                                        #                    grad, local_step=self._local_step._ref())
                                        weighted_grad,
                                        local_step=self._local_step._ref())
                                    with ops.control_dependencies(
                                        [apply_grad_op]):
                                        finished_print_op = logging_ops.Print(
                                            global_step, [global_step],
                                            message=
                                            "Done applying grads for variable %d"
                                            % index)
                                        train_ops.append(finished_print_op)

            # Phase 2 gradient applying
            for index, (grad, var) in enumerate(grads_and_vars):
                with ops.device(var.device):
                    grad_accum = self._accumulator_list[index][0]
                    work_idx_print1 = logging_ops.Print(
                        worker_id, [worker_id],
                        message="worker id for aggregate grad")
                    ps_step_printer1 = logging_ops.Print(
                        global_step, [global_step],
                        message="global step printer1 on ps")
                    num_replica_aggragate = logging_ops.Print(
                        self._replicas_to_aggregate,
                        [self._replicas_to_aggregate],
                        message="num replica aggregate")
                    train_ops.append(work_idx_print1)
                    train_ops.append(ps_step_printer1)
                    train_ops.append(num_replica_aggragate)
                    if grad is None:
                        aggregated_grad.append(None)
                    elif isinstance(grad, ops.Tensor):
                        if collect_cdfs:
                            #              aggregated_grad.append(grad_accum.take_grad(self._total_num_replicas))
                            aggregated_grad.append(
                                grad_accum.take_grad(
                                    self._replicas_to_aggregate))
                        else:
                            aggregated_grad.append(grad_accum.take_grad(1))
                    else:
                        if collect_cdfs:
                            #              aggregated_grad.append(grad_accum.take_grad(self._total_num_replicas))
                            aggregated_grad.append(
                                grad_accum.take_grad(
                                    self._replicas_to_aggregate))
                        else:
                            aggregated_grad.append(
                                grad_accum.take_indexed_slices_grad(1))

            aggregated_grads_and_vars = zip(aggregated_grad, var_list)

            # Some debug operations
            self.print_sizes = logging_ops.Print(global_step, [
                self._sync_token_queues[i].size()
                for i in range(self._total_num_replicas)
            ],
                                                 message="queue sizes")
            self.print_accum_sizes = logging_ops.Print(
                self._local_step,
                [x[0].num_accumulated()
                 for x in self._accumulator_list] + [worker_id],
                message="Accum sizes")
            self.print_local_step = logging_ops.Print(
                self._local_step,
                [self._local_step._ref(),
                 global_step._ref()],
                message="local vs global step")

            # sync_op will be assigned to the same device as the global step.
            with ops.device(global_step.device), ops.name_scope(""):
                with ops.control_dependencies([self.print_accum_sizes]):
                    update_op = self._opt.apply_gradients(
                        aggregated_grads_and_vars, global_step)
                    self._update_op = update_op
                    with ops.control_dependencies([update_op]):
                        sync_op = []
                        for cur_worker_id in range(self._total_num_replicas):
                            sync_op.append(
                                self._sync_token_queues[cur_worker_id].enqueue(
                                    global_step))
                        sync_op = control_flow_ops.group(*(sync_op))

                # dummy_queue is passed to the queue runner. Don't use the real queues
                # because the queue runner doesn't automatically reopen it once it
                # closed queues in PS devices.
                dummy_queue = (data_flow_ops.FIFOQueue(
                    1,
                    types_pb2.DT_INT32,
                    shapes=(),
                    shared_name="dummy_queue"))

                self._chief_queue_runner = queue_runner.QueueRunner(
                    dummy_queue, [sync_op])

            with ops.device(global_step.device), ops.name_scope(""):
                with ops.control_dependencies(train_ops):
                    # Worker finished applying gradients. Add token to phase1_finished_queue
                    train_op = logging_ops.Print(
                        self._local_step._ref(), [
                            x[0].num_accumulated()
                            for x in self._accumulator_list
                        ] + [worker_id],
                        message="Finished worker updates",
                        name="FinishedWorkerUpdatesPrint")

            for accum, var in self._accumulator_list:
                with ops.device(var.device):
                    chief_init_ops.append(
                        accum.set_global_step(global_step,
                                              name="SetGlobalStep"))
            self.chief_init_op = control_flow_ops.group(*(chief_init_ops))
            self._gradients_applied = True

            return train_op
Exemple #28
0
  def testModelWithBuckets(self):
    """Larger tests that does full sequence-to-sequence model training."""
    # We learn to copy 10 symbols in 2 buckets: length 4 and length 8.
    classes = 10
    buckets = [(4, 4), (8, 8)]
    perplexities = [[], []]  # Results for each bucket.
    random_seed.set_random_seed(111)
    random.seed(111)
    np.random.seed(111)

    with self.test_session() as sess:
      # We use sampled softmax so we keep output projection separate.
      w = variable_scope.get_variable("proj_w", [24, classes])
      w_t = array_ops.transpose(w)
      b = variable_scope.get_variable("proj_b", [classes])

      # Here comes a sample Seq2Seq model using GRU cells.
      def SampleGRUSeq2Seq(enc_inp, dec_inp, weights):
        """Example sequence-to-sequence model that uses GRU cells."""

        def GRUSeq2Seq(enc_inp, dec_inp):
          cell = core_rnn_cell_impl.MultiRNNCell(
              [core_rnn_cell_impl.GRUCell(24) for _ in range(2)],
              state_is_tuple=True)
          return seq2seq_lib.embedding_attention_seq2seq(
              enc_inp,
              dec_inp,
              cell,
              num_encoder_symbols=classes,
              num_decoder_symbols=classes,
              embedding_size=24,
              output_projection=(w, b))

        targets = [dec_inp[i + 1] for i in range(len(dec_inp) - 1)] + [0]

        def SampledLoss(labels, inputs):
          labels = array_ops.reshape(labels, [-1, 1])
          return nn_impl.sampled_softmax_loss(
              weights=w_t,
              biases=b,
              labels=labels,
              inputs=inputs,
              num_sampled=8,
              num_classes=classes)

        return seq2seq_lib.model_with_buckets(
            enc_inp,
            dec_inp,
            targets,
            weights,
            buckets,
            GRUSeq2Seq,
            softmax_loss_function=SampledLoss)

      # Now we construct the copy model.
      batch_size = 8
      inp = [
          array_ops.placeholder(
              dtypes.int32, shape=[None]) for _ in range(8)
      ]
      out = [
          array_ops.placeholder(
              dtypes.int32, shape=[None]) for _ in range(8)
      ]
      weights = [
          array_ops.ones_like(
              inp[0], dtype=dtypes.float32) for _ in range(8)
      ]
      with variable_scope.variable_scope("root"):
        _, losses = SampleGRUSeq2Seq(inp, out, weights)
        updates = []
        params = variables.all_variables()
        optimizer = adam.AdamOptimizer(0.03, epsilon=1e-5)
        for i in range(len(buckets)):
          full_grads = gradients_impl.gradients(losses[i], params)
          grads, _ = clip_ops.clip_by_global_norm(full_grads, 30.0)
          update = optimizer.apply_gradients(zip(grads, params))
          updates.append(update)
        sess.run([variables.global_variables_initializer()])
      steps = 6
      for _ in range(steps):
        bucket = random.choice(np.arange(len(buckets)))
        length = buckets[bucket][0]
        i = [
            np.array(
                [np.random.randint(9) + 1 for _ in range(batch_size)],
                dtype=np.int32) for _ in range(length)
        ]
        # 0 is our "GO" symbol here.
        o = [np.array([0] * batch_size, dtype=np.int32)] + i
        feed = {}
        for i1, i2, o1, o2 in zip(inp[:length], i[:length], out[:length],
                                  o[:length]):
          feed[i1.name] = i2
          feed[o1.name] = o2
        if length < 8:  # For the 4-bucket, we need the 5th as target.
          feed[out[length].name] = o[length]
        res = sess.run([updates[bucket], losses[bucket]], feed)
        perplexities[bucket].append(math.exp(float(res[1])))
      for bucket in range(len(buckets)):
        if len(perplexities[bucket]) > 1:  # Assert that perplexity went down.
          self.assertLess(perplexities[bucket][-1],  # 10% margin of error.
                          1.1 * perplexities[bucket][0])
Exemple #29
0
  def testModelWithBuckets(self):
    """Larger tests that does full sequence-to-sequence model training."""
    # We learn to copy 10 symbols in 2 buckets: length 4 and length 8.
    classes = 10
    buckets = [(4, 4), (8, 8)]
    perplexities = [[], []]  # Results for each bucket.
    random_seed.set_random_seed(111)
    random.seed(111)
    np.random.seed(111)

    with self.test_session() as sess:
      # We use sampled softmax so we keep output projection separate.
      w = variable_scope.get_variable("proj_w", [24, classes])
      w_t = array_ops.transpose(w)
      b = variable_scope.get_variable("proj_b", [classes])

      # Here comes a sample Seq2Seq model using GRU cells.
      def SampleGRUSeq2Seq(enc_inp, dec_inp, weights):
        """Example sequence-to-sequence model that uses GRU cells."""

        def GRUSeq2Seq(enc_inp, dec_inp):
          cell = core_rnn_cell_impl.MultiRNNCell(
              [core_rnn_cell_impl.GRUCell(24) for _ in range(2)],
              state_is_tuple=True)
          return seq2seq_lib.embedding_attention_seq2seq(
              enc_inp,
              dec_inp,
              cell,
              num_encoder_symbols=classes,
              num_decoder_symbols=classes,
              embedding_size=24,
              output_projection=(w, b))

        targets = [dec_inp[i + 1] for i in range(len(dec_inp) - 1)] + [0]

        def SampledLoss(labels, inputs):
          labels = array_ops.reshape(labels, [-1, 1])
          return nn_impl.sampled_softmax_loss(
              weights=w_t,
              biases=b,
              labels=labels,
              inputs=inputs,
              num_sampled=8,
              num_classes=classes)

        return seq2seq_lib.model_with_buckets(
            enc_inp,
            dec_inp,
            targets,
            weights,
            buckets,
            GRUSeq2Seq,
            softmax_loss_function=SampledLoss)

      # Now we construct the copy model.
      batch_size = 8
      inp = [
          array_ops.placeholder(
              dtypes.int32, shape=[None]) for _ in range(8)
      ]
      out = [
          array_ops.placeholder(
              dtypes.int32, shape=[None]) for _ in range(8)
      ]
      weights = [
          array_ops.ones_like(
              inp[0], dtype=dtypes.float32) for _ in range(8)
      ]
      with variable_scope.variable_scope("root"):
        _, losses = SampleGRUSeq2Seq(inp, out, weights)
        updates = []
        params = variables.all_variables()
        optimizer = adam.AdamOptimizer(0.03, epsilon=1e-5)
        for i in range(len(buckets)):
          full_grads = gradients_impl.gradients(losses[i], params)
          grads, _ = clip_ops.clip_by_global_norm(full_grads, 30.0)
          update = optimizer.apply_gradients(zip(grads, params))
          updates.append(update)
        sess.run([variables.global_variables_initializer()])
      steps = 6
      for _ in range(steps):
        bucket = random.choice(np.arange(len(buckets)))
        length = buckets[bucket][0]
        i = [
            np.array(
                [np.random.randint(9) + 1 for _ in range(batch_size)],
                dtype=np.int32) for _ in range(length)
        ]
        # 0 is our "GO" symbol here.
        o = [np.array([0] * batch_size, dtype=np.int32)] + i
        feed = {}
        for i1, i2, o1, o2 in zip(inp[:length], i[:length], out[:length],
                                  o[:length]):
          feed[i1.name] = i2
          feed[o1.name] = o2
        if length < 8:  # For the 4-bucket, we need the 5th as target.
          feed[out[length].name] = o[length]
        res = sess.run([updates[bucket], losses[bucket]], feed)
        perplexities[bucket].append(math.exp(float(res[1])))
      for bucket in range(len(buckets)):
        if len(perplexities[bucket]) > 1:  # Assert that perplexity went down.
          self.assertLess(perplexities[bucket][-1],  # 10% margin of error.
                          1.1 * perplexities[bucket][0])
Exemple #30
0
    def __init__(self,
                 var_list=None,
                 reshape=False,
                 sharded=False,
                 max_to_keep=5,
                 keep_checkpoint_every_n_hours=10000.0,
                 name=None,
                 restore_sequentially=False,
                 saver_def=None,
                 builder=None):
        """Creates a `Saver`.

    The constructor adds ops to save and restore variables.

    `var_list` specifies the variables that will be saved and restored. It can
    be passed as a `dict` or a list:

    * A `dict` of names to variables: The keys are the names that will be
      used to save or restore the variables in the checkpoint files.
    * A list of variables: The variables will be keyed with their op name in
      the checkpoint files.

    For example:

    ```python
    v1 = tf.Variable(..., name='v1')
    v2 = tf.Variable(..., name='v2')

    # Pass the variables as a dict:
    saver = tf.train.Saver({'v1': v1, 'v2': v2})

    # Or pass them as a list.
    saver = tf.train.Saver([v1, v2])
    # Passing a list is equivalent to passing a dict with the variable op names
    # as keys:
    saver = tf.train.Saver({v.op.name: v for v in [v1, v2]})
    ```

    The optional `reshape` argument, if `True`, allows restoring a variable from
    a save file where the variable had a different shape, but the same number
    of elements and type.  This is useful if you have reshaped a variable and
    want to reload it from an older checkpoint.

    The optional `sharded` argument, if `True`, instructs the saver to shard
    checkpoints per device.

    Args:
      var_list: A list of `Variable` objects or a dictionary mapping names to
        variables.  If `None`, defaults to the list of all variables.
      reshape: If `True`, allows restoring parameters from a checkpoint
        where the variables have a different shape.
      sharded: If `True`, shard the checkpoints, one per device.
      max_to_keep: maximum number of recent checkpoints to keep.
        Defaults to 10,000 hours.
      keep_checkpoint_every_n_hours: How often to keep checkpoints.
        Defaults to 10,000 hours.
      name: string.  Optional name to use as a prefix when adding operations.
      restore_sequentially: A `Bool`, which if true, causes restore of different
        variables to happen sequentially within each device.  This can lower
        memory usage when restoring very large models.
      saver_def: Optional `SaverDef` proto to use instead of running the
        builder. This is only useful for specialty code that wants to recreate
        a `Saver` object for a previously built `Graph` that had a `Saver`.
        The `saver_def` proto should be the one returned by the
        `as_saver_def()` call of the `Saver` that was created for that `Graph`.
      builder: Optional `SaverBuilder` to use if a `saver_def` was not provided.
        Defaults to `BaseSaverBuilder()`.

    Raises:
      TypeError: If `var_list` is invalid.
      ValueError: If any of the keys or values in `var_list` are not unique.
    """
        if saver_def is None:
            if builder is None:
                builder = BaseSaverBuilder()
            if var_list is None:
                var_list = variables.all_variables()
            if not var_list:
                raise ValueError("No variables to save")
            saver_def = builder.build(
                var_list,
                reshape=reshape,
                sharded=sharded,
                max_to_keep=max_to_keep,
                keep_checkpoint_every_n_hours=keep_checkpoint_every_n_hours,
                name=name,
                restore_sequentially=restore_sequentially)
        if not isinstance(saver_def, saver_pb2.SaverDef):
            raise ValueError("saver_def must if a saver_pb2.SaverDef: %s" %
                             saver_def)
        if not saver_def.save_tensor_name:
            raise ValueError(
                "saver_def must specify the save_tensor_name: %s" %
                str(saver_def))
        if not saver_def.restore_op_name:
            raise ValueError("saver_def must specify the restore_op_name: %s" %
                             str(saver_def))
        self._filename_tensor_name = saver_def.filename_tensor_name
        self._save_tensor_name = saver_def.save_tensor_name
        self._restore_op_name = saver_def.restore_op_name
        self._max_to_keep = saver_def.max_to_keep
        # If keep_checkpoint_every_n_hours is not set, set it to 10000 hours.
        self._keep_checkpoint_every_n_hours = (
            saver_def.keep_checkpoint_every_n_hours
            if saver_def.keep_checkpoint_every_n_hours else 10000)
        self._next_checkpoint_time = (
            time.time() + self._keep_checkpoint_every_n_hours * 3600)
        self._sharded = saver_def.sharded
        self._last_checkpoints = []
  def apply_gradients(self, grads_and_vars, global_step=None, name=None):
    """Apply gradients to variables.

    This contains most of the synchronization implementation and also wraps the
    apply_gradients() from the real optimizer.

    Args:
      grads_and_vars: List of (gradient, variable) pairs as returned by
        compute_gradients().
      global_step: Optional Variable to increment by one after the
        variables have been updated.
      name: Optional name for the returned operation.  Default to the
        name passed to the Optimizer constructor.

    Returns:
      train_op: The op to dequeue a token so the replicas can exit this batch
      and start the next one. This is executed by each replica.

    Raises:
      ValueError: If the grads_and_vars is empty.
      ValueError: If global step is not provided, the staleness cannot be
        checked.
    """
    if not grads_and_vars:
      raise ValueError("Must supply at least one variable")

    if global_step is None:
      raise ValueError("Global step is required to check staleness")

    self._global_step = global_step
    train_ops = []
    aggregated_grad = []
    var_list = []

    self._local_step = variables.Variable(
        initial_value=0,
        trainable=False,
        collections=[ops.GraphKeys.LOCAL_VARIABLES],
        name="sync_rep_local_step")
    self.local_step_init_op = state_ops.assign(self._local_step, global_step)
    chief_init_ops = [self.local_step_init_op]
    self.ready_for_local_init_op = variables.report_uninitialized_variables(
        variables.all_variables())

    with ops.name_scope(None, self._name):
      for grad, var in grads_and_vars:
        var_list.append(var)
        with ops.device(var.device):
          # Dense gradients.
          if grad is None:
            aggregated_grad.append(None)  # pass-through.
            continue
          elif isinstance(grad, ops.Tensor):
            grad_accum = data_flow_ops.ConditionalAccumulator(
                grad.dtype,
                shape=var.get_shape(),
                shared_name=var.name + "/grad_accum")
            train_ops.append(grad_accum.apply_grad(
                grad, local_step=self._local_step))
            aggregated_grad.append(grad_accum.take_grad(
                self._replicas_to_aggregate))
          else:
            if not isinstance(grad, ops.IndexedSlices):
              raise ValueError("Unknown grad type!")
            grad_accum = data_flow_ops.SparseConditionalAccumulator(
                grad.dtype, shape=(), shared_name=var.name + "/grad_accum")
            train_ops.append(grad_accum.apply_indexed_slices_grad(
                grad, local_step=self._local_step))
            aggregated_grad.append(grad_accum.take_indexed_slices_grad(
                self._replicas_to_aggregate))

          self._accumulator_list.append((grad_accum, var.device))

      aggregated_grads_and_vars = zip(aggregated_grad, var_list)

      # sync_op will be assigned to the same device as the global step.
      with ops.device(global_step.device), ops.name_scope(""):
        update_op = self._opt.apply_gradients(aggregated_grads_and_vars,
                                              global_step)

      # Create token queue.
      with ops.device(global_step.device), ops.name_scope(""):
        sync_token_queue = (
            data_flow_ops.FIFOQueue(-1,
                                    global_step.dtype.base_dtype,
                                    shapes=(),
                                    shared_name="sync_token_q"))
        self._sync_token_queue = sync_token_queue

        # dummy_queue is passed to the queue runner. Don't use the real queues
        # because the queue runner doesn't automatically reopen it once it
        # closed queues in PS devices.
        dummy_queue = (
            data_flow_ops.FIFOQueue(1,
                                    types_pb2.DT_INT32,
                                    shapes=(),
                                    shared_name="dummy_queue"))

      with ops.device(global_step.device), ops.name_scope(""):
        # Replicas have to wait until they can get a token from the token queue.
        with ops.control_dependencies(train_ops):
          token = sync_token_queue.dequeue()
        train_op = state_ops.assign(self._local_step, token)

        with ops.control_dependencies([update_op]):
          # Sync_op needs to insert tokens to the token queue at the end of the
          # step so the replicas can fetch them to start the next step.
          tokens = array_ops.fill([self._tokens_per_step], global_step.ref())
          sync_op = sync_token_queue.enqueue_many((tokens,))

        if self._variable_averages is not None:
          with ops.control_dependencies([sync_op]), ops.name_scope(""):
            sync_op = self._variable_averages.apply(
                self._variables_to_average)

        self._chief_queue_runner = queue_runner.QueueRunner(dummy_queue,
                                                            [sync_op])
      for accum, dev in self._accumulator_list:
        with ops.device(dev):
          chief_init_ops.append(
              accum.set_global_step(
                  global_step, name="SetGlobalStep"))
      self.chief_init_op = control_flow_ops.group(*(chief_init_ops))
      self._gradients_applied = True
      return train_op
    def apply_gradients(self,
                        grads_and_vars,
                        worker_id,
                        global_step=None,
                        name=None,
                        collect_cdfs=False):
        """Apply gradients to variables.
    This contains most of the synchronization implementation and also wraps the
    apply_gradients() from the real optimizer.
    Args:
      grads_and_vars: List of (gradient, variable) pairs as returned by
        compute_gradients().
      global_step: Optional Variable to increment by one after the
        variables have been updated.
      name: Optional name for the returned operation.  Default to the
        name passed to the Optimizer constructor.
    Returns:
      train_op: The op to dequeue a token so the replicas can exit this batch
      and start the next one. This is executed by each replica.
    Raises:
      ValueError: If the grads_and_vars is empty.
      ValueError: If global step is not provided, the staleness cannot be
        checked.
    """
        if not grads_and_vars:
            raise ValueError("Must supply at least one variable")

        if global_step is None:
            raise ValueError("Global step is required to check staleness")

        self._global_step = global_step
        train_ops = []
        aggregated_grad = []
        var_list = []
        printer_ops = []

        def f_pos():
            enq_total_ops = self._stop_queue.enqueue(global_step)
            '''
      for worker_id in range(self._total_num_replicas):
        enq_ops = self._should_stop_queues[worker_id].enqueue(global_step)
        with ops.control_dependencies([enq_ops]):
          L = []
      '''
            #      ret_pos = [tf.constant(i) for i in range(self._construtor)]
            with ops.control_dependencies([enq_total_ops]):
                return tf.Print(global_step, [global_step],
                                message="Enquequed to stop queue")
#        ret_pos = tf.Variable(33)
#        return ret_pos

        def f_neg():
            #      ret_neg = [tf.constant(i+5) for i in range(self._construtor)]
            ret_neg = tf.Variable(22)
            return tf.Print(global_step, [global_step],
                            message="Nothing to stop queue")


#      worker_id_list_printer = logging_ops.Print(global_step,
#                  [a for a in self._worker_idx_list] + [worker_id] + [global_step],
#                  message="Worker ID list status")
#      train_ops.append(worker_id_list_printer)

        self._local_step = variables.Variable(
            initial_value=0,
            trainable=False,
            collections=[ops.GraphKeys.LOCAL_VARIABLES],
            dtype=global_step.dtype.base_dtype,
            name="sync_rep_local_step")
        self.local_step_init_op = state_ops.assign(self._local_step,
                                                   global_step._ref())
        chief_init_ops = [self.local_step_init_op]

        self.ready_for_local_init_op = variables.report_uninitialized_variables(
            variables.all_variables())

        # The wait op waits for the current worker to dequeue a token from its respective token queue
        self._wait_op = self._sync_token_queues[worker_id].dequeue()

        # Replicas have to wait until they can get a token from the token queue
        # BEFORE begining to compute gradients.
        with ops.device(global_step.device):
            queue_size = self._sync_token_queues[worker_id].size()
            update_local_step_op = state_ops.assign(self._local_step,
                                                    global_step._ref())

        # Gradient accum creation
        with ops.name_scope(None, self._name):
            for grad, var in grads_and_vars:
                var_list.append(var)
                tf.logging.info("Grad " + str(grad) + " assigned to " +
                                str(var.device))
                with ops.device(var.device):
                    if grad is None:
                        continue
                    elif isinstance(grad, ops.Tensor):
                        grad_accum = data_flow_ops.ConditionalAccumulator(
                            grad.dtype,
                            shape=var.get_shape(),
                            shared_name=var.name + "/grad_accum")
                    else:
                        if not isinstance(grad, ops.IndexedSlices):
                            raise ValueError("Unknown grad type!")
                        grad_accum = data_flow_ops.SparseConditionalAccumulator(
                            grad.dtype,
                            shape=(),
                            shared_name=var.name + "/grad_accum")
                    self._accumulator_list.append((grad_accum, var))
            """# Phase 1 gradient computation
      with ops.control_dependencies([update_local_step_op]):
        for index, (grad, var) in enumerate(grads_and_vars):
          with ops.device(var.device):
            if grad is None:
              continue

            elif isinstance(grad, ops.Tensor):
              grad_accum = self._accumulator_list[index][0]

              train_ops.append(grad_accum.apply_grad(grad,
                                                     local_step=self._local_step._ref()))

            else:
              if not isinstance(grad, ops.IndexedSlices):
                raise ValueError("Unknown grad type!")
              grad_accum = self._accumulator_list[index][0]

              train_ops.append(grad_accum.apply_indexed_slices_grad(
                grad, local_step=self._local_step._ref()))"""

            # Phase 1 gradient computation
            with ops.control_dependencies([update_local_step_op]):
                for index, (grad, var) in enumerate(grads_and_vars):
                    print_start_op = logging_ops.Print(
                        global_step, [global_step],
                        message="Starting to apply grads for variable %d" %
                        index)
                    with ops.device(var.device):
                        if grad is None:
                            continue

                        elif isinstance(grad, ops.Tensor):
                            grad_accum = self._accumulator_list[index][0]

                            with ops.control_dependencies([print_start_op]):
                                with tf.device("job:worker/task:%d" %
                                               worker_id):
                                    apply_grad_op = grad_accum.apply_grad(
                                        grad,
                                        local_step=self._local_step._ref())
                                    with ops.control_dependencies(
                                        [apply_grad_op]):
                                        finished_print_op = logging_ops.Print(
                                            global_step, [global_step],
                                            message=
                                            "Done applying grads for variable %d"
                                            % index)
                                        train_ops.append(finished_print_op)

                        else:
                            if not isinstance(grad, ops.IndexedSlices):
                                raise ValueError("Unknown grad type!")
                            grad_accum = self._accumulator_list[index][0]

                            with ops.control_dependencies([print_start_op]):
                                with tf.device("job:worker/task:%d" %
                                               worker_id):
                                    apply_grad_op = grad_accum.apply_indexed_slices_grad(
                                        grad,
                                        local_step=self._local_step._ref())
                                    with ops.control_dependencies(
                                        [apply_grad_op]):
                                        finished_print_op = logging_ops.Print(
                                            global_step, [global_step],
                                            message=
                                            "Done applying grads for variable %d"
                                            % index)
                                        train_ops.append(finished_print_op)

                        with ops.control_dependencies([apply_grad_op]):
                            accum_sizes_printer = logging_ops.Print(
                                global_step,
                                [
                                    x[0].num_accumulated()
                                    for x in self._accumulator_list
                                ] + [worker_id] + [global_step],
                                message="Accum aggregated status on ps")
                            train_ops.append(accum_sizes_printer)
                            x = self._accumulator_list[0]
                            ret = tf.cond(
                                tf.greater_equal(
                                    x[0].num_accumulated(),
                                    self._constant_for_comparison), f_pos,
                                f_neg)

                            should_stop_list_printer = logging_ops.Print(
                                global_step, [ret],
                                message="Should stop ret val status on ps")
                            train_ops.append(should_stop_list_printer)
                            with ops.control_dependencies([ret]):
                                queue_total_printer = logging_ops.Print(
                                    global_step, [self._stop_queue.size()],
                                    message="shared should stop queue size")
                                train_ops.append(queue_total_printer)

            # Phase 2 gradient applying
            for index, (grad, var) in enumerate(grads_and_vars):
                with ops.device(var.device):
                    grad_accum = self._accumulator_list[index][0]
                    if grad is None:
                        aggregated_grad.append(None)
                    elif isinstance(grad, ops.Tensor):
                        if collect_cdfs:
                            aggregated_grad.append(
                                grad_accum.take_grad(self._total_num_replicas))
                        else:
                            aggregated_grad.append(grad_accum.take_grad(1))
                    else:
                        if collect_cdfs:
                            aggregated_grad.append(
                                grad_accum.take_grad(self._total_num_replicas))
                        else:
                            aggregated_grad.append(
                                grad_accum.take_indexed_slices_grad(1))

            aggregated_grads_and_vars = zip(aggregated_grad, var_list)

            # Some debug operations
            self.print_sizes = logging_ops.Print(global_step, [
                self._sync_token_queues[i].size()
                for i in range(self._total_num_replicas)
            ],
                                                 message="queue sizes")
            self.print_accum_sizes = logging_ops.Print(
                self._local_step,
                [x[0].num_accumulated()
                 for x in self._accumulator_list] + [worker_id],
                message="Accum sizes")
            self.print_local_step = logging_ops.Print(
                self._local_step,
                [self._local_step._ref(),
                 global_step._ref()],
                message="local vs global step")

            # sync_op will be assigned to the same device as the global step.
            with ops.device(global_step.device), ops.name_scope(""):
                with ops.control_dependencies([self.print_accum_sizes]):
                    update_op = self._opt.apply_gradients(
                        aggregated_grads_and_vars, global_step)
                    self._update_op = update_op
                    num_to_dequeue = self._stop_queue.size()
                    deq_ops = self._stop_queue.dequeue_many(num_to_dequeue)
                    with ops.control_dependencies([deq_ops]):
                        size_printer_2 = logging_ops.Print(
                            global_step, [self.print_accum_sizes],
                            message="Complelted the dequeue operation!")
                        printer_ops.append(size_printer_2)
                    with ops.control_dependencies(printer_ops):
                        with ops.control_dependencies([update_op]):
                            sync_op = []
                            for cur_worker_id in range(
                                    self._total_num_replicas):
                                sync_op.append(
                                    self._sync_token_queues[cur_worker_id].
                                    enqueue(global_step))
                            sync_op = control_flow_ops.group(*(sync_op))

                # dummy_queue is passed to the queue runner. Don't use the real queues
                # because the queue runner doesn't automatically reopen it once it
                # closed queues in PS devices.
                dummy_queue = (data_flow_ops.FIFOQueue(
                    1,
                    types_pb2.DT_INT32,
                    shapes=(),
                    shared_name="dummy_queue"))

                self._chief_queue_runner = queue_runner.QueueRunner(
                    dummy_queue, [sync_op])

            with ops.device(global_step.device), ops.name_scope(""):
                with ops.control_dependencies(train_ops):
                    # Worker finished applying gradients. Add token to phase1_finished_queue
                    train_op = logging_ops.Print(
                        self._local_step._ref(), [
                            x[0].num_accumulated()
                            for x in self._accumulator_list
                        ] + [worker_id] + [global_step],
                        message="Finished worker updates",
                        name="FinishedWorkerUpdatesPrint")

            for accum, var in self._accumulator_list:
                with ops.device(var.device):
                    chief_init_ops.append(
                        accum.set_global_step(global_step,
                                              name="SetGlobalStep"))
            self.chief_init_op = control_flow_ops.group(*(chief_init_ops))
            self._gradients_applied = True

            return train_op
Exemple #33
0
    def apply_gradients(self, grads_and_vars, global_step=None, name=None):
        """Apply gradients to variables.

    This contains most of the synchronization implementation and also wraps the
    apply_gradients() from the real optimizer.

    Args:
      grads_and_vars: List of (gradient, variable) pairs as returned by
        compute_gradients().
      global_step: Optional Variable to increment by one after the
        variables have been updated.
      name: Optional name for the returned operation.  Default to the
        name passed to the Optimizer constructor.

    Returns:
      train_op: The op to dequeue a token so the replicas can exit this batch
      and start the next one. This is executed by each replica.

    Raises:
      ValueError: If the grads_and_vars is empty.
      ValueError: If global step is not provided, the staleness cannot be
        checked.
    """
        if not grads_and_vars:
            raise ValueError("Must supply at least one variable")

        if global_step is None:
            raise ValueError("Global step is required to check staleness")

        self._global_step = global_step
        train_ops = []
        aggregated_grad = []
        var_list = []

        self._local_step = variables.Variable(
            initial_value=0,
            trainable=False,
            collections=[ops.GraphKeys.LOCAL_VARIABLES],
            name="sync_rep_local_step")
        self.local_step_init_op = state_ops.assign(self._local_step,
                                                   global_step)
        chief_init_ops = [self.local_step_init_op]
        self.ready_for_local_init_op = variables.report_uninitialized_variables(
            variables.all_variables())

        with ops.name_scope(None, self._name):
            for grad, var in grads_and_vars:
                var_list.append(var)
                with ops.device(var.device):
                    # Dense gradients.
                    if grad is None:
                        aggregated_grad.append(None)  # pass-through.
                        continue
                    elif isinstance(grad, ops.Tensor):
                        grad_accum = data_flow_ops.ConditionalAccumulator(
                            grad.dtype,
                            shape=var.get_shape(),
                            shared_name=var.name + "/grad_accum")
                        train_ops.append(
                            grad_accum.apply_grad(grad,
                                                  local_step=self._local_step))
                        aggregated_grad.append(
                            grad_accum.take_grad(self._replicas_to_aggregate))
                    else:
                        if not isinstance(grad, ops.IndexedSlices):
                            raise ValueError("Unknown grad type!")
                        grad_accum = data_flow_ops.SparseConditionalAccumulator(
                            grad.dtype,
                            shape=(),
                            shared_name=var.name + "/grad_accum")
                        train_ops.append(
                            grad_accum.apply_indexed_slices_grad(
                                grad, local_step=self._local_step))
                        aggregated_grad.append(
                            grad_accum.take_indexed_slices_grad(
                                self._replicas_to_aggregate))

                    self._accumulator_list.append((grad_accum, var.device))

            aggregated_grads_and_vars = zip(aggregated_grad, var_list)

            # sync_op will be assigned to the same device as the global step.
            with ops.device(global_step.device), ops.name_scope(""):
                update_op = self._opt.apply_gradients(
                    aggregated_grads_and_vars, global_step)

            # Create token queue.
            with ops.device(global_step.device), ops.name_scope(""):
                sync_token_queue = (data_flow_ops.FIFOQueue(
                    -1,
                    global_step.dtype.base_dtype,
                    shapes=(),
                    shared_name="sync_token_q"))
                self._sync_token_queue = sync_token_queue

                # dummy_queue is passed to the queue runner. Don't use the real queues
                # because the queue runner doesn't automatically reopen it once it
                # closed queues in PS devices.
                dummy_queue = (data_flow_ops.FIFOQueue(
                    1,
                    types_pb2.DT_INT32,
                    shapes=(),
                    shared_name="dummy_queue"))

            with ops.device(global_step.device), ops.name_scope(""):
                # Replicas have to wait until they can get a token from the token queue.
                with ops.control_dependencies(train_ops):
                    token = sync_token_queue.dequeue()
                train_op = state_ops.assign(self._local_step, token)

                with ops.control_dependencies([update_op]):
                    # Sync_op needs to insert tokens to the token queue at the end of the
                    # step so the replicas can fetch them to start the next step.
                    tokens = array_ops.fill([self._tokens_per_step],
                                            global_step.ref())
                    sync_op = sync_token_queue.enqueue_many((tokens, ))

                if self._variable_averages is not None:
                    with ops.control_dependencies([sync_op
                                                   ]), ops.name_scope(""):
                        sync_op = self._variable_averages.apply(
                            self._variables_to_average)

                self._chief_queue_runner = queue_runner.QueueRunner(
                    dummy_queue, [sync_op])
            for accum, dev in self._accumulator_list:
                with ops.device(dev):
                    chief_init_ops.append(
                        accum.set_global_step(global_step,
                                              name="SetGlobalStep"))
            self.chief_init_op = control_flow_ops.group(*(chief_init_ops))
            self._gradients_applied = True
            return train_op