Example #1
0
def npairs_loss(labels, embeddings_anchor, embeddings_positive,
                reg_lambda=0.002, print_losses=False):
  """Computes the npairs loss.

  Npairs loss expects paired data where a pair is composed of samples from the
  same labels and each pairs in the minibatch have different labels. The loss
  has two components. The first component is the L2 regularizer on the
  embedding vectors. The second component is the sum of cross entropy loss
  which takes each row of the pair-wise similarity matrix as logits and
  the remapped one-hot labels as labels.

  See: http://www.nec-labs.com/uploads/images/Department-Images/MediaAnalytics/papers/nips16_npairmetriclearning.pdf

  Args:
    labels: 1-D tf.int32 `Tensor` of shape [batch_size/2].
    embeddings_anchor: 2-D Tensor of shape [batch_size/2, embedding_dim] for the
      embedding vectors for the anchor images. Embeddings should not be
      l2 normalized.
    embeddings_positive: 2-D Tensor of shape [batch_size/2, embedding_dim] for the
      embedding vectors for the positive images. Embeddings should not be
      l2 normalized.
    reg_lambda: Float. L2 regularization term on the embedding vectors.
    print_losses: Boolean. Option to print the xent and l2loss.

  Returns:
    npairs_loss: tf.float32 scalar.
  """
  # pylint: enable=line-too-long
  # Add the regularizer on the embedding.
  reg_anchor = math_ops.reduce_mean(
      math_ops.reduce_sum(math_ops.square(embeddings_anchor), 1))
  reg_positive = math_ops.reduce_mean(
      math_ops.reduce_sum(math_ops.square(embeddings_positive), 1))
  l2loss = math_ops.multiply(
      0.25 * reg_lambda, reg_anchor + reg_positive, name='l2loss')

  # Get per pair similarities.
  similarity_matrix = math_ops.matmul(
      embeddings_anchor, embeddings_positive, transpose_a=False,
      transpose_b=True)

  # Reshape [batch_size] label tensor to a [batch_size, 1] label tensor.
  lshape = array_ops.shape(labels)
  assert lshape.shape == 1
  labels = array_ops.reshape(labels, [lshape[0], 1])

  labels_remapped = math_ops.to_float(
      math_ops.equal(labels, array_ops.transpose(labels)))
  labels_remapped /= math_ops.reduce_sum(labels_remapped, 1, keep_dims=True)

  # Add the softmax loss.
  xent_loss = nn.softmax_cross_entropy_with_logits(
      logits=similarity_matrix, labels=labels_remapped)
  xent_loss = math_ops.reduce_mean(xent_loss, name='xentropy')

  if print_losses:
    xent_loss = logging_ops.Print(
        xent_loss, ['cross entropy:', xent_loss, 'l2loss:', l2loss])

  return l2loss + xent_loss
Example #2
0
    def test_report_unsupported_operations_graph_mode(self):
        """Tests that unsupported operations are detected."""
        context = self.create_test_xla_compile_context()
        context.Enter()
        dummy_tensor = constant_op.constant(1.1)
        audio_summary = summary.audio('audio_summary', dummy_tensor, 0.5)
        histogram_summary = summary.histogram('histogram_summary',
                                              dummy_tensor)
        image_summary = summary.image('image_summary', dummy_tensor)
        scalar_summary = summary.scalar('scalar_summary', dummy_tensor)
        tensor_summary = summary.tensor_summary('tensor_summary', dummy_tensor)
        summary.merge([
            audio_summary, histogram_summary, image_summary, scalar_summary,
            tensor_summary
        ],
                      name='merge_summary')
        logging_ops.Print(dummy_tensor, [dummy_tensor], name='print_op')
        context.Exit()

        unsupported_ops_names = [op.name for op in context._unsupported_ops]
        self.assertEqual(unsupported_ops_names, [
            u'audio_summary', u'histogram_summary', u'image_summary',
            u'scalar_summary', u'tensor_summary',
            u'merge_summary/merge_summary', u'print_op'
        ])
Example #3
0
def mmid_Npair_loss_graph(config, reg_lambda, embeddings_positive,
                          embeddings_anchor):
    """Uses npairs_loss in both directions.
  Args:
    pregrasp_embedding: Batch of embeddings of the pregrasp image
    goal_embedding: Batch of embeddings of the goal image
    postgrasp_embedding: Batch of embeddings of the postgrasp image
    params: Parameters for loss. Currently unused.
  Returns:
    A scalar loss
  """
    pair_a = embeddings_positive
    pair_b = embeddings_anchor
    labels = tf.range(config.BATCH_SIZE, dtype=tf.int32)

    pair_a = logging_ops.Print(pair_a, [
        'mean_embedding:',
        math_ops.reduce_mean(math_ops.reduce_sum(pair_a, 1))
    ])

    loss_1 = tf.contrib.losses.metric_learning.npairs_loss(
        labels, pair_a, pair_b, reg_lambda=reg_lambda, print_losses=True)
    loss_2 = tf.contrib.losses.metric_learning.npairs_loss(
        labels, pair_b, pair_a, reg_lambda=reg_lambda, print_losses=True)
    tf.summary.scalar('npairs_loss1', loss_1)
    tf.summary.scalar('npairs_loss2', loss_2)
    return loss_1 + loss_2
    def testZerosLikeVariant(self):
        # TODO(ebrevdo): Re-enable use_gpu=True once non-DMA Variant
        # copying between CPU and GPU is supported AND we register a
        # ZerosLike callback for GPU for Variant storing primitive types
        # in variant_op_registry.cc.
        with self.test_session(use_gpu=False):
            variant_tensor = tensor_pb2.TensorProto(
                dtype=dtypes_lib.variant.as_datatype_enum,
                tensor_shape=tensor_shape.TensorShape([]).as_proto(),
                variant_val=[
                    tensor_pb2.VariantTensorDataProto(
                        # Match registration in variant_op_registry.cc
                        type_name=b"int",
                        metadata=np.array(1, dtype=np.int32).tobytes())
                ])
            const_variant = constant_op.constant(variant_tensor)
            zeros_like = array_ops.zeros_like(const_variant)
            zeros_like_op = logging_ops.Print(
                zeros_like, [const_variant, zeros_like],
                message=
                "Variant storing an int, input and output of zeros_like:").op

            # Smoke test -- ensure this executes without trouble.
            # Right now, non-numpy-compatible objects cannot be returned from a
            # session.run call; similarly, objects that can't be converted to
            # native numpy types cannot be passed to ops.convert_to_tensor.
            # TODO(ebrevdo): Add registration mechanism for
            # ops.convert_to_tensor and for session.run output.
            zeros_like_op.run()
def add_image_summary(tensor, name=None, prefix=None, print_summary=False):
    """Adds an image summary for the given tensor.

  Args:
    tensor: a variable or op tensor with shape [batch,height,width,channels]
    name: the optional name for the summary.
    prefix: An optional prefix for the summary names.
    print_summary: If `True`, the summary is printed to stdout when the summary
      is computed.

  Returns:
    An image `Tensor` of type `string` whose contents are the serialized
    `Summary` protocol buffer.
  """
    summary_name = _get_summary_name(tensor, name, prefix)
    # If print_summary, then we need to make sure that this call doesn't add the
    # non-printing op to the collection. We'll add it to the collection later.
    collections = [] if print_summary else None
    op = summary.image(name=summary_name,
                       tensor=tensor,
                       collections=collections)
    if print_summary:
        op = logging_ops.Print(op, [tensor], summary_name)
        ops.add_to_collection(ops.GraphKeys.SUMMARIES, op)
    return op
Example #6
0
def dynamic_print(*values):
    """Implementation of print using dynamic dispatch.

  The function attempts to use tf.Print if all the values are compatible.
  Otherwise, it will fall back to py_func.

  Args:
    *values: values to print
  Returns:
    A dummy value indicating the print completed. If tf.
  """

    if all(map(is_tf_print_compatible, values)):
        return logging_ops.Print(1, values)

    def print_wrapper(*vals):
        if six.PY3:
            # TensorFlow doesn't seem to generate Unicode when passing strings to
            # py_func. This causes the print to add a "b'" wrapper to the output,
            # which is probably never what you want.
            vals = tuple(v.decode() if isinstance(v, bytes) else v
                         for v in vals)
        print(*vals)
        # The flush helps avoid garbled output in IPython.
        sys.stdout.flush()

    return py_func.wrap_py_func(print_wrapper,
                                None,
                                values,
                                use_dummy_return=True)
    def testVariant(self):
        def create_constant_variant(value):
            return constant_op.constant(
                tensor_pb2.TensorProto(
                    dtype=dtypes.variant.as_datatype_enum,
                    tensor_shape=tensor_shape.TensorShape([]).as_proto(),
                    variant_val=[
                        tensor_pb2.VariantTensorDataProto(
                            # Match registration in variant_op_registry.cc
                            type_name=b"int",
                            metadata=np.array(value, dtype=np.int32).tobytes())
                    ]))

        # TODO(ebrevdo): Re-enable use_gpu=True once non-DMA Variant
        # copying between CPU and GPU is supported.
        with self.test_session(use_gpu=False):
            variant_const_3 = create_constant_variant(3)
            variant_const_4 = create_constant_variant(4)
            variant_const_5 = create_constant_variant(5)
            # 3 + 3 + 5 + 4 = 15.
            result = math_ops.add_n((variant_const_3, variant_const_3,
                                     variant_const_5, variant_const_4))

            # Smoke test -- ensure this executes without trouble.
            # Right now, non-numpy-compatible objects cannot be returned from a
            # session.run call; similarly, objects that can't be converted to
            # native numpy types cannot be passed to ops.convert_to_tensor.
            # For now, run the test and examine the output to see that the result is
            # equal to 15.
            result_op = logging_ops.Print(
                result,
                [variant_const_3, variant_const_4, variant_const_5, result],
                message=("Variants stored an int: c(3), c(4), c(5), "
                         "add_n(c(3), c(3), c(5), c(4)): ")).op
            result_op.run()
    def testVariant(self):
        # TODO(ebrevdo): Re-enable use_gpu=True once non-DMA Variant
        # copying between CPU and GPU is supported.
        with self.test_session(use_gpu=False):
            variant_tensor = tensor_pb2.TensorProto(
                dtype=dtypes_lib.variant.as_datatype_enum,
                tensor_shape=tensor_shape.TensorShape([]).as_proto(),
                variant_val=[
                    tensor_pb2.VariantTensorDataProto(
                        # Match registration in variant_op_registry.cc
                        type_name=b"int",
                        metadata=np.array(1, dtype=np.int32).tobytes())
                ])
            const = constant_op.constant(variant_tensor)
            const_value = const.op.get_attr("value")

            # Ensure we stored the tensor proto properly.
            self.assertProtoEquals(variant_tensor, const_value)

            # Smoke test -- ensure this executes without trouble.
            # Right now, non-numpy-compatible objects cannot be returned from a
            # session.run call; similarly, objects that can't be converted to
            # native numpy types cannot be passed to ops.convert_to_tensor.
            # TODO(ebrevdo): Add registration mechanism for
            # ops.convert_to_tensor and for session.run output.
            logging_const_op = logging_ops.Print(
                const, [const],
                message="Variant storing an int, decoded const value:").op
            logging_const_op.run()
Example #9
0
 def _encode(self, grads_and_vars, shapes):
     if not self.compress:
         return grads_and_vars
     with ops.control_dependencies([
             logging_ops.Print(0, [0],
                               message="Start Encode Gradients on Workers")
     ]):
         coding = encode(grads_and_vars, r=self.svd_rank, shapes=shapes)
         return coding
Example #10
0
 def _decode(self, coding):
     if self.compress:
         with ops.control_dependencies([
                 logging_ops.Print(0, [self._global_step],
                                   message="Start Decode Gradients on PS")
         ]):
             grads_and_vars, decode_data = decode(coding)
             return grads_and_vars, decode_data
     return coding, {}
Example #11
0
    def _model_fn(features, labels, mode):
        """Function that returns predictions, training loss, and training op."""
        weights = None
        if weights_name and weights_name in features:
            weights = features.pop(weights_name)

        graph_builder = graph_builder_class(params,
                                            device_assigner=device_assigner)
        inference = {}
        if (mode == model_fn_lib.ModeKeys.EVAL
                or mode == model_fn_lib.ModeKeys.INFER):
            inference[eval_metrics.INFERENCE_PROB_NAME] = (
                graph_builder.inference_graph(features))

            if not params.regression:
                inference[eval_metrics.INFERENCE_PRED_NAME] = math_ops.argmax(
                    inference[eval_metrics.INFERENCE_PROB_NAME], 1)

        # labels might be None if we're doing prediction (which brings up the
        # question of why we force everything to adhere to a single model_fn).
        loss_deps = []
        training_graph = None
        if labels is not None and mode == model_fn_lib.ModeKeys.TRAIN:
            training_graph = control_flow_ops.group(
                graph_builder.training_graph(features,
                                             labels,
                                             input_weights=weights,
                                             num_trainers=num_trainers,
                                             trainer_id=trainer_id),
                state_ops.assign_add(contrib_framework.get_global_step(), 1))
            loss_deps.append(training_graph)

        training_loss = None
        if (mode == model_fn_lib.ModeKeys.EVAL
                or mode == model_fn_lib.ModeKeys.TRAIN):
            with ops.control_dependencies(loss_deps):
                training_loss = graph_builder.training_loss(features,
                                                            labels,
                                                            name=LOSS_NAME)
            if report_feature_importances and mode == model_fn_lib.ModeKeys.EVAL:
                training_loss = logging_ops.Print(
                    training_loss, [graph_builder.feature_importances()],
                    summarize=1000)
        # Put weights back in
        if weights is not None:
            features[weights_name] = weights

        training_hooks = []
        if early_stopping_rounds:
            training_hooks.append(TensorForestLossHook(early_stopping_rounds))

        return model_fn_lib.ModelFnOps(mode=mode,
                                       predictions=inference,
                                       loss=training_loss,
                                       train_op=training_graph,
                                       training_hooks=training_hooks)
Example #12
0
 def maybe_warn_on_large_rejection(accept_dist, initial_dist):
   proportion_rejected = math_ops.reduce_sum((1 - accept_dist) * initial_dist)
   return control_flow_ops.cond(
       math_ops.less(proportion_rejected, .5),
       lambda: accept_dist,
       lambda: logging_ops.Print(  # pylint: disable=g-long-lambda
           accept_dist, [proportion_rejected, initial_dist, accept_dist],
           message="Proportion of examples rejected by sampler is high: ",
           summarize=100,
           first_n=10))
Example #13
0
 def testPrintGradient(self):
     inp = constant_op.constant(2.0, shape=[100, 32], name="in")
     w = constant_op.constant(4.0, shape=[10, 100], name="w")
     wx = math_ops.matmul(w, inp, name="wx")
     wx_print = logging_ops.Print(wx, [w, w, w])
     wx_grad = gradients_impl.gradients(wx, w)[0]
     wx_print_grad = gradients_impl.gradients(wx_print, w)[0]
     wxg = self.evaluate(wx_grad)
     wxpg = self.evaluate(wx_print_grad)
     self.assertAllEqual(wxg, wxpg)
 def compute_gradients(self, *args, **kwargs):
   """Compute gradients of "loss" for the variables in "var_list".
   This simply wraps the compute_gradients() from the real optimizer. The
   gradients will be aggregated in the apply_gradients() so that user can
   modify the gradients like clipping with per replica global norm if needed.
   The global norm with aggregated gradients can be bad as one replica's huge
   gradients can hurt the gradients from other replicas.
   Args:
     *args: Arguments for compute_gradients().
     **kwargs: Keyword arguments for compute_gradients().
   Returns:
     A list of (gradient, variable) pairs.
   """
   with ops.control_dependencies([logging_ops.Print(0, [0], message="Starting to compute gradients")]):
     grads_and_vars = self._opt.compute_gradients(*args, **kwargs)
     for index, (grad, var) in enumerate(grads_and_vars):
       with ops.control_dependencies([grad]):
         grads_and_vars[index] = (logging_ops.Print(grad, [0], message="Done computing gradient %d, grad shape: %s" % (index, var.get_shape())), var)
     return grads_and_vars
Example #15
0
def top_1_accuracy(config, embeddings_positive, embeddings_anchor):
    pred_matrix = math_ops.matmul(embeddings_anchor,
                                  embeddings_positive,
                                  transpose_a=False,
                                  transpose_b=True)
    pred = tf.math.argmax(input=pred_matrix, axis=1)

    labels = tf.range(config.BATCH_SIZE, dtype=tf.int32)
    if labels.dtype != pred.dtype:
        pred = math_ops.cast(pred, labels.dtype)
    is_correct = math_ops.cast(math_ops.equal(pred, labels), tf.float32)
    is_correct = logging_ops.Print(is_correct,
                                   ['acc:', tf.reduce_mean(is_correct)])
    return tf.reduce_mean(is_correct)
Example #16
0
def dynamic_print(*values):
    """Implementartion of print using dynamic dispatch.

  The function attempts to use tf.Print if all the values are compatible.
  Otherwise, it will fall back to py_func.

  Args:
    *values: values to print
  Returns:
    A dummy value indicating the print completed. If tf.
  """

    if all(map(is_tf_print_compatible, values)):
        return logging_ops.Print(1, values)
    return py_func.wrap_py_func(print, None, values, use_dummy_return=True)
Example #17
0
  def testCaptureControls(self):
    g = ops.Graph()
    with g.as_default():
      x = constant_op.constant([10.0])
      x = logging_ops.Print(x, [x], "outer")

      @function.Defun(dtypes.float32)
      def Foo(y):
        with ops.control_dependencies([x]):
          y = logging_ops.Print(y, [y], "inner")
        return y

      with self.assertRaisesRegexp(ValueError, "not an element of this graph."):
        # NOTE: We still do not support capturing control deps.
        _ = Foo(x)
Example #18
0
def call_print(*values):
    """Compiled counterpart of the print builtin.

  The function attempts to use tf.Print if all the values are compatible.
  Otherwise, it will fall back to py_func.

  Args:
    *values: values to print
  Returns:
    A dummy value indicating the print completed. If tf.
  """

    if all(map(is_tf_print_compatible, values)):
        return logging_ops.Print(1, values)
    return py_func.wrap_py_func(print, None, values, use_dummy_return=True)
Example #19
0
def encode(grads_and_vars, r=2, shapes=None):
    for i, ((grad, var), shape) in enumerate(zip(grads_and_vars, shapes)):
        with ops.control_dependencies([
                logging_ops.Print(0, [i],
                                  message="Encoding Gradients on Workers")
        ]):
            with tf.device(grad.device):
                ndims = len(shape)
                code = _svd_encode(grad, r=r, ndims=ndims, shape=shape)
                grads_and_vars[i] = (code, var)

            n_bytes = _list_bytes(grads_and_vars)
            for i, (g, v) in enumerate(grads_and_vars):
                if isinstance(g, dict):
                    grads_and_vars[i][0]['n_bytes'] = n_bytes
            return grads_and_vars
                def zero_grad_function():
                    zero_grads = []
                    with ops.name_scope(op.name + "_grad"):
                        # pylint: disable=protected-access
                        with ops.get_default_graph()._original_op(op):

                            for index, input in enumerate(op.inputs):
                                zero_grad = tf.zeros(tf.shape(input),
                                                     dtype=input.dtype)
                                if index == 0:
                                    zero_grad = logging_ops.Print(
                                        zero_grad, [zero_grad],
                                        message=
                                        "I'm a straggler; Piping up zeros.")
                                zero_grads.append(zero_grad)

                    return zero_grads
    def get_init_tokens_op(self, num_tokens=-1):
        """Returns the op to fill the sync_token_queue with the tokens.
    This is supposed to be executed in the beginning of the chief/sync thread
    so that even if the total_num_replicas is less than replicas_to_aggregate,
    the model can still proceed as the replicas can compute multiple steps per
    variable update. Make sure:
    `num_tokens >= replicas_to_aggregate - total_num_replicas`.
    Args:
      num_tokens: Number of tokens to add to the queue.
    Returns:
      An op for the chief/sync replica to fill the token queue.
    Raises:
      ValueError: If this is called before apply_gradients().
      ValueError: If num_tokens are smaller than replicas_to_aggregate -
        total_num_replicas.
    """
        if self._gradients_applied is False:
            raise ValueError(
                "get_init_tokens_op() should be called after apply_gradients()."
            )

        tokens_needed = self._total_num_replicas
        if num_tokens == -1:
            num_tokens = self._total_num_replicas
        elif num_tokens < tokens_needed:
            raise ValueError(
                "Too few tokens to finish the first step: %d (given) vs %d (needed)"
                % (num_tokens, tokens_needed))

        init_tokens = []
        with ops.device(self._global_step.device), ops.name_scope(""):
            tokens = array_ops.fill([num_tokens], self._global_step)
            for i in range(self._total_num_replicas):
                with ops.control_dependencies([
                        logging_ops.Print(self._global_step,
                                          [self._global_step],
                                          message="Init token queue")
                ]):
                    init_tokens_op = self._sync_token_queues[i].enqueue(
                        self._global_step)
                init_tokens.append(init_tokens_op)

        return init_tokens
def npairs_loss_fan(labels,
                    embeddings_anchor,
                    embeddings_positive,
                    reg_lambda=3e-3,
                    print_losses=False,
                    hard_ori=False,
                    HardOrNot=None):
    # pylint: enable=line-too-long
    # Add the regularizer on the embedding.
    reg_anchor = math_ops.reduce_mean(
        math_ops.reduce_sum(math_ops.square(embeddings_anchor), 1))
    reg_positive = math_ops.reduce_mean(
        math_ops.reduce_sum(math_ops.square(embeddings_positive), 1))
    l2loss = math_ops.multiply(0.25 * reg_lambda,
                               reg_anchor + reg_positive,
                               name='l2loss')

    # Get per pair similarities.
    similarity_matrix = math_ops.matmul(embeddings_anchor,
                                        embeddings_positive,
                                        transpose_a=False,
                                        transpose_b=True)

    # Reshape [batch_size] label tensor to a [batch_size, 1] label tensor.
    lshape = array_ops.shape(labels)
    assert lshape.shape == 1
    labels = array_ops.reshape(labels, [lshape[0], 1])

    labels_remapped = math_ops.to_float(
        math_ops.not_equal(labels, array_ops.transpose(labels)))
    # labels_remapped /= math_ops.reduce_sum(labels_remapped, 1, keepdims=True)

    # Add the softmax loss.
    xent_loss = nn.softmax_cross_entropy_with_logits(logits=similarity_matrix,
                                                     labels=labels_remapped)
    xent_loss = math_ops.reduce_mean(xent_loss, name='xentropy')
    if hard_ori:
        xent_loss = tf.multiply(xent_loss, HardOrNot)
    if print_losses:
        xent_loss = logging_ops.Print(
            xent_loss, ['cross entropy:', xent_loss, 'l2loss:', l2loss])

    return l2loss + xent_loss
Example #23
0
 def f(include_print):
   shape = constant_op.constant([5])
   if include_print:
     shape = logging_ops.Print(shape, [shape])
   return random.get_global_generator().normal(shape)
    def apply_gradients(self,
                        grads_and_vars,
                        worker_id,
                        global_step=None,
                        name=None,
                        collect_cdfs=False):
        """Apply gradients to variables.
    This contains most of the synchronization implementation and also wraps the
    apply_gradients() from the real optimizer.
    Args:
      grads_and_vars: List of (gradient, variable) pairs as returned by
        compute_gradients().
      global_step: Optional Variable to increment by one after the
        variables have been updated.
      name: Optional name for the returned operation.  Default to the
        name passed to the Optimizer constructor.
    Returns:
      train_op: The op to dequeue a token so the replicas can exit this batch
      and start the next one. This is executed by each replica.
    Raises:
      ValueError: If the grads_and_vars is empty.
      ValueError: If global step is not provided, the staleness cannot be
        checked.
    """
        if not grads_and_vars:
            raise ValueError("Must supply at least one variable")

        if global_step is None:
            raise ValueError("Global step is required to check staleness")

        self._global_step = global_step
        train_ops = []
        aggregated_grad = []
        var_list = []
        printer_ops = []

        def f_pos():
            enq_total_ops = self._stop_queue.enqueue(global_step)
            '''
      for worker_id in range(self._total_num_replicas):
        enq_ops = self._should_stop_queues[worker_id].enqueue(global_step)
        with ops.control_dependencies([enq_ops]):
          L = []
      '''
            #      ret_pos = [tf.constant(i) for i in range(self._construtor)]
            with ops.control_dependencies([enq_total_ops]):
                return tf.Print(global_step, [global_step],
                                message="Enquequed to stop queue")
#        ret_pos = tf.Variable(33)
#        return ret_pos

        def f_neg():
            #      ret_neg = [tf.constant(i+5) for i in range(self._construtor)]
            ret_neg = tf.Variable(22)
            return tf.Print(global_step, [global_step],
                            message="Nothing to stop queue")


#      worker_id_list_printer = logging_ops.Print(global_step,
#                  [a for a in self._worker_idx_list] + [worker_id] + [global_step],
#                  message="Worker ID list status")
#      train_ops.append(worker_id_list_printer)

        self._local_step = variables.Variable(
            initial_value=0,
            trainable=False,
            collections=[ops.GraphKeys.LOCAL_VARIABLES],
            dtype=global_step.dtype.base_dtype,
            name="sync_rep_local_step")
        self.local_step_init_op = state_ops.assign(self._local_step,
                                                   global_step._ref())
        chief_init_ops = [self.local_step_init_op]

        self.ready_for_local_init_op = variables.report_uninitialized_variables(
            variables.all_variables())

        # The wait op waits for the current worker to dequeue a token from its respective token queue
        self._wait_op = self._sync_token_queues[worker_id].dequeue()

        # Replicas have to wait until they can get a token from the token queue
        # BEFORE begining to compute gradients.
        with ops.device(global_step.device):
            queue_size = self._sync_token_queues[worker_id].size()
            update_local_step_op = state_ops.assign(self._local_step,
                                                    global_step._ref())

        # Gradient accum creation
        with ops.name_scope(None, self._name):
            for grad, var in grads_and_vars:
                var_list.append(var)
                tf.logging.info("Grad " + str(grad) + " assigned to " +
                                str(var.device))
                with ops.device(var.device):
                    if grad is None:
                        continue
                    elif isinstance(grad, ops.Tensor):
                        grad_accum = data_flow_ops.ConditionalAccumulator(
                            grad.dtype,
                            shape=var.get_shape(),
                            shared_name=var.name + "/grad_accum")
                    else:
                        if not isinstance(grad, ops.IndexedSlices):
                            raise ValueError("Unknown grad type!")
                        grad_accum = data_flow_ops.SparseConditionalAccumulator(
                            grad.dtype,
                            shape=(),
                            shared_name=var.name + "/grad_accum")
                    self._accumulator_list.append((grad_accum, var))
            """# Phase 1 gradient computation
      with ops.control_dependencies([update_local_step_op]):
        for index, (grad, var) in enumerate(grads_and_vars):
          with ops.device(var.device):
            if grad is None:
              continue

            elif isinstance(grad, ops.Tensor):
              grad_accum = self._accumulator_list[index][0]

              train_ops.append(grad_accum.apply_grad(grad,
                                                     local_step=self._local_step._ref()))

            else:
              if not isinstance(grad, ops.IndexedSlices):
                raise ValueError("Unknown grad type!")
              grad_accum = self._accumulator_list[index][0]

              train_ops.append(grad_accum.apply_indexed_slices_grad(
                grad, local_step=self._local_step._ref()))"""

            # Phase 1 gradient computation
            with ops.control_dependencies([update_local_step_op]):
                for index, (grad, var) in enumerate(grads_and_vars):
                    print_start_op = logging_ops.Print(
                        global_step, [global_step],
                        message="Starting to apply grads for variable %d" %
                        index)
                    with ops.device(var.device):
                        if grad is None:
                            continue

                        elif isinstance(grad, ops.Tensor):
                            grad_accum = self._accumulator_list[index][0]

                            with ops.control_dependencies([print_start_op]):
                                with tf.device("job:worker/task:%d" %
                                               worker_id):
                                    apply_grad_op = grad_accum.apply_grad(
                                        grad,
                                        local_step=self._local_step._ref())
                                    with ops.control_dependencies(
                                        [apply_grad_op]):
                                        finished_print_op = logging_ops.Print(
                                            global_step, [global_step],
                                            message=
                                            "Done applying grads for variable %d"
                                            % index)
                                        train_ops.append(finished_print_op)

                        else:
                            if not isinstance(grad, ops.IndexedSlices):
                                raise ValueError("Unknown grad type!")
                            grad_accum = self._accumulator_list[index][0]

                            with ops.control_dependencies([print_start_op]):
                                with tf.device("job:worker/task:%d" %
                                               worker_id):
                                    apply_grad_op = grad_accum.apply_indexed_slices_grad(
                                        grad,
                                        local_step=self._local_step._ref())
                                    with ops.control_dependencies(
                                        [apply_grad_op]):
                                        finished_print_op = logging_ops.Print(
                                            global_step, [global_step],
                                            message=
                                            "Done applying grads for variable %d"
                                            % index)
                                        train_ops.append(finished_print_op)

                        with ops.control_dependencies([apply_grad_op]):
                            accum_sizes_printer = logging_ops.Print(
                                global_step,
                                [
                                    x[0].num_accumulated()
                                    for x in self._accumulator_list
                                ] + [worker_id] + [global_step],
                                message="Accum aggregated status on ps")
                            train_ops.append(accum_sizes_printer)
                            x = self._accumulator_list[0]
                            ret = tf.cond(
                                tf.greater_equal(
                                    x[0].num_accumulated(),
                                    self._constant_for_comparison), f_pos,
                                f_neg)

                            should_stop_list_printer = logging_ops.Print(
                                global_step, [ret],
                                message="Should stop ret val status on ps")
                            train_ops.append(should_stop_list_printer)
                            with ops.control_dependencies([ret]):
                                queue_total_printer = logging_ops.Print(
                                    global_step, [self._stop_queue.size()],
                                    message="shared should stop queue size")
                                train_ops.append(queue_total_printer)

            # Phase 2 gradient applying
            for index, (grad, var) in enumerate(grads_and_vars):
                with ops.device(var.device):
                    grad_accum = self._accumulator_list[index][0]
                    if grad is None:
                        aggregated_grad.append(None)
                    elif isinstance(grad, ops.Tensor):
                        if collect_cdfs:
                            aggregated_grad.append(
                                grad_accum.take_grad(self._total_num_replicas))
                        else:
                            aggregated_grad.append(grad_accum.take_grad(1))
                    else:
                        if collect_cdfs:
                            aggregated_grad.append(
                                grad_accum.take_grad(self._total_num_replicas))
                        else:
                            aggregated_grad.append(
                                grad_accum.take_indexed_slices_grad(1))

            aggregated_grads_and_vars = zip(aggregated_grad, var_list)

            # Some debug operations
            self.print_sizes = logging_ops.Print(global_step, [
                self._sync_token_queues[i].size()
                for i in range(self._total_num_replicas)
            ],
                                                 message="queue sizes")
            self.print_accum_sizes = logging_ops.Print(
                self._local_step,
                [x[0].num_accumulated()
                 for x in self._accumulator_list] + [worker_id],
                message="Accum sizes")
            self.print_local_step = logging_ops.Print(
                self._local_step,
                [self._local_step._ref(),
                 global_step._ref()],
                message="local vs global step")

            # sync_op will be assigned to the same device as the global step.
            with ops.device(global_step.device), ops.name_scope(""):
                with ops.control_dependencies([self.print_accum_sizes]):
                    update_op = self._opt.apply_gradients(
                        aggregated_grads_and_vars, global_step)
                    self._update_op = update_op
                    num_to_dequeue = self._stop_queue.size()
                    deq_ops = self._stop_queue.dequeue_many(num_to_dequeue)
                    with ops.control_dependencies([deq_ops]):
                        size_printer_2 = logging_ops.Print(
                            global_step, [self.print_accum_sizes],
                            message="Complelted the dequeue operation!")
                        printer_ops.append(size_printer_2)
                    with ops.control_dependencies(printer_ops):
                        with ops.control_dependencies([update_op]):
                            sync_op = []
                            for cur_worker_id in range(
                                    self._total_num_replicas):
                                sync_op.append(
                                    self._sync_token_queues[cur_worker_id].
                                    enqueue(global_step))
                            sync_op = control_flow_ops.group(*(sync_op))

                # dummy_queue is passed to the queue runner. Don't use the real queues
                # because the queue runner doesn't automatically reopen it once it
                # closed queues in PS devices.
                dummy_queue = (data_flow_ops.FIFOQueue(
                    1,
                    types_pb2.DT_INT32,
                    shapes=(),
                    shared_name="dummy_queue"))

                self._chief_queue_runner = queue_runner.QueueRunner(
                    dummy_queue, [sync_op])

            with ops.device(global_step.device), ops.name_scope(""):
                with ops.control_dependencies(train_ops):
                    # Worker finished applying gradients. Add token to phase1_finished_queue
                    train_op = logging_ops.Print(
                        self._local_step._ref(), [
                            x[0].num_accumulated()
                            for x in self._accumulator_list
                        ] + [worker_id] + [global_step],
                        message="Finished worker updates",
                        name="FinishedWorkerUpdatesPrint")

            for accum, var in self._accumulator_list:
                with ops.device(var.device):
                    chief_init_ops.append(
                        accum.set_global_step(global_step,
                                              name="SetGlobalStep"))
            self.chief_init_op = control_flow_ops.group(*(chief_init_ops))
            self._gradients_applied = True

            return train_op
 def Foo(y):
     with ops.control_dependencies([x]):
         y = logging_ops.Print(y, [y], "inner")
     return y
def train(target, all_data, all_labels, cluster_spec):
    '''
    This is the main function for training
    '''
    image_placeholder = tf.placeholder(
        dtype=tf.float32,
        shape=[FLAGS.batch_size, IMG_HEIGHT, IMG_WIDTH, IMG_DEPTH])
    label_placeholder = tf.placeholder(dtype=tf.int32,
                                       shape=[FLAGS.batch_size])

    num_workers = len(cluster_spec.as_dict()['worker'])
    num_parameter_servers = len(cluster_spec.as_dict()['ps'])

    if FLAGS.num_replicas_to_aggregate == -1:
        num_replicas_to_aggregate = num_workers
    else:
        num_replicas_to_aggregate = FLAGS.num_replicas_to_aggregate

    assert num_workers > 0 and num_parameter_servers > 0, (
        ' num_workers and '
        'num_parameter_servers'
        ' must be > 0.')
    is_chief = (FLAGS.task_id == 0)
    num_examples = all_data.shape[0]

    with tf.device(
            tf.train.replica_device_setter(
                #cpu only
                #    worker_device='/job:worker/task:%d' % FLAGS.task_id,
                #with gpu enabled
                worker_device='/job:worker/task:%d/gpu:0' % FLAGS.task_id,
                cluster=cluster_spec)):

        global_step = tf.Variable(0, name="global_step", trainable=False)

        num_batches_per_epoch = (num_examples / FLAGS.batch_size)
        decay_steps = int(num_batches_per_epoch * FLAGS.num_epochs_per_decay /
                          num_replicas_to_aggregate)
        lr = tf.train.exponential_decay(FLAGS.initial_learning_rate,
                                        global_step,
                                        decay_steps,
                                        FLAGS.learning_rate_decay_factor,
                                        staircase=True)
        # Logits of training data and valiation data come from the same graph. The inference of
        # validation data share all the weights with train data. This is implemented by passing
        # reuse=True to the variable scopes of train graph
        logits = inference(image_placeholder,
                           FLAGS.num_residual_blocks,
                           reuse=False)

        # The following codes calculate the train loss, which is consist of the
        # softmax cross entropy and the relularization loss
        #            regu_losses = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)
        total_loss = calc_loss(logits, label_placeholder)

        opt = tf.train.AdamOptimizer(lr)

        if FLAGS.interval_method or FLAGS.worker_times_cdf_method:
            opt = TimeoutReplicasOptimizer(opt,
                                           global_step,
                                           total_num_replicas=num_workers)
        elif FLAGS.backup_worker_method:
            opt = BackupOptimizer(
                opt,
                replicas_to_aggregate=num_replicas_to_aggregate,
                total_num_replicas=num_workers)
        else:
            use_svd_compress = FLAGS.svd_rank > 0
            kwargs = {
                'replicas_to_aggregate': num_replicas_to_aggregate,
                'total_num_replicas': num_workers,
                'compress': use_svd_compress,
                'svd_rank': FLAGS.svd_rank
            }
            print('#' * 40)
            print(kwargs)
            print('#' * 40)
            opt = LowCommSync(opt, global_step=global_step, **kwargs)

        # Compute gradients with respect to the loss.
        grads = opt.compute_gradients(total_loss)

        if FLAGS.interval_method or FLAGS.worker_times_cdf_method:
            apply_gradients_op = opt.apply_gradients(
                grads,
                FLAGS.task_id,
                global_step=global_step,
                collect_cdfs=FLAGS.worker_times_cdf_method)
#            apply_gradients_op = opt.apply_gradients(grads, FLAGS.task_id, global_step=global_step)
        elif FLAGS.backup_worker_method:
            apply_gradients_op = opt.apply_gradients(grads,
                                                     FLAGS.task_id,
                                                     global_step=global_step)
        else:
            # SVD encode happens right here:
            shapes = [g.get_shape() for g, _ in grads]
            if use_svd_compress:
                encoded_grads = encode(grads, r=1, shapes=shapes)
                apply_gradients_op = opt.apply_gradients(
                    encoded_grads, global_step=global_step)
            else:
                apply_gradients_op = opt.apply_gradients(
                    grads, global_step=global_step)

        with tf.control_dependencies([apply_gradients_op]):
            train_op = tf.identity(total_loss, name='train_op')

        # Initialize a saver to save checkpoints. Merge all summaries, so we can run all
        # summarizing operations by running summary_op. Initialize a new session
        chief_queue_runners = [opt.get_chief_queue_runner()]
        init_tokens_op = opt.get_init_tokens_op()
        saver = tf.train.Saver()
        summary_op = tf.summary.merge_all()
        init_op = tf.global_variables_initializer()
        test_print_op = logging_ops.Print(0, [0], message="Test print success")
        if is_chief:
            local_init_op = opt.chief_init_op
        else:
            local_init_op = opt.local_step_init_op

        local_init_opt = [local_init_op]
        ready_for_local_init_op = opt.ready_for_local_init_op

        sv = tf.train.Supervisor(
            is_chief=is_chief,
            local_init_op=local_init_op,
            ready_for_local_init_op=ready_for_local_init_op,
            logdir=FLAGS.train_dir,
            init_op=init_op,
            summary_op=None,
            global_step=global_step,
            saver=saver,
            save_model_secs=FLAGS.save_interval_secs)
        tf.logging.info('%s Supervisor' % datetime.now())
        sess_config = tf.ConfigProto(
            allow_soft_placement=True,
            log_device_placement=FLAGS.log_device_placement)
        sess = sv.prepare_or_wait_for_session(target, config=sess_config)
        queue_runners = tf.get_collection(tf.GraphKeys.QUEUE_RUNNERS)
        sv.start_queue_runners(sess, queue_runners)
        tf.logging.info('Started %d queues for processing input data.',
                        len(queue_runners))

        if is_chief:
            if not FLAGS.interval_method or FLAGS.worker_times_cdf_method:
                sv.start_queue_runners(sess, chief_queue_runners)
            sess.run(init_tokens_op)

        timeout_client, timeout_server = launch_manager(sess, FLAGS)
        next_summary_time = time.time() + FLAGS.save_summaries_secs
        begin_time = time.time()
        cur_iteration = -1
        local_data_batch_idx = 0
        epoch_counter = 0
        iterations_finished = set()

        if FLAGS.task_id == 0 and FLAGS.interval_method:
            opt.start_interval_updates(sess, timeout_client)
        '''
        np.random.seed(SEED)
        b = np.ones(int(num_batches_per_epoch))
        interval = np.arange(0, int(num_batches_per_epoch))
        idx_list = np.random.choice(interval, int(num_workers), replace=False)     
        '''
        while not sv.should_stop():
            #    try:
            sys.stdout.flush()
            tf.logging.info("A new iteration...")
            cur_iteration += 1

            if FLAGS.worker_times_cdf_method:
                sess.run([opt._wait_op])
                timeout_client.broadcast_worker_dequeued_token(cur_iteration)
            start_time = time.time()
            epoch_counter, local_data_batch_idx, feed_dict = fill_feed_dict(
                all_data, all_labels, image_placeholder, label_placeholder,
                FLAGS.batch_size, local_data_batch_idx, epoch_counter)

            run_options = tf.RunOptions()
            run_metadata = tf.RunMetadata()

            if FLAGS.timeline_logging:
                run_options.trace_level = tf.RunOptions.FULL_TRACE
                run_options.output_partition_graphs = True

            #feed_dict[weight_vec_placeholder] = ls_solution
            tf.logging.info("Data batch index: %s, Current epoch idex: %s" %
                            (str(epoch_counter), str(local_data_batch_idx)))
            loss_value, step = sess.run(
                #[train_op, global_step], feed_dict={feed_dict, x}, run_metadata=run_metadata, options=run_options)
                [train_op, global_step],
                feed_dict=feed_dict,
                run_metadata=run_metadata,
                options=run_options)

            if FLAGS.worker_times_cdf_method:
                timeout_client.broadcast_worker_finished_computing_gradients(
                    cur_iteration)
            assert not np.isnan(loss_value), 'Model diverged with loss = NaN'
            finish_time = time.time()
            if FLAGS.timeline_logging:
                tl = timeline.Timeline(run_metadata.step_stats)
                ctf = tl.generate_chrome_trace_format()
                with open(
                        '%s/worker=%d_timeline_iter=%d.json' %
                    (FLAGS.train_dir, FLAGS.task_id, step), 'w'):
                    f.write(ctf)
            if step > FLAGS.max_steps:
                break

            duration = time.time() - start_time
            examples_per_sec = FLAGS.batch_size / float(duration)
            format_str = ('Worker %d: %s: step %d, loss = %f'
                          '(%.1f examples/sec; %.3f  sec/batch)')
            tf.logging.info(format_str %
                            (FLAGS.task_id, datetime.now(), step, loss_value,
                             examples_per_sec, duration))
            if is_chief and next_summary_time < time.time(
            ) and FLAGS.should_summarize:
                tf.logging.info('Running Summary operation on the chief.')
                summary_str = sess.run(summary_op)
                sv.summary_computed(sess, summary_str)
                tf.logging.info('Finished running Summary operation.')
                next_summary_time += FLAGS.save_summaries_secs
        #    except tf.errors.DeadlineExceededError:
        #        tf.logging.info("Killed at time %f" % time.time())
        #sess.reset_kill()
        #    except:
        #        tf.logging.info("Unexpected error: %s" % str(sys.exc_info()[0]))
        #sess.reset_kill()
        if is_chief:
            tf.logging.info('Elapsed Time: %f' % (time.time() - begin_time))
        sv.stop()

        if is_chief:
            saver.save(sess,
                       os.path.join(FLAGS.train_dir, 'model.ckpt'),
                       global_step=global_step)
Example #27
0
def stratified_sample(tensors,
                      labels,
                      target_probs,
                      batch_size,
                      init_probs=None,
                      enqueue_many=False,
                      queue_capacity=16,
                      threads_per_queue=1,
                      name=None):
    """Stochastically creates batches based on per-class probabilities.

  This method discards examples. Internally, it creates one queue to amortize
  the cost of disk reads, and one queue to hold the properly-proportioned
  batch.

  Args:
    tensors: List of tensors for data. All tensors are either one item or a
        batch, according to enqueue_many.
    labels: Tensor for label of data. Label is a single integer or a batch,
        depending on `enqueue_many`. It is not a one-hot vector.
    target_probs: Target class proportions in batch. An object whose type has a
        registered Tensor conversion function.
    batch_size: Size of batch to be returned.
    init_probs: Class proportions in the data. An object whose type has a
        registered Tensor conversion function, or `None` for estimating the
        initial distribution.
    enqueue_many: Bool. If true, interpret input tensors as having a batch
        dimension.
    queue_capacity: Capacity of the large queue that holds input examples.
    threads_per_queue: Number of threads for the large queue that holds input
        examples and for the final queue with the proper class proportions.
    name: Optional prefix for ops created by this function.
  Raises:
    ValueError: If `tensors` isn't iterable.
    ValueError: `enqueue_many` is True and labels doesn't have a batch
        dimension, or if `enqueue_many` is False and labels isn't a scalar.
    ValueError: `enqueue_many` is True, and batch dimension on data and labels
        don't match.
    ValueError: if probs don't sum to one.
    ValueError: if a zero initial probability class has a nonzero target
        probability.
    TFAssertion: if labels aren't integers in [0, num classes).
  Returns:
    (data_batch, label_batch), where data_batch is a list of tensors of the same
        length as `tensors`

  Example:
    # Get tensor for a single data and label example.
    data, label = data_provider.Get(['data', 'label'])

    # Get stratified batch according to per-class probabilities.
    target_probs = [...distribution you want...]
    [data_batch], labels = tf.contrib.training.stratified_sample(
        [data], label, target_probs)

    # Run batch through network.
    ...
  """
    with ops.name_scope(name, 'stratified_sample', list(tensors) + [labels]):
        tensor_list = ops.convert_n_to_tensor_or_indexed_slices(tensors)
        labels = ops.convert_to_tensor(labels)
        target_probs = ops.convert_to_tensor(target_probs,
                                             dtype=dtypes.float32)
        # Reduce the case of a single example to that of a batch of size 1.
        if not enqueue_many:
            tensor_list = [
                array_ops.expand_dims(tensor, 0) for tensor in tensor_list
            ]
            labels = array_ops.expand_dims(labels, 0)

        # If `init_probs` is `None`, set up online estimation of data distribution.
        if init_probs is None:
            # We use `target_probs` to get the number of classes, so its shape must be
            # fully defined at graph construction time.
            target_probs.get_shape().assert_is_fully_defined()
            init_probs = _estimate_data_distribution(
                labels,
                target_probs.get_shape().num_elements())
        else:
            init_probs = ops.convert_to_tensor(init_probs,
                                               dtype=dtypes.float32)

        # Validate that input is consistent.
        tensor_list, labels, [init_probs, target_probs
                              ] = _verify_input(tensor_list, labels,
                                                [init_probs, target_probs])

        # Check that all zero initial probabilities also have zero target
        # probabilities.
        assert_op = control_flow_ops.Assert(
            math_ops.reduce_all(
                math_ops.logical_or(math_ops.not_equal(init_probs, 0),
                                    math_ops.equal(target_probs, 0))),
            [
                'All classes with zero initial probability must also have zero target '
                'probability: ', init_probs, target_probs
            ])
        init_probs = control_flow_ops.with_dependencies([assert_op],
                                                        init_probs)

        # Calculate acceptance sampling probabilities.
        accept_probs = _calculate_acceptance_probabilities(
            init_probs, target_probs)
        proportion_rejected = math_ops.reduce_sum(
            (1 - accept_probs) * init_probs)
        accept_probs = control_flow_ops.cond(
            math_ops.less(proportion_rejected, .5),
            lambda: accept_probs,
            lambda: logging_ops.Print(  # pylint: disable=g-long-lambda
                accept_probs, [accept_probs],
                message='Proportion of examples rejected by sampler is high.',
                first_n=10))

        # Make a single queue to hold input examples. Reshape output so examples
        # don't have singleton batch dimension.
        batched = input_ops.batch(tensor_list + [labels],
                                  batch_size=1,
                                  num_threads=threads_per_queue,
                                  capacity=queue_capacity,
                                  enqueue_many=True)
        val_list = [array_ops.squeeze(x, [0]) for x in batched[:-1]]
        label = array_ops.squeeze(batched[-1], [0])

        # Set up second queue containing batches that have the desired class
        # proportions.
        cur_prob = array_ops.gather(accept_probs, label)
        batched = input_ops.maybe_batch(
            val_list + [label],
            keep_input=random_ops.random_uniform([]) < cur_prob,
            batch_size=batch_size,
            num_threads=threads_per_queue)
        return batched[:-1], batched[-1]
def train(target, all_data, all_labels, cluster_spec):
    '''
    This is the main function for training
    '''
    image_placeholder = tf.placeholder(
        dtype=tf.float32,
        shape=[FLAGS.batch_size, IMG_HEIGHT, IMG_WIDTH, IMG_DEPTH])
    label_placeholder = tf.placeholder(dtype=tf.int32,
                                       shape=[FLAGS.batch_size])

    num_workers = len(cluster_spec.as_dict()['worker'])
    num_parameter_servers = len(cluster_spec.as_dict()['ps'])

    if FLAGS.num_replicas_to_aggregate == -1:
        num_replicas_to_aggregate = num_workers
    else:
        num_replicas_to_aggregate = FLAGS.num_replicas_to_aggregate

    assert num_workers > 0 and num_parameter_servers > 0, (
        ' num_workers and '
        'num_parameter_servers'
        ' must be > 0.')
    is_chief = (FLAGS.task_id == 0)
    num_examples = all_data.shape[0]

    with tf.device(
            tf.train.replica_device_setter(
                #cpu only
                #            worker_device='/job:worker/task:%d' % FLAGS.task_id,
                #with gpu enabled
                worker_device='/job:worker/task:%d/gpu:0' % FLAGS.task_id,
                cluster=cluster_spec)):

        global_step = tf.Variable(0, name="global_step", trainable=False)

        num_batches_per_epoch = (num_examples / FLAGS.batch_size)
        decay_steps = int(num_batches_per_epoch * FLAGS.num_epochs_per_decay /
                          num_replicas_to_aggregate)
        lr = tf.train.exponential_decay(FLAGS.initial_learning_rate,
                                        global_step,
                                        decay_steps,
                                        FLAGS.learning_rate_decay_factor,
                                        staircase=True)
        # Logits of training data and valiation data come from the same graph. The inference of
        # validation data share all the weights with train data. This is implemented by passing
        # reuse=True to the variable scopes of train graph
        logits = inference(image_placeholder,
                           FLAGS.num_residual_blocks,
                           reuse=False)

        #            vali_logits = inference(self.vali_image_placeholder, FLAGS.num_residual_blocks, reuse=True)

        # The following codes calculate the train loss, which is consist of the
        # softmax cross entropy and the relularization loss
        #            regu_losses = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)
        total_loss = calc_loss(logits, label_placeholder)

        #        predictions = tf.nn.softmax(logits)
        #        train_top1_error = top_k_error(predictions, label_placeholder, 1)

        opt = tf.train.AdamOptimizer(lr)
        if FLAGS.interval_method or FLAGS.worker_times_cdf_method:
            opt = TimeoutReplicasOptimizer(opt,
                                           global_step,
                                           total_num_replicas=num_workers)
        elif FLAGS.backup_worker_method:
            opt = BackupOptimizer(
                opt,
                replicas_to_aggregate=num_replicas_to_aggregate,
                total_num_replicas=num_workers)
        else:
            #            opt = tf.train.SyncReplicasOptimizerV2(
            opt = tf.train.SyncReplicasOptimizer(
                opt,
                replicas_to_aggregate=num_replicas_to_aggregate,
                total_num_replicas=num_workers)

        # Compute gradients with respect to the loss.
        grads = opt.compute_gradients(total_loss)
        #compute weighted gradients here.
        #===============================================================================================
        '''
        #define a placeholder for weighted vector, i.e. LS solution
        weight_vec_placeholder = tf.placeholder(dtype=tf.float32,
                                                shape=(num_workers,))
        grad_list = [x[0] for x in grads]
        new_grad_list = []
        #times gradient from each worker with the corresponding weight
        #which is just scalar multiplication
        for g_idx in range(len(grad_list)):
            grad_on_worker = grad_list[g_idx]
            weight = tf.slice(weight_vec_placeholder, [FLAGS.task_id], [1])
            tf.logging.info("Logging Happens Here!")
            tf.logging.info(weight[0])
            new_grad_list.append(tf.scalar_mul(weight[0], grad_on_worker))
        grad_new = []
        #regenerate the weighted gradients, merging all weighted vector
        for x_idx in range(len(grads)):
            grad_elem = grads[x_idx]
            grad_new.append((new_grad_list[x_idx], grad_elem[1]))
        '''
        #===============================================================================================
        if FLAGS.interval_method or FLAGS.worker_times_cdf_method:
            apply_gradients_op = opt.apply_gradients(
                grads,
                FLAGS.task_id,
                global_step=global_step,
                collect_cdfs=FLAGS.worker_times_cdf_method)
#            apply_gradients_op = opt.apply_gradients(grads, FLAGS.task_id, global_step=global_step)
        elif FLAGS.backup_worker_method:
            apply_gradients_op = opt.apply_gradients(grads,
                                                     FLAGS.task_id,
                                                     global_step=global_step)
        else:
            apply_gradients_op = opt.apply_gradients(grads,
                                                     global_step=global_step)
#           apply_gradients_op = opt.apply_gradients(grad_new, global_step=global_step)
        with tf.control_dependencies([apply_gradients_op]):
            train_op = tf.identity(total_loss, name='train_op')

        # Initialize a saver to save checkpoints. Merge all summaries, so we can run all
        # summarizing operations by running summary_op. Initialize a new session
        chief_queue_runners = [opt.get_chief_queue_runner()]
        init_tokens_op = opt.get_init_tokens_op()
        saver = tf.train.Saver()
        summary_op = tf.summary.merge_all()
        init_op = tf.global_variables_initializer()
        test_print_op = logging_ops.Print(0, [0], message="Test print success")
        if is_chief:
            local_init_op = opt.chief_init_op
        else:
            local_init_op = opt.local_step_init_op

        local_init_opt = [local_init_op]
        ready_for_local_init_op = opt.ready_for_local_init_op

        sv = tf.train.Supervisor(
            is_chief=is_chief,
            local_init_op=local_init_op,
            ready_for_local_init_op=ready_for_local_init_op,
            logdir=FLAGS.train_dir,
            init_op=init_op,
            summary_op=None,
            global_step=global_step,
            saver=saver,
            save_model_secs=FLAGS.save_interval_secs)
        tf.logging.info('%s Supervisor' % datetime.now())
        sess_config = tf.ConfigProto(
            allow_soft_placement=True,
            log_device_placement=FLAGS.log_device_placement)
        sess = sv.prepare_or_wait_for_session(target, config=sess_config)
        queue_runners = tf.get_collection(tf.GraphKeys.QUEUE_RUNNERS)
        sv.start_queue_runners(sess, queue_runners)
        tf.logging.info('Started %d queues for processing input data.',
                        len(queue_runners))

        if is_chief:
            if not FLAGS.interval_method or FLAGS.worker_times_cdf_method:
                sv.start_queue_runners(sess, chief_queue_runners)
            sess.run(init_tokens_op)

        timeout_client, timeout_server = launch_manager(sess, FLAGS)
        next_summary_time = time.time() + FLAGS.save_summaries_secs
        begin_time = time.time()
        cur_iteration = -1
        local_data_batch_idx = 0
        epoch_counter = 0
        iterations_finished = set()

        if FLAGS.task_id == 0 and FLAGS.interval_method:
            opt.start_interval_updates(sess, timeout_client)
        '''
        np.random.seed(SEED)
        b = np.ones(int(num_batches_per_epoch))
        interval = np.arange(0, int(num_batches_per_epoch))
        idx_list = np.random.choice(interval, int(num_workers), replace=False)     
        '''
        while not sv.should_stop():
            #    try:
            sys.stdout.flush()
            tf.logging.info("A new iteration...")
            cur_iteration += 1

            if FLAGS.worker_times_cdf_method:
                sess.run([opt._wait_op])
                timeout_client.broadcast_worker_dequeued_token(cur_iteration)
            start_time = time.time()
            epoch_counter, local_data_batch_idx, feed_dict = fill_feed_dict(
                all_data, all_labels, image_placeholder, label_placeholder,
                FLAGS.batch_size, local_data_batch_idx, epoch_counter)

            run_options = tf.RunOptions()
            run_metadata = tf.RunMetadata()
            #===============================================================================================
            '''
            LS_start_time = time.time()
            interval_2 = np.arange(0, int(num_workers))
            workers_to_kill = np.random.choice(interval_2, FLAGS.num_worker_kill, replace=False)
            #interval_2 = np.arange(0, WORKER_NUM)
            #workers_to_kill = np.random.choice(interval_2, NUM_WORKER_KILL, replace=False)
            A = np.zeros((int(num_workers), int(num_batches_per_epoch)))
            for i in range(A.shape[0]):
              if i == A.shape[0]-1:
                A[i][idx_list[i]] = 1
                A[i][idx_list[0]] = 1
              else:
                A[i][idx_list[i]] = 1
                A[i][idx_list[i+1]] = 1

            for i in range(len(idx_list)):
              element = idx_list[i]
              if element == A.shape[1]-1:
                idx_list[i] = 0
              else:
                idx_list[i] += 1

            for k in workers_to_kill:
              A[k] = 0

            A_for_calc = np.transpose(A)
            ls_solution = np.dot(np.linalg.pinv(A_for_calc), b)
            tf.logging.info("workers killed this iteration:")
            tf.logging.info(str(workers_to_kill))
            tf.logging.info("The matrix to solve:")
            for item in A_for_calc:
              tf.logging.info(str(item))
            tf.logging.info("Solution of LS:")
            tf.logging.info(str(ls_solution)) 
            LS_duration = time.time() - LS_start_time
            tf.logging.info("LS run time: %s" % str(LS_duration))
            '''
            #===============================================================================================

            if FLAGS.timeline_logging:
                run_options.trace_level = tf.RunOptions.FULL_TRACE
                run_options.output_partition_graphs = True

            #feed_dict[weight_vec_placeholder] = ls_solution
            tf.logging.info("RUNNING SESSION... %f" % time.time())
            tf.logging.info("Data batch index: %s, Current epoch idex: %s" %
                            (str(epoch_counter), str(local_data_batch_idx)))
            loss_value, step = sess.run(
                #[train_op, global_step], feed_dict={feed_dict, x}, run_metadata=run_metadata, options=run_options)
                [train_op, global_step],
                feed_dict=feed_dict,
                run_metadata=run_metadata,
                options=run_options)
            tf.logging.info("DONE RUNNING SESSION...")

            if FLAGS.worker_times_cdf_method:
                timeout_client.broadcast_worker_finished_computing_gradients(
                    cur_iteration)
            assert not np.isnan(loss_value), 'Model diverged with loss = NaN'
            finish_time = time.time()
            if FLAGS.timeline_logging:
                tl = timeline.Timeline(run_metadata.step_stats)
                ctf = tl.generate_chrome_trace_format()
                with open(
                        '%s/worker=%d_timeline_iter=%d.json' %
                    (FLAGS.train_dir, FLAGS.task_id, step), 'w'):
                    f.write(ctf)
            if step > FLAGS.max_steps:
                break

            duration = time.time() - start_time
            examples_per_sec = FLAGS.batch_size / float(duration)
            format_str = ('Worker %d: %s: step %d, loss = %f'
                          '(%.1f examples/sec; %.3f  sec/batch)')
            tf.logging.info(format_str %
                            (FLAGS.task_id, datetime.now(), step, loss_value,
                             examples_per_sec, duration))
            if is_chief and next_summary_time < time.time(
            ) and FLAGS.should_summarize:
                tf.logging.info('Running Summary operation on the chief.')
                summary_str = sess.run(summary_op)
                sv.summary_computed(sess, summary_str)
                tf.logging.info('Finished running Summary operation.')
                next_summary_time += FLAGS.save_summaries_secs
        #    except tf.errors.DeadlineExceededError:
        #        tf.logging.info("Killed at time %f" % time.time())
        #sess.reset_kill()
        #    except:
        #        tf.logging.info("Unexpected error: %s" % str(sys.exc_info()[0]))
        #sess.reset_kill()
        if is_chief:
            tf.logging.info('Elapsed Time: %f' % (time.time() - begin_time))
        sv.stop()

        if is_chief:
            saver.save(sess,
                       os.path.join(FLAGS.train_dir, 'model.ckpt'),
                       global_step=global_step)
Example #29
0
 def testPrintString(self):
     inp = constant_op.constant(2.0, shape=[100, 32])
     inp_printed = logging_ops.Print(inp, ["hello"])
     self.assertEqual(inp.get_shape(), inp_printed.get_shape())
Example #30
0
def print_op(input_,
             data=None,
             message=None,
             first_n=None,
             summarize=20,
             print_tensor_name=True,
             print_tensor_type=True,
             print_shape=True,
             summarize_indicator_vector=True,
             name=None):
  """Creates a print op that will print when a tensor is accessed.

  Wraps the tensor passed in so that whenever that tensor is accessed,
  the message `message` is printed, along with the current value of the
  tensor `t` and an optional list of other tensors.

  Args:
    input_: A Tensor/SparseTensor/TensorArray to print when it is evaluated.
    data: A list of other tensors to print.
    message: A string message to print as a prefix.
    first_n: Only log `first_n` number of times. Negative numbers log always;
             this is the default.
    summarize: Print this number of elements in the tensor.
    print_tensor_name: Print the tensor name.
    print_tensor_type: Print the tensor type.
    print_shape: Print the tensor's shape.
    summarize_indicator_vector: Whether to print the index of the first true
      value in an indicator vector (a Boolean tensor).
    name: The name to give this op.

  Returns:
    A Print op. The Print op returns `input_`.

  Raises:
    ValueError: If the tensor `input_` is not a Tensor, SparseTensor or
      TensorArray.

  """

  message = message or ""
  if input_ is None:
    raise ValueError("input_ must be of type "
                     "Tensor, SparseTensor or TensorArray")

  tensor_list = _get_tensor_repr(input_, print_tensor_name, print_tensor_type,
                                 print_shape, summarize_indicator_vector)

  if data is not None:
    for t in data:
      tensor_list.extend(_get_tensor_repr(t, print_tensor_name,
                                          print_tensor_type, print_shape,
                                          summarize_indicator_vector))

  if isinstance(input_, ops.Tensor) or isinstance(input_, variables.Variable):
    input_ = logging_ops.Print(input_, tensor_list, message, first_n, summarize,
                               name)
  elif isinstance(input_, sparse_tensor.SparseTensor):
    p = logging_ops.Print(
        constant_op.constant([]), tensor_list, message, first_n, summarize,
        name)

    with ops.control_dependencies([p]):
      input_ = sparse_tensor.SparseTensor(
          array_ops.identity(input_.indices),
          array_ops.identity(input_.values),
          array_ops.identity(input_.dense_shape))
  elif isinstance(input_, tensor_array_ops.TensorArray):
    p = logging_ops.Print(
        constant_op.constant([]), tensor_list, message, first_n, summarize,
        name)

    with ops.control_dependencies([p]):
      input_ = tensor_array_ops.TensorArray(dtype=input_.dtype,
                                            handle=input_.handle,
                                            flow=input_.flow)
  else:
    raise ValueError("input_ must be of type "
                     "Tensor, SparseTensor or TensorArray")

  return input_