def npairs_loss(labels, embeddings_anchor, embeddings_positive, reg_lambda=0.002, print_losses=False): """Computes the npairs loss. Npairs loss expects paired data where a pair is composed of samples from the same labels and each pairs in the minibatch have different labels. The loss has two components. The first component is the L2 regularizer on the embedding vectors. The second component is the sum of cross entropy loss which takes each row of the pair-wise similarity matrix as logits and the remapped one-hot labels as labels. See: http://www.nec-labs.com/uploads/images/Department-Images/MediaAnalytics/papers/nips16_npairmetriclearning.pdf Args: labels: 1-D tf.int32 `Tensor` of shape [batch_size/2]. embeddings_anchor: 2-D Tensor of shape [batch_size/2, embedding_dim] for the embedding vectors for the anchor images. Embeddings should not be l2 normalized. embeddings_positive: 2-D Tensor of shape [batch_size/2, embedding_dim] for the embedding vectors for the positive images. Embeddings should not be l2 normalized. reg_lambda: Float. L2 regularization term on the embedding vectors. print_losses: Boolean. Option to print the xent and l2loss. Returns: npairs_loss: tf.float32 scalar. """ # pylint: enable=line-too-long # Add the regularizer on the embedding. reg_anchor = math_ops.reduce_mean( math_ops.reduce_sum(math_ops.square(embeddings_anchor), 1)) reg_positive = math_ops.reduce_mean( math_ops.reduce_sum(math_ops.square(embeddings_positive), 1)) l2loss = math_ops.multiply( 0.25 * reg_lambda, reg_anchor + reg_positive, name='l2loss') # Get per pair similarities. similarity_matrix = math_ops.matmul( embeddings_anchor, embeddings_positive, transpose_a=False, transpose_b=True) # Reshape [batch_size] label tensor to a [batch_size, 1] label tensor. lshape = array_ops.shape(labels) assert lshape.shape == 1 labels = array_ops.reshape(labels, [lshape[0], 1]) labels_remapped = math_ops.to_float( math_ops.equal(labels, array_ops.transpose(labels))) labels_remapped /= math_ops.reduce_sum(labels_remapped, 1, keep_dims=True) # Add the softmax loss. xent_loss = nn.softmax_cross_entropy_with_logits( logits=similarity_matrix, labels=labels_remapped) xent_loss = math_ops.reduce_mean(xent_loss, name='xentropy') if print_losses: xent_loss = logging_ops.Print( xent_loss, ['cross entropy:', xent_loss, 'l2loss:', l2loss]) return l2loss + xent_loss
def test_report_unsupported_operations_graph_mode(self): """Tests that unsupported operations are detected.""" context = self.create_test_xla_compile_context() context.Enter() dummy_tensor = constant_op.constant(1.1) audio_summary = summary.audio('audio_summary', dummy_tensor, 0.5) histogram_summary = summary.histogram('histogram_summary', dummy_tensor) image_summary = summary.image('image_summary', dummy_tensor) scalar_summary = summary.scalar('scalar_summary', dummy_tensor) tensor_summary = summary.tensor_summary('tensor_summary', dummy_tensor) summary.merge([ audio_summary, histogram_summary, image_summary, scalar_summary, tensor_summary ], name='merge_summary') logging_ops.Print(dummy_tensor, [dummy_tensor], name='print_op') context.Exit() unsupported_ops_names = [op.name for op in context._unsupported_ops] self.assertEqual(unsupported_ops_names, [ u'audio_summary', u'histogram_summary', u'image_summary', u'scalar_summary', u'tensor_summary', u'merge_summary/merge_summary', u'print_op' ])
def mmid_Npair_loss_graph(config, reg_lambda, embeddings_positive, embeddings_anchor): """Uses npairs_loss in both directions. Args: pregrasp_embedding: Batch of embeddings of the pregrasp image goal_embedding: Batch of embeddings of the goal image postgrasp_embedding: Batch of embeddings of the postgrasp image params: Parameters for loss. Currently unused. Returns: A scalar loss """ pair_a = embeddings_positive pair_b = embeddings_anchor labels = tf.range(config.BATCH_SIZE, dtype=tf.int32) pair_a = logging_ops.Print(pair_a, [ 'mean_embedding:', math_ops.reduce_mean(math_ops.reduce_sum(pair_a, 1)) ]) loss_1 = tf.contrib.losses.metric_learning.npairs_loss( labels, pair_a, pair_b, reg_lambda=reg_lambda, print_losses=True) loss_2 = tf.contrib.losses.metric_learning.npairs_loss( labels, pair_b, pair_a, reg_lambda=reg_lambda, print_losses=True) tf.summary.scalar('npairs_loss1', loss_1) tf.summary.scalar('npairs_loss2', loss_2) return loss_1 + loss_2
def testZerosLikeVariant(self): # TODO(ebrevdo): Re-enable use_gpu=True once non-DMA Variant # copying between CPU and GPU is supported AND we register a # ZerosLike callback for GPU for Variant storing primitive types # in variant_op_registry.cc. with self.test_session(use_gpu=False): variant_tensor = tensor_pb2.TensorProto( dtype=dtypes_lib.variant.as_datatype_enum, tensor_shape=tensor_shape.TensorShape([]).as_proto(), variant_val=[ tensor_pb2.VariantTensorDataProto( # Match registration in variant_op_registry.cc type_name=b"int", metadata=np.array(1, dtype=np.int32).tobytes()) ]) const_variant = constant_op.constant(variant_tensor) zeros_like = array_ops.zeros_like(const_variant) zeros_like_op = logging_ops.Print( zeros_like, [const_variant, zeros_like], message= "Variant storing an int, input and output of zeros_like:").op # Smoke test -- ensure this executes without trouble. # Right now, non-numpy-compatible objects cannot be returned from a # session.run call; similarly, objects that can't be converted to # native numpy types cannot be passed to ops.convert_to_tensor. # TODO(ebrevdo): Add registration mechanism for # ops.convert_to_tensor and for session.run output. zeros_like_op.run()
def add_image_summary(tensor, name=None, prefix=None, print_summary=False): """Adds an image summary for the given tensor. Args: tensor: a variable or op tensor with shape [batch,height,width,channels] name: the optional name for the summary. prefix: An optional prefix for the summary names. print_summary: If `True`, the summary is printed to stdout when the summary is computed. Returns: An image `Tensor` of type `string` whose contents are the serialized `Summary` protocol buffer. """ summary_name = _get_summary_name(tensor, name, prefix) # If print_summary, then we need to make sure that this call doesn't add the # non-printing op to the collection. We'll add it to the collection later. collections = [] if print_summary else None op = summary.image(name=summary_name, tensor=tensor, collections=collections) if print_summary: op = logging_ops.Print(op, [tensor], summary_name) ops.add_to_collection(ops.GraphKeys.SUMMARIES, op) return op
def dynamic_print(*values): """Implementation of print using dynamic dispatch. The function attempts to use tf.Print if all the values are compatible. Otherwise, it will fall back to py_func. Args: *values: values to print Returns: A dummy value indicating the print completed. If tf. """ if all(map(is_tf_print_compatible, values)): return logging_ops.Print(1, values) def print_wrapper(*vals): if six.PY3: # TensorFlow doesn't seem to generate Unicode when passing strings to # py_func. This causes the print to add a "b'" wrapper to the output, # which is probably never what you want. vals = tuple(v.decode() if isinstance(v, bytes) else v for v in vals) print(*vals) # The flush helps avoid garbled output in IPython. sys.stdout.flush() return py_func.wrap_py_func(print_wrapper, None, values, use_dummy_return=True)
def testVariant(self): def create_constant_variant(value): return constant_op.constant( tensor_pb2.TensorProto( dtype=dtypes.variant.as_datatype_enum, tensor_shape=tensor_shape.TensorShape([]).as_proto(), variant_val=[ tensor_pb2.VariantTensorDataProto( # Match registration in variant_op_registry.cc type_name=b"int", metadata=np.array(value, dtype=np.int32).tobytes()) ])) # TODO(ebrevdo): Re-enable use_gpu=True once non-DMA Variant # copying between CPU and GPU is supported. with self.test_session(use_gpu=False): variant_const_3 = create_constant_variant(3) variant_const_4 = create_constant_variant(4) variant_const_5 = create_constant_variant(5) # 3 + 3 + 5 + 4 = 15. result = math_ops.add_n((variant_const_3, variant_const_3, variant_const_5, variant_const_4)) # Smoke test -- ensure this executes without trouble. # Right now, non-numpy-compatible objects cannot be returned from a # session.run call; similarly, objects that can't be converted to # native numpy types cannot be passed to ops.convert_to_tensor. # For now, run the test and examine the output to see that the result is # equal to 15. result_op = logging_ops.Print( result, [variant_const_3, variant_const_4, variant_const_5, result], message=("Variants stored an int: c(3), c(4), c(5), " "add_n(c(3), c(3), c(5), c(4)): ")).op result_op.run()
def testVariant(self): # TODO(ebrevdo): Re-enable use_gpu=True once non-DMA Variant # copying between CPU and GPU is supported. with self.test_session(use_gpu=False): variant_tensor = tensor_pb2.TensorProto( dtype=dtypes_lib.variant.as_datatype_enum, tensor_shape=tensor_shape.TensorShape([]).as_proto(), variant_val=[ tensor_pb2.VariantTensorDataProto( # Match registration in variant_op_registry.cc type_name=b"int", metadata=np.array(1, dtype=np.int32).tobytes()) ]) const = constant_op.constant(variant_tensor) const_value = const.op.get_attr("value") # Ensure we stored the tensor proto properly. self.assertProtoEquals(variant_tensor, const_value) # Smoke test -- ensure this executes without trouble. # Right now, non-numpy-compatible objects cannot be returned from a # session.run call; similarly, objects that can't be converted to # native numpy types cannot be passed to ops.convert_to_tensor. # TODO(ebrevdo): Add registration mechanism for # ops.convert_to_tensor and for session.run output. logging_const_op = logging_ops.Print( const, [const], message="Variant storing an int, decoded const value:").op logging_const_op.run()
def _encode(self, grads_and_vars, shapes): if not self.compress: return grads_and_vars with ops.control_dependencies([ logging_ops.Print(0, [0], message="Start Encode Gradients on Workers") ]): coding = encode(grads_and_vars, r=self.svd_rank, shapes=shapes) return coding
def _decode(self, coding): if self.compress: with ops.control_dependencies([ logging_ops.Print(0, [self._global_step], message="Start Decode Gradients on PS") ]): grads_and_vars, decode_data = decode(coding) return grads_and_vars, decode_data return coding, {}
def _model_fn(features, labels, mode): """Function that returns predictions, training loss, and training op.""" weights = None if weights_name and weights_name in features: weights = features.pop(weights_name) graph_builder = graph_builder_class(params, device_assigner=device_assigner) inference = {} if (mode == model_fn_lib.ModeKeys.EVAL or mode == model_fn_lib.ModeKeys.INFER): inference[eval_metrics.INFERENCE_PROB_NAME] = ( graph_builder.inference_graph(features)) if not params.regression: inference[eval_metrics.INFERENCE_PRED_NAME] = math_ops.argmax( inference[eval_metrics.INFERENCE_PROB_NAME], 1) # labels might be None if we're doing prediction (which brings up the # question of why we force everything to adhere to a single model_fn). loss_deps = [] training_graph = None if labels is not None and mode == model_fn_lib.ModeKeys.TRAIN: training_graph = control_flow_ops.group( graph_builder.training_graph(features, labels, input_weights=weights, num_trainers=num_trainers, trainer_id=trainer_id), state_ops.assign_add(contrib_framework.get_global_step(), 1)) loss_deps.append(training_graph) training_loss = None if (mode == model_fn_lib.ModeKeys.EVAL or mode == model_fn_lib.ModeKeys.TRAIN): with ops.control_dependencies(loss_deps): training_loss = graph_builder.training_loss(features, labels, name=LOSS_NAME) if report_feature_importances and mode == model_fn_lib.ModeKeys.EVAL: training_loss = logging_ops.Print( training_loss, [graph_builder.feature_importances()], summarize=1000) # Put weights back in if weights is not None: features[weights_name] = weights training_hooks = [] if early_stopping_rounds: training_hooks.append(TensorForestLossHook(early_stopping_rounds)) return model_fn_lib.ModelFnOps(mode=mode, predictions=inference, loss=training_loss, train_op=training_graph, training_hooks=training_hooks)
def maybe_warn_on_large_rejection(accept_dist, initial_dist): proportion_rejected = math_ops.reduce_sum((1 - accept_dist) * initial_dist) return control_flow_ops.cond( math_ops.less(proportion_rejected, .5), lambda: accept_dist, lambda: logging_ops.Print( # pylint: disable=g-long-lambda accept_dist, [proportion_rejected, initial_dist, accept_dist], message="Proportion of examples rejected by sampler is high: ", summarize=100, first_n=10))
def testPrintGradient(self): inp = constant_op.constant(2.0, shape=[100, 32], name="in") w = constant_op.constant(4.0, shape=[10, 100], name="w") wx = math_ops.matmul(w, inp, name="wx") wx_print = logging_ops.Print(wx, [w, w, w]) wx_grad = gradients_impl.gradients(wx, w)[0] wx_print_grad = gradients_impl.gradients(wx_print, w)[0] wxg = self.evaluate(wx_grad) wxpg = self.evaluate(wx_print_grad) self.assertAllEqual(wxg, wxpg)
def compute_gradients(self, *args, **kwargs): """Compute gradients of "loss" for the variables in "var_list". This simply wraps the compute_gradients() from the real optimizer. The gradients will be aggregated in the apply_gradients() so that user can modify the gradients like clipping with per replica global norm if needed. The global norm with aggregated gradients can be bad as one replica's huge gradients can hurt the gradients from other replicas. Args: *args: Arguments for compute_gradients(). **kwargs: Keyword arguments for compute_gradients(). Returns: A list of (gradient, variable) pairs. """ with ops.control_dependencies([logging_ops.Print(0, [0], message="Starting to compute gradients")]): grads_and_vars = self._opt.compute_gradients(*args, **kwargs) for index, (grad, var) in enumerate(grads_and_vars): with ops.control_dependencies([grad]): grads_and_vars[index] = (logging_ops.Print(grad, [0], message="Done computing gradient %d, grad shape: %s" % (index, var.get_shape())), var) return grads_and_vars
def top_1_accuracy(config, embeddings_positive, embeddings_anchor): pred_matrix = math_ops.matmul(embeddings_anchor, embeddings_positive, transpose_a=False, transpose_b=True) pred = tf.math.argmax(input=pred_matrix, axis=1) labels = tf.range(config.BATCH_SIZE, dtype=tf.int32) if labels.dtype != pred.dtype: pred = math_ops.cast(pred, labels.dtype) is_correct = math_ops.cast(math_ops.equal(pred, labels), tf.float32) is_correct = logging_ops.Print(is_correct, ['acc:', tf.reduce_mean(is_correct)]) return tf.reduce_mean(is_correct)
def dynamic_print(*values): """Implementartion of print using dynamic dispatch. The function attempts to use tf.Print if all the values are compatible. Otherwise, it will fall back to py_func. Args: *values: values to print Returns: A dummy value indicating the print completed. If tf. """ if all(map(is_tf_print_compatible, values)): return logging_ops.Print(1, values) return py_func.wrap_py_func(print, None, values, use_dummy_return=True)
def testCaptureControls(self): g = ops.Graph() with g.as_default(): x = constant_op.constant([10.0]) x = logging_ops.Print(x, [x], "outer") @function.Defun(dtypes.float32) def Foo(y): with ops.control_dependencies([x]): y = logging_ops.Print(y, [y], "inner") return y with self.assertRaisesRegexp(ValueError, "not an element of this graph."): # NOTE: We still do not support capturing control deps. _ = Foo(x)
def call_print(*values): """Compiled counterpart of the print builtin. The function attempts to use tf.Print if all the values are compatible. Otherwise, it will fall back to py_func. Args: *values: values to print Returns: A dummy value indicating the print completed. If tf. """ if all(map(is_tf_print_compatible, values)): return logging_ops.Print(1, values) return py_func.wrap_py_func(print, None, values, use_dummy_return=True)
def encode(grads_and_vars, r=2, shapes=None): for i, ((grad, var), shape) in enumerate(zip(grads_and_vars, shapes)): with ops.control_dependencies([ logging_ops.Print(0, [i], message="Encoding Gradients on Workers") ]): with tf.device(grad.device): ndims = len(shape) code = _svd_encode(grad, r=r, ndims=ndims, shape=shape) grads_and_vars[i] = (code, var) n_bytes = _list_bytes(grads_and_vars) for i, (g, v) in enumerate(grads_and_vars): if isinstance(g, dict): grads_and_vars[i][0]['n_bytes'] = n_bytes return grads_and_vars
def zero_grad_function(): zero_grads = [] with ops.name_scope(op.name + "_grad"): # pylint: disable=protected-access with ops.get_default_graph()._original_op(op): for index, input in enumerate(op.inputs): zero_grad = tf.zeros(tf.shape(input), dtype=input.dtype) if index == 0: zero_grad = logging_ops.Print( zero_grad, [zero_grad], message= "I'm a straggler; Piping up zeros.") zero_grads.append(zero_grad) return zero_grads
def get_init_tokens_op(self, num_tokens=-1): """Returns the op to fill the sync_token_queue with the tokens. This is supposed to be executed in the beginning of the chief/sync thread so that even if the total_num_replicas is less than replicas_to_aggregate, the model can still proceed as the replicas can compute multiple steps per variable update. Make sure: `num_tokens >= replicas_to_aggregate - total_num_replicas`. Args: num_tokens: Number of tokens to add to the queue. Returns: An op for the chief/sync replica to fill the token queue. Raises: ValueError: If this is called before apply_gradients(). ValueError: If num_tokens are smaller than replicas_to_aggregate - total_num_replicas. """ if self._gradients_applied is False: raise ValueError( "get_init_tokens_op() should be called after apply_gradients()." ) tokens_needed = self._total_num_replicas if num_tokens == -1: num_tokens = self._total_num_replicas elif num_tokens < tokens_needed: raise ValueError( "Too few tokens to finish the first step: %d (given) vs %d (needed)" % (num_tokens, tokens_needed)) init_tokens = [] with ops.device(self._global_step.device), ops.name_scope(""): tokens = array_ops.fill([num_tokens], self._global_step) for i in range(self._total_num_replicas): with ops.control_dependencies([ logging_ops.Print(self._global_step, [self._global_step], message="Init token queue") ]): init_tokens_op = self._sync_token_queues[i].enqueue( self._global_step) init_tokens.append(init_tokens_op) return init_tokens
def npairs_loss_fan(labels, embeddings_anchor, embeddings_positive, reg_lambda=3e-3, print_losses=False, hard_ori=False, HardOrNot=None): # pylint: enable=line-too-long # Add the regularizer on the embedding. reg_anchor = math_ops.reduce_mean( math_ops.reduce_sum(math_ops.square(embeddings_anchor), 1)) reg_positive = math_ops.reduce_mean( math_ops.reduce_sum(math_ops.square(embeddings_positive), 1)) l2loss = math_ops.multiply(0.25 * reg_lambda, reg_anchor + reg_positive, name='l2loss') # Get per pair similarities. similarity_matrix = math_ops.matmul(embeddings_anchor, embeddings_positive, transpose_a=False, transpose_b=True) # Reshape [batch_size] label tensor to a [batch_size, 1] label tensor. lshape = array_ops.shape(labels) assert lshape.shape == 1 labels = array_ops.reshape(labels, [lshape[0], 1]) labels_remapped = math_ops.to_float( math_ops.not_equal(labels, array_ops.transpose(labels))) # labels_remapped /= math_ops.reduce_sum(labels_remapped, 1, keepdims=True) # Add the softmax loss. xent_loss = nn.softmax_cross_entropy_with_logits(logits=similarity_matrix, labels=labels_remapped) xent_loss = math_ops.reduce_mean(xent_loss, name='xentropy') if hard_ori: xent_loss = tf.multiply(xent_loss, HardOrNot) if print_losses: xent_loss = logging_ops.Print( xent_loss, ['cross entropy:', xent_loss, 'l2loss:', l2loss]) return l2loss + xent_loss
def f(include_print): shape = constant_op.constant([5]) if include_print: shape = logging_ops.Print(shape, [shape]) return random.get_global_generator().normal(shape)
def apply_gradients(self, grads_and_vars, worker_id, global_step=None, name=None, collect_cdfs=False): """Apply gradients to variables. This contains most of the synchronization implementation and also wraps the apply_gradients() from the real optimizer. Args: grads_and_vars: List of (gradient, variable) pairs as returned by compute_gradients(). global_step: Optional Variable to increment by one after the variables have been updated. name: Optional name for the returned operation. Default to the name passed to the Optimizer constructor. Returns: train_op: The op to dequeue a token so the replicas can exit this batch and start the next one. This is executed by each replica. Raises: ValueError: If the grads_and_vars is empty. ValueError: If global step is not provided, the staleness cannot be checked. """ if not grads_and_vars: raise ValueError("Must supply at least one variable") if global_step is None: raise ValueError("Global step is required to check staleness") self._global_step = global_step train_ops = [] aggregated_grad = [] var_list = [] printer_ops = [] def f_pos(): enq_total_ops = self._stop_queue.enqueue(global_step) ''' for worker_id in range(self._total_num_replicas): enq_ops = self._should_stop_queues[worker_id].enqueue(global_step) with ops.control_dependencies([enq_ops]): L = [] ''' # ret_pos = [tf.constant(i) for i in range(self._construtor)] with ops.control_dependencies([enq_total_ops]): return tf.Print(global_step, [global_step], message="Enquequed to stop queue") # ret_pos = tf.Variable(33) # return ret_pos def f_neg(): # ret_neg = [tf.constant(i+5) for i in range(self._construtor)] ret_neg = tf.Variable(22) return tf.Print(global_step, [global_step], message="Nothing to stop queue") # worker_id_list_printer = logging_ops.Print(global_step, # [a for a in self._worker_idx_list] + [worker_id] + [global_step], # message="Worker ID list status") # train_ops.append(worker_id_list_printer) self._local_step = variables.Variable( initial_value=0, trainable=False, collections=[ops.GraphKeys.LOCAL_VARIABLES], dtype=global_step.dtype.base_dtype, name="sync_rep_local_step") self.local_step_init_op = state_ops.assign(self._local_step, global_step._ref()) chief_init_ops = [self.local_step_init_op] self.ready_for_local_init_op = variables.report_uninitialized_variables( variables.all_variables()) # The wait op waits for the current worker to dequeue a token from its respective token queue self._wait_op = self._sync_token_queues[worker_id].dequeue() # Replicas have to wait until they can get a token from the token queue # BEFORE begining to compute gradients. with ops.device(global_step.device): queue_size = self._sync_token_queues[worker_id].size() update_local_step_op = state_ops.assign(self._local_step, global_step._ref()) # Gradient accum creation with ops.name_scope(None, self._name): for grad, var in grads_and_vars: var_list.append(var) tf.logging.info("Grad " + str(grad) + " assigned to " + str(var.device)) with ops.device(var.device): if grad is None: continue elif isinstance(grad, ops.Tensor): grad_accum = data_flow_ops.ConditionalAccumulator( grad.dtype, shape=var.get_shape(), shared_name=var.name + "/grad_accum") else: if not isinstance(grad, ops.IndexedSlices): raise ValueError("Unknown grad type!") grad_accum = data_flow_ops.SparseConditionalAccumulator( grad.dtype, shape=(), shared_name=var.name + "/grad_accum") self._accumulator_list.append((grad_accum, var)) """# Phase 1 gradient computation with ops.control_dependencies([update_local_step_op]): for index, (grad, var) in enumerate(grads_and_vars): with ops.device(var.device): if grad is None: continue elif isinstance(grad, ops.Tensor): grad_accum = self._accumulator_list[index][0] train_ops.append(grad_accum.apply_grad(grad, local_step=self._local_step._ref())) else: if not isinstance(grad, ops.IndexedSlices): raise ValueError("Unknown grad type!") grad_accum = self._accumulator_list[index][0] train_ops.append(grad_accum.apply_indexed_slices_grad( grad, local_step=self._local_step._ref()))""" # Phase 1 gradient computation with ops.control_dependencies([update_local_step_op]): for index, (grad, var) in enumerate(grads_and_vars): print_start_op = logging_ops.Print( global_step, [global_step], message="Starting to apply grads for variable %d" % index) with ops.device(var.device): if grad is None: continue elif isinstance(grad, ops.Tensor): grad_accum = self._accumulator_list[index][0] with ops.control_dependencies([print_start_op]): with tf.device("job:worker/task:%d" % worker_id): apply_grad_op = grad_accum.apply_grad( grad, local_step=self._local_step._ref()) with ops.control_dependencies( [apply_grad_op]): finished_print_op = logging_ops.Print( global_step, [global_step], message= "Done applying grads for variable %d" % index) train_ops.append(finished_print_op) else: if not isinstance(grad, ops.IndexedSlices): raise ValueError("Unknown grad type!") grad_accum = self._accumulator_list[index][0] with ops.control_dependencies([print_start_op]): with tf.device("job:worker/task:%d" % worker_id): apply_grad_op = grad_accum.apply_indexed_slices_grad( grad, local_step=self._local_step._ref()) with ops.control_dependencies( [apply_grad_op]): finished_print_op = logging_ops.Print( global_step, [global_step], message= "Done applying grads for variable %d" % index) train_ops.append(finished_print_op) with ops.control_dependencies([apply_grad_op]): accum_sizes_printer = logging_ops.Print( global_step, [ x[0].num_accumulated() for x in self._accumulator_list ] + [worker_id] + [global_step], message="Accum aggregated status on ps") train_ops.append(accum_sizes_printer) x = self._accumulator_list[0] ret = tf.cond( tf.greater_equal( x[0].num_accumulated(), self._constant_for_comparison), f_pos, f_neg) should_stop_list_printer = logging_ops.Print( global_step, [ret], message="Should stop ret val status on ps") train_ops.append(should_stop_list_printer) with ops.control_dependencies([ret]): queue_total_printer = logging_ops.Print( global_step, [self._stop_queue.size()], message="shared should stop queue size") train_ops.append(queue_total_printer) # Phase 2 gradient applying for index, (grad, var) in enumerate(grads_and_vars): with ops.device(var.device): grad_accum = self._accumulator_list[index][0] if grad is None: aggregated_grad.append(None) elif isinstance(grad, ops.Tensor): if collect_cdfs: aggregated_grad.append( grad_accum.take_grad(self._total_num_replicas)) else: aggregated_grad.append(grad_accum.take_grad(1)) else: if collect_cdfs: aggregated_grad.append( grad_accum.take_grad(self._total_num_replicas)) else: aggregated_grad.append( grad_accum.take_indexed_slices_grad(1)) aggregated_grads_and_vars = zip(aggregated_grad, var_list) # Some debug operations self.print_sizes = logging_ops.Print(global_step, [ self._sync_token_queues[i].size() for i in range(self._total_num_replicas) ], message="queue sizes") self.print_accum_sizes = logging_ops.Print( self._local_step, [x[0].num_accumulated() for x in self._accumulator_list] + [worker_id], message="Accum sizes") self.print_local_step = logging_ops.Print( self._local_step, [self._local_step._ref(), global_step._ref()], message="local vs global step") # sync_op will be assigned to the same device as the global step. with ops.device(global_step.device), ops.name_scope(""): with ops.control_dependencies([self.print_accum_sizes]): update_op = self._opt.apply_gradients( aggregated_grads_and_vars, global_step) self._update_op = update_op num_to_dequeue = self._stop_queue.size() deq_ops = self._stop_queue.dequeue_many(num_to_dequeue) with ops.control_dependencies([deq_ops]): size_printer_2 = logging_ops.Print( global_step, [self.print_accum_sizes], message="Complelted the dequeue operation!") printer_ops.append(size_printer_2) with ops.control_dependencies(printer_ops): with ops.control_dependencies([update_op]): sync_op = [] for cur_worker_id in range( self._total_num_replicas): sync_op.append( self._sync_token_queues[cur_worker_id]. enqueue(global_step)) sync_op = control_flow_ops.group(*(sync_op)) # dummy_queue is passed to the queue runner. Don't use the real queues # because the queue runner doesn't automatically reopen it once it # closed queues in PS devices. dummy_queue = (data_flow_ops.FIFOQueue( 1, types_pb2.DT_INT32, shapes=(), shared_name="dummy_queue")) self._chief_queue_runner = queue_runner.QueueRunner( dummy_queue, [sync_op]) with ops.device(global_step.device), ops.name_scope(""): with ops.control_dependencies(train_ops): # Worker finished applying gradients. Add token to phase1_finished_queue train_op = logging_ops.Print( self._local_step._ref(), [ x[0].num_accumulated() for x in self._accumulator_list ] + [worker_id] + [global_step], message="Finished worker updates", name="FinishedWorkerUpdatesPrint") for accum, var in self._accumulator_list: with ops.device(var.device): chief_init_ops.append( accum.set_global_step(global_step, name="SetGlobalStep")) self.chief_init_op = control_flow_ops.group(*(chief_init_ops)) self._gradients_applied = True return train_op
def Foo(y): with ops.control_dependencies([x]): y = logging_ops.Print(y, [y], "inner") return y
def train(target, all_data, all_labels, cluster_spec): ''' This is the main function for training ''' image_placeholder = tf.placeholder( dtype=tf.float32, shape=[FLAGS.batch_size, IMG_HEIGHT, IMG_WIDTH, IMG_DEPTH]) label_placeholder = tf.placeholder(dtype=tf.int32, shape=[FLAGS.batch_size]) num_workers = len(cluster_spec.as_dict()['worker']) num_parameter_servers = len(cluster_spec.as_dict()['ps']) if FLAGS.num_replicas_to_aggregate == -1: num_replicas_to_aggregate = num_workers else: num_replicas_to_aggregate = FLAGS.num_replicas_to_aggregate assert num_workers > 0 and num_parameter_servers > 0, ( ' num_workers and ' 'num_parameter_servers' ' must be > 0.') is_chief = (FLAGS.task_id == 0) num_examples = all_data.shape[0] with tf.device( tf.train.replica_device_setter( #cpu only # worker_device='/job:worker/task:%d' % FLAGS.task_id, #with gpu enabled worker_device='/job:worker/task:%d/gpu:0' % FLAGS.task_id, cluster=cluster_spec)): global_step = tf.Variable(0, name="global_step", trainable=False) num_batches_per_epoch = (num_examples / FLAGS.batch_size) decay_steps = int(num_batches_per_epoch * FLAGS.num_epochs_per_decay / num_replicas_to_aggregate) lr = tf.train.exponential_decay(FLAGS.initial_learning_rate, global_step, decay_steps, FLAGS.learning_rate_decay_factor, staircase=True) # Logits of training data and valiation data come from the same graph. The inference of # validation data share all the weights with train data. This is implemented by passing # reuse=True to the variable scopes of train graph logits = inference(image_placeholder, FLAGS.num_residual_blocks, reuse=False) # The following codes calculate the train loss, which is consist of the # softmax cross entropy and the relularization loss # regu_losses = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES) total_loss = calc_loss(logits, label_placeholder) opt = tf.train.AdamOptimizer(lr) if FLAGS.interval_method or FLAGS.worker_times_cdf_method: opt = TimeoutReplicasOptimizer(opt, global_step, total_num_replicas=num_workers) elif FLAGS.backup_worker_method: opt = BackupOptimizer( opt, replicas_to_aggregate=num_replicas_to_aggregate, total_num_replicas=num_workers) else: use_svd_compress = FLAGS.svd_rank > 0 kwargs = { 'replicas_to_aggregate': num_replicas_to_aggregate, 'total_num_replicas': num_workers, 'compress': use_svd_compress, 'svd_rank': FLAGS.svd_rank } print('#' * 40) print(kwargs) print('#' * 40) opt = LowCommSync(opt, global_step=global_step, **kwargs) # Compute gradients with respect to the loss. grads = opt.compute_gradients(total_loss) if FLAGS.interval_method or FLAGS.worker_times_cdf_method: apply_gradients_op = opt.apply_gradients( grads, FLAGS.task_id, global_step=global_step, collect_cdfs=FLAGS.worker_times_cdf_method) # apply_gradients_op = opt.apply_gradients(grads, FLAGS.task_id, global_step=global_step) elif FLAGS.backup_worker_method: apply_gradients_op = opt.apply_gradients(grads, FLAGS.task_id, global_step=global_step) else: # SVD encode happens right here: shapes = [g.get_shape() for g, _ in grads] if use_svd_compress: encoded_grads = encode(grads, r=1, shapes=shapes) apply_gradients_op = opt.apply_gradients( encoded_grads, global_step=global_step) else: apply_gradients_op = opt.apply_gradients( grads, global_step=global_step) with tf.control_dependencies([apply_gradients_op]): train_op = tf.identity(total_loss, name='train_op') # Initialize a saver to save checkpoints. Merge all summaries, so we can run all # summarizing operations by running summary_op. Initialize a new session chief_queue_runners = [opt.get_chief_queue_runner()] init_tokens_op = opt.get_init_tokens_op() saver = tf.train.Saver() summary_op = tf.summary.merge_all() init_op = tf.global_variables_initializer() test_print_op = logging_ops.Print(0, [0], message="Test print success") if is_chief: local_init_op = opt.chief_init_op else: local_init_op = opt.local_step_init_op local_init_opt = [local_init_op] ready_for_local_init_op = opt.ready_for_local_init_op sv = tf.train.Supervisor( is_chief=is_chief, local_init_op=local_init_op, ready_for_local_init_op=ready_for_local_init_op, logdir=FLAGS.train_dir, init_op=init_op, summary_op=None, global_step=global_step, saver=saver, save_model_secs=FLAGS.save_interval_secs) tf.logging.info('%s Supervisor' % datetime.now()) sess_config = tf.ConfigProto( allow_soft_placement=True, log_device_placement=FLAGS.log_device_placement) sess = sv.prepare_or_wait_for_session(target, config=sess_config) queue_runners = tf.get_collection(tf.GraphKeys.QUEUE_RUNNERS) sv.start_queue_runners(sess, queue_runners) tf.logging.info('Started %d queues for processing input data.', len(queue_runners)) if is_chief: if not FLAGS.interval_method or FLAGS.worker_times_cdf_method: sv.start_queue_runners(sess, chief_queue_runners) sess.run(init_tokens_op) timeout_client, timeout_server = launch_manager(sess, FLAGS) next_summary_time = time.time() + FLAGS.save_summaries_secs begin_time = time.time() cur_iteration = -1 local_data_batch_idx = 0 epoch_counter = 0 iterations_finished = set() if FLAGS.task_id == 0 and FLAGS.interval_method: opt.start_interval_updates(sess, timeout_client) ''' np.random.seed(SEED) b = np.ones(int(num_batches_per_epoch)) interval = np.arange(0, int(num_batches_per_epoch)) idx_list = np.random.choice(interval, int(num_workers), replace=False) ''' while not sv.should_stop(): # try: sys.stdout.flush() tf.logging.info("A new iteration...") cur_iteration += 1 if FLAGS.worker_times_cdf_method: sess.run([opt._wait_op]) timeout_client.broadcast_worker_dequeued_token(cur_iteration) start_time = time.time() epoch_counter, local_data_batch_idx, feed_dict = fill_feed_dict( all_data, all_labels, image_placeholder, label_placeholder, FLAGS.batch_size, local_data_batch_idx, epoch_counter) run_options = tf.RunOptions() run_metadata = tf.RunMetadata() if FLAGS.timeline_logging: run_options.trace_level = tf.RunOptions.FULL_TRACE run_options.output_partition_graphs = True #feed_dict[weight_vec_placeholder] = ls_solution tf.logging.info("Data batch index: %s, Current epoch idex: %s" % (str(epoch_counter), str(local_data_batch_idx))) loss_value, step = sess.run( #[train_op, global_step], feed_dict={feed_dict, x}, run_metadata=run_metadata, options=run_options) [train_op, global_step], feed_dict=feed_dict, run_metadata=run_metadata, options=run_options) if FLAGS.worker_times_cdf_method: timeout_client.broadcast_worker_finished_computing_gradients( cur_iteration) assert not np.isnan(loss_value), 'Model diverged with loss = NaN' finish_time = time.time() if FLAGS.timeline_logging: tl = timeline.Timeline(run_metadata.step_stats) ctf = tl.generate_chrome_trace_format() with open( '%s/worker=%d_timeline_iter=%d.json' % (FLAGS.train_dir, FLAGS.task_id, step), 'w'): f.write(ctf) if step > FLAGS.max_steps: break duration = time.time() - start_time examples_per_sec = FLAGS.batch_size / float(duration) format_str = ('Worker %d: %s: step %d, loss = %f' '(%.1f examples/sec; %.3f sec/batch)') tf.logging.info(format_str % (FLAGS.task_id, datetime.now(), step, loss_value, examples_per_sec, duration)) if is_chief and next_summary_time < time.time( ) and FLAGS.should_summarize: tf.logging.info('Running Summary operation on the chief.') summary_str = sess.run(summary_op) sv.summary_computed(sess, summary_str) tf.logging.info('Finished running Summary operation.') next_summary_time += FLAGS.save_summaries_secs # except tf.errors.DeadlineExceededError: # tf.logging.info("Killed at time %f" % time.time()) #sess.reset_kill() # except: # tf.logging.info("Unexpected error: %s" % str(sys.exc_info()[0])) #sess.reset_kill() if is_chief: tf.logging.info('Elapsed Time: %f' % (time.time() - begin_time)) sv.stop() if is_chief: saver.save(sess, os.path.join(FLAGS.train_dir, 'model.ckpt'), global_step=global_step)
def stratified_sample(tensors, labels, target_probs, batch_size, init_probs=None, enqueue_many=False, queue_capacity=16, threads_per_queue=1, name=None): """Stochastically creates batches based on per-class probabilities. This method discards examples. Internally, it creates one queue to amortize the cost of disk reads, and one queue to hold the properly-proportioned batch. Args: tensors: List of tensors for data. All tensors are either one item or a batch, according to enqueue_many. labels: Tensor for label of data. Label is a single integer or a batch, depending on `enqueue_many`. It is not a one-hot vector. target_probs: Target class proportions in batch. An object whose type has a registered Tensor conversion function. batch_size: Size of batch to be returned. init_probs: Class proportions in the data. An object whose type has a registered Tensor conversion function, or `None` for estimating the initial distribution. enqueue_many: Bool. If true, interpret input tensors as having a batch dimension. queue_capacity: Capacity of the large queue that holds input examples. threads_per_queue: Number of threads for the large queue that holds input examples and for the final queue with the proper class proportions. name: Optional prefix for ops created by this function. Raises: ValueError: If `tensors` isn't iterable. ValueError: `enqueue_many` is True and labels doesn't have a batch dimension, or if `enqueue_many` is False and labels isn't a scalar. ValueError: `enqueue_many` is True, and batch dimension on data and labels don't match. ValueError: if probs don't sum to one. ValueError: if a zero initial probability class has a nonzero target probability. TFAssertion: if labels aren't integers in [0, num classes). Returns: (data_batch, label_batch), where data_batch is a list of tensors of the same length as `tensors` Example: # Get tensor for a single data and label example. data, label = data_provider.Get(['data', 'label']) # Get stratified batch according to per-class probabilities. target_probs = [...distribution you want...] [data_batch], labels = tf.contrib.training.stratified_sample( [data], label, target_probs) # Run batch through network. ... """ with ops.name_scope(name, 'stratified_sample', list(tensors) + [labels]): tensor_list = ops.convert_n_to_tensor_or_indexed_slices(tensors) labels = ops.convert_to_tensor(labels) target_probs = ops.convert_to_tensor(target_probs, dtype=dtypes.float32) # Reduce the case of a single example to that of a batch of size 1. if not enqueue_many: tensor_list = [ array_ops.expand_dims(tensor, 0) for tensor in tensor_list ] labels = array_ops.expand_dims(labels, 0) # If `init_probs` is `None`, set up online estimation of data distribution. if init_probs is None: # We use `target_probs` to get the number of classes, so its shape must be # fully defined at graph construction time. target_probs.get_shape().assert_is_fully_defined() init_probs = _estimate_data_distribution( labels, target_probs.get_shape().num_elements()) else: init_probs = ops.convert_to_tensor(init_probs, dtype=dtypes.float32) # Validate that input is consistent. tensor_list, labels, [init_probs, target_probs ] = _verify_input(tensor_list, labels, [init_probs, target_probs]) # Check that all zero initial probabilities also have zero target # probabilities. assert_op = control_flow_ops.Assert( math_ops.reduce_all( math_ops.logical_or(math_ops.not_equal(init_probs, 0), math_ops.equal(target_probs, 0))), [ 'All classes with zero initial probability must also have zero target ' 'probability: ', init_probs, target_probs ]) init_probs = control_flow_ops.with_dependencies([assert_op], init_probs) # Calculate acceptance sampling probabilities. accept_probs = _calculate_acceptance_probabilities( init_probs, target_probs) proportion_rejected = math_ops.reduce_sum( (1 - accept_probs) * init_probs) accept_probs = control_flow_ops.cond( math_ops.less(proportion_rejected, .5), lambda: accept_probs, lambda: logging_ops.Print( # pylint: disable=g-long-lambda accept_probs, [accept_probs], message='Proportion of examples rejected by sampler is high.', first_n=10)) # Make a single queue to hold input examples. Reshape output so examples # don't have singleton batch dimension. batched = input_ops.batch(tensor_list + [labels], batch_size=1, num_threads=threads_per_queue, capacity=queue_capacity, enqueue_many=True) val_list = [array_ops.squeeze(x, [0]) for x in batched[:-1]] label = array_ops.squeeze(batched[-1], [0]) # Set up second queue containing batches that have the desired class # proportions. cur_prob = array_ops.gather(accept_probs, label) batched = input_ops.maybe_batch( val_list + [label], keep_input=random_ops.random_uniform([]) < cur_prob, batch_size=batch_size, num_threads=threads_per_queue) return batched[:-1], batched[-1]
def train(target, all_data, all_labels, cluster_spec): ''' This is the main function for training ''' image_placeholder = tf.placeholder( dtype=tf.float32, shape=[FLAGS.batch_size, IMG_HEIGHT, IMG_WIDTH, IMG_DEPTH]) label_placeholder = tf.placeholder(dtype=tf.int32, shape=[FLAGS.batch_size]) num_workers = len(cluster_spec.as_dict()['worker']) num_parameter_servers = len(cluster_spec.as_dict()['ps']) if FLAGS.num_replicas_to_aggregate == -1: num_replicas_to_aggregate = num_workers else: num_replicas_to_aggregate = FLAGS.num_replicas_to_aggregate assert num_workers > 0 and num_parameter_servers > 0, ( ' num_workers and ' 'num_parameter_servers' ' must be > 0.') is_chief = (FLAGS.task_id == 0) num_examples = all_data.shape[0] with tf.device( tf.train.replica_device_setter( #cpu only # worker_device='/job:worker/task:%d' % FLAGS.task_id, #with gpu enabled worker_device='/job:worker/task:%d/gpu:0' % FLAGS.task_id, cluster=cluster_spec)): global_step = tf.Variable(0, name="global_step", trainable=False) num_batches_per_epoch = (num_examples / FLAGS.batch_size) decay_steps = int(num_batches_per_epoch * FLAGS.num_epochs_per_decay / num_replicas_to_aggregate) lr = tf.train.exponential_decay(FLAGS.initial_learning_rate, global_step, decay_steps, FLAGS.learning_rate_decay_factor, staircase=True) # Logits of training data and valiation data come from the same graph. The inference of # validation data share all the weights with train data. This is implemented by passing # reuse=True to the variable scopes of train graph logits = inference(image_placeholder, FLAGS.num_residual_blocks, reuse=False) # vali_logits = inference(self.vali_image_placeholder, FLAGS.num_residual_blocks, reuse=True) # The following codes calculate the train loss, which is consist of the # softmax cross entropy and the relularization loss # regu_losses = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES) total_loss = calc_loss(logits, label_placeholder) # predictions = tf.nn.softmax(logits) # train_top1_error = top_k_error(predictions, label_placeholder, 1) opt = tf.train.AdamOptimizer(lr) if FLAGS.interval_method or FLAGS.worker_times_cdf_method: opt = TimeoutReplicasOptimizer(opt, global_step, total_num_replicas=num_workers) elif FLAGS.backup_worker_method: opt = BackupOptimizer( opt, replicas_to_aggregate=num_replicas_to_aggregate, total_num_replicas=num_workers) else: # opt = tf.train.SyncReplicasOptimizerV2( opt = tf.train.SyncReplicasOptimizer( opt, replicas_to_aggregate=num_replicas_to_aggregate, total_num_replicas=num_workers) # Compute gradients with respect to the loss. grads = opt.compute_gradients(total_loss) #compute weighted gradients here. #=============================================================================================== ''' #define a placeholder for weighted vector, i.e. LS solution weight_vec_placeholder = tf.placeholder(dtype=tf.float32, shape=(num_workers,)) grad_list = [x[0] for x in grads] new_grad_list = [] #times gradient from each worker with the corresponding weight #which is just scalar multiplication for g_idx in range(len(grad_list)): grad_on_worker = grad_list[g_idx] weight = tf.slice(weight_vec_placeholder, [FLAGS.task_id], [1]) tf.logging.info("Logging Happens Here!") tf.logging.info(weight[0]) new_grad_list.append(tf.scalar_mul(weight[0], grad_on_worker)) grad_new = [] #regenerate the weighted gradients, merging all weighted vector for x_idx in range(len(grads)): grad_elem = grads[x_idx] grad_new.append((new_grad_list[x_idx], grad_elem[1])) ''' #=============================================================================================== if FLAGS.interval_method or FLAGS.worker_times_cdf_method: apply_gradients_op = opt.apply_gradients( grads, FLAGS.task_id, global_step=global_step, collect_cdfs=FLAGS.worker_times_cdf_method) # apply_gradients_op = opt.apply_gradients(grads, FLAGS.task_id, global_step=global_step) elif FLAGS.backup_worker_method: apply_gradients_op = opt.apply_gradients(grads, FLAGS.task_id, global_step=global_step) else: apply_gradients_op = opt.apply_gradients(grads, global_step=global_step) # apply_gradients_op = opt.apply_gradients(grad_new, global_step=global_step) with tf.control_dependencies([apply_gradients_op]): train_op = tf.identity(total_loss, name='train_op') # Initialize a saver to save checkpoints. Merge all summaries, so we can run all # summarizing operations by running summary_op. Initialize a new session chief_queue_runners = [opt.get_chief_queue_runner()] init_tokens_op = opt.get_init_tokens_op() saver = tf.train.Saver() summary_op = tf.summary.merge_all() init_op = tf.global_variables_initializer() test_print_op = logging_ops.Print(0, [0], message="Test print success") if is_chief: local_init_op = opt.chief_init_op else: local_init_op = opt.local_step_init_op local_init_opt = [local_init_op] ready_for_local_init_op = opt.ready_for_local_init_op sv = tf.train.Supervisor( is_chief=is_chief, local_init_op=local_init_op, ready_for_local_init_op=ready_for_local_init_op, logdir=FLAGS.train_dir, init_op=init_op, summary_op=None, global_step=global_step, saver=saver, save_model_secs=FLAGS.save_interval_secs) tf.logging.info('%s Supervisor' % datetime.now()) sess_config = tf.ConfigProto( allow_soft_placement=True, log_device_placement=FLAGS.log_device_placement) sess = sv.prepare_or_wait_for_session(target, config=sess_config) queue_runners = tf.get_collection(tf.GraphKeys.QUEUE_RUNNERS) sv.start_queue_runners(sess, queue_runners) tf.logging.info('Started %d queues for processing input data.', len(queue_runners)) if is_chief: if not FLAGS.interval_method or FLAGS.worker_times_cdf_method: sv.start_queue_runners(sess, chief_queue_runners) sess.run(init_tokens_op) timeout_client, timeout_server = launch_manager(sess, FLAGS) next_summary_time = time.time() + FLAGS.save_summaries_secs begin_time = time.time() cur_iteration = -1 local_data_batch_idx = 0 epoch_counter = 0 iterations_finished = set() if FLAGS.task_id == 0 and FLAGS.interval_method: opt.start_interval_updates(sess, timeout_client) ''' np.random.seed(SEED) b = np.ones(int(num_batches_per_epoch)) interval = np.arange(0, int(num_batches_per_epoch)) idx_list = np.random.choice(interval, int(num_workers), replace=False) ''' while not sv.should_stop(): # try: sys.stdout.flush() tf.logging.info("A new iteration...") cur_iteration += 1 if FLAGS.worker_times_cdf_method: sess.run([opt._wait_op]) timeout_client.broadcast_worker_dequeued_token(cur_iteration) start_time = time.time() epoch_counter, local_data_batch_idx, feed_dict = fill_feed_dict( all_data, all_labels, image_placeholder, label_placeholder, FLAGS.batch_size, local_data_batch_idx, epoch_counter) run_options = tf.RunOptions() run_metadata = tf.RunMetadata() #=============================================================================================== ''' LS_start_time = time.time() interval_2 = np.arange(0, int(num_workers)) workers_to_kill = np.random.choice(interval_2, FLAGS.num_worker_kill, replace=False) #interval_2 = np.arange(0, WORKER_NUM) #workers_to_kill = np.random.choice(interval_2, NUM_WORKER_KILL, replace=False) A = np.zeros((int(num_workers), int(num_batches_per_epoch))) for i in range(A.shape[0]): if i == A.shape[0]-1: A[i][idx_list[i]] = 1 A[i][idx_list[0]] = 1 else: A[i][idx_list[i]] = 1 A[i][idx_list[i+1]] = 1 for i in range(len(idx_list)): element = idx_list[i] if element == A.shape[1]-1: idx_list[i] = 0 else: idx_list[i] += 1 for k in workers_to_kill: A[k] = 0 A_for_calc = np.transpose(A) ls_solution = np.dot(np.linalg.pinv(A_for_calc), b) tf.logging.info("workers killed this iteration:") tf.logging.info(str(workers_to_kill)) tf.logging.info("The matrix to solve:") for item in A_for_calc: tf.logging.info(str(item)) tf.logging.info("Solution of LS:") tf.logging.info(str(ls_solution)) LS_duration = time.time() - LS_start_time tf.logging.info("LS run time: %s" % str(LS_duration)) ''' #=============================================================================================== if FLAGS.timeline_logging: run_options.trace_level = tf.RunOptions.FULL_TRACE run_options.output_partition_graphs = True #feed_dict[weight_vec_placeholder] = ls_solution tf.logging.info("RUNNING SESSION... %f" % time.time()) tf.logging.info("Data batch index: %s, Current epoch idex: %s" % (str(epoch_counter), str(local_data_batch_idx))) loss_value, step = sess.run( #[train_op, global_step], feed_dict={feed_dict, x}, run_metadata=run_metadata, options=run_options) [train_op, global_step], feed_dict=feed_dict, run_metadata=run_metadata, options=run_options) tf.logging.info("DONE RUNNING SESSION...") if FLAGS.worker_times_cdf_method: timeout_client.broadcast_worker_finished_computing_gradients( cur_iteration) assert not np.isnan(loss_value), 'Model diverged with loss = NaN' finish_time = time.time() if FLAGS.timeline_logging: tl = timeline.Timeline(run_metadata.step_stats) ctf = tl.generate_chrome_trace_format() with open( '%s/worker=%d_timeline_iter=%d.json' % (FLAGS.train_dir, FLAGS.task_id, step), 'w'): f.write(ctf) if step > FLAGS.max_steps: break duration = time.time() - start_time examples_per_sec = FLAGS.batch_size / float(duration) format_str = ('Worker %d: %s: step %d, loss = %f' '(%.1f examples/sec; %.3f sec/batch)') tf.logging.info(format_str % (FLAGS.task_id, datetime.now(), step, loss_value, examples_per_sec, duration)) if is_chief and next_summary_time < time.time( ) and FLAGS.should_summarize: tf.logging.info('Running Summary operation on the chief.') summary_str = sess.run(summary_op) sv.summary_computed(sess, summary_str) tf.logging.info('Finished running Summary operation.') next_summary_time += FLAGS.save_summaries_secs # except tf.errors.DeadlineExceededError: # tf.logging.info("Killed at time %f" % time.time()) #sess.reset_kill() # except: # tf.logging.info("Unexpected error: %s" % str(sys.exc_info()[0])) #sess.reset_kill() if is_chief: tf.logging.info('Elapsed Time: %f' % (time.time() - begin_time)) sv.stop() if is_chief: saver.save(sess, os.path.join(FLAGS.train_dir, 'model.ckpt'), global_step=global_step)
def testPrintString(self): inp = constant_op.constant(2.0, shape=[100, 32]) inp_printed = logging_ops.Print(inp, ["hello"]) self.assertEqual(inp.get_shape(), inp_printed.get_shape())
def print_op(input_, data=None, message=None, first_n=None, summarize=20, print_tensor_name=True, print_tensor_type=True, print_shape=True, summarize_indicator_vector=True, name=None): """Creates a print op that will print when a tensor is accessed. Wraps the tensor passed in so that whenever that tensor is accessed, the message `message` is printed, along with the current value of the tensor `t` and an optional list of other tensors. Args: input_: A Tensor/SparseTensor/TensorArray to print when it is evaluated. data: A list of other tensors to print. message: A string message to print as a prefix. first_n: Only log `first_n` number of times. Negative numbers log always; this is the default. summarize: Print this number of elements in the tensor. print_tensor_name: Print the tensor name. print_tensor_type: Print the tensor type. print_shape: Print the tensor's shape. summarize_indicator_vector: Whether to print the index of the first true value in an indicator vector (a Boolean tensor). name: The name to give this op. Returns: A Print op. The Print op returns `input_`. Raises: ValueError: If the tensor `input_` is not a Tensor, SparseTensor or TensorArray. """ message = message or "" if input_ is None: raise ValueError("input_ must be of type " "Tensor, SparseTensor or TensorArray") tensor_list = _get_tensor_repr(input_, print_tensor_name, print_tensor_type, print_shape, summarize_indicator_vector) if data is not None: for t in data: tensor_list.extend(_get_tensor_repr(t, print_tensor_name, print_tensor_type, print_shape, summarize_indicator_vector)) if isinstance(input_, ops.Tensor) or isinstance(input_, variables.Variable): input_ = logging_ops.Print(input_, tensor_list, message, first_n, summarize, name) elif isinstance(input_, sparse_tensor.SparseTensor): p = logging_ops.Print( constant_op.constant([]), tensor_list, message, first_n, summarize, name) with ops.control_dependencies([p]): input_ = sparse_tensor.SparseTensor( array_ops.identity(input_.indices), array_ops.identity(input_.values), array_ops.identity(input_.dense_shape)) elif isinstance(input_, tensor_array_ops.TensorArray): p = logging_ops.Print( constant_op.constant([]), tensor_list, message, first_n, summarize, name) with ops.control_dependencies([p]): input_ = tensor_array_ops.TensorArray(dtype=input_.dtype, handle=input_.handle, flow=input_.flow) else: raise ValueError("input_ must be of type " "Tensor, SparseTensor or TensorArray") return input_