def _Loop(self): with self._cluster, tf.container(self._container_id), self._GetSession( cluster_def=self._cluster_def, disable_meta_optimizer=FLAGS.disable_meta_optimizer_in_executor ) as sess: config_proto = (self._tpu_embedding.config_proto if self._tpu_embedding is not None else None) sess.run(tf.tpu.initialize_system(embedding_config=config_proto)) # Initialize the variables first, if needed. for program in self._programs: program.RestoreIfNeeded(sess) program.Compile(sess) sess.run(self._initialize_tables) sess.run(self._initialize_local_vars) sess.run(self._load_ops) while True: global_step = sess.run(py_utils.GetGlobalStep()) if self._ShouldStop(sess, global_step): tf.logging.info('Training finished.') if not self._ml_perf_log: self.save_only_checkpointer.Save(sess, global_step) return # If a task is explicitly selected, only run the programs associated # with that task. if self._single_task_mode or self._model_task_name: tf.logging.info('Single task mode: %s', self._model_task_name) program_schedule = self._program_schedule_dict[ self._model_task_name] else: # Otherwise, sample a task. model_task = self.task_scheduler.Sample(global_step) tf.logging.info('Sampled %s', model_task) program_schedule = self._program_schedule_dict[model_task] done = program_schedule.Run(sess) if done: tf.logging.info('Program schedule told us to stop.') return # global_step local variable above is a result of sess.run, not a # tf variable, so when we do save_only_checkpointer.Save(...) here # py_utils.GetGlobalStep() is ahead of it by # (train_executions_per_eval * train_steps_per_loop) # steps ahead already, due to program_schedule.Run(sess). # if not self._ml_perf_log: tf.logging.info('Retrieve params.') sess.run(self._retrieve_ops) tf.logging.info('Retrieve params done.') self.save_only_checkpointer.MaybeSave( sess, py_utils.GetGlobalStep())
def _Loop(self): with tf.container(self._container_id), self._GetSession( cluster_def=self._cluster_def, disable_meta_optimizer=FLAGS.disable_meta_optimizer_in_executor ) as sess: # Initialize the variables first, if needed. for program in self._programs: program.RestoreIfNeeded(sess) program.Compile(sess) sess.run(self._initialize_tables) sess.run(self._initialize_local_vars) while True: global_step = sess.run(py_utils.GetGlobalStep()) if self._ShouldStop(sess, global_step): tf.logging.info('Training finished.') if not self._ml_perf_log: self.save_only_checkpointer.Save(sess, global_step) return # If a task is explicitly selected, only run the programs associated # with that task. if self._single_task_mode or self._model_task_name: tf.logging.info('Single task mode: %s', self._model_task_name) program_schedule = self._program_schedule_dict[ self._model_task_name] else: # Otherwise, sample a task. model_task = self.task_scheduler.Sample(global_step) tf.logging.info('Sampled %s', model_task) program_schedule = self._program_schedule_dict[model_task] done = program_schedule.Run(sess) if done: tf.logging.info('Program schedule told us to stop.') return # TODO(blee): More complex saving rules. Currently, we assume # we save after every task's program schedule execution. # # global_step local variable above is a result of sess.run, not a # tf variable, so when we do save_only_checkpointer.Save(...) here # py_utils.GetGlobalStep() is ahead of it by # (train_executions_per_eval * train_steps_per_loop) # steps ahead already, due to program_schedule.Run(sess). # if not self._ml_perf_log: self.save_only_checkpointer.Save(sess, py_utils.GetGlobalStep())
def Value(self): p = self.params num_decays = tf.floor( tf.div( tf.cast(py_utils.GetGlobalStep(), tf.float32), float(p.num_steps_per_decay))) return tf.pow(p.decay, num_decays)
def Run(self, sess): tf.logging.info('Executing decode program for %s.', self._task_name) self._checkpointer.RestoreIfNeeded(sess) gsteps = py_utils.GetGlobalStep() global_step = sess.run(gsteps) infeed_future = self._infeed_pool.apply_async(self._InfeedLoop, args=(sess, )) dec_metrics = self._model_task.CreateDecoderMetrics() start_time = time.time() for i in range(self._steps_per_loop): metrics_values = sess.run(self.metrics) self._model_task.PostProcessDecodeOut(metrics_values, dec_metrics) tf.logging.info( 'step: %d %f' % (i, dec_metrics['num_samples_in_batch'].total_value)) infeed_future.wait() num_examples_metric = dec_metrics['num_samples_in_batch'] summaries = {k: v.Summary(k) for k, v in six.iteritems(dec_metrics)} elapsed_secs = time.time() - start_time example_rate = num_examples_metric.total_value / elapsed_secs summaries['examples/sec'] = tf.Summary(value=[ tf.Summary.Value(tag='examples/sec', simple_value=example_rate) ]) self._WriteSummaries(os.path.basename(self._program_dir), global_step, summaries)
def Run(self, sess): tf.logging.info('Executing train program for %s.', self._task_name) gsteps = py_utils.GetGlobalStep() infeed_future = self._infeed_pool.apply_async(self._InfeedLoop, args=(sess, )) ary = sess.run(self.tpu_ops) infeed_future.wait() values = ary[0] outfeeds = ary[1] eval_metrics = self._eval_metrics.PackMetricsValues(values) task = self._model.GetTask() global_step, total_examples = sess.run( [gsteps, task.total_examples_var]) self._RecordStepRate(global_step, total_examples) msg = 'step:%6d' % global_step for key, (val, _) in sorted(six.iteritems(eval_metrics)): msg += ' %s:%.8g' % (key, val) self._SummarizeValue(global_step, key, val) task.ProcessFPropResults(sess, global_step, eval_metrics, outfeeds)
def _GetSaveableVariablesDict(models): """Get all variables of the model that should be saved. Args: models: a list of lingvo model objects. Returns: A map of the variables with their names as keys, trailing `:0` stripepd. Raises: RuntimeError: if there are variables with shared name. """ res = {} for model in models: res = py_utils.MergeDictsWithValueCheck(res, model.GetVariablesDict()) res_updated = {} for k in res: k_new = k # strip ':0' from variable names to be backwards compatible with graph mode # checkpoint keys if k[-2:] == ':0': k_new = k[:-2] res_updated[k_new] = res[k] res_updated['global_step'] = py_utils.GetGlobalStep() return res_updated
def _verify_timestep_counts(self, num_splits): num_micro_batches = 8 batch_size = 16 with self.session(graph=tf.Graph()) as sess: py_utils.GetGlobalStep() tf.set_random_seed(1245) inputs = tf.random_uniform([batch_size, 8, 8, 1]) net = _BuildDummyPipelineCnn(num_splits=num_splits, num_micro_batches=num_micro_batches) endpoints = net.FPropDefaultTheta(inputs) if isinstance(endpoints, (list, tuple)): logits, aux_logits = endpoints else: logits = endpoints aux_logits = None loss = tf.reduce_mean(logits) grads = tf.gradients(loss, tf.trainable_variables()) grad_norm = tf.sqrt(py_utils.SumSquared(grads)) ts = net.GetAccumulatorValues().Flatten() sess.run(tf.global_variables_initializer()) grad_norm_val, ts_vals = sess.run([grad_norm, ts]) self.assertNear(grad_norm_val, 0.269997, err=1.0e-6) # Accumulator values should be equal to number of time steps in pipeline. for ts_val in list(ts_vals): expected_ts = num_micro_batches if num_splits > 1 else 1 self.assertEqual(ts_val, expected_ts) if aux_logits is not None: aux_logit_tensor = sess.run(aux_logits) self.assertEqual(aux_logit_tensor.shape, (batch_size, 8, 8, 1))
def Value(self): values = [] for schedule in self.schedules: values.append(schedule.Value()) relative_step = tf.math.mod(py_utils.GetGlobalStep(), self._period) return py_utils.PiecewiseConstant(relative_step, self._boundaries, values, values[0].dtype)
def Run(self, sess): tf.logging.info('Executing train program for %s.', self._task_name) gsteps = py_utils.GetGlobalStep() infeed_future = self._infeed_pool.apply_async( self._InfeedLoop, args=(sess,)) ary = sess.run(self.tpu_ops) infeed_future.wait() values = ary[0] outfeeds = ary[1] eval_metrics = self._eval_metrics.PackMetricsValues(values) global_step = sess.run(gsteps) step_rate, example_rate = self._step_rate_tracker.ComputeStepRate( global_step, eval_metrics['num_samples_in_batch'][0]) self._SummarizeValue(global_step, 'global_step/sec', step_rate) self._SummarizeValue(global_step, 'examples/sec', example_rate) for key, (val, _) in sorted(six.iteritems(eval_metrics)): self._SummarizeValue(global_step, key, val) self._model.GetTask().ProcessFPropResults(sess, global_step, eval_metrics, outfeeds)
def Apply(self, lr, var_grad): p = self.params def _Acc(vg): """Updating accumulators.""" v, g = vg with tf.variable_scope(v.op.name): _, a = py_utils.CreateVariable( 'grad_accumulator', py_utils.WeightParams(v.get_shape(), py_utils.WeightInit.Constant(0.0), self.params.dtype), trainable=False) a = tf.assign_add(a, g) return v, a var_grad = var_grad.Transform(_Acc) def _ApplyAndReset(): with tf.control_dependencies([ self._opt.Apply( lr, py_utils.ApplyGradMultiplier(var_grad, 1. / p.accum_steps)) ]): return tf.group( *[tf.assign(a, tf.zeros_like(a)) for _, a in var_grad.Flatten()]) return tf.cond( tf.equal( tf.mod(py_utils.GetGlobalStep(), p.accum_steps), p.accum_steps - 1), _ApplyAndReset, lambda: tf.group(tf.no_op()))
def InstantiateVariables(self): """Create variables for this layer and child layers. DO NOT OVERRIDE. Override self._CreateLayerVariables instead. """ if self._create_variables_status != _CreateLayerVariablesStatus.NOT_CALLED: return self._create_variables_status = _CreateLayerVariablesStatus.IN_PROGRESS stack_size = len(_CREATE_VARIABLES_STACK.stack) _CREATE_VARIABLES_STACK.stack.append(self) try: self._global_step = py_utils.GetGlobalStep() self._CreateChildrenVariables() if not self._is_variable_free: self.AddExtraTheta('global_step', self._global_step) with self._SelfVariableScope(): for name, meta in list(self._variables_to_create.items()): self._CreateVariableInternal(name, meta) self._CreateLayerVariables() finally: assert _CREATE_VARIABLES_STACK.stack[-1] is self _CREATE_VARIABLES_STACK.stack.pop() assert len(_CREATE_VARIABLES_STACK.stack) == stack_size self._create_variables_status = _CreateLayerVariablesStatus.COMPLETED if not _CREATE_VARIABLES_STACK.stack: # Outermost layer just finished InstantiateVariables. self._VerifyVarsAndTheta()
def __init__(self, params): """Layer constructor. Sub-classes of BaseLayer should decorator its __init__ with @base_layer.initializer Args: params: A params used to construct this layer. """ assert params.name, ('Layer params for %s must have a "name"' % self.__class__.__name__) self._params = params.Copy() tf.logging.debug('Creating layer %s with params: \n %s \n', self.__class__.__name__, str(params)) # Vars created by this layer. self._private_vars = py_utils.NestedMap() # Theta derived from this layer's vars. self._private_theta = py_utils.NestedMap() # Child layers created by this layer through CreateChild/CreateChildren. self._private_children = py_utils.NestedMap() # Child layers created by this layer. A well-formed layer should # have self._private_children equals to self._children_list. I.e., # all child layers are created using CreateChild/CreateChildren. self._children_list = [] # Extra theta's not directly correpond to any underlying vars. For example, # the concatenated sharded variables. self._extra_theta = py_utils.NestedMap() # All registered accumulators. self._private_accumulators = py_utils.NestedMap() # Layer-private functions. Add with AddFunction. self._private_fns = dict() self.AddExtraTheta('global_step', py_utils.GetGlobalStep())
def testDecoderFPropDeterministicAttentionDropout(self): """Verify that attention dropout is deterministic given fixed seeds.""" with self.session(use_gpu=False, graph=tf.Graph()) as sess: tf.set_random_seed(8372749040) p = self._DecoderParams( py_utils.VariationalNoiseParams(None, True, False, seed=1792)) p.use_while_loop_based_unrolling = False p.attention.atten_dropout_prob = 0.5 p.attention.atten_dropout_deterministic = True loss, per_sequence_loss = self._testDecoderFPropHelper(params=p) global_step = py_utils.GetGlobalStep() tf.global_variables_initializer().run() loss_val, per_sequence_loss_val, global_steps_val = sess.run( [loss, per_sequence_loss, global_step]) print('loss = ', loss_val, 'per sequence loss = ', per_sequence_loss_val) self.assertAllClose([3.587372, 15.0], loss_val) self.assertAllClose([14.171288, 9.965696, 10.221684, 19.451914], per_sequence_loss_val) self.assertEqual(0, global_steps_val) # Run another step to test global_step and time_step are incremented # correctly. sess.run(tf.assign_add(global_step, 1)) loss_val, per_sequence_loss_val, global_steps_val = sess.run( [loss, per_sequence_loss, global_step]) print('loss = ', loss_val, 'per sequence loss = ', per_sequence_loss_val) self.assertAllClose([3.626164, 15.0], loss_val) self.assertAllClose([14.70993, 10.572938, 10.516836, 18.592758], per_sequence_loss_val) self.assertEqual(1, global_steps_val)
def Run(self, sess): tf.logging.info('Executing decode program for %s.', self._task_name) gsteps = py_utils.GetGlobalStep() global_step = sess.run(gsteps) infeed_future = self._infeed_pool.apply_async(self._InfeedLoop, args=(sess, )) dec_metrics = self._model_task.CreateDecoderMetrics() start_time = time.time() buffered_decode_out = [] for i in range(self._steps_per_loop): metrics_values = sess.run(self.metrics) decode_out = self._model_task.PostProcessDecodeOut( metrics_values, dec_metrics) tf.logging.info( 'step: %d %f' % (i, dec_metrics['num_samples_in_batch'].total_value)) if decode_out: buffered_decode_out.extend(decode_out) infeed_future.wait() num_examples_metric = dec_metrics['num_samples_in_batch'] summaries = {k: v.Summary(k) for k, v in six.iteritems(dec_metrics)} elapsed_secs = time.time() - start_time example_rate = num_examples_metric.total_value / elapsed_secs summaries['examples/sec'] = tf.Summary(value=[ tf.Summary.Value(tag='examples/sec', simple_value=example_rate) ]) self._WriteSummaries(os.path.basename(self._program_dir), global_step, summaries) decode_out_path = os.path.join(self._program_dir, 'decoder_out_%09d' % global_step) decode_finalize_args = base_model.DecodeFinalizeArgs( decode_out_path=decode_out_path, decode_out=buffered_decode_out) self._model_task.DecodeFinalize(decode_finalize_args)
def __init__(self, params): """Layer constructor. Sub-classes of BaseLayer should decorator its __init__ with @base_layer.initializer Args: params: A params used to construct this layer. """ assert params.name, ( 'Layer params for %s must have a "name"' % self.__class__.__name__) tf_module_name = params.name tf_module_name = re.sub('[^a-zA-Z0-9_]+', '_', tf_module_name) tf_module_name = 'bbf_' + self.__class__.__name__ + '_' + tf_module_name py_utils.NestedMap.CheckKey(tf_module_name) # initialize the base class. super(BaseLayer, self).__init__(tf_module_name) # Note AutoTracking doesn't work properly due to its inability to walk # through py_utils.NestedMap data structures which are used widely # throughout the Lingvo codebase. Also there seems to be some performance # hit in turning on auto-tracking in constructing graphs. For now, we # disable auto-tracking. # TODO(lingvo): Re-enable auto-tracking when fuller support is # added for key data structures used in Lingvo, and performance issue is # debugged more and understood better. self._setattr_tracking = False self._parent = ( _LAYER_STACK.layer_stack[-2] if len(_LAYER_STACK.layer_stack) > 1 else None) assert self._parent is not self self._params = params.Copy() tf.logging.debug('Creating layer %s with params: \n %s \n', self.__class__.__name__, str(params)) # Vars created by this layer. self._private_vars = py_utils.NestedMap() # Theta derived from this layer's vars. self._private_theta = py_utils.NestedMap() # Child layers created by this layer through CreateChild/CreateChildren. self._private_children = py_utils.NestedMap() # Child layers created by this layer. A well-formed layer should # have self._private_children equals to self._children_list. I.e., # all child layers are created using CreateChild/CreateChildren. self._children_list = [] # Extra theta's not directly correpond to any underlying vars. For example, # the concatenated sharded variables. self._extra_theta = py_utils.NestedMap() # All registered accumulators. self._private_accumulators = py_utils.NestedMap() # Layer-private functions. Add with AddFunction. self._private_fns = dict() # Mapping from variable names to its symbolic shape. # self._var_symbolic_shape_map['var_name'] will be a tuple of integers or # symbolic expressions, one for each dimension of the variable. self._var_symbolic_shape_map = dict() self.AddExtraTheta('global_step', py_utils.GetGlobalStep())
def Run(self, sess): gsteps = py_utils.GetGlobalStep() global_step = sess.run(gsteps) self.SetStatusMessage('Executing train program at step %d' % global_step) infeed_future = self._infeed_pool.apply_async(self._InfeedLoop, args=(sess, )) ary = sess.run(self.tpu_ops) infeed_future.wait() values = ary[0] outfeeds = ary[1] self._eval_metrics.PackMetricsValues(values) eval_metrics = self._eval_metrics.metrics global_step = sess.run(gsteps) step_rate, example_rate, total_examples = ( self._step_rate_tracker.ComputeStepRate( global_step, eval_metrics['num_samples_in_batch'][0] * self._steps_per_loop)) self._SummarizeValue(global_step, 'global_step/sec', step_rate) self._SummarizeValue(global_step, 'examples/sec', example_rate) self._SummarizeValue(global_step, 'total_samples', total_examples) for key, (val, _) in sorted(six.iteritems(eval_metrics)): self._SummarizeValue(global_step, key, val) self._model.GetTask().ProcessFPropResults(sess, global_step, eval_metrics, outfeeds) return False
def ApplyPruning(cls, pruning_hparams_dict, lstmobj, wm_pc, dtype, scope): # pylint:disable=invalid-name if not cls._pruning_obj: cls.Setup(pruning_hparams_dict, global_step=py_utils.GetGlobalStep()) return apply_customized_lstm_matrix_compression( cls._pruning_obj, py_utils.WeightParams, py_utils.WeightInit, lstmobj, wm_pc.shape, dtype, scope)
def Value(self): p = self.params step_num = tf.cast(py_utils.GetGlobalStep(), tf.float32) learning_rate = tf.math.rsqrt( tf.maximum(step_num - p.offset, p.warmup_steps)) learning_rate *= p.multiplier return learning_rate
def Run(self, sess): tf.logging.info('Executing decode program for %s.', self._task_name) gsteps = py_utils.GetGlobalStep() global_step = sess.run(gsteps) if self._ml_perf_log: steps_per_epoch = self._ml_perf.steps_per_epoch epoch = int(global_step) // steps_per_epoch mlp_log.mlperf_print('eval_start', None, metadata={'epoch_num': (epoch + 1)}) infeed_future = self._infeed_pool.apply_async(self._InfeedLoop, args=(sess, )) dec_metrics = self._model_task.CreateDecoderMetrics() start_time = time.time() buffered_decode_out = [] for i in range(self._steps_per_loop): metrics_values = sess.run(self.metrics) decode_out = self._model_task.PostProcessDecodeOut( metrics_values, dec_metrics) tf.logging.info( 'step: %d %f' % (i, dec_metrics['num_samples_in_batch'].total_value)) if decode_out: buffered_decode_out.extend(decode_out) infeed_future.wait() if self._ml_perf_log: mlp_log.mlperf_print('eval_stop', None, metadata={'epoch_num': (epoch + 1)}) num_examples_metric = dec_metrics['num_samples_in_batch'] summaries = {k: v.Summary(k) for k, v in six.iteritems(dec_metrics)} elapsed_secs = time.time() - start_time example_rate = num_examples_metric.total_value / elapsed_secs summaries['examples/sec'] = tf.Summary(value=[ tf.Summary.Value(tag='examples/sec', simple_value=example_rate) ]) self._WriteSummaries(os.path.basename(self._program_dir), global_step, summaries) decode_out_path = os.path.join(self._program_dir, 'decoder_out_%09d' % global_step) decode_finalize_args = base_model.DecodeFinalizeArgs( decode_out_path=decode_out_path, decode_out=buffered_decode_out) self._model_task.DecodeFinalize(decode_finalize_args) if self._ml_perf_log: mlperf_metric = self._ml_perf.decoder_metric_name mlperf_metric_value = dec_metrics[mlperf_metric].value mlp_log.mlperf_print('eval_accuracy', mlperf_metric_value, metadata={'epoch_num': epoch}) if mlperf_metric_value > self._ml_perf.decoder_metric_success_threshold: tf.logging.info('ml_perf_final_threshold: %f exceeded', self._ml_perf.decoder_metric_success_threshold) mlp_log.mlperf_print('run_stop', None, metadata={'status': 'success'})
def Run(self, sess): gsteps = py_utils.GetGlobalStep() global_step = sess.run(gsteps) self.SetStatusMessage('Executing decode program at step %d' % global_step) infeed_future = self._infeed_pool.apply_async(self._InfeedLoop, args=(sess, )) decode_future = self._infeed_pool.apply_async(self._DecodeLoop, args=(sess, )) dec_metrics = self._model_task.CreateDecoderMetrics() start_time = time.time() for _ in range(self._steps_per_loop): metrics_values = sess.run(self.metrics) self._model_task.PostProcessDecodeOut(metrics_values, dec_metrics) decode_future.wait() infeed_future.wait() summaries = {k: v.Summary(k) for k, v in six.iteritems(dec_metrics)} elapsed_secs = time.time() - start_time num_examples_metric = dec_metrics['num_samples_in_batch'] example_rate = num_examples_metric.total_value / elapsed_secs summaries['examples/sec'] = tf.Summary(value=[ tf.Summary.Value(tag='examples/sec', simple_value=example_rate) ]) self._WriteSummaries(os.path.basename(self._program_dir), global_step, summaries) return False
def Run(self, sess): p = self.params self._checkpointer.RestoreIfNeeded(sess) gsteps = py_utils.GetGlobalStep() self._infeed_pool.apply_async(self._InfeedLoop, args=(sess,)) ary = sess.run(self.tpu_ops) values = ary[0] outfeeds = ary[1] eval_metrics = self._eval_metrics.PackMetricsValues(values) task = self._model.GetTask() global_step, total_examples = sess.run([gsteps, task.total_examples_var]) self._RecordStepRate(global_step, total_examples) msg = 'step:%6d' % global_step for key, (val, _) in sorted(six.iteritems(eval_metrics)): msg += ' %s:%.8g' % (key, val) self._SummarizeValue(global_step, key, val) task.ProcessFPropResults(sess, global_step, eval_metrics, outfeeds) if p.always_checkpoint_after_execution: self._checkpointer.Save(sess, gsteps) else: self._checkpointer.MaybeSave(sess, gsteps)
def _Loop(self): with tf.container(self._container_id), self._GetSession( cluster_def=self._cluster_def) as sess: # Initialize the variables first, if needed. for program in self._programs: program.RestoreIfNeeded(sess) sess.run(self._initialize_tables) sess.run(self._initialize_local_vars) while True: global_step = sess.run(py_utils.GetGlobalStep()) if self._ShouldStop(sess, global_step): tf.logging.info('Training finished.') self.save_only_checkpointer.Save(sess, global_step) return # If a task is explicitly selected, only run the programs associated # with that task. if self._single_task_mode or self._model_task_name: tf.logging.info('Single task mode: %s', self._model_task_name) program_schedule = self._program_schedule_dict[self._model_task_name] else: # Otherwise, sample a task. model_task = self.task_scheduler.Sample(global_step) tf.logging.info('Sampled %s', model_task) program_schedule = self._program_schedule_dict[model_task] program_schedule.Run(sess) # TODO(blee): More complex saving rules. Currently, we assume # we save after every task's program schedule execution. self.save_only_checkpointer.Save(sess, global_step)
def _LoopEnqueue(self, op, session_override=None): """Runs the enqueue op in a loop.""" p = self.params sess = session_override or self._GetSession() with tf.container(self._container_id), sess: if self._initialize_tables is not None: sess.run(self._initialize_tables) gsteps = py_utils.GetGlobalStep() local_enqueue_steps = 0 # Global enqueue steps measures how many global steps have data enqueued # for already. We use this to terminate; note that the enqueue op may # hang in session.run if we do not terminate with this check. global_enqueue_steps = None tf.logging.info( 'params.train.max_steps: %d, enqueue_max_steps: %d', p.train.max_steps, p.train.enqueue_max_steps) while True: if self._dequeue_thread_complete: tf.logging.info( 'LoopEnqueue done since consuming thread is done.') return global_step = sess.run(gsteps) if global_enqueue_steps is None: global_enqueue_steps = global_step if local_enqueue_steps % 1000 == 0: tf.logging.info( 'Current global_enqueue_steps: %d, ' 'local_enqueue_steps: %d, global_step: %d', global_enqueue_steps, local_enqueue_steps, global_step) if py_utils.use_tpu(): global_steps_with_available_data = int( global_enqueue_steps // p.train.tpu_steps_per_loop * p.train.tpu_steps_per_loop) else: global_steps_with_available_data = global_enqueue_steps if (self._ShouldStop(sess, global_steps_with_available_data) or self._ShouldStop(sess, global_step)): tf.logging.info('Done. ShouldStop is True.') tf.logging.info('Enqueue loop sleeping') time.sleep(15) continue if (p.train.enqueue_max_steps > 0 and local_enqueue_steps >= p.train.enqueue_max_steps): tf.logging.info('Done. train.enqueue_max_steps reached.') tf.logging.info('Enqueue loop sleeping') time.sleep(15) continue local_enqueue_steps += 1 # There are tpu_infeed_parallelism parallel threads enqueuing. # We account for all of them when updating global_enqueue_steps. global_enqueue_steps += p.input.tpu_infeed_parallelism sess.run([op])
def CreateVariables(self): """Create variables for this layer and child layers. DO NOT OVERRIDE. Override self._CreateVariables instead. """ if self._create_variables_called: return self._create_variables_called = True self._global_step = py_utils.GetGlobalStep() if self._is_variable_free: for child in self._children_list: if not child._is_variable_free: # pylint: disable=protected-access raise ValueError( 'Variable free layer %s(%s) child %s(%s) has variables.' % (self.params.name, self.params.cls, child.params.name, child.params.cls)) else: self.AddExtraTheta('global_step', self._global_step) self._CreateChildrenVariables() with tf.variable_scope( py_utils.SanitizeScopeKey(self.params.name), auxiliary_name_scope=False): for name, meta in list(self._variables_to_create.items()): self._CreateVariable(name, meta) self._CreateVariables() self._VerifyVarsAndTheta()
def BuildDataSource(self, data_source_from_file_pattern_fn): """Read and return input batch. Args: data_source_from_file_pattern_fn: a function to read and return input batch from a string file_pattern Returns: A NestedMap containing: data: a tuple of tf.Tensor or `.NestedMap` of tf.Tensor Raises: ValueError: inconsistent sizes between boundaries and datasource_params, specification of unsupported datasources, or out of order boundaries. """ p = self.params if len(p.datasource_params) != len(p.boundaries) + 1: raise ValueError( 'Expected p.datasource_params to have one more entry than ' 'p.boundaries. Found %d datasource_params, and %d boundaries' % (len(p.datasource_params), len(p.boundaries))) for ds_p in p.datasource_params: if 'bprop_variable_filters' in ds_p: if any(filter for filter in ds_p.bprop_variable_filters): raise ValueError( 'CurriculumDataSource does not support distinct ' 'bprop_variable_filters per stage.') for idx in range(len(p.boundaries) - 1): if p.boundaries[idx] > p.boundaries[idx + 1]: raise ValueError( 'Expected p.boundaries to monotonically increase, but ' 'found %d > %d at position %d' % (p.boundaries[idx], p.boundaries[idx + 1], idx)) global_step = py_utils.GetGlobalStep() datasources = [ds_p.Instantiate() for ds_p in p.datasource_params] def GetDatasourceFn(idx): def DatasourceFn(): datasource = datasources[idx].BuildDataSource( data_source_from_file_pattern_fn) datasource.pop('bprop_variable_filters', None) return datasource return DatasourceFn cases = [] for idx in range(len(p.boundaries)): cases.append((tf.less( global_step, tf.constant(p.boundaries[idx], dtype=global_step.dtype)), GetDatasourceFn(idx))) ret = tf.case(cases, default=GetDatasourceFn(-1)) ret.bprop_variable_filters = p.bprop_variable_filters return ret
def Run(self, sess): gsteps = py_utils.GetGlobalStep() global_step = sess.run(gsteps) self.dec_metrics = self._decode_model_task.CreateDecoderMetrics() # Start TPU program thread. train_future = self._train_pool.apply_async(self._TrainAndDecode, args=(sess, )) if self._warmup_seconds > 0: # The first execution of the TPU program has a warm-up # so we delay feeding data yet as that's when the MLPerf timing # starts. This way, when we actually infeed, the TPU program # is immediately ready to execute/dequeue data. tf.logging.info('Waiting before first infeed.') time.sleep(self._warmup_seconds) self._warmup_seconds = 0 if self._ml_perf_log: if not self._run_start: mlp_log.mlperf_print(key='init_stop', value=None) self._run_start = mlp_log.mlperf_print(key='run_start', value=None) steps_per_epoch = self._ml_perf.steps_per_epoch epoch = int(global_step) // steps_per_epoch if epoch > self._ml_perf_epoch: self._ml_perf_epoch = epoch mlp_log.mlperf_print('block_start', None, metadata={ 'first_epoch_num': epoch + 1, 'epoch_count': 1 }) self.SetStatusMessage('MLPerf epoch: %d' % self._ml_perf_epoch) # Start infeed thread. infeed_future = self._infeed_pool.apply_async(self._InfeedLoop, args=(sess, )) infeed_future.wait() train_future.wait() if self._ml_perf_log: mlp_log.mlperf_print('eval_stop', None, metadata={'epoch_num': (epoch + 1)}) mlperf_metric = self._ml_perf.decoder_metric_name mlperf_metric_value = float(self.dec_metrics[mlperf_metric].value) mlp_log.mlperf_print('eval_accuracy', mlperf_metric_value, metadata={'epoch_num': epoch}) if mlperf_metric_value > self._ml_perf.decoder_metric_success_threshold: tf.logging.info('ml_perf_final_threshold: %f exceeded', self._ml_perf.decoder_metric_success_threshold) if not self._run_stop: self._run_stop = mlp_log.mlperf_print( 'run_stop', None, metadata={'status': 'success'}) self.SetStatusMessage('MLPerf run_time: %.2f' % (self._run_stop - self._run_start)) return True return False
def Value(self): """Returns the current learning rate decay.""" p = self.params current_step = tf.cast(py_utils.GetGlobalStep(), tf.float32) warmup_steps = tf.cast(p.warmup_steps, tf.float32) linear_warmup = tf.minimum(1.0, current_step / warmup_steps) rsqrt_decay = tf.math.rsqrt(tf.maximum(current_step, warmup_steps)) return p.model_dim**-0.5 * linear_warmup * rsqrt_decay
def Value(self): p = self.params assert p.total_steps > 0 with tf.name_scope(p.name): decay_gap = p.initial_value - p.final_value return p.final_value + 0.5 * decay_gap * (1 + tf.cos(math.pi * tf.minimum( 1.0, tf.cast(py_utils.GetGlobalStep(), tf.float32) / p.total_steps)))
def Value(self): """Returns the current schedule value.""" p = self.params current_step = tf.cast(tf.maximum(py_utils.GetGlobalStep(), 1), tf.float32) warmup_steps = tf.cast(p.warmup_steps, tf.float32) return p.peak * tf.minimum(current_step / warmup_steps, tf.sqrt(warmup_steps / current_step))
def GetOverWriteGlobalStep(graph=None): graph = graph or tf.get_default_graph() mb_tensors = graph.get_collection_ref(_OVERWRITE_GLOBAL_STEP_COLLECTION) if len(mb_tensors) == 1: mb_tensor = mb_tensors[0] else: mb_tensor = py_utils.GetGlobalStep() return mb_tensor