def __init__(self, params, model_task_name, logdir, tf_master, trial=base_trial.NoOpTrial()): """Construct a new BaseRunner. Args: params: Params object containing model configuration. model_task_name: String name of the task this runner should execute for multitask models only. See flag for details. logdir: String path to the log directory to output to. tf_master: String path to the master job, e.g. 'local'. trial: An optional hyperparameter trial. Used by Vizier studies. """ p = params.Copy() # Set in subclasses. self._job_name = '' self._params = trial.OverrideModelParams(p) tf.logging.info('=' * 60) for line in self.params.ToText().split('\n'): tf.logging.info('%s', line) tf.logging.info('=' * 60) self._logdir = logdir self._tf_master = tf_master self._model_task_name = model_task_name self._trial = trial # If the runner is conducting a Vizier trial, scope all the variables # (e.g., global_step) by the trial id so that we do not share states across # trials. self._container_id = self._trial.Name() self._should_report_metrics = False # To early terminate a runner, we set max_steps here and that will trigger # appropriate ShouldStop behavior in the threads. This is used by Vizier # to early stop a trial. self._max_steps = None self.params.cluster.logdir = logdir self._cluster = cluster_factory.Cluster(self.params.cluster) self._train_dir = os.path.join(self._logdir, 'train') tf.io.gfile.makedirs(self._train_dir) self._graph = tf.Graph() self._summary_writer = None self._initialize_tables = None self._dequeue_thread_complete = False early_stop.MetricHistory.SetLogdirInMetricHistories(p, logdir) self._early_stop = None if p.train.early_stop and p.train.early_stop.window: self._early_stop = early_stop.EarlyStop(p.train.early_stop) with self._graph.as_default(): self._early_stop.FProp(None) self._init_input_ops = [] self._SetStatusMessage('Starting ...')
def testEarlyStopDefaultIsNoOp(self): p = early_stop.EarlyStop.Params() es = early_stop.EarlyStop(p) es.FProp(None) mh = early_stop.MetricHistory a = mh.ConditionalAppend(es.params.metric_history.jobname, es.params.metric_history.metric, 1, 10.0) s = es.Stop(None) self.assertFalse(a) self.assertFalse(s) self.assertIsNone(es._node) self.assertEqual(len(early_stop.MetricHistory._metric_histories_map), 0)
def __init__(self, params, model_task_name, logdir, tf_master, trial=base_trial.NoOpTrial()): """Construct a new BaseRunner. Args: params: Params object containing model configuration. model_task_name: String name of the task this runner should execute for multitask models only. See flag for details. logdir: String path to the log directory to output to. tf_master: String path to the master job, e.g. 'local'. trial: An optional hyperparameter trial. Used by Vizier studies. """ p = params.Copy() p.add_summary = FLAGS.add_summary self._params = trial.OverrideModelParams(p) tf.logging.info('=' * 60) for line in self.params.ToText().split('\n'): tf.logging.info('%s', line) tf.logging.info('=' * 60) self._logdir = logdir self._tf_master = tf_master self._model_task_name = model_task_name self._trial = trial # If the runner is conducting a Vizier trial, scope all the variables # (e.g., global_step) by the trial id so that we do not share states across # trials. self._container_id = self._trial.Name() self._cluster = cluster_factory.Cluster(self.params.cluster) self._train_dir = os.path.join(self._logdir, 'train') self._graph = tf.Graph() self._summary_writer = None self.initialize_tables = None early_stop.MetricHistory.SetLogdirInMetricHistories(p, logdir) self._early_stop = None if p.train.early_stop and p.train.early_stop.window: self._early_stop = early_stop.EarlyStop(p.train.early_stop) with self._graph.as_default(): self._early_stop.FProp(None) self._SetStatusMessage('Starting ...')
def testEarlyStoppingAscendingTfEvents(self): logdir = test_helper.test_src_dir_path('core/ops') p = early_stop.EarlyStop.Params() p.window = 1000 p.tolerance = 0.0 p.metric_history.local_filesystem = True p.metric_history.minimize = False p.metric_history.jobname = 'testdata' p.metric_history.metric = 'bleu/dev' p.metric_history.tfevent_file = True early_stop.MetricHistory.SetLogdirInMetricHistories(p, logdir) es = early_stop.EarlyStop(p) es.FProp(None) with self.session() as sess: self.assertTrue(es.Stop(sess)) self.assertEqual(es.best_step, 102600) self.assertEqual(es.last_step, 185200)
def testEarlyStoppingAscendingMetric(self): logdir = tf.test.get_temp_dir() tf.gfile.MkDir(os.path.join(logdir, 'decoder_dev')) p = early_stop.EarlyStop.Params() p.window = 2 p.tolerance = 1.0 p.metric_history.local_filesystem = True p.metric_history.minimize = False p.metric_history.jobname = 'decoder_dev' p.metric_history.metric = 'canonical_bleu' early_stop.MetricHistory.SetLogdirInMetricHistories(p, logdir) es = early_stop.EarlyStop(p) es.FProp(None) with self.session() as sess: jobname = es.metric_history.params.jobname metric = es.metric_history.params.metric mh = early_stop.MetricHistory mh.ConditionalAppend(jobname, metric, 1, 0.0) self.assertFalse(es.Stop(sess)) self.assertEqual(es.best_step, 1) self.assertEqual(es.last_step, 1) mh.ConditionalAppend(jobname, metric, 2, 1.0) self.assertFalse(es.Stop(sess)) self.assertEqual(es.best_step, 1) self.assertEqual(es.last_step, 2) mh.ConditionalAppend(jobname, metric, 3, 2.5) self.assertFalse(es.Stop(sess)) self.assertEqual(es.best_step, 3) self.assertEqual(es.last_step, 3) mh.ConditionalAppend(jobname, metric, 5, 2.0) self.assertFalse(es.Stop(sess)) self.assertEqual(es.best_step, 3) self.assertEqual(es.last_step, 5) mh.ConditionalAppend(jobname, metric, 6, 1.0) self.assertTrue(es.Stop(sess)) self.assertEqual(es.best_step, 3) self.assertEqual(es.last_step, 6)
def testEarlyStopping(self): logdir = tf.test.get_temp_dir() tf.gfile.MkDir(os.path.join(logdir, 'eval_dev')) p = early_stop.EarlyStop.Params() p.window = 2 p.tolerance = 1.0 p.metric_history.local_filesystem = True early_stop.MetricHistory.SetLogdirInMetricHistories(p, logdir) es = early_stop.EarlyStop(p) es.FProp(None) with self.session() as sess: jobname = es.metric_history.params.jobname metric = es.metric_history.params.metric mh = early_stop.MetricHistory mh.ConditionalAppend(jobname, metric, 1, 10.0) self.assertFalse(es.Stop(sess)) self.assertEqual(es.best_step, 1) self.assertEqual(es.last_step, 1) mh.ConditionalAppend(jobname, metric, 2, 5.0) self.assertFalse(es.Stop(sess)) self.assertEqual(es.best_step, 2) self.assertEqual(es.last_step, 2) mh.ConditionalAppend(jobname, metric, 3, 4.0) self.assertFalse(es.Stop(sess)) self.assertEqual(es.best_step, 2) self.assertEqual(es.last_step, 3) mh.ConditionalAppend(jobname, metric, 5, 4.0) self.assertTrue(es.Stop(sess)) self.assertEqual(es.best_step, 2) self.assertEqual(es.last_step, 5)