Beispiel #1
0
    def __init__(self,
                 params,
                 model_task_name,
                 logdir,
                 tf_master,
                 trial=base_trial.NoOpTrial()):
        """Construct a new BaseRunner.

    Args:
      params:  Params object containing model configuration.
      model_task_name:  String name of the task this runner should execute for
        multitask models only.  See flag for details.
      logdir:  String path to the log directory to output to.
      tf_master:  String path to the master job, e.g. 'local'.
      trial:   An optional hyperparameter trial. Used by Vizier studies.
    """
        p = params.Copy()
        # Set in subclasses.
        self._job_name = ''

        self._params = trial.OverrideModelParams(p)
        tf.logging.info('=' * 60)
        for line in self.params.ToText().split('\n'):
            tf.logging.info('%s', line)
        tf.logging.info('=' * 60)

        self._logdir = logdir
        self._tf_master = tf_master
        self._model_task_name = model_task_name
        self._trial = trial
        # If the runner is conducting a Vizier trial, scope all the variables
        # (e.g., global_step) by the trial id so that we do not share states across
        # trials.
        self._container_id = self._trial.Name()
        self._should_report_metrics = False

        # To early terminate a runner, we set max_steps here and that will trigger
        # appropriate ShouldStop behavior in the threads. This is used by Vizier
        # to early stop a trial.
        self._max_steps = None

        self.params.cluster.logdir = logdir
        self._cluster = cluster_factory.Cluster(self.params.cluster)
        self._train_dir = os.path.join(self._logdir, 'train')
        tf.io.gfile.makedirs(self._train_dir)
        self._graph = tf.Graph()
        self._summary_writer = None
        self._initialize_tables = None
        self._dequeue_thread_complete = False

        early_stop.MetricHistory.SetLogdirInMetricHistories(p, logdir)
        self._early_stop = None
        if p.train.early_stop and p.train.early_stop.window:
            self._early_stop = early_stop.EarlyStop(p.train.early_stop)
            with self._graph.as_default():
                self._early_stop.FProp(None)

        self._init_input_ops = []

        self._SetStatusMessage('Starting ...')
  def testEarlyStopDefaultIsNoOp(self):
    p = early_stop.EarlyStop.Params()
    es = early_stop.EarlyStop(p)
    es.FProp(None)
    mh = early_stop.MetricHistory
    a = mh.ConditionalAppend(es.params.metric_history.jobname,
                             es.params.metric_history.metric, 1, 10.0)
    s = es.Stop(None)

    self.assertFalse(a)
    self.assertFalse(s)
    self.assertIsNone(es._node)
    self.assertEqual(len(early_stop.MetricHistory._metric_histories_map), 0)
Beispiel #3
0
    def __init__(self,
                 params,
                 model_task_name,
                 logdir,
                 tf_master,
                 trial=base_trial.NoOpTrial()):
        """Construct a new BaseRunner.

    Args:
      params:  Params object containing model configuration.
      model_task_name:  String name of the task this runner should execute
        for multitask models only.  See flag for details.
      logdir:  String path to the log directory to output to.
      tf_master:  String path to the master job, e.g. 'local'.
      trial:   An optional hyperparameter trial. Used by Vizier studies.
    """
        p = params.Copy()
        p.add_summary = FLAGS.add_summary

        self._params = trial.OverrideModelParams(p)
        tf.logging.info('=' * 60)
        for line in self.params.ToText().split('\n'):
            tf.logging.info('%s', line)
        tf.logging.info('=' * 60)

        self._logdir = logdir
        self._tf_master = tf_master
        self._model_task_name = model_task_name
        self._trial = trial
        # If the runner is conducting a Vizier trial, scope all the variables
        # (e.g., global_step) by the trial id so that we do not share states across
        # trials.
        self._container_id = self._trial.Name()

        self._cluster = cluster_factory.Cluster(self.params.cluster)
        self._train_dir = os.path.join(self._logdir, 'train')
        self._graph = tf.Graph()
        self._summary_writer = None
        self.initialize_tables = None

        early_stop.MetricHistory.SetLogdirInMetricHistories(p, logdir)
        self._early_stop = None
        if p.train.early_stop and p.train.early_stop.window:
            self._early_stop = early_stop.EarlyStop(p.train.early_stop)
            with self._graph.as_default():
                self._early_stop.FProp(None)

        self._SetStatusMessage('Starting ...')
Beispiel #4
0
    def testEarlyStoppingAscendingTfEvents(self):
        logdir = test_helper.test_src_dir_path('core/ops')
        p = early_stop.EarlyStop.Params()
        p.window = 1000
        p.tolerance = 0.0
        p.metric_history.local_filesystem = True
        p.metric_history.minimize = False
        p.metric_history.jobname = 'testdata'
        p.metric_history.metric = 'bleu/dev'
        p.metric_history.tfevent_file = True
        early_stop.MetricHistory.SetLogdirInMetricHistories(p, logdir)

        es = early_stop.EarlyStop(p)
        es.FProp(None)
        with self.session() as sess:
            self.assertTrue(es.Stop(sess))
            self.assertEqual(es.best_step, 102600)
            self.assertEqual(es.last_step, 185200)
Beispiel #5
0
    def testEarlyStoppingAscendingMetric(self):
        logdir = tf.test.get_temp_dir()
        tf.gfile.MkDir(os.path.join(logdir, 'decoder_dev'))

        p = early_stop.EarlyStop.Params()
        p.window = 2
        p.tolerance = 1.0
        p.metric_history.local_filesystem = True
        p.metric_history.minimize = False
        p.metric_history.jobname = 'decoder_dev'
        p.metric_history.metric = 'canonical_bleu'
        early_stop.MetricHistory.SetLogdirInMetricHistories(p, logdir)

        es = early_stop.EarlyStop(p)
        es.FProp(None)
        with self.session() as sess:
            jobname = es.metric_history.params.jobname
            metric = es.metric_history.params.metric
            mh = early_stop.MetricHistory

            mh.ConditionalAppend(jobname, metric, 1, 0.0)
            self.assertFalse(es.Stop(sess))
            self.assertEqual(es.best_step, 1)
            self.assertEqual(es.last_step, 1)

            mh.ConditionalAppend(jobname, metric, 2, 1.0)
            self.assertFalse(es.Stop(sess))
            self.assertEqual(es.best_step, 1)
            self.assertEqual(es.last_step, 2)

            mh.ConditionalAppend(jobname, metric, 3, 2.5)
            self.assertFalse(es.Stop(sess))
            self.assertEqual(es.best_step, 3)
            self.assertEqual(es.last_step, 3)

            mh.ConditionalAppend(jobname, metric, 5, 2.0)
            self.assertFalse(es.Stop(sess))
            self.assertEqual(es.best_step, 3)
            self.assertEqual(es.last_step, 5)

            mh.ConditionalAppend(jobname, metric, 6, 1.0)
            self.assertTrue(es.Stop(sess))
            self.assertEqual(es.best_step, 3)
            self.assertEqual(es.last_step, 6)
Beispiel #6
0
    def testEarlyStopping(self):
        logdir = tf.test.get_temp_dir()
        tf.gfile.MkDir(os.path.join(logdir, 'eval_dev'))

        p = early_stop.EarlyStop.Params()
        p.window = 2
        p.tolerance = 1.0
        p.metric_history.local_filesystem = True
        early_stop.MetricHistory.SetLogdirInMetricHistories(p, logdir)

        es = early_stop.EarlyStop(p)
        es.FProp(None)
        with self.session() as sess:
            jobname = es.metric_history.params.jobname
            metric = es.metric_history.params.metric
            mh = early_stop.MetricHistory

            mh.ConditionalAppend(jobname, metric, 1, 10.0)
            self.assertFalse(es.Stop(sess))
            self.assertEqual(es.best_step, 1)
            self.assertEqual(es.last_step, 1)

            mh.ConditionalAppend(jobname, metric, 2, 5.0)
            self.assertFalse(es.Stop(sess))
            self.assertEqual(es.best_step, 2)
            self.assertEqual(es.last_step, 2)

            mh.ConditionalAppend(jobname, metric, 3, 4.0)
            self.assertFalse(es.Stop(sess))
            self.assertEqual(es.best_step, 2)
            self.assertEqual(es.last_step, 3)

            mh.ConditionalAppend(jobname, metric, 5, 4.0)
            self.assertTrue(es.Stop(sess))
            self.assertEqual(es.best_step, 2)
            self.assertEqual(es.last_step, 5)