def CreateRunners(self, jobs, logdir, trial=base_trial.NoOpTrial()): """Creates a list of runners based on `FLAGS.mode`. Args: jobs: a list of runner jobs. logdir: the directory used for logging, usually on CNS. trial: optional `Trial` object, used for reporting measures and early stopping. Returns: A list of `.BaseRunner`, one per job in `jobs`. """ all_runners = [] is_training = 'trainer' in jobs or 'trainer_client' in jobs for j in jobs: tf_master = FLAGS.tf_master # Ensure that decoder or evaler threads do not clobber variables being # updated by trainer by forcing them to use independent sessions. if is_training and (j.startswith('decoder') or j.startswith('evaler')): tf_master = '' runner = self._CreateRunner(j, FLAGS.model_task_name, logdir, tf_master, trial) all_runners.append(runner) return all_runners
def __init__(self, params, model_task_name, logdir, tf_master, trial=base_trial.NoOpTrial()): """Construct a new BaseRunner. Args: params: Params object containing model configuration. model_task_name: String name of the task this runner should execute for multitask models only. See flag for details. logdir: String path to the log directory to output to. tf_master: String path to the master job, e.g. 'local'. trial: An optional hyperparameter trial. Used by Vizier studies. """ p = params.Copy() # Set in subclasses. self._job_name = '' self._params = trial.OverrideModelParams(p) tf.logging.info('=' * 60) for line in self.params.ToText().split('\n'): tf.logging.info('%s', line) tf.logging.info('=' * 60) self._logdir = logdir self._tf_master = tf_master self._model_task_name = model_task_name self._trial = trial # If the runner is conducting a Vizier trial, scope all the variables # (e.g., global_step) by the trial id so that we do not share states across # trials. self._container_id = self._trial.Name() self._should_report_metrics = False # To early terminate a runner, we set max_steps here and that will trigger # appropriate ShouldStop behavior in the threads. This is used by Vizier # to early stop a trial. self._max_steps = None self.params.cluster.logdir = logdir self._cluster = cluster_factory.Cluster(self.params.cluster) self._train_dir = os.path.join(self._logdir, 'train') tf.io.gfile.makedirs(self._train_dir) self._graph = tf.Graph() self._summary_writer = None self._initialize_tables = None self._dequeue_thread_complete = False early_stop.MetricHistory.SetLogdirInMetricHistories(p, logdir) self._early_stop = None if p.train.early_stop and p.train.early_stop.window: self._early_stop = early_stop.EarlyStop(p.train.early_stop) with self._graph.as_default(): self._early_stop.FProp(None) self._init_input_ops = [] self._SetStatusMessage('Starting ...')
def CreateRunners(cls, jobs, model_name, logdir, trial=base_trial.NoOpTrial()): """Creates a list of runners based on `FLAGS.mode`. Args: jobs: a list of runner jobs. model_name: name of a registered `ModelParams` class. logdir: the directory used for logging, usually on CNS. trial: optional `Trial` object, used for reporting measures and early stopping. Returns: A list of `.BaseRunner`, one per job in `jobs`. """ runners = [] for j in jobs: tf_master = FLAGS.tf_master # Ensure that decoder or evaler threads do not clobber variables being # updated by trainer by forcing them to use independent sessions. if ('trainer' in jobs and (j.startswith('decoder') or j.startswith('evaler'))): tf_master = '' runner = cls._CreateRunner(j, model_name, FLAGS.model_task_name, logdir, tf_master, trial) runners.append(runner) return runners
def __init__(self, params, model_task_name, logdir, tf_master, trial=base_trial.NoOpTrial()): """Construct a new BaseRunner. Args: params: Params object containing model configuration. model_task_name: String name of the task this runner should execute for multitask models only. See flag for details. logdir: String path to the log directory to output to. tf_master: String path to the master job, e.g. 'local'. trial: An optional hyperparameter trial. Used by Vizier studies. """ p = params.Copy() p.add_summary = FLAGS.add_summary self._params = trial.OverrideModelParams(p) tf.logging.info('=' * 60) for line in self.params.ToText().split('\n'): tf.logging.info('%s', line) tf.logging.info('=' * 60) self._logdir = logdir self._tf_master = tf_master self._model_task_name = model_task_name self._trial = trial # If the runner is conducting a Vizier trial, scope all the variables # (e.g., global_step) by the trial id so that we do not share states across # trials. self._container_id = self._trial.Name() self._cluster = cluster_factory.Cluster(self.params.cluster) self._train_dir = os.path.join(self._logdir, 'train') self._graph = tf.Graph() self._summary_writer = None self.initialize_tables = None early_stop.MetricHistory.SetLogdirInMetricHistories(p, logdir) self._early_stop = None if p.train.early_stop and p.train.early_stop.window: self._early_stop = early_stop.EarlyStop(p.train.early_stop) with self._graph.as_default(): self._early_stop.FProp(None) self._SetStatusMessage('Starting ...')
def __init__(self, *args, **kwargs): super(BaseTrainerTest, self).__init__(*args, **kwargs) self._trial = base_trial.NoOpTrial()
def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self._trial = base_trial.NoOpTrial()
def __init__(self, params, model_task_name='', logdir='', tf_master='', trial=base_trial.NoOpTrial()): """A job runner. Args: params: Params object containing model configuration. model_task_name: String name of the task this runner should execute for multitask models only. See flag for details. logdir: String path to the log directory to output to. tf_master: String path to the master job, e.g. 'local'. trial: An optional hyperparameter trial. Used by Vizier studies. """ self._params = trial.OverrideModelParams(params.Copy()) p = self.params self._model_task_name = model_task_name self._logdir = logdir self._train_dir = os.path.join(self._logdir, 'train') tf.io.gfile.makedirs(self._train_dir) self._tf_master = tf_master self._trial = trial # If the runner is conducting a Vizier trial, scope all the variables # (e.g., global_step) by the trial id so that we do not share states across # trials. self._container_id = self._trial.Name() # Set in subclasses. self._job_name = '' self._daemon = False self._verbose_enqueue_logging = False self._checkpointer = None self._should_report_metrics = False if py_utils.IsEagerMode(): self._graph = None else: self._graph = tf.Graph() self._summary_writer = None self._initialize_tables = None self._dequeue_thread_complete = False self._early_stop = None # The actual EarlyStop object. if p.train.early_stop and p.train.early_stop.window: early_stop.MetricHistory.SetLogdirInMetricHistories( p, self._logdir) self._early_stop = p.train.early_stop.Instantiate() self._verbose_enqueue_logging = True # Merged TF scalar summaries for training related input data stats. self._merged_input_data_summary_op = None # To early terminate a runner, we set max_steps here and that will trigger # appropriate ShouldStop behavior in the threads. This is used by Vizier # to early stop a trial and also EarlyStop to stop training based on # metrics. self._max_steps_for_early_stop = None self.enqueue_ops = None tf.logging.info('=' * 60) for line in self.params.ToText().split('\n'): tf.logging.info('%s', line) tf.logging.info('=' * 60) self._SetStatusMessage('Starting ...') self.params.cluster.logdir = logdir self._cluster = cluster_factory.Cluster(self.params.cluster) self._worker_cluster_def = self._cluster.worker_cluster_def if py_utils.IsEagerMode(): self._cluster.InitDevicesEager() else: self._cluster.InitDevices(self._GetSession()) # Ensure global step tensor is created. with contextlib.ExitStack() as stack: if not py_utils.IsEagerMode(): stack.enter_context(self._graph.as_default()) stack.enter_context(tf.device(self._cluster.GetPlacer())) # It is important that we enter the tf.container scope *after* # the graph scope. If we reverse the ordering, the tf.container # basically has no-effect which is a tricky silent error. stack.enter_context(tf.container(self._container_id)) self._global_step_var = py_utils.GetOrCreateGlobalStepVar()