Ejemplo n.º 1
0
  def Params(cls):
    """Params for a MLPerfProgramSchedule."""
    p = hyperparams.InstantiableParams(cls)

    p.Define('task_dict', None, 'dataset_name -> task params')
    p.Define('task_name', None, 'High level task name')
    p.Define('logdir', None, 'Log directory')
    p.Define('train_program', None, 'Train program params')
    p.Define('train_executions_per_eval', 1, '')
    p.Define('dataset_names', [], 'List of all dataset names.')
    p.Define('num_splits_per_client', None, '')

    p.Define('ml_perf', hyperparams.Params(), 'MlPerf configuration.')

    mlp = p.ml_perf
    mlp.Define('benchmark_name', None, 'Benchmark name for compliance log.')
    mlp.Define('decoder_metric_name', None,
               'Name of the decoder metric to report for compliance log.')
    mlp.Define('decoder_metric_success_threshold', None,
               'Benchmark run must exceed this value to succeeed.')
    mlp.Define('steps_per_epoch', None, 'Number of training steps per epoch.')
    mlp.Define('global_batch_size', None, 'Global batch size.')
    mlp.Define('max_sequence_length', None, 'Maximum sequence length.')
    mlp.Define('optimizer_name', None, 'Optimizer used.')
    mlp.Define('opt_adam_beta_1', None, 'beta_1 used by Adam optimizer.')
    mlp.Define('opt_adam_beta_2', None, 'beta_2 used by Adam optimizer.')
    mlp.Define('opt_adam_epsilon', None, 'epsilon used by Adam optimizer.')
    mlp.Define('base_learning_rate', None, 'Base learning rate.')
    mlp.Define('warmup_steps', None, 'Number of warm-up steps.')
    mlp.Define('train_samples', None, 'Number of train samples.')
    mlp.Define('eval_samples', None, 'Number of eval samples.')

    return p
Ejemplo n.º 2
0
 def _JobSpec(cls, replicas):
     """Construct a job spec param with the given number of replicas."""
     p = hyperparams.Params()
     # By default, we use /job:localhost so that most of tests can just
     # work out of the box. trainer.py will then set job names accordingly.
     p.Define('name', '/job:localhost',
              'TensorFlow job spec, e.g., /job:trainer, /job:ps')
     p.Define('replicas', replicas, 'The number of tasks of a job.')
     p.Define(
         'targets', '', 'The target network address(es) to which we can '
         'create tf sessions. E.g., a single ip:port, or a list of '
         'comma-separated grpc://ip:port, etc.')
     p.Define('cpus_per_replica', 1, 'The number of CPU devices to use per '
              'replica.')
     p.Define('gpus_per_replica', 0, 'The number of GPU devices to use per '
              'replica.')
     p.Define(
         'devices_per_split', 1, 'Devices of a replica are grouped into '
         'splits. Each split contains these many devices. One split is a '
         'group of devices on which the computation nodes of a graph is '
         'placed upon.E.g., one can place the forward lstm on device 0 of '
         'a split and place the backward lstm on device 1. etc.')
     p.Define('tpus_per_replica', 0,
              'The number of tpu cores to use per replica.')
     p.Define('num_tpu_hosts', 0, 'The number of tpu hosts.')
     return p
Ejemplo n.º 3
0
 def Params(cls):
     p = hyperparams.Params()
     p.Define('name', 'EarlyStop', '')
     p.Define('metric_history', MetricHistory.Params(),
              'Metric history params.')
     p.Define(
         'tolerance', 0.0, 'Minimum significant difference in metric; '
         'useful if progress is asymptotic.')
     p.Define('window', 0,
              'Maximum number of steps between best and current.')
     p.Define('verbose', True, 'Log early-stop checks.')
     p.Define('min_steps', 0, 'Minimum number of steps before stopping.')
     return p
Ejemplo n.º 4
0
 def Params(cls):
     """Default parameters for a trial."""
     p = hyperparams.Params()
     p.Define(
         'report_interval_seconds', 600,
         'Interval between reporting trial results and checking for early '
         'stopping.')
     p.Define(
         'vizier_objective_metric_key', 'loss',
         'Which eval metric to use as the "objective value" for tuning.')
     p.Define(
         'report_during_training', False,
         'Whether to report objective metrics during the training process.')
     return p
Ejemplo n.º 5
0
  def Params(cls):
    """Params for a SimpleProgramSchedule."""
    p = hyperparams.InstantiableParams(cls)
    p.Define('task_dict', None, 'dataset_name -> task params')
    p.Define('task_name', None, 'High level task name')
    p.Define('logdir', None, 'Log directory')
    p.Define('train_program', None, 'Train program params')
    p.Define('train_executions_per_eval', 1, '')
    p.Define('eval_programs', [], 'List of eval program params.')
    p.Define('num_splits_per_client', None, '')
    p.Define('dataset_names', [], 'List of all dataset names.')

    # TODO(blee): Clean these up.
    p.Define('ml_perf', hyperparams.Params(), 'MlPerf configuration.')
    mlp = p.ml_perf
    mlp.Define('benchmark_name', None, 'Benchmark name for compliance log.')
    return p
Ejemplo n.º 6
0
def BuildData():
    """Returns a hyperparam recording build information of this py binary."""
    p = hyperparams.Params()
    p.Define('timestamp', build_data.TimestampAscii(),
             'Build timestamp as a string.')
    p.Define('info', build_data.BuildInfo(),
             'User, host, and directory of builder.')
    p.Define('target', build_data.Target(), 'Build target.')
    p.Define('id', build_data.BuildID(), 'Build id.')
    p.Define('changelist', build_data.Changelist(), 'Build CL.')
    p.Define('client_info', build_data.ClientInfo(),
             'Perforce client changelist and status as descriptive string.')
    p.Define('label', build_data.BuildLabel(),
             'Build label (passed to make-{opt,dbg} -l).')
    p.Define('platform', build_data.Platform(), 'Google platform.')
    p.Define('tool', build_data.BuildTool(), 'Build tool.')
    p.Define('paropts', build_data.ParOptions(), 'Par options.')
    return p
Ejemplo n.º 7
0
 def Params(cls):
     p = hyperparams.Params()
     p.Define('name', 'MetricHistory',
              'Used by SetLogdirInMetricHistories.')
     p.Define('jobname', 'eval_dev',
              'Job and dataset to which metric applies.')
     p.Define('metric', 'log_pplx', 'Metric to record.')
     p.Define(
         'minimize', True,
         'If True, training minimizes the metric. If False, training '
         'maximizes the metric.')
     p.Define('logdir', '', 'Root dir for BF logs.')
     p.Define(
         'tfevent_file', False, 'If True, read the metric from '
         'events.out.tfevents.* files in the job dir instead of '
         'maintaining a history file.')
     p.Define('local_filesystem', False,
              'Logdir is on local filesystem (needed for unit test).')
     return p
    def Params(cls):
        """Defaults params for input generators."""
        p = super(BaseInputGenerator, cls).Params()
        p.name = 'input'
        p.Define(
            'batch_size', 0, 'Batch size for a device split. This will be '
            'scaled to match the accelarator hardware topology.')
        p.Define(
            'num_samples', 0,
            'If non-zero, the dataset contains these many samples. '
            'For test/eval dataset, if we want the test/evel job evaluate '
            'the whole dataset, this param must be set precisely. Otherwise, '
            'this param is optional.')

        # TPU related infeed tuning.
        p.Define('use_per_host_infeed', False,
                 'Whether run infeed op on each host.')
        p.Define(
            'tpu_infeed_parallelism', 1,
            'Uses these many python threads to drive infeed concurrently.')
        p.Define('use_partitioned_infeed_queue', False,
                 'Use partitioned infeed')
        p.Define('num_partitions', None, 'Num partitions')

        p.Define('remote', hyperparams.Params(),
                 'Params to configure remote input policy.')
        pp = p.remote
        pp.Define(
            'shardable_batch', True,
            'True if and only if this input generates simple batches whose 1st '
            'dimension of every tensor in a batch is the batch dimension, and '
            'other dimensions are always the same.')
        pp.Define(
            'max_inflights_per_target', 32, 'The maximum number of '
            'concurrent inflight remote input fetches per remote target.')
        return p
 def Test(self):
     """Returns Params for the testing dataset."""
     return hyperparams.Params()
 def Dev(self):
     """Returns Params for the development dataset."""
     return hyperparams.Params()
 def Train(self):
     """Returns Params for the training dataset."""
     return hyperparams.Params()
Ejemplo n.º 12
0
 def Params(cls):
     p = super(QuantizableLayer, cls).Params()
     p.Define('qdomain', hyperparams.Params(),
              'Container for quantization domains.')
     p.qdomain.Define('default', None, 'Default quantization domain.')
     return p