Ejemplo n.º 1
0
  def Params(cls):
    p = super(BaseModel, cls).Params()
    p.Define(
        'model', None, 'Which python function generates the param. It includes '
        'the file name and lineno where the function is defined.')
    p.Define(
        'cluster', cluster_factory.Cluster.Params(),
        'The training cluster. Individual layer may config differently'
        ' based on training cluster it is running under.')
    p.Define('input', None, 'Input generator Params.')
    p.Define('build_data', build_data.BuildData(), 'Build data of this binary.')
    p.Define('train', hyperparams.Params(),
             'Params to control how this model should be trained.')
    tp = p.train
    tp.Define(
        'start_up_delay_steps', 200, 'i-th replica starts training after '
        'i*(i+1)/2*start_up_delay_steps steps')
    tp.Define('max_steps', 4 * 10**6, 'Training max of 4M steps.')
    tp.Define('tpu_steps_per_loop', 100, 'The number of training steps per '
              'training loop for TPUs.')
    tp.Define(
        'ema_decay', 0.0,
        'If > 0, enable ExponentialMovingAverage during training '
        'with the give decay. '
        'Must be < 1. Disabled if <= 0.')
    tp.Define('init_from_checkpoint_rules', {},
              'See BaseTask documentation for details.')
    tp.Define('early_stop', None,
              'Early stopping based on dev-set performance.')
    tp.Define('save_interval_seconds', 60 * 10,
              'Generates a checkpoint roughly once every this many seconds.')
    tp.Define('summary_interval_steps', 100,
              'Generates a checkpoint roughly once every this many steps.')

    return p
Ejemplo n.º 2
0
  def Params(cls):
    p = super().Params()
    p.Define(
        'model', None, 'Which python function generates the param. It includes '
        'the file name and lineno where the function is defined.')
    p.Define(
        'cluster', cluster_factory.Cluster.Params(),
        'The training cluster. Individual layer may config differently'
        ' based on training cluster it is running under.')
    p.Define('input', None, 'Input generator Params.')
    p.Define('build_data', build_data.BuildData(), 'Build data of this binary.')
    p.Define('train', hyperparams.Params(),
             'Params to control how this model should be trained.')
    p.Define('reporting_job', 'evaler',
             'Name of job that reports trial results.')

    tp = p.train
    tp.Define(
        'start_up_delay_steps', 200, 'i-th replica starts training after '
        'i*(i+1)/2*start_up_delay_steps steps')
    tp.Define('max_steps', 4 * 10**6, 'Training max of 4M steps.')
    tp.Define('tpu_steps_per_loop', 1000, 'The number of training steps per '
              'training loop for TPUs.')
    tp.Define(
        'tpu_device_order_mode', None,
        'A device_assignment_lib.DeviceOrderMode enum that determines whether '
        'to assign devices in a way that the order of replicas or '
        'model-parallel cores will form a ring or mesh, or let the library to '
        'choose. Default None to AUTO.')
    tp.Define(
        'ema_decay', 0.0,
        'If > 0, enable ExponentialMovingAverage during training '
        'with the give decay. '
        'Must be < 1. Disabled if <= 0. '
        'Must be set consistent across all tasks.')
    tp.Define(
        'ema_decay_moving_vars', None,
        'If True, include variables from collection "moving_vars" in ema. '
        'Must be set consistent across all tasks.')
    tp.Define('init_from_checkpoint_rules', {},
              'See BaseTask documentation for details.')
    tp.Define('early_stop', None,
              'Early stopping based on dev-set performance.')
    tp.Define(
        'enqueue_max_steps', -1, 'Max enqueue steps. -1 meaning no limit.'
        ' This flag should be set for unit-test only.')
    tp.Define('save_interval_seconds', 60 * 10,
              'Generates a checkpoint roughly once every this many seconds.')
    tp.Define('save_max_to_keep', 100,
              'Maximum number of recent checkpoints to keep.')
    tp.Define('save_keep_checkpoint_every_n_hours', 0.5,
              'How often to keep a checkpoint.')
    tp.Define('summary_interval_steps', 100,
              'Generates a checkpoint roughly once every this many steps.')

    return p