コード例 #1
0
def sample_adagrad_wide_grid(seed):
  """Sample a random configuration from a wide grid for adagrad."""
  rng = np.random.RandomState(seed)
  cfg = {
      "learning_rate": utils.sample_log_float(rng, 1e-8, 1e1),
      "initial_accumulator_value": utils.sample_log_float(rng, 1e-10, 1e3),
  }
  return cfg
コード例 #2
0
def sample_adam8p_wide_grid(seed):
  """Sample a random configuration from a wide grid for adam8p."""
  rng = np.random.RandomState(seed)
  cfg = {
      "learning_rate": utils.sample_log_float(rng, 1e-8, 1e1),
      "beta1": 1 - utils.sample_log_float(rng, 1e-4, 1e0),
      "beta2": 1 - utils.sample_log_float(rng, 1e-6, 1e0),
      "epsilon": utils.sample_log_float(rng, 1e-10, 1e3),
  }
  return cfg
コード例 #3
0
def sample_nadamw_grid(seed):
    """Sample a random configuration from a wide grid for nadamw."""
    rng = np.random.RandomState(seed + 14358)
    cfg = {
        "learning_rate": utils.sample_log_float(rng, 1e-5, 1e0),
        "beta1": 1 - utils.sample_log_float(rng, 1e-3, 1e0),
        "beta2": 1 - utils.sample_log_float(rng, 1e-5, 1e0),
        "epsilon": utils.sample_log_float(rng, 1e-8, 1e4),
        "use_nesterov": rng.uniform(0., 1.) > 0.5,
    }

    # Weight decay / l2 regularization often comes in 2 forms: added to the loss
    # or "AdamW" style where the decay is only used to modify the weights and
    # not also accumulated in the rolling averages.
    # We have 3 configurations -- only adamw style, only l2, and the sum of both.
    # Values are picked in a wide range somewhat arbitrarily.
    rand_idx = rng.uniform(0, 1)
    if rand_idx < 0.3333:
        cfg["adamw_weight_decay"] = utils.sample_log_float(rng, 1e-5, 1e-1)
        cfg["l2_weight_decay"] = 0.0
    elif rand_idx < 0.6666:
        cfg["adamw_weight_decay"] = 0.0
        cfg["l2_weight_decay"] = utils.sample_log_float(rng, 1e-5, 1e-1)
    else:
        cfg["adamw_weight_decay"] = utils.sample_log_float(rng, 1e-5, 1e-1)
        cfg["l2_weight_decay"] = utils.sample_log_float(rng, 1e-5, 1e-1)

    # With probability 50% use a learning rate warmup. Warmups should be short
    # so we choose a fractions < 0.1 of all of training.
    if rng.uniform(0, 1) > 0.5:
        cfg["warmup_fraction"] = utils.sample_log_float(rng, 1e-5, 1e-1)
    else:
        cfg["warmup_fraction"] = 0.0

    # This optimizer family uses a cosine learning rate schedule to some fixed
    # value. Many works simply decay to zero which we do 50% of the time here.
    # The other times we have a variable decay ranging from no decay, to 5 orders
    # of magnitude smaller.
    if rng.uniform(0, 1) > 0.5:
        cfg["min_learning_rate_mult"] = 0.0
    else:
        cfg["min_learning_rate_mult"] = utils.sample_log_float(rng, 1e-5, 1e0)

    # Determines how long a constant learning rate should be held.
    # a value of 0 means the decay starts immediatly and 1 means no decay
    # will occur.
    cfg["constant_fraction"] = rng.uniform(0., 1.)

    return cfg
コード例 #4
0
def sample_adam1p_wide_grid(seed):
  """Sample a random configuration from a wide grid for adam8p."""
  rng = np.random.RandomState(seed + 4123)
  cfg = {
      "learning_rate": utils.sample_log_float(rng, 1e-8, 1e1),
  }
  return cfg
コード例 #5
0
ファイル: adam8p.py プロジェクト: LONG-9621/Stackedcapsule
def sample_adam6p_wide_grid(seed):
    """Sample a random configuration from a wide grid for adam6p."""
    rng = np.random.RandomState(seed + 123455)
    cfg = {
        "learning_rate": utils.sample_log_float(rng, 1e-8, 1e1),
        "beta1": 1 - utils.sample_log_float(rng, 1e-4, 1e0),
        "beta2": 1 - utils.sample_log_float(rng, 1e-6, 1e0),
        "epsilon": utils.sample_log_float(rng, 1e-10, 1e3),
        "linear_decay": utils.sample_log_float(rng, 1e-7, 1e-4),
        "exponential_decay": utils.sample_log_float(rng, 1e-3, 1e-6),
    }
    return cfg