def sample_adagrad_wide_grid(seed): """Sample a random configuration from a wide grid for adagrad.""" rng = np.random.RandomState(seed) cfg = { "learning_rate": utils.sample_log_float(rng, 1e-8, 1e1), "initial_accumulator_value": utils.sample_log_float(rng, 1e-10, 1e3), } return cfg
def sample_adam8p_wide_grid(seed): """Sample a random configuration from a wide grid for adam8p.""" rng = np.random.RandomState(seed) cfg = { "learning_rate": utils.sample_log_float(rng, 1e-8, 1e1), "beta1": 1 - utils.sample_log_float(rng, 1e-4, 1e0), "beta2": 1 - utils.sample_log_float(rng, 1e-6, 1e0), "epsilon": utils.sample_log_float(rng, 1e-10, 1e3), } return cfg
def sample_nadamw_grid(seed): """Sample a random configuration from a wide grid for nadamw.""" rng = np.random.RandomState(seed + 14358) cfg = { "learning_rate": utils.sample_log_float(rng, 1e-5, 1e0), "beta1": 1 - utils.sample_log_float(rng, 1e-3, 1e0), "beta2": 1 - utils.sample_log_float(rng, 1e-5, 1e0), "epsilon": utils.sample_log_float(rng, 1e-8, 1e4), "use_nesterov": rng.uniform(0., 1.) > 0.5, } # Weight decay / l2 regularization often comes in 2 forms: added to the loss # or "AdamW" style where the decay is only used to modify the weights and # not also accumulated in the rolling averages. # We have 3 configurations -- only adamw style, only l2, and the sum of both. # Values are picked in a wide range somewhat arbitrarily. rand_idx = rng.uniform(0, 1) if rand_idx < 0.3333: cfg["adamw_weight_decay"] = utils.sample_log_float(rng, 1e-5, 1e-1) cfg["l2_weight_decay"] = 0.0 elif rand_idx < 0.6666: cfg["adamw_weight_decay"] = 0.0 cfg["l2_weight_decay"] = utils.sample_log_float(rng, 1e-5, 1e-1) else: cfg["adamw_weight_decay"] = utils.sample_log_float(rng, 1e-5, 1e-1) cfg["l2_weight_decay"] = utils.sample_log_float(rng, 1e-5, 1e-1) # With probability 50% use a learning rate warmup. Warmups should be short # so we choose a fractions < 0.1 of all of training. if rng.uniform(0, 1) > 0.5: cfg["warmup_fraction"] = utils.sample_log_float(rng, 1e-5, 1e-1) else: cfg["warmup_fraction"] = 0.0 # This optimizer family uses a cosine learning rate schedule to some fixed # value. Many works simply decay to zero which we do 50% of the time here. # The other times we have a variable decay ranging from no decay, to 5 orders # of magnitude smaller. if rng.uniform(0, 1) > 0.5: cfg["min_learning_rate_mult"] = 0.0 else: cfg["min_learning_rate_mult"] = utils.sample_log_float(rng, 1e-5, 1e0) # Determines how long a constant learning rate should be held. # a value of 0 means the decay starts immediatly and 1 means no decay # will occur. cfg["constant_fraction"] = rng.uniform(0., 1.) return cfg
def sample_adam1p_wide_grid(seed): """Sample a random configuration from a wide grid for adam8p.""" rng = np.random.RandomState(seed + 4123) cfg = { "learning_rate": utils.sample_log_float(rng, 1e-8, 1e1), } return cfg
def sample_adam6p_wide_grid(seed): """Sample a random configuration from a wide grid for adam6p.""" rng = np.random.RandomState(seed + 123455) cfg = { "learning_rate": utils.sample_log_float(rng, 1e-8, 1e1), "beta1": 1 - utils.sample_log_float(rng, 1e-4, 1e0), "beta2": 1 - utils.sample_log_float(rng, 1e-6, 1e0), "epsilon": utils.sample_log_float(rng, 1e-10, 1e3), "linear_decay": utils.sample_log_float(rng, 1e-7, 1e-4), "exponential_decay": utils.sample_log_float(rng, 1e-3, 1e-6), } return cfg