def _new_update_deltas(self, network, parameter_vws, grads): update_deltas = treeano.UpdateDeltas() for parameter_vw, grad in zip(parameter_vws, grads): prev_grad, _ = update_utils.update_previous( network, update_deltas, grad, "grad(%s)" % parameter_vw.name, parameter_vw.shape) prev_update = network.create_vw( "quickprop_prev_update(%s)" % parameter_vw.name, shape=parameter_vw.shape, is_shared=True, tags={"state"}, default_inits=[treeano.inits.ConstantInit(1)], ).variable denom = prev_grad - grad # TODO paramerize epsilon = 1e-6 denom = denom + treeano.utils.sign_non_zero(denom) * epsilon parameter_delta = prev_update * grad / denom parameter = parameter_vw.variable update_deltas[parameter] = parameter_delta update_deltas[prev_update] = parameter_delta - prev_update return update_deltas
def _new_update_deltas(self, network, parameter_vws, grads): # alpha / stepsize / learning rate are all the same thing # using alpha because that is what is used in the paper alpha = network.find_hyperparameter( ["adam_learning_rate", "adam_alpha", "learning_rate"], 0.001) beta1 = network.find_hyperparameter(["adam_beta1", "beta1"], 0.9) beta2 = network.find_hyperparameter(["adam_beta2", "beta2"], 0.999) epsilon = network.find_hyperparameter(["adam_epsilon", "epsilon"], 1e-8) update_deltas = treeano.UpdateDeltas() # keep count state only once t_vw = network.create_vw( "adam_count", shape=(), is_shared=True, tags={"state"}, default_inits=[], ) t = t_vw.variable new_t = t + 1 update_deltas[t] = new_t - t for parameter_vw, grad in zip(parameter_vws, grads): # biased 1st moment estimate # moving average of gradient m_vw = network.create_vw( "adam_m(%s)" % parameter_vw.name, shape=parameter_vw.shape, is_shared=True, tags={"state"}, default_inits=[], ) # 2nd moment # moving average of squared gradient v_vw = network.create_vw( "adam_v(%s)" % parameter_vw.name, shape=parameter_vw.shape, is_shared=True, tags={"state"}, default_inits=[], ) m = m_vw.variable v = v_vw.variable # new value for 1st moment estimate new_m = beta1 * m + (1 - beta1) * grad # new value for 2nd moment estimate new_v = beta2 * v + (1 - beta2) * T.sqr(grad) parameter_delta = -alpha * new_m / (T.sqrt(new_v) + epsilon) update_deltas[m] = new_m - m update_deltas[v] = new_v - v update_deltas[parameter_vw.variable] = parameter_delta return update_deltas
def _new_update_deltas(self, network, parameter_vws, grads): learning_rate = network.find_hyperparameter( ["sgd_learning_rate", "learning_rate"], 0.1) # HACK changes the rest of this node... mostly restructuring deltas = {} for vw, grad in zip(parameter_vws, grads): initial_std = np.std(vw.value) # prevent multiplying by 0 std if initial_std == 0: initial_std = 1.0 factor = treeano.utils.as_fX(-learning_rate * initial_std**2) deltas[vw.variable] = factor * grad return treeano.UpdateDeltas(deltas)
def _new_update_deltas(self, network, parameter_vws, grads): learning_rate = network.find_hyperparameter(["learning_rate"], 1e-2) momentum = network.find_hyperparameter(["momentum"], 0.9) rho = network.find_hyperparameter(["rho"], 0.95) epsilon = network.find_hyperparameter( ["std_rmsprop_epsilon", "epsilon"], 1e-8) update_deltas = treeano.UpdateDeltas() for parameter_vw, grad in zip(parameter_vws, grads): # exponential moving average of gradients for numerator g_avg_numer = network.create_vw( "std_rmsprop_gradients_momentum(%s)" % parameter_vw.name, shape=parameter_vw.shape, is_shared=True, tags={"state"}, default_inits=[], ).variable # exponential moving average of gradients for denominator g_avg_denom = network.create_vw( "std_rmsprop_gradients(%s)" % parameter_vw.name, shape=parameter_vw.shape, is_shared=True, tags={"state"}, default_inits=[], ).variable # exponential moving average of gradients squared g2_avg = network.create_vw( "std_rmsprop_gradients_squared(%s)" % parameter_vw.name, shape=parameter_vw.shape, is_shared=True, tags={"state"}, default_inits=[], ).variable # updated state new_g_avg_numer = momentum * g_avg_numer + (1 - momentum) * grad new_g_avg_denom = rho * g_avg_denom + (1 - rho) * grad new_g2_avg = rho * g2_avg + (1 - rho) * T.sqr(grad) # calculate update std = T.sqrt(new_g2_avg - T.sqr(new_g_avg_denom) + epsilon) deltas = -learning_rate * new_g_avg_numer / std update_deltas[g_avg_numer] = new_g_avg_numer - g_avg_numer update_deltas[g_avg_denom] = new_g_avg_denom - g_avg_denom update_deltas[g2_avg] = new_g2_avg - g2_avg update_deltas[parameter_vw.variable] = deltas return update_deltas
def _new_update_deltas(self, network, parameter_vws, grads): # NOTE: in the paper, learning_rate is referred to as epsilon # not doing that here as it would be confusing learning_rate = network.find_hyperparameter(["learning_rate"], 0.01) # NOTE: this is referred to as lambda in the paper # NOTE: when doing hyperparameter selection in the paper, # they select from 1e-4, 1e-5, 1e-6 damping_factor = network.find_hyperparameter(["damping_factor"], 1e-2) update_deltas = treeano.UpdateDeltas() k_vw = network.create_vw( "esgd_count", shape=(), is_shared=True, tags={"state"}, default_inits=[], ) k = k_vw.variable new_k = k + 1 update_deltas[k] = new_k - k for parameter_vw, grad in zip(parameter_vws, grads): D_vw = network.create_vw( "esgd_D(%s)" % parameter_vw.name, shape=parameter_vw.shape, is_shared=True, tags={"state"}, default_inits=[], ) # TODO ESGD update should only occur every 20 iterations # to amortize cost parameter = parameter_vw.variable D = D_vw.variable # TODO save this state so that we can seed the rng srng = MRG_RandomStreams() # noise vector v = srng.normal(size=parameter.shape) Hv = T.Rop(grad, parameter, v) D_delta = T.sqr(Hv) new_D = D + D_delta # new_D / new_k is essentially a mean denominator = damping_factor + T.sqrt(new_D / new_k) parameter_delta = -learning_rate * grad / denominator update_deltas[parameter] = parameter_delta update_deltas[D] = D_delta return update_deltas
def _new_update_deltas(self, network, parameter_vws, grads): learning_rate = network.find_hyperparameter(["learning_rate"], 1e-4) rho = network.find_hyperparameter(["rho"], 0.95) momentum = network.find_hyperparameter(["momentum"], 0.9) epsilon = network.find_hyperparameter(["epsilon"], 1e-4) update_deltas = treeano.UpdateDeltas() for parameter_vw, grad in zip(parameter_vws, grads): # momentum term delta = network.create_vw( "graves_rmsprop_delta(%s)" % parameter_vw.name, shape=parameter_vw.shape, is_shared=True, tags={"state"}, default_inits=[], ).variable # exponential moving average of gradients g_avg = network.create_vw( "graves_rmsprop_gradients(%s)" % parameter_vw.name, shape=parameter_vw.shape, is_shared=True, tags={"state"}, default_inits=[], ).variable # exponential moving average of gradients squared g2_avg = network.create_vw( "graves_rmsprop_gradients_squared(%s)" % parameter_vw.name, shape=parameter_vw.shape, is_shared=True, tags={"state"}, default_inits=[], ).variable # updated gradients squared new_g_avg = rho * g_avg + (1 - rho) * grad new_g2_avg = rho * g2_avg + (1 - rho) * T.sqr(grad) # calculate update std = T.sqrt(new_g2_avg - T.sqr(new_g_avg) + epsilon) new_delta = momentum * delta - learning_rate * grad / std update_deltas[g_avg] = new_g_avg - g_avg update_deltas[g2_avg] = new_g2_avg - g2_avg update_deltas[delta] = new_delta - delta update_deltas[parameter_vw.variable] = new_delta return update_deltas
def _new_update_deltas(self, network, parameter_vws, grads): learning_rate = network.find_hyperparameter(["learning_rate"], 0.001) epsilon = network.find_hyperparameter(["epsilon"], 1e-16) update_deltas = treeano.UpdateDeltas() for parameter_vw, grad in zip(parameter_vws, grads): mem_vw = network.create_vw( "smorms3_mem(%s)" % parameter_vw.name, shape=parameter_vw.shape, is_shared=True, tags={"state"}, default_inits=[treeano.inits.ConstantInit(1)], ) g_vw = network.create_vw( "smorms3_g(%s)" % parameter_vw.name, shape=parameter_vw.shape, is_shared=True, tags={"state"}, default_inits=[], ) g2_vw = network.create_vw( "smorms3_g2(%s)" % parameter_vw.name, shape=parameter_vw.shape, is_shared=True, tags={"state"}, default_inits=[], ) parameter = parameter_vw.variable mem = mem_vw.variable g = g_vw.variable g2 = g2_vw.variable r = 1 / (mem + 1) new_g = (1 - r) * g + r * grad new_g2 = (1 - r) * g2 + r * grad**2 term1 = (new_g**2) / (new_g2 + epsilon) term2 = T.sqrt(new_g2) + epsilon parameter_delta = -grad * T.minimum(learning_rate, term1) / term2 new_mem = 1 + mem * (1 - term1) update_deltas[parameter] = parameter_delta update_deltas[mem] = new_mem - mem update_deltas[g] = new_g - g update_deltas[g2] = new_g2 - g2 return update_deltas
def _new_update_deltas(self, network, parameter_vws, grads): # alpha / stepsize / learning rate are all the same thing # using alpha because that is what is used in the paper alpha = network.find_hyperparameter(["adam_learning_rate", "adam_alpha", "learning_rate"], 0.001) beta1 = network.find_hyperparameter(["adam_beta1", "beta1"], 0.9) beta2 = network.find_hyperparameter(["adam_beta2", "beta2"], 0.999) epsilon = network.find_hyperparameter(["adam_epsilon", "epsilon"], 1e-8) update_deltas = treeano.UpdateDeltas() # keep count state only once t_vw = network.create_vw( "adam_count", shape=(), is_shared=True, tags={"state"}, default_inits=[], ) t = t_vw.variable new_t = t + 1 update_deltas[t] = new_t - t # compute some values only once # unbias terms to take into account initializing with 0 # NOTE: unbias terms assume constant beta1/beta2 m_unbias_term = 1 - beta1 ** new_t v_unbias_term = T.sqrt(1 - beta2 ** new_t) epsilon_hat = epsilon * v_unbias_term alpha_t = alpha * v_unbias_term / m_unbias_term for parameter_vw, grad in zip(parameter_vws, grads): # biased 1st moment estimate # moving average of gradient m_vw = network.create_vw( "adam_m(%s)" % parameter_vw.name, shape=parameter_vw.shape, is_shared=True, tags={"state"}, default_inits=[], ) # 2nd moment # moving average of squared gradient v_vw = network.create_vw( "adam_v(%s)" % parameter_vw.name, shape=parameter_vw.shape, is_shared=True, tags={"state"}, default_inits=[], ) # another moving average of gradient g_vw = network.create_vw( "adam_g(%s)" % parameter_vw.name, shape=parameter_vw.shape, is_shared=True, tags={"state"}, default_inits=[], ) m = m_vw.variable v = v_vw.variable g = g_vw.variable # new value for 1st moment estimate new_m = beta1 * m + (1 - beta1) * grad # new value for 2nd moment estimate new_v = beta2 * v + (1 - beta2) * T.sqr(grad) new_g = beta2 * g + (1 - beta2) * grad parameter_delta = - alpha_t * new_m / (T.sqrt(new_v - T.sqr(new_g)) + epsilon_hat) update_deltas[m] = new_m - m update_deltas[v] = new_v - v update_deltas[g] = new_g - g update_deltas[parameter_vw.variable] = parameter_delta return update_deltas
def _new_update_deltas(self, network, parameter_vws, grads): # alpha / stepsize / learning rate are all the same thing # using alpha because that is what is used in the paper alpha = network.find_hyperparameter(["learning_rate"], 0.001) beta1 = network.find_hyperparameter(["beta1"], 0.9) beta2 = network.find_hyperparameter(["beta2"], 0.999) epsilon = network.find_hyperparameter(["epsilon"], 1e-8) constant_root = network.find_hyperparameter(["constant_root"], None) normalize_denominator = network.find_hyperparameter( ["normalize_denominator"], True) update_deltas = treeano.UpdateDeltas() # keep count state only once t_vw = network.create_vw( "adaadam_count", shape=(), is_shared=True, tags={"state"}, default_inits=[], ) t = t_vw.variable new_t = t + 1 update_deltas[t] = new_t - t # compute some values only once # unbias terms to take into account initializing with 0 # NOTE: unbias terms assume constant beta1/beta2 m_unbias_term = 1 - beta1**new_t v_unbias_term = T.sqrt(1 - beta2**new_t) epsilon_hat = epsilon * v_unbias_term alpha_t = alpha * v_unbias_term / m_unbias_term if constant_root is None: h = network.find_hyperparameter(["half_life_batches"]) # heuristic: set as half_life_batches by default c = network.find_hyperparameter(["clipped_batches"], h) f = 2.0**(1. / h) w0 = 2.0 * (1 / f)**c w_state = network.create_vw( "adaadam_w", shape=(), is_shared=True, tags={"state"}, default_inits=[treeano.inits.ConstantInit(w0)], ).variable update_deltas[w_state] = w_state * f - w_state # TODO parameterize bounds w = T.clip(w_state, 2.0, 10000.0) else: w = constant_root for parameter_vw, grad in zip(parameter_vws, grads): # biased 1st moment estimate # moving average of gradient m_vw = network.create_vw( "adaadam_m(%s)" % parameter_vw.name, shape=parameter_vw.shape, is_shared=True, tags={"state"}, default_inits=[], ) # 2nd moment # moving average of squared gradient v_vw = network.create_vw( "adaadam_v(%s)" % parameter_vw.name, shape=parameter_vw.shape, is_shared=True, tags={"state"}, default_inits=[], ) m = m_vw.variable v = v_vw.variable # new value for 1st moment estimate new_m = beta1 * m + (1 - beta1) * grad # new value for 2nd moment estimate new_v = beta2 * v + (1 - beta2) * T.sqr(grad) orig_denom = T.sqrt(new_v) denom = T.pow(new_v, 1. / w) # FIXME try w/ and w/o normalizer if normalize_denominator: denom_normalizer = ((orig_denom.sum() + 1e-8) / (denom.sum() + 1e-8)) else: denom_normalizer = 1 if 1: parameter_delta = -alpha_t * new_m / ( (denom + epsilon_hat) * denom_normalizer) else: parameter_delta = -alpha_t * new_m / ( denom * denom_normalizer + epsilon_hat) update_deltas[m] = new_m - m update_deltas[v] = new_v - v update_deltas[parameter_vw.variable] = parameter_delta return update_deltas
def _new_update_deltas(self, network, vws, grads): return treeano.UpdateDeltas({vw.variable: const for vw in vws})
def _new_update_deltas(self, network, parameter_vws, grads): # alpha / stepsize / learning rate are all the same thing # using alpha because that is what is used in the paper alpha = network.find_hyperparameter( ["adam_learning_rate", "adam_alpha", "learning_rate"], 0.002) beta1 = network.find_hyperparameter(["adam_beta1", "beta1"], 0.975) beta2 = network.find_hyperparameter(["adam_beta2", "beta2"], 0.999) epsilon = network.find_hyperparameter(["adam_epsilon", "epsilon"], 1e-8) update_deltas = treeano.UpdateDeltas() # keep count state only once t_vw = network.create_vw( "adam_count", shape=(), is_shared=True, tags={"state"}, default_inits=[], ) t = t_vw.variable new_t = t + 1 update_deltas[t] = new_t - t # compute some values only once # unbias terms to take into account initializing with 0 # NOTE: unbias terms assume constant beta1/beta2 m_unbias_term1 = 1 - beta1**new_t m_unbias_term2 = 1 - beta1**(new_t + 1) v_unbias_term = T.sqrt(1 - beta2**new_t) for parameter_vw, grad in zip(parameter_vws, grads): # biased 1st moment estimate # moving average of gradient m_vw = network.create_vw( "adam_m(%s)" % parameter_vw.name, shape=parameter_vw.shape, is_shared=True, tags={"state"}, default_inits=[], ) # 2nd moment # moving average of squared gradient v_vw = network.create_vw( "adam_v(%s)" % parameter_vw.name, shape=parameter_vw.shape, is_shared=True, tags={"state"}, default_inits=[], ) m = m_vw.variable v = v_vw.variable # new value for 1st moment estimate new_m = beta1 * m + (1 - beta1) * grad # new value for 2nd moment estimate new_v = beta2 * v + (1 - beta2) * T.sqr(grad) numer = (beta1 * new_m / m_unbias_term2 + (1 - beta1) * grad / m_unbias_term1) # NOTE: nadam paper has epsilon inside sqrt, but leaving it outside # for consistency with adam denom = T.sqrt(beta2 * new_v / v_unbias_term) + epsilon parameter_delta = -alpha * numer / denom update_deltas[m] = new_m - m update_deltas[v] = new_v - v update_deltas[parameter_vw.variable] = parameter_delta return update_deltas
def new_update_deltas(self, network): batch_idx = network.get_vw("batch_idx").variable ud = treeano.UpdateDeltas() ud[batch_idx] = treeano.utils.as_fX(1) return ud
def _new_update_deltas(self, network, parameter_vws, grads): # alpha / stepsize / learning rate are all the same thing # using alpha because that is what is used in the paper alpha = network.find_hyperparameter( ["adam_learning_rate", "adam_alpha", "learning_rate"], 0.001) beta1 = network.find_hyperparameter(["adam_beta1", "beta1"], 0.9) beta2 = network.find_hyperparameter(["adam_beta2", "beta2"], 0.999) epsilon = network.find_hyperparameter(["adam_epsilon", "epsilon"], 1e-8) # HACK part 1: different from adam scale_fn = network.find_hyperparameter(["scale_function"], treeano.utils.identity) update_deltas = treeano.UpdateDeltas() # keep count state only once t_vw = network.create_vw( "adam_count", shape=(), is_shared=True, tags={"state"}, default_inits=[], ) t = t_vw.variable new_t = t + 1 update_deltas[t] = new_t - t # compute some values only once # unbias terms to take into account initializing with 0 # NOTE: unbias terms assume constant beta1/beta2 m_unbias_term = 1 - beta1**new_t v_unbias_term = T.sqrt(1 - beta2**new_t) epsilon_hat = epsilon * v_unbias_term alpha_t = alpha * v_unbias_term / m_unbias_term for parameter_vw, grad in zip(parameter_vws, grads): # biased 1st moment estimate # moving average of gradient m_vw = network.create_vw( "adam_m(%s)" % parameter_vw.name, shape=parameter_vw.shape, is_shared=True, tags={"state"}, default_inits=[], ) # 2nd moment # moving average of squared gradient v_vw = network.create_vw( "adam_v(%s)" % parameter_vw.name, shape=parameter_vw.shape, is_shared=True, tags={"state"}, default_inits=[], ) m = m_vw.variable v = v_vw.variable # new value for 1st moment estimate new_m = beta1 * m + (1 - beta1) * grad # new value for 2nd moment estimate new_v = beta2 * v + (1 - beta2) * T.sqr(grad) parameter_delta = -alpha_t * new_m / (T.sqrt(new_v) + epsilon_hat) # HACK part 2: different from standard adam initial_std = treeano.utils.as_fX(np.std(parameter_vw.value)) # prevent multiplying by 0 std if initial_std > 0: parameter_delta *= scale_fn(initial_std) update_deltas[m] = new_m - m update_deltas[v] = new_v - v update_deltas[parameter_vw.variable] = parameter_delta return update_deltas