def _apply_gradients_to_shared_memory_vars(self, grads, shared_vars): opt_st = self.opt_st self.flat_grads = np.empty(shared_vars.size, dtype=ctypes.c_float) #Flatten grads offset = 0 for g in grads: self.flat_grads[offset:offset + g.size] = g.reshape(-1) offset += g.size g = self.flat_grads shared_vars.step.value += 1 T = shared_vars.step.value if self.optimizer_type == "adam" and self.optimizer_mode == "shared": p = np.frombuffer(shared_vars.vars, ctypes.c_float) p_size = shared_vars.size m = np.frombuffer(opt_st.ms, ctypes.c_float) v = np.frombuffer(opt_st.vs, ctypes.c_float) opt_st.lr.value = 1.0 * opt_st.lr.value * (1 - self.b2**T)**0.5 / ( 1 - self.b1**T) apply_grads_adam(m, v, g, p, p_size, opt_st.lr.value, self.b1, self.b2, self.e) elif self.optimizer_type == "adamax" and self.optimizer_mode == "shared": beta_1 = .9 beta_2 = .999 lr = opt_st.lr.value p = np.frombuffer(shared_vars.vars, ctypes.c_float) p_size = shared_vars.size m = np.frombuffer(opt_st.ms, ctypes.c_float) u = np.frombuffer(opt_st.vs, ctypes.c_float) apply_grads_adamax(m, u, g, p, p_size, lr, beta_1, beta_2, T) else: #local or shared rmsprop/momentum lr = self.decay_lr() if (self.optimizer_mode == "local"): m = opt_st else: #shared m = np.frombuffer(opt_st.vars, ctypes.c_float) p = np.frombuffer(shared_vars.vars, ctypes.c_float) p_size = shared_vars.size _type = 0 if self.optimizer_type == "momentum" else 1 apply_grads_mom_rmsprop(m, g, p, p_size, _type, lr, self.alpha, self.e)
def _apply_gradients_to_shared_memory_vars(self, grads, opt_st): #Flatten grads offset = 0 for g in grads: self.flat_grads[offset:offset + g.size] = g.reshape(-1) offset += g.size g = self.flat_grads if self.optimizer_type == "adam" and self.optimizer_mode == "shared": p = np.frombuffer(self.learning_vars.vars, ctypes.c_float) p_size = self.learning_vars.size m = np.frombuffer(opt_st.ms, ctypes.c_float) v = np.frombuffer(opt_st.vs, ctypes.c_float) T = self.global_step.value() opt_st.lr.value = 1.0 * opt_st.lr.value * (1 - self.b2**T)**0.5 / (1 - self.b1**T) apply_grads_adam(m, v, g, p, p_size, opt_st.lr.value, self.b1, self.b2, self.e) elif self.optimizer_type == "adamax" and self.optimizer_mode == "shared": beta_1 = .9 beta_2 = .999 lr = opt_st.lr.value p = np.frombuffer(self.learning_vars.vars, ctypes.c_float) p_size = self.learning_vars.size m = np.frombuffer(opt_st.ms, ctypes.c_float) u = np.frombuffer(opt_st.vs, ctypes.c_float) T = self.global_step.value() apply_grads_adamax(m, u, g, p, p_size, lr, beta_1, beta_2, T) else: #local or shared rmsprop/momentum lr = self.decay_lr() if (self.optimizer_mode == "local"): m = opt_st else: #shared m = np.frombuffer(opt_st.vars, ctypes.c_float) p = np.frombuffer(self.learning_vars.vars, ctypes.c_float) p_size = self.learning_vars.size _type = 0 if self.optimizer_type == "momentum" else 1 #print "BEFORE", "RMSPROP m", m[0], "GRAD", g[0], self.flat_grads[0], self.flat_grads2[0] apply_grads_mom_rmsprop(m, g, p, p_size, _type, lr, self.alpha, self.e)