def get_updates(self, params, cost): grads = self.get_gradients(cost, params) accumulators = [shared_zeros(p.get_value().shape) for p in params] delta_accumulators = [shared_zeros(p.get_value().shape) for p in params] updates = [] for p, g, a, d_a in zip(params, grads, accumulators, delta_accumulators): new_a = self.rho * a + (1 - self.rho) * g ** 2 # update accumulator updates.append((a, new_a)) # use the new accumulator and the *old* delta_accumulator update = g * T.sqrt(d_a + self.epsilon) / T.sqrt(new_a + self.epsilon) new_p = p - self.lr * update updates.append((p, new_p)) # update delta_accumulator new_d_a = self.rho * d_a + (1 - self.rho) * update ** 2 updates.append((d_a, new_d_a)) return updates
def __init__(self, input_shape, epsilon=1e-6, weights=None): self.init = initializations.get("uniform") self.input_shape = input_shape self.epsilon = epsilon self.gamma = self.init((self.input_shape)) self.beta = shared_zeros(self.input_shape) self.params = [self.gamma, self.beta] if weights is not None: self.set_weights(weights)
def __init__(self, input_dim, output_dim=128, init='uniform', inner_init='orthogonal', activation='tanh', inner_activation='hard_sigmoid', truncate_gradient=-1, weights=None, return_sequences=False): self.input_dim = input_dim self.output_dim = output_dim self.truncate_gradient = truncate_gradient self.return_sequences = return_sequences self.init = initializations.get(init) self.inner_init = initializations.get(inner_init) self.activation = activations.get(activation) self.inner_activation = activations.get(inner_activation) self.input = T.matrix() self.W_i = self.init((self.input_dim, self.output_dim)) self.U_i = self.inner_init((self.output_dim, self.output_dim)) self.b_i = shared_zeros((self.output_dim)) self.W_f = self.init((self.input_dim, self.output_dim)) self.U_f = self.inner_init((self.output_dim, self.output_dim)) self.b_f = shared_zeros((self.output_dim)) self.W_c = self.init((self.input_dim, self.output_dim)) self.U_c = self.inner_init((self.output_dim, self.output_dim)) self.b_c = shared_zeros((self.output_dim)) self.W_o = self.init((self.input_dim, self.output_dim)) self.U_o = self.inner_init((self.output_dim, self.output_dim)) self.b_o = shared_zeros((self.output_dim)) self.params = [ self.W_i, self.U_i, self.b_i, self.W_c, self.U_c, self.b_c, self.W_f, self.U_f, self.b_f, self.W_o, self.U_o, self.b_o, ] if weights is not None: self.set_weights(weights)
def get_updates(self, params, cost): grads = self.get_gradients(cost, params) accumulators = [shared_zeros(p.get_value().shape) for p in params] updates = [] for p, g, a in zip(params, grads, accumulators): new_a = a + g**2 # update accumulator updates.append((a, new_a)) new_p = p - self.lr * g / T.sqrt(new_a + self.epsilon) updates.append((p, new_p)) return updates
def get_updates(self, params, cost): grads = self.get_gradients(cost, params) accumulators = [shared_zeros(p.get_value().shape) for p in params] updates = [] for p, g, a in zip(params, grads, accumulators): new_a = a + g ** 2 # update accumulator updates.append((a, new_a)) new_p = p - self.lr * g / T.sqrt(new_a + self.epsilon) updates.append((p, new_p)) return updates
def __init__(self, input_dim, output_dim, init='uniform', activation='linear', weights=None): self.init = initializations.get(init) self.activation = activations.get(activation) self.input_dim = input_dim self.output_dim = output_dim self.input = T.matrix() self.W = self.init((self.input_dim, self.output_dim)) self.b = shared_zeros((self.output_dim)) self.params = [self.W, self.b] if weights is not None: self.set_weights(weights)
def get_updates(self, loss, params,t=10): grads = self.get_gradients(loss, params) grad_c = [] for g in grads: grad_c.append(clip_l2(g,t)) accumulators = [shared_zeros(p.get_value().shape) for p in params] delta_accumulators = [shared_zeros(p.get_value().shape) for p in params] self.updates = [] for p, g, a, d_a in zip(params, grad_c, accumulators, delta_accumulators): new_a = self.rho * a + (1-self.rho) * g**2 #update delta self.updates.append((a, new_a)) # use the new accumulator and the *old* delta_accumulator delta = (-g) * T.sqrt(d_a + self.epsilon) / T.sqrt(new_a + self.epsilon) new_p = p + self.lr * delta self.updates.append((p, new_p)) # update delta_accumulator new_d_a = self.rho * d_a + (1-self.rho) * delta**2 self.updates.append((d_a, new_d_a)) return self.updates
def init_params(self): W = self.init((self.in_dim, self.h_dim)) self.W = np.concatenate([ ortho_weight((self.in_dim, self.h_dim)), ortho_weight((self.in_dim, self.h_dim)), ortho_weight((self.in_dim, self.h_dim)), ortho_weight((self.in_dim, self.h_dim)) ], axis=1) self.W = sharedX(self.W) self.U = np.concatenate([ ortho_weight((self.h_dim, self.h_dim)), ortho_weight((self.h_dim, self.h_dim)), ortho_weight((self.h_dim, self.h_dim)), ortho_weight((self.h_dim, self.h_dim)) ], axis=1) self.U = sharedX(self.U) self.b = shared_zeros((4 * self.h_dim, )) # attention params # e^i = Ua(tanh(Wctx.dot(context) + Uctx.dot(h_tm1) + bctx)) self.Wpc = self.init((self.ctx_dim, self.pctx_dim)) self.Uph = self.init((self.h_dim, self.pctx_dim)) self.bc = shared_zeros((self.pctx_dim, )) self.Ua = self.init((self.pctx_dim, 1)) self.ba = shared_zeros((1, )) self.Wc = self.init((self.ctx_dim, self.h_dim * 4)) if self.selector: self.Wsel = self.init( (self.h_dim, 1) ) # if Wsel=h_dim*h_dim what will happen, is it mean that it could select different feature for different sample? self.bsel = shared_zeros((1, )) self.pack_params()
def get_updates(self, params, cost): grads = self.get_gradients(cost, params) accumulators = [shared_zeros(p.get_value().shape) for p in params] delta_accumulators = [ shared_zeros(p.get_value().shape) for p in params ] updates = [] for p, g, a, d_a in zip(params, grads, accumulators, delta_accumulators): new_a = self.rho * a + (1 - self.rho) * g**2 # update accumulator updates.append((a, new_a)) # use the new accumulator and the *old* delta_accumulator update = g * T.sqrt(d_a + self.epsilon) / T.sqrt(new_a + self.epsilon) new_p = p - self.lr * update updates.append((p, new_p)) # update delta_accumulator new_d_a = self.rho * d_a + (1 - self.rho) * update**2 updates.append((d_a, new_d_a)) return updates
def get_updates(self, params, cost): grads = self.get_gradients(cost, params) lr = self.lr - self.decay * self.iterations updates = [(self.iterations, self.iterations + 1.)] for p, g in zip(params, grads): m = shared_zeros(p.get_value().shape) # momentum v = self.momentum * m - lr * g # velocity updates.append((m, v)) if self.nesterov: new_p = p + self.momentum * v - lr * g else: new_p = p + v updates.append((p, new_p)) return updates
def get_updates(self, params, cost): grads = self.get_gradients(cost, params) lr = self.lr * (1.0 / (1.0 + self.decay * self.iterations)) updates = [(self.iterations, self.iterations+1.)] for p, g in zip(params, grads): m = shared_zeros(p.get_value().shape) # momentum v = self.momentum * m - lr * g # velocity updates.append((m, v)) if self.nesterov: new_p = p + self.momentum * v - lr * g else: new_p = p + v updates.append((p, new_p)) return updates
def __init__(self, input_dim, output_dim, init='uniform', inner_init='orthogonal', activation='sigmoid', weights=None, truncate_gradient=-1, return_sequences=False): self.init = initializations.get(init) self.inner_init = initializations.get(inner_init) self.input_dim = input_dim self.output_dim = output_dim self.truncate_gradient = truncate_gradient self.activation = activations.get(activation) self.return_sequences = return_sequences self.input = T.matrix() self.W = self.init((self.input_dim, self.output_dim)) self.U = self.init((self.output_dim, self.output_dim)) self.b = shared_zeros((self.output_dim)) self.params = [self.W, self.U, self.b] if weights is not None: self.set_weights(weights)
def __init__(self, nb_filter, stack_size, nb_row, nb_col, init='uniform', activation='linear', weights=None, image_shape=None, border_mode='valid', subsample=(1,1)): self.init = initializations.get(init) self.activation = activations.get(activation) self.subsample = subsample self.border_mode = border_mode self.image_shape = image_shape self.input = T.tensor4() self.W_shape = (nb_filter, stack_size, nb_row, nb_col) self.W = self.init(self.W_shape) self.b = shared_zeros((nb_filter,)) self.params = [self.W, self.b] if weights is not None: self.set_weights(weights)
def get_updates(self, loss, params,t=10): grads = self.get_gradients(loss, params) grads_clipped = [] for g in grads: grads_clipped.append(clip_l2(g,t)) lr = self.lr * (1.0 / (1.0 + self.decay * self.iterations)) self.updates = [(self.iterations, self.iterations + 1.)] for p, g in zip(params, grads_clipped): m = shared_zeros(p.get_value().shape) # momentum v = self.momentum * m - lr * g # velocity self.updates.append((m, v)) if self.nesterov: new_p = p + self.momentum * v - lr * g else: new_p = p + v self.updates.append((p, new_p)) return self.updates
def init_params(self): W = self.init((self.in_dim, self.h_dim)) self.W = np.concatenate([ ortho_weight((self.in_dim, self.h_dim)), ortho_weight((self.in_dim, self.h_dim)), ortho_weight((self.in_dim, self.h_dim)), ortho_weight((self.in_dim, self.h_dim)) ], axis=1) self.W = sharedX(self.W) self.U = np.concatenate([ ortho_weight((self.h_dim, self.h_dim)), ortho_weight((self.h_dim, self.h_dim)), ortho_weight((self.h_dim, self.h_dim)), ortho_weight((self.h_dim, self.h_dim)) ], axis=1) self.U = sharedX(self.U) self.b = shared_zeros((4 * self.h_dim, )) self.pack_params()
def zero(shape): return shared_zeros(shape)
def __init__(self, input_shape): self.alphas = shared_zeros(input_shape) self.params = [self.alphas]
def init_params(self): self.W = self.init((self.in_dim, self.out_dim)) self.b = shared_zeros((self.out_dim, )) self.pack_params()
def zero(shape): return shared_zeros(shape)