def __init__(self, learning_rate=0.001, beta_1=0.9, beta_2=0.999, eps=1e-8, **kwargs): super(Adam, self).__init__(**kwargs) self.lr = sharedX(learning_rate) self.iter = sharedX(0) self.beta_1 = sharedX(beta_1) self.beta_2 = sharedX(beta_2) self.eps = sharedX(eps)
def __init__(self, learning_rate=0.9, momentum=0., k=1.0, lr_decay_factor=0.9, decay_batch=10000): """ dx = -learning_rate / sqrt(k + sum(gparam^2)) * gparam ref : Chris Dyer : Notes on AdaGrad """ self.lr = sharedX(learning_rate) self.mom = sharedX(momentum) self.k = sharedX(k)
def __init__(self, learning_rate=0.9, momentum=0., k=1.0, **kwargs): """ dx = -learning_rate / sqrt(k + sum(gparam^2)) * gparam ref : Chris Dyer : Notes on AdaGrad """ super(AdaGrad, self).__init__(**kwargs) self.lr = sharedX(learning_rate) self.mom = sharedX(momentum) self.k = sharedX(k)
def __init__(self, eps=1e-6, rho=0.95): """ dx_t = -rms(dx_{t-1}) / rms(gparam_t) * gparam_t rms(dx) = sqrt(E_t(dx^2) + eps) E_t(dx^s) = rho E_{t-1}(dx^2) + (1-rho) dx^2 ref : Matthew D. Zeiler: ADADELTA: AN ADAPTIVE LEARNING RATE METHOD """ self.eps = sharedX(eps) self.rho = sharedX(rho)
def update(self, deltas, params, gparams): t = self.iter + 1 lr_t = self.lr * T.sqrt(1-self.beta_2**t)/(1-self.beta_1**t) updates = [] for delta, param, gparam in zip(deltas, params, gparams): m = sharedX(param.get_value() * 0.) v = sharedX(param.get_value() * 0.) m_t = (self.beta_1 * m) + (1 - self.beta_1) * gparam v_t = (self.beta_2 * v) + (1 - self.beta_2) * gparam**2 param_t = param - lr_t * m_t / (T.sqrt(v_t) + self.eps) updates.append((m, m_t)) updates.append((v, v_t)) updates.append((param, param_t)) updates += self.decay() return updates
def __call__(self, dim, name='W', **kwargs): if len(dim) != 2 or dim[0] != dim[1]: raise Exception( "Identity matrix initialization can only be used for 2D square matrices" ) else: return sharedX(self.scale * np.identity(dim[0]), **kwargs)
def decay(self): updates = [] new_batch = ifelse(T.gt(self.batch, self.decay_batch), sharedX(0), self.batch+1) new_lr = ifelse(T.gt(self.batch, self.decay_batch), self.lr*self.lr_decay_factor, self.lr) updates.append((self.batch, new_batch)) updates.append((self.lr, new_lr)) return updates
def update(self, delta, gparam): self.batch += 1 if T.gt(self.batch, self.decay_batch): self.lr.set_value(self.lr.get_value() * self.lr_decay_factor) self.batch = sharedX(0) return [(delta, self.mom * delta - self.lr * gparam)]
def __init__(self, input_dim, output_dim, init=UniformWeight(scale=0.1), weights=None): self.input_dim = input_dim self.output_dim = output_dim if weights is None: self.W = init((input_dim, output_dim)) else: self.W = sharedX(weights) self.params = [self.W]
def __init__(self, dim, alpha=0.2): ''' y = wx + b if y > 0 then z = y else z = alpha * y return z alpha: the gradient of the slope which is updated by backpropagation ''' self.alpha = sharedX(np.ones(dim) * alpha, name='PRELU_gradient') self.params = [self.alpha]
def __call__(self, dim, name='W', **kwargs): ''' From Lasagne ''' flat_shape = (dim[0], np.prod(dim[1:])) a = np.random.normal(0.0, 1.0, flat_shape) u, _, v = np.linalg.svd(a, full_matrices=False) # pick the one with the correct shape q = u if u.shape == flat_shape else v q = q.reshape(dim) return sharedX(name=name, value=self.scale * q[:dim[0],:dim[1]], borrow=True, **kwargs)
def __call__(self, dim, name='W', **kwargs): ''' From Lasagne ''' flat_shape = (dim[0], np.prod(dim[1:])) a = np.random.normal(0.0, 1.0, flat_shape) u, _, v = np.linalg.svd(a, full_matrices=False) # pick the one with the correct shape q = u if u.shape == flat_shape else v q = q.reshape(dim) return sharedX(name=name, value=self.scale * q[:dim[0], :dim[1]], borrow=True, **kwargs)
def __call__(self, dim, name='W', **kwargs): if len(dim) != 2 or dim[0] != dim[1]: raise Exception("Identity matrix initialization can only be used for 2D square matrices") else: return sharedX(self.scale * np.identity(dim[0]), **kwargs)
def __call__(self, dim, name='W', **kwargs): W_values = np.random.normal(loc=self.mean, scale=self.std, size=dim) return sharedX(name=name, value=W_values, borrow=True, **kwargs)
def __init__(self, alpha=0.01): self.alpha = sharedX(alpha) self.params = []
def __call__(self, dim, name='W', **kwargs): fan_in, fan_out = get_fans(dim) W_values = np.random.uniform(low=-4 * np.sqrt(6. / (fan_in + fan_out)), high=4 * np.sqrt(6. / (fan_in + fan_out)), size=dim) return sharedX(name=name, value=W_values, borrow=True, **kwargs)
def __init__(self, learning_rate=0.01, momentum=0.9, lr_decay_factor=0.9, decay_batch=10000): self.lr = sharedX(learning_rate) self.mom = sharedX(momentum) self.batch = sharedX(0) self.decay_batch = sharedX(decay_batch) self.lr_decay_factor = asfloatX(lr_decay_factor)
def __call__(self, dim, name='W', **kwargs): W_values = np.random.uniform(low=-self.scale, high=self.scale, size=dim) return sharedX(name=name, value=W_values, borrow=True, **kwargs)
def __init__(self, learning_rate=0.01, momentum=0.9, **kwargs): super(SGD, self).__init__(**kwargs) self.lr = sharedX(learning_rate) self.mom = sharedX(momentum)
def __init__(self, lr_decay_factor=1.0, decay_batch=10000): self.batch = sharedX(0) self.decay_batch = sharedX(decay_batch) self.lr_decay_factor = asfloatX(lr_decay_factor)
def __call__(self, dim, name='W', **kwargs): fan_in, fan_out = get_fans(dim) W_values = np.random.uniform(low = -4 * np.sqrt(6. / (fan_in + fan_out)), high = 4 * np.sqrt(6. / (fan_in + fan_out)), size = dim) return sharedX(name=name, value=W_values, borrow=True, **kwargs)
def __init__(self, learning_rate=0.01, eps=1e-6, rho=0.9, **kwargs): super(RMSprop, self).__init__(**kwargs) self.lr = sharedX(learning_rate) self.eps = sharedX(eps) self.rho = sharedX(rho)
def _layer_stats(self, state_below, layer_output): return [('moving_mean', T.mean(self.moving_mean)), ('moving_var', T.mean(self.moving_var)), ('gamma_mean', T.mean(self.gamma)), ('beta_mean', T.mean(self.beta)), ('memory', sharedX(self.mem))]