def make_ff_controller(opt): b, h, m, p, k = opt.b, opt.h, opt.m, opt.p, opt.k H = 2*h in_size = k + h*m out_size = H*m + H + H + H*3 + H + h*m + h*m + p # Previous reads r_bhm = cgt.tensor3("r", fixed_shape = (b,h,m)) # External inputs X_bk = cgt.matrix("x", fixed_shape = (b,k)) r_b_hm = r_bhm.reshape([r_bhm.shape[0], r_bhm.shape[1]*r_bhm.shape[2]]) # Input to controller inp_bq = cgt.concatenate([X_bk, r_b_hm], axis=1) hid_sizes = opt.ff_hid_sizes activation = cgt.tanh layer_out_sizes = [in_size] + hid_sizes + [out_size] last_out = inp_bq # feedforward part. we could simplify a bit by using nn.Affine for i in xrange(len(layer_out_sizes)-1): indim = layer_out_sizes[i] outdim = layer_out_sizes[i+1] W = cgt.shared(.02*nr.randn(indim, outdim), name="W%i"%i, fixed_shape_mask="all") bias = cgt.shared(.02*nr.randn(1, outdim), name="b%i"%i, fixed_shape_mask="all") last_out = cgt.broadcast("+",last_out.dot(W),bias,"xx,1x") # Don't apply nonlinearity at the last layer if i != len(layer_out_sizes)-2: last_out = activation(last_out) idx = 0 k_bHm = last_out[:,idx:idx+H*m]; idx += H*m; k_bHm = k_bHm.reshape([b,H,m]) beta_bH = last_out[:,idx:idx+H]; idx += H g_bH = last_out[:,idx:idx+H]; idx += H s_bH3 = last_out[:,idx:idx+3*H]; idx += 3*H; s_bH3 = s_bH3.reshape([b,H,3]) gamma_bH = last_out[:,idx:idx+H]; idx += H e_bhm = last_out[:,idx:idx+h*m]; idx += h*m; e_bhm = e_bhm.reshape([b,h,m]) a_bhm = last_out[:,idx:idx+h*m]; idx += h*m; a_bhm = a_bhm.reshape([b,h,m]) y_bp = last_out[:,idx:idx+p]; idx += p k_bHm = cgt.tanh(k_bHm) beta_bH = nn.softplus(beta_bH) g_bH = cgt.sigmoid(g_bH) s_bH3 = sum_normalize2(cgt.exp(s_bH3)) gamma_bH = cgt.sigmoid(gamma_bH)+1 e_bhm = cgt.sigmoid(e_bhm) a_bhm = cgt.tanh(a_bhm) # y_bp = y_bp assert infer_shape(k_bHm) == (b,H,m) assert infer_shape(beta_bH) == (b,H) assert infer_shape(g_bH) == (b,H) assert infer_shape(s_bH3) == (b,H,3) assert infer_shape(gamma_bH) == (b,H) assert infer_shape(e_bhm) == (b,h,m) assert infer_shape(a_bhm) == (b,h,m) assert infer_shape(y_bp) == (b,p) return nn.Module([r_bhm, X_bk], [k_bHm, beta_bH, g_bH, s_bH3, gamma_bH, e_bhm, a_bhm, y_bp])
def make_ntm_initial_states(opt): n, m, h, b = opt.n, opt.m, opt.h, opt.b M_1nm = cgt.shared(.1*nr.randn(1,n,m)) winit_1Hn = cgt.shared(.1*nr.rand(1,2*h,n)) winit_1Hn = sum_normalize2(cgt.exp(winit_1Hn)) rinit_1hm = cgt.shared(np.zeros((1,h,m))) return [cgt.repeat(arr, b, axis=0) for arr in (M_1nm, winit_1Hn, rinit_1hm)]
def test_flatvec(): cgt.reset_config cgt.set_precision('double') cgt.core.update_config(backend="python") # XXX N = 10 K = 3 Xval = np.random.randn(N, K) wval = np.random.randn(K) bval = np.random.randn() yval = np.random.randn(N) X_nk = cgt.shared(Xval, "X") y_n = cgt.shared(yval, "y") w_k = cgt.shared(wval, "w") b = cgt.shared(bval, name="b") ypred = cgt.dot(X_nk, w_k) + b err = cgt.sum(cgt.square(ypred - y_n)) g = cgt.grad(err, [w_k, b]) g = core.simplify(g) pars = [w_k, b] flatx = nn.setup_contiguous_storage(pars) f = cgt.function([], [err, cgt.flatcat(g)])
def test_devices(): N = 10 K = 3 compile_info = cgt.compilation.get_compile_info() cuda_enabled = compile_info["CGT_ENABLE_CUDA"] if not cuda_enabled: raise SkipTest("cuda disabled") Xval = np.random.randn(N, K).astype(cgt.floatX) wval = np.random.randn(K).astype(cgt.floatX) bval = np.asarray(np.random.randn()).astype(cgt.floatX) yval = np.random.randn(N).astype(cgt.floatX) with cgt.scoped_update_config(default_device=cgt.Device(devtype="gpu")): X_nk = cgt.shared(Xval, "X", device=cgt.Device(devtype='gpu')) y_n = cgt.shared(yval, "y") w_k = cgt.shared(wval, "w") b = cgt.shared(bval, name="b") print "bval", bval ypred = cgt.dot(cgt.square(X_nk), w_k) + b err = cgt.sum(cgt.sin(ypred - y_n)) g = cgt.grad(err, [w_k, b]) outputs = [err] + g f = cgt.function([], [err] + g) results = f() print results assert np.allclose( results[0], np.sin(np.square(Xval).dot(wval) + bval - yval).sum())
def test_flatvec(): cgt.reset_config cgt.set_precision('double') cgt.core.update_config(backend="python") # XXX N = 10 K = 3 Xval = np.random.randn(N,K) wval = np.random.randn(K) bval = np.random.randn() yval = np.random.randn(N) X_nk = cgt.shared(Xval, "X") y_n = cgt.shared(yval, "y") w_k = cgt.shared(wval, "w") b = cgt.shared(bval, name="b") ypred = cgt.dot(X_nk, w_k) + b err = cgt.sum(cgt.square(ypred - y_n)) g = cgt.grad(err, [w_k, b]) g = core.simplify(g) pars = [w_k, b] flatx = nn.setup_contiguous_storage(pars) f = cgt.function([], [err,cgt.flatcat(g)])
def test_devices(): N = 10 K = 3 compile_info = cgt.compilation.get_compile_info() cuda_enabled = compile_info["CGT_ENABLE_CUDA"] if not cuda_enabled: raise SkipTest("cuda disabled") Xval = np.random.randn(N,K).astype(cgt.floatX) wval = np.random.randn(K).astype(cgt.floatX) bval = np.asarray(np.random.randn()).astype(cgt.floatX) yval = np.random.randn(N).astype(cgt.floatX) with cgt.scoped_update_config(default_device=cgt.Device(devtype="gpu")): X_nk = cgt.shared(Xval, "X", device=cgt.Device(devtype='gpu')) y_n = cgt.shared(yval, "y") w_k = cgt.shared(wval, "w") b = cgt.shared(bval, name="b") print "bval",bval ypred = cgt.dot(cgt.square(X_nk), w_k) + b err = cgt.sum(cgt.sin(ypred - y_n)) g = cgt.grad(err, [w_k, b]) outputs = [err]+g f = cgt.function([], [err]+g) results = f() print results assert np.allclose(results[0] , np.sin(np.square(Xval).dot(wval)+bval-yval).sum())
def adadelta(cost, params, learning_rate=1.0, rho=0.95, epsilon=1e-6): """ Adadelta updates The learning rate is scaled by the ratio of accumulated gradients to the ratio of accumulated step sizes. Math: * ``accu_new = rho * accu + (1 - rho) * grad ** 2`` * ``update = (grad * cgt.sqrt(delta_accu + epsilon) / cgt.sqrt(accu_new + epsilon))`` * ``param = param - learning_rate * update`` * ``delta_accu_new = rho * delta_accu + (1 - rho) * update ** 2`` Parameters ---------- cost : a scalar loss. params : a list of cgt shared variables. We generate update expressions w.r.t. these variables. learning_rate : float Tunes the size of the update step. rho : float Controls decay of gradient moving average. epsilon : float Avoid division by 0 while scaling. Small constant. Returns ------- list of tuples of the form (param, updates), (accumulated_grads, accumulated_grads_new), (step_accum, step_accum_new) References ---------- .. [1] Zeiler, M. D. (2012): ADADELTA: An Adaptive Learning Rate Method. arXiv Preprint arXiv:1212.5701. """ updates = [] grads = cgt.grad(cost, params) for param, grad in zip(params, grads): assert isinstance(param.op, core.GetData) accu = cgt.shared(np.zeros(param.op.get_shape(), dtype=param.dtype)) delta_accu = cgt.shared( np.zeros(param.op.get_shape(), dtype=param.dtype)) accu_new = rho * accu + (1 - rho) * grad**2 updates.append((accu, accu_new)) update = (grad * cgt.sqrt(delta_accu + epsilon) / cgt.sqrt(accu_new + epsilon)) updates.append((param, param - learning_rate * update)) delta_accu_new = rho * delta_accu + (1 - rho) * update**2 updates.append((delta_accu, delta_accu_new)) return updates
def test_nesterov_momentum(): results = [] for scale in scales: A = cgt.shared(1.0) B = cgt.shared(1.0) updates = nn.momentum(f(A, scale) + f(B, scale), [A, B], learning_rate=0.1, mu=0.5) do_update = cgt.function([], [], updates=updates) for _ in range(10): do_update() assert np.allclose(A.op.get_value(), B.op.get_value()) results.append(A.op.get_value().copy()) assert np.allclose(results, torch_values['nesterov_momentum'])
def adadelta(cost, params, learning_rate=1.0, rho=0.95, epsilon=1e-6): """ Adadelta updates The learning rate is scaled by the ratio of accumulated gradients to the ratio of accumulated step sizes. Math: * ``accu_new = rho * accu + (1 - rho) * grad ** 2`` * ``update = (grad * cgt.sqrt(delta_accu + epsilon) / cgt.sqrt(accu_new + epsilon))`` * ``param = param - learning_rate * update`` * ``delta_accu_new = rho * delta_accu + (1 - rho) * update ** 2`` Parameters ---------- cost : a scalar loss. params : a list of cgt shared variables. We generate update expressions w.r.t. these variables. learning_rate : float Tunes the size of the update step. rho : float Controls decay of gradient moving average. epsilon : float Avoid division by 0 while scaling. Small constant. Returns ------- list of tuples of the form (param, updates), (accumulated_grads, accumulated_grads_new), (step_accum, step_accum_new) References ---------- .. [1] Zeiler, M. D. (2012): ADADELTA: An Adaptive Learning Rate Method. arXiv Preprint arXiv:1212.5701. """ updates = [] grads = cgt.grad(cost, params) for param, grad in zip(params, grads): assert isinstance(param.op, core.GetData) accu = cgt.shared(np.zeros(param.op.get_shape(), dtype=param.dtype)) delta_accu = cgt.shared(np.zeros(param.op.get_shape(), dtype=param.dtype)) accu_new = rho * accu + (1 - rho) * grad ** 2 updates.append((accu, accu_new)) update = (grad * cgt.sqrt(delta_accu + epsilon) / cgt.sqrt(accu_new + epsilon)) updates.append((param, param - learning_rate * update)) delta_accu_new = rho * delta_accu + (1 - rho) * update ** 2 updates.append((delta_accu, delta_accu_new)) return updates
def test_sgd(): results = [] for scale in scales: A = cgt.shared(1.0) B = cgt.shared(1.0) updates = nn.sgd(f(A, scale) + f(B, scale), [A, B], learning_rate=0.1) do_update = cgt.function([], [], updates=updates) for _ in range(10): do_update() assert np.allclose(A.op.get_value(), B.op.get_value()) results.append(A.op.get_value().copy()) assert np.allclose(results, torch_values['sgd'])
def test_adadelta(): results = [] for scale in scales: A = cgt.shared(1.0) B = cgt.shared(1.0) updates = nn.adadelta(f(A, scale) + f(B, scale), [A, B]) do_update = cgt.function([], [], updates=updates) for _ in range(10): do_update() assert np.allclose(A.op.get_value(), B.op.get_value()) results.append(A.op.get_value().copy()) assert np.allclose(results, torch_values['adadelta'])
def test_rmsprop(): results = [] for scale in scales: A = cgt.shared(1.0) B = cgt.shared(1.0) updates = nn.rmsprop(f(A, scale) + f(B, scale), [A, B], learning_rate=0.01) do_update = cgt.function([], [], updates=updates) for _ in range(10): do_update() assert np.allclose(A.op.get_value(), B.op.get_value()) results.append(A.op.get_value().copy()) assert np.allclose(results, torch_values['rmsprop'])
def run_nesterov_momenutm(): results = [] for scale in scales: A = cgt.shared(1.0) B = cgt.shared(1.0) updates = nn.momentum(f(A, scale) + f(B, scale), [A, B], learning_rate=0.1, momentum=0.5) do_update = cgt.function([], [], updates=updates) for _ in range(10): do_update() assert np.allclose(A.op.get_value(), B.op.get_value()) results.append(A.op.get_value().copy()) assert np.allclose(results, torch_values['nesterov_momentum'])
def momentum(cost, params, learning_rate, mu=0.9): """Stochastic Gradient Descent (SGD) updates with momentum Math: * ``velocity := mu * velocity - learning_rate * grad`` * ``param := param + velocity`` Parameters ---------- cost : a scalar loss. params : a list of cgt shared variables. We generate update expressions w.r.t. these variables. learning_rate : float Tunes the size of the update step. momentum: float Tunes the weight given to the velocity term. Returns ------- list of tuples of the form (param, new_param) and (velocity, new_velocity) """ updates = [] grads = cgt.grad(cost, params) for param, grad in zip(params, grads): assert isinstance(param.op, core.GetData) velocity = cgt.shared(np.zeros(param.op.get_shape(), dtype=param.dtype)) new_velocity = mu * velocity - learning_rate * grad new_param = param + new_velocity updates.append((velocity, new_velocity)) updates.append((param, new_param)) return updates
def nesterov_momentum(cost, params, learning_rate, momentum=0.9): """Stochastic Gradient Descent (SGD) updates with Nesterov momentum Math: * ``velocity := momentum * velocity - learning_rate * grad`` * ``param := momentum*velocity + param - learning_rate * grad`` Parameters ---------- cost : a scalar loss. params : a list of cgt shared variables. We generate update expressions w.r.t. these variables. learning_rate : float Tunes the size of the update step. momentum: float Tunes the weight given to the velocity term. Returns ------- list of tuples of the form [(param, updates) (velocity, velocity_update)] """ updates = [] grads = cgt.grad(cost, params) for param, grad in zip(params, grads): value = param.op.get_value() velocity = cgt.shared(np.zeros(value.shape, dtype=value.dtype)) x = momentum * velocity - learning_rate * grad updates.append((velocity, x)) updates.append((param, momentum * x + param - learning_rate * grad)) return updates
def nesterov_momentum(cost, params, learning_rate, momentum=0.9): """Stochastic Gradient Descent (SGD) updates with Nesterov momentum Math: * ``velocity := momentum * velocity - learning_rate * grad`` * ``param := momentum*velocity + param - learning_rate * grad`` Parameters ---------- cost : a scalar loss. params : a list of cgt shared variables. We generate update expressions w.r.t. these variables. learning_rate : float Tunes the size of the update step. momentum: float Tunes the weight given to the velocity term. Returns ------- list of tuples of the form [(param, updates) (velocity, velocity_update)] """ updates = [] grads = cgt.grad(cost, params) for param, grad in zip(params, grads): value = param.op.get_value() velocity = cgt.shared(np.zeros(value.shape, dtype=value.dtype)) x = momentum * velocity - learning_rate * grad updates.append((velocity, x)) updates.append((param, momentum*x + param - learning_rate * grad)) return updates
def parameter(val, name=None, device=None): fixed_shape_mask = "all" out = cgt.shared(val, name=name, device=device, fixed_shape_mask=fixed_shape_mask) out.props["is_parameter"] = True return out
def test_update(): with cgt.scoped_update_config(parallel=True): xval = np.array(1.5) x = cgt.shared(xval) f = cgt.function([], x.sum(), updates=[(x, x + 1)]) before = x.op.get_value().copy() f() after = x.op.get_value() assert np.allclose(after, before + 1)
def adagrad_updates(cost, params, stepsize=0.001, rho=0.9, epsilon=1e-6): grads = cgt.grad(cost, params) updates = [] for param, grad in zip(params, grads): value = param.op.get_value() accu = cgt.shared(np.zeros(value.shape, dtype=value.dtype)) delta_accu = cgt.shared(np.zeros(value.shape, dtype=value.dtype)) accu_new = rho * accu + (1 - rho) * grad**2 updates.append((accu, accu_new)) update = (grad * cgt.sqrt(delta_accu + epsilon) / cgt.sqrt(accu_new + epsilon)) updates.append((param, param - stepsize * update)) delta_accu_new = rho * delta_accu + (1 - rho) * update**2 updates.append((delta_accu, delta_accu_new)) return updates
def test_update(): with cgt.scoped_update_config(parallel = True, backend="native"): xval = np.array(1.5) x = cgt.shared(xval) f = cgt.function([], x.sum(), updates=[(x,x+1)]) before = x.op.get_value().copy() f() after = x.op.get_value() assert np.allclose(after , before+1)
def rmsprop_updates(cost, params, stepsize=0.001, rho=0.9, epsilon=1e-6): grads = cgt.grad(cost, params) updates = [] for p, g in zip(params, grads): acc = cgt.shared(p.op.get_value() * 0.) acc_new = rho * acc + (1 - rho) * cgt.square(g) gradient_scaling = cgt.sqrt(acc_new + epsilon) g = g / gradient_scaling updates.append((acc, acc_new)) updates.append((p, p - stepsize * g)) return updates
def shared(val, name=None, broadcastable=None, borrow=False): if is_theano(): return theano.shared(val, name=name, broadcastable=broadcastable) elif is_cgt(): return cgt.shared(val, name=name) else: var = tf.Variable(val.astype(floatX), name=name) var._tensorfuse_shape_template = val.shape var._tensorfuse_shared = True compat.tf_add_blank_var(var) return var
def __init__(self, xdim, args, dec="bernoulli"): self.xdim = xdim self.hdim = args.hdim self.zdim = args.zdim self.lmbda = args.lmbda # weight decay coefficient * 2 self.x = cgt.matrix("x", dtype=cgt.floatX) self.eps = cgt.matrix("eps", dtype=cgt.floatX) self.enc_mlp = GaussianMLP(self.x, self.xdim, self.hdim, self.zdim, nlayers=args.nlayers, eps=self.eps) if dec == "bernoulli": # log p(x | z) defined as -CE(x, y) = dec_mlp.cost(y) self.dec_mlp = BernoulliMLP(self.enc_mlp.out, self.zdim, self.hdim, self.xdim, nlayers=args.nlayers, y=self.x) elif dec == "gaussian": self.dec_mlp = GaussianMLP(self.enc_mlp.out, self.zdim, self.hdim, self.xdim, nlayers=args.nlayers, y=self.x) else: raise RuntimeError("unrecognized decoder %" % dec) self.cost = (-cgt.sum(kld_unit_mvn(self.enc_mlp.mu, self.enc_mlp.var)) + self.dec_mlp.cost) / args.batch_size self.params = self.enc_mlp.params + self.dec_mlp.params # L2 regularization self.gparams = [cgt.grad(self.cost, [p])[0] + self.lmbda * p for p in self.params] self.gaccums = [cgt.shared(np.zeros(p.op.get_value().shape, dtype=cgt.floatX)) for p in self.params] # XXX replace w/ adagrad update from nn ADAGRAD_EPS = 1e-10 # for stability self.updates = [ (param, param - args.lr * gparam / cgt.sqrt(gaccum + cgt.square(gparam) + ADAGRAD_EPS)) for param, gparam, gaccum in zip(self.params, self.gparams, self.gaccums) ] self.updates += [ (gaccum, gaccum + cgt.square(gparam)) for gaccum, gparam in zip(self.gaccums, self.gparams) ] self.train = cgt.function( [self.x, self.eps], self.cost, updates=self.updates ) self.test = cgt.function( [self.x, self.eps], self.cost, updates=None ) # can be used for semi-supervised learning for example self.encode = cgt.function( [self.x, self.eps], self.enc_mlp.out )
def test_array_wrapper(): xval = np.zeros(10) x = cgt.shared(xval) f = cgt.function([], [], updates=[(x, x + 1)]) f() g = cgt.function([], x.sum()) assert np.allclose(x.op.get_value(), xval + 1) xval2 = np.arange(10) x.op.set_value(xval2) print x.op.get_value() assert np.allclose(x.op.get_value(), xval2) assert g() == xval2.sum() f() assert np.allclose(x.op.get_value(), xval2 + 1) assert g() == (xval2 + 1).sum()
def __init__(self, input, n_in, n_out, W=None, b=None, activation=cgt.tanh, prefix=""): self.n_in = n_in self.n_out = n_out if W is None: # XXX replace with nn init W_values = np.asarray( rng.uniform( low=-np.sqrt(6. / (n_in + n_out)), high=np.sqrt(6. / (n_in + n_out)), size=(n_in, n_out) ), dtype=cgt.floatX ) if activation == cgt.sigmoid: W_values *= 4 W = cgt.shared(W_values, name=prefix+"_W") if b is None: b_values = np.zeros((n_out,), dtype=cgt.floatX) b = cgt.shared(b_values, name=prefix+"_b") self.W = W self.b = b # XXX broadcast api may change lin_output = cgt.broadcast("+", cgt.dot(input, self.W), cgt.dimshuffle(self.b, ["x", 0]), "xx,1x") self.output = ( lin_output if activation is None else activation(lin_output) ) # parameters of the model self.params = [self.W, self.b]
def test_array_wrapper(): xval = np.zeros(10) x = cgt.shared(xval) f = cgt.function([],[],updates=[(x,x+1)]) f() g = cgt.function([],x.sum()) assert np.allclose(x.op.get_value(), xval+1) xval2 = np.arange(10) x.op.set_value(xval2) print x.op.get_value() assert np.allclose(x.op.get_value(), xval2) assert g() == xval2.sum() f() assert np.allclose(x.op.get_value(), xval2+1) assert g() == (xval2+1).sum()
def test_incsubtensor0(): # First let's test fancy slice along zeroth dimension W = cgt.shared(np.zeros((5, 3)), name="W") inc = cgt.matrix() # we'll increment W by this matrix incval = np.arange(9).reshape(3, 3) inds = cgt.vector(dtype='i8') updates = {W: cgt.inc_subtensor(W, inds, inc)} f = cgt.function([inds, inc], [], updates=updates) f([1, 2, 4], incval) assert np.allclose( W.op.get_value(), np.array([[0., 0., 0.], [0., 1., 2.], [3., 4., 5.], [0., 0., 0.], [6., 7., 8.]]))
def runtest(backend, precision): with cgt.scoped_update_config(backend='native', precision=precision): xval = np.zeros(10) x = cgt.shared(xval) f = cgt.function([], [], updates=[(x, x + 1)]) f() g = cgt.function([], x.sum()) assert np.allclose(x.op.get_value(), xval + 1) xval2 = np.arange(10) x.op.set_value(xval2) print x.op.get_value() assert np.allclose(x.op.get_value(), xval2) assert g() == xval2.sum() f() assert np.allclose(x.op.get_value(), xval2 + 1) assert g() == (xval2 + 1).sum()
def runtest(backend, precision): with cgt.scoped_update_config(backend='native',precision=precision): xval = np.zeros(10) x = cgt.shared(xval) f = cgt.function([],[],updates=[(x,x+1)]) f() g = cgt.function([],x.sum()) assert np.allclose(x.op.get_value(), xval+1) xval2 = np.arange(10) x.op.set_value(xval2) print x.op.get_value() assert np.allclose(x.op.get_value(), xval2) assert g() == xval2.sum() f() assert np.allclose(x.op.get_value(), xval2+1) assert g() == (xval2+1).sum()
def __init__(self,input_sizes,mem_size,name_prefix=""): Wiz_vals = [normc(randnf(input_size,mem_size)) for input_size in input_sizes] self.Wizs = [cgt.shared(Wiz_val,name=name_prefix+"Wiz") for Wiz_val in Wiz_vals] Wmz_val = normc(randnf(mem_size,mem_size)) self.Wmz = cgt.shared(Wmz_val,name=name_prefix+"Wmz") bz = np.zeros((1,mem_size),cgt.floatX) self.bz = cgt.shared(bz,name=name_prefix+"bz") Wir_vals = [normc(randnf(input_size,mem_size)) for input_size in input_sizes] self.Wirs = [cgt.shared(Wir_val,name=name_prefix+"Wir") for Wir_val in Wir_vals] Wmr_val = normc(randnf(mem_size,mem_size)) self.Wmr = cgt.shared(Wmr_val,name=name_prefix+"Wmr") br = np.zeros((1,mem_size),cgt.floatX) self.br = cgt.shared(br,name=name_prefix+"br") Wim_vals = [normc(randnf(input_size,mem_size)) for input_size in input_sizes] self.Wims = [cgt.shared(Wim_val,name=name_prefix+"Wim") for Wim_val in Wim_vals] Wmm_val = normc(np.eye(mem_size,dtype=cgt.floatX)) self.Wmm = cgt.shared(Wmm_val,name=name_prefix+"Wmm") bm = np.zeros((1,mem_size),cgt.floatX) self.bm = cgt.shared(bm,name=name_prefix+"bm")
def __init__(self, input_sizes, mem_size, name_prefix=""): Wiz_vals = [ normc(randnf(input_size, mem_size)) for input_size in input_sizes ] self.Wizs = [ cgt.shared(Wiz_val, name=name_prefix + "Wiz") for Wiz_val in Wiz_vals ] Wmz_val = normc(randnf(mem_size, mem_size)) self.Wmz = cgt.shared(Wmz_val, name=name_prefix + "Wmz") bz = np.zeros((1, mem_size), cgt.floatX) self.bz = cgt.shared(bz, name=name_prefix + "bz") Wir_vals = [ normc(randnf(input_size, mem_size)) for input_size in input_sizes ] self.Wirs = [ cgt.shared(Wir_val, name=name_prefix + "Wir") for Wir_val in Wir_vals ] Wmr_val = normc(randnf(mem_size, mem_size)) self.Wmr = cgt.shared(Wmr_val, name=name_prefix + "Wmr") br = np.zeros((1, mem_size), cgt.floatX) self.br = cgt.shared(br, name=name_prefix + "br") Wim_vals = [ normc(randnf(input_size, mem_size)) for input_size in input_sizes ] self.Wims = [ cgt.shared(Wim_val, name=name_prefix + "Wim") for Wim_val in Wim_vals ] Wmm_val = normc(np.eye(mem_size, dtype=cgt.floatX)) self.Wmm = cgt.shared(Wmm_val, name=name_prefix + "Wmm") bm = np.zeros((1, mem_size), cgt.floatX) self.bm = cgt.shared(bm, name=name_prefix + "bm")
def test_lrn(): if not get_compile_info()["CGT_ENABLE_CUDA"]: raise SkipTest("Skipping because CUDA disabled") nr.seed(0) Xval = nr.randn(4, 8, 16, 16) X = cgt.shared(Xval, name="X", fixed_shape_mask="all") # X = cgt.tensor4(name='X') y = cross_channel_lrn(X, localsize=4, alpha=.1, beta=.5) f = cgt.function([], y) print f().sum() print f().sum() print f().sum() assert np.isfinite(f().sum()) # print f(Xval).sum() a = nr.rand(*cgt.infer_shape(y)) loss = (y * a).sum() gradcheck_model(loss, [X], eps=1e-5)
def test_incsubtensor2(): W = cgt.shared(np.zeros((5, 3)), name="W") i0 = cgt.vector(dtype='i8') i1 = cgt.vector(dtype='i8') inc = cgt.vector() updates2 = {W: cgt.inc_subtensor(W, (i0, i1), inc)} f2 = cgt.function([i0, i1, inc], [], updates=updates2) f2([0, 1, 2, 2], [0, 1, 2, 2], [1, 2, 3, 4]) assert np.allclose( W.op.get_value(), np.array([ [1., 0., 0.], [0., 2., 0.], [0., 0., 7.], [0., 0., 0.], [0., 0., 0.], ]))
def test_lrn(): if not get_compile_info()["CGT_ENABLE_CUDA"]: raise SkipTest("Skipping because CUDA disabled") nr.seed(0) Xval = nr.randn(4,8,16,16) X = cgt.shared(Xval, name="X", fixed_shape_mask="all") # X = cgt.tensor4(name='X') y = cross_channel_lrn(X, localsize=4, alpha=.1, beta=.5) f = cgt.function([],y) print f().sum() print f().sum() print f().sum() assert np.isfinite(f().sum()) # print f(Xval).sum() a = nr.rand(*cgt.infer_shape(y)) loss = (y*a).sum() gradcheck_model(loss, [X],eps=1e-5)
def test_incsubtensor2(): W = cgt.shared(np.zeros((5,3)), name="W") i0 = cgt.vector(dtype='i8') i1 = cgt.vector(dtype='i8') inc = cgt.vector() updates2 = {W : cgt.inc_subtensor(W, (i0,i1), inc)} f2 = cgt.function([i0,i1,inc],[],updates=updates2) f2([0,1,2,2],[0,1,2,2],[1,2,3,4]) assert np.allclose(W.op.get_value(), np.array( [ [ 1., 0., 0.], [ 0., 2., 0.], [ 0., 0., 7.], [ 0., 0., 0.], [ 0., 0., 0.], ]))
def rmsprop(cost, params, learning_rate=1.0, rho=0.9, epsilon=1e-6): """RMSProp updates Divide learning rate by moving average of RMS gradients. See [1] Math: * ``accu_new = rho * accu + (1 - rho) * grad ** 2`` * ``param = param - (learning_rate * grad / cgt.sqrt(accu_new + epsilon))`` Parameters ---------- cost : a scalar loss. params : a list of cgt shared variables. We generate update expressions w.r.t. these variables. learning_rate : float Tunes the size of the update step. rho : float Controls decay of gradient moving average. epsilon : float Avoid division by 0 while scaling. Small constant. Returns ------- list of tuples of the form [(param, updates), (accumulated_RMS_grads, accumulated_RMS_grads_new)] References ---------- .. [1] Yann N. Dauphin, Harm de Vries, Junyoung Chung, Yoshua Bengio (2015): RMSProp and equilibrated adaptive learning rates for non-convex optimization arXiv:1502.04390 http://arxiv.org/abs/1502.04390 """ updates = [] grads = cgt.grad(cost, params) for param, grad in zip(params, grads): value = param.op.get_value() accu = cgt.shared(np.zeros(value.shape, dtype=value.dtype)) accu_new = rho * accu + (1 - rho) * grad**2 updates.append((accu, accu_new)) updates.append( (param, param - (learning_rate * grad / cgt.sqrt(accu_new + epsilon)))) return updates
def test_incsubtensor1(): W = cgt.shared(np.zeros((5, 3)), name="W") inc = cgt.matrix() # we'll increment W by this matrix incval = np.arange(9).reshape(3, 3) start = cgt.scalar(dtype='i8') stop = cgt.scalar(dtype='i8') updates = {W: cgt.inc_subtensor(W, slice(start, stop), inc)} f = cgt.function([start, stop, inc], [], updates=updates) f(0, 3, incval) assert np.allclose( W.op.get_value(), np.array([ [0., 1., 2.], [3., 4., 5.], [6., 7., 8.], [0., 0., 0.], [0., 0., 0.], ]))
def test_incsubtensor1(): W = cgt.shared(np.zeros((5,3)), name="W") inc = cgt.matrix() # we'll increment W by this matrix incval = np.arange(9).reshape(3,3) start = cgt.scalar(dtype='i8') stop = cgt.scalar(dtype='i8') updates = {W : cgt.inc_subtensor(W, slice(start, stop), inc)} f = cgt.function([start,stop,inc],[],updates=updates) f(0,3,incval) assert np.allclose(W.op.get_value(), np.array( [ [ 0., 1., 2.], [ 3., 4., 5.], [ 6., 7., 8.], [ 0., 0., 0.], [ 0., 0., 0.], ]))
def rmsprop(cost, params, learning_rate=1.0, rho=0.9, epsilon=1e-6): """RMSProp updates Divide learning rate by moving average of RMS gradients. See [1] Math: * ``accu_new = rho * accu + (1 - rho) * grad ** 2`` * ``param = param - (learning_rate * grad / cgt.sqrt(accu_new + epsilon))`` Parameters ---------- cost : a scalar loss. params : a list of cgt shared variables. We generate update expressions w.r.t. these variables. learning_rate : float Tunes the size of the update step. rho : float Controls decay of gradient moving average. epsilon : float Avoid division by 0 while scaling. Small constant. Returns ------- list of tuples of the form (param, updates), (accumulated_RMS_grads, accumulated_RMS_grads_new) References ---------- .. [1] Yann N. Dauphin, Harm de Vries, Junyoung Chung, Yoshua Bengio (2015): RMSProp and equilibrated adaptive learning rates for non-convex optimization arXiv:1502.04390 http://arxiv.org/abs/1502.04390 """ updates = [] grads = cgt.grad(cost, params) for param, grad in zip(params, grads): assert isinstance(param.op, core.GetData) accu = cgt.shared(np.zeros(param.op.get_shape(), dtype=param.dtype)) accu_new = rho * accu + (1 - rho) * grad ** 2 updates.append((accu, accu_new)) updates.append((param, param - (learning_rate * grad / cgt.sqrt(accu_new + epsilon)))) return updates
def test_incsubtensor0(): # First let's test fancy slice along zeroth dimension W = cgt.shared(np.zeros((5,3)), name="W") inc = cgt.matrix() # we'll increment W by this matrix incval = np.arange(9).reshape(3,3) inds = cgt.vector(dtype='i8') updates = {W : cgt.inc_subtensor(W, inds, inc)} f = cgt.function([inds,inc],[],updates=updates) f([1,2,4],incval) assert np.allclose(W.op.get_value(), np.array( [[ 0., 0., 0.], [ 0., 1., 2.], [ 3., 4., 5.], [ 0., 0., 0.], [ 6., 7., 8.]]))
def adagrad(cost, params, learning_rate=1.0, epsilon=1e-6): """Adagrad updates The learning rate will be scaled by dividing it by the sqaure root of the sum of accumulated squared gradients. Math: * ``accu_new = accu + grad ** 2`` * ``param = param - (learning_rate * grad) / cgt.sqrt(accu_new + epsilon)`` Parameters ---------- cost : a scalar loss. params : a list of cgt shared variables. We generate update expressions w.r.t. these variables. learning_rate : float Tunes the size of the update step. epsilon: avoids division close to zero. Small float. Returns ------- list of tuples of the form [(param, updates), (accumulated_grads, accumulated_grads_new)] References ---------- .. [1] Duchi, J., Hazan, E., & Singer, Y. (2011): Adaptive subgradient methods for online learning and stochastic optimization. JMLR, 12:2121-2159. """ updates = [] grads = cgt.grad(cost, params) for param, grad in zip(params, grads): value = param.op.get_value() accu = cgt.shared(np.zeros(value.shape, dtype=value.dtype)) accu_new = accu + grad**2 updates.append((accu, accu_new)) updates.append( (param, param - (learning_rate * grad) / cgt.sqrt(accu_new + epsilon))) return updates
def adagrad(cost, params, learning_rate=1.0, epsilon=1e-6): """Adagrad updates The learning rate will be scaled by dividing it by the sqaure root of the sum of accumulated squared gradients. Math: * ``accu_new = accu + grad ** 2`` * ``param = param - (learning_rate * grad) / cgt.sqrt(accu_new + epsilon)`` Parameters ---------- cost : a scalar loss. params : a list of cgt shared variables. We generate update expressions w.r.t. these variables. learning_rate : float Tunes the size of the update step. epsilon: avoids division close to zero. Small float. Returns ------- list of tuples of the form [(param, updates), (accumulated_grads, accumulated_grads_new)] References ---------- .. [1] Duchi, J., Hazan, E., & Singer, Y. (2011): Adaptive subgradient methods for online learning and stochastic optimization. JMLR, 12:2121-2159. """ updates = [] grads = cgt.grad(cost, params) for param, grad in zip(params, grads): assert isinstance(param.op, core.GetData) accu = cgt.shared(np.zeros(param.op.get_shape(), dtype=param.dtype)) accu_new = accu + grad ** 2 updates.append((accu, accu_new)) updates.append((param, param - (learning_rate * grad) / cgt.sqrt(accu_new + epsilon))) return updates
def nesterov_momentum(cost, params, learning_rate, mu=0.9): """Stochastic Gradient Descent (SGD) updates with Nesterov momentum Math: * ``new_velocity := mu * velocity - learning_rate * grad`` * ``param := param - mu * velocity + (1 + mu) * new_velocity`` See http://arxiv.org/abs/1212.0901v2, first part of eq 7 At each step we're returning the "peaked-ahead parameters" Parameters ---------- cost : a scalar loss. params : a list of cgt shared variables. We generate update expressions w.r.t. these variables. learning_rate : float Tunes the size of the update step. mu: float Tunes the weight given to the velocity term. Returns ------- list of tuples of the form (param, updates), (velocity, velocity_update) """ updates = [] grads = cgt.grad(cost, params) for param, grad in zip(params, grads): assert isinstance(param.op, core.GetData) velocity = cgt.shared(np.zeros(param.op.get_shape(), dtype=param.dtype)) new_velocity = mu * velocity - learning_rate * grad new_param = param - mu * velocity + (mu + 1) * new_velocity updates.append((velocity, new_velocity)) updates.append((param, new_param)) return updates
def init_weights(*shape): return cgt.shared(np.random.randn(*shape) * 0.01, fixed_shape_mask='all')
crop_size = tp.crop_size chans = len(tp.mean_value) dp = layer.data_param batch_size = dp.batch_size output = [cgt.tensor(dtype=cgt.floatX,ndim=4,name=layer.name, fixed_shape=(batch_size,chans,crop_size,crop_size)), cgt.tensor(dtype='i8',ndim=2,name=layer.name, fixed_shape=(batch_size, 1))] elif layer.type == "Convolution": X = inputs[0] param = layer.convolution_param kh,kw = (param.kernel_size, param.kernel_size) if param.HasField("kernel_size")\ else (param.kernel_h, param.kernel_w) nchanin = infer_shape(X)[0] Wshape = (param.num_output, nchanin, kh, kw) Wname = layer.param[0].name or layer.name+":W" Wval = np.empty(Wshape, dtype=cgt.floatX) W = name2node[Wname] = cgt.shared(Wval, name=Wname, fixed_shape_mask="all") bshape = (1, param.num_output, 1, 1) bname = layer.param[1].name or layer.name+":b" bval = np.empty(bshape, dtype=cgt.floatX) b = name2node[bname] = cgt.shared(bval, name=bname, fixed_shape_mask="all") sh,sw = (param.stride, param.stride) if param.HasField("stride")\ else (param.stride_h, param.stride_w) output = [cgt.broadcast("+",nn.conv2d(X, W, subsample=(sh,sw)), b, "xxxx,1x11")] elif layer.type == "Pooling": param = layer.pooling_param X = inputs[0] pool_type = {param.MAX : "max", param.AVE : "mean"}[param.pool] height_in,width_in = infer_shape(X)[2:4] kernel = (param.kernel_size, param.kernel_size) if param.HasField("kernel_size")\ else (param.kernel_h, param.kernel_w) stride = (param.stride, param.stride) if param.HasField("stride")\
cgt.tensor(dtype='i8', ndim=2, name=layer.name, fixed_shape=(batch_size, 1)) ] elif layer.type == "Convolution": X = inputs[0] param = layer.convolution_param kh,kw = (param.kernel_size, param.kernel_size) if param.HasField("kernel_size")\ else (param.kernel_h, param.kernel_w) nchanin = infer_shape(X)[0] Wshape = (param.num_output, nchanin, kh, kw) Wname = layer.param[0].name or layer.name + ":W" Wval = np.empty(Wshape, dtype=cgt.floatX) W = name2node[Wname] = cgt.shared(Wval, name=Wname, fixed_shape_mask="all") bshape = (1, param.num_output, 1, 1) bname = layer.param[1].name or layer.name + ":b" bval = np.empty(bshape, dtype=cgt.floatX) b = name2node[bname] = cgt.shared(bval, name=bname, fixed_shape_mask="all") sh,sw = (param.stride, param.stride) if param.HasField("stride")\ else (param.stride_h, param.stride_w) output = [ cgt.broadcast("+", nn.conv2d(X, W, subsample=(sh, sw)), b, "xxxx,1x11") ] elif layer.type == "Pooling": param = layer.pooling_param
# split data X_train, X_test, Y_train, Y_test = train_test_split(data, targets, test_size=0.2, random_state=0) # hyperparams # # Be careful when setting alpha! If it's too large # here the cost will blow up. alpha = 1e-7 epochs = 100 # Linear regression model np.random.seed(0) X = cgt.matrix("X", fixed_shape=(None, nfeats)) Y = cgt.vector("Y") w = cgt.shared(np.random.randn(nfeats) * 0.01) # prediction ypred = cgt.dot(X, w) # cost cost = cgt.square(Y - ypred).mean() # derivative with respect to w dw = cgt.grad(cost=cost, wrt=w) updates = [(w, w - dw * alpha)] # training function trainf = cgt.function(inputs=[X, Y], outputs=[], updates=updates) # cost function, no updates costf = cgt.function(inputs=[X, Y], outputs=cost)