def __init__(self, n_actions): Serializable.__init__(self, n_actions) cgt.set_precision('double') n_in = 128 o_no = cgt.matrix("o_no",fixed_shape=(None,n_in)) a_n = cgt.vector("a_n",dtype='i8') q_n = cgt.vector("q_n") oldpdist_np = cgt.matrix("oldpdists") h0 = (o_no - 128.0)/128.0 nhid = 64 h1 = cgt.tanh(nn.Affine(128,nhid,weight_init=nn.IIDGaussian(std=.1))(h0)) probs_na = nn.softmax(nn.Affine(nhid,n_actions,weight_init=nn.IIDGaussian(std=0.01))(h1)) logprobs_na = cgt.log(probs_na) b = cgt.size(o_no, 0) logps_n = logprobs_na[cgt.arange(b), a_n] surr = (logps_n*q_n).mean() kl = (oldpdist_np * cgt.log(oldpdist_np/probs_na)).sum(axis=1).mean() params = nn.get_parameters(surr) gradsurr = cgt.grad(surr, params) flatgrad = cgt.concatenate([p.flatten() for p in gradsurr]) lam = cgt.scalar() penobj = surr - lam * kl self._f_grad_lagrangian = cgt.function([lam, oldpdist_np, o_no, a_n, q_n], cgt.concatenate([p.flatten() for p in cgt.grad(penobj,params)])) self.f_pdist = cgt.function([o_no], probs_na) self.f_probs = cgt.function([o_no], probs_na) self.f_surr_kl = cgt.function([oldpdist_np, o_no, a_n, q_n], [surr, kl]) self.f_gradlogp = cgt.function([oldpdist_np, o_no, a_n, q_n], flatgrad) self.pc = ParamCollection(params)
def test_devices(): N = 10 K = 3 compile_info = cgt.compilation.get_compile_info() cuda_enabled = compile_info["CGT_ENABLE_CUDA"] if not cuda_enabled: raise SkipTest("cuda disabled") Xval = np.random.randn(N, K).astype(cgt.floatX) wval = np.random.randn(K).astype(cgt.floatX) bval = np.asarray(np.random.randn()).astype(cgt.floatX) yval = np.random.randn(N).astype(cgt.floatX) with cgt.scoped_update_config(default_device=cgt.Device(devtype="gpu")): X_nk = cgt.shared(Xval, "X", device=cgt.Device(devtype='gpu')) y_n = cgt.shared(yval, "y") w_k = cgt.shared(wval, "w") b = cgt.shared(bval, name="b") print "bval", bval ypred = cgt.dot(cgt.square(X_nk), w_k) + b err = cgt.sum(cgt.sin(ypred - y_n)) g = cgt.grad(err, [w_k, b]) outputs = [err] + g f = cgt.function([], [err] + g) results = f() print results assert np.allclose( results[0], np.sin(np.square(Xval).dot(wval) + bval - yval).sum())
def momentum(cost, params, learning_rate, mu=0.9): """Stochastic Gradient Descent (SGD) updates with momentum Math: * ``velocity := mu * velocity - learning_rate * grad`` * ``param := param + velocity`` Parameters ---------- cost : a scalar loss. params : a list of cgt shared variables. We generate update expressions w.r.t. these variables. learning_rate : float Tunes the size of the update step. momentum: float Tunes the weight given to the velocity term. Returns ------- list of tuples of the form (param, new_param) and (velocity, new_velocity) """ updates = [] grads = cgt.grad(cost, params) for param, grad in zip(params, grads): assert isinstance(param.op, core.GetData) velocity = cgt.shared(np.zeros(param.op.get_shape(), dtype=param.dtype)) new_velocity = mu * velocity - learning_rate * grad new_param = param + new_velocity updates.append((velocity, new_velocity)) updates.append((param, new_param)) return updates
def nesterov_momentum(cost, params, learning_rate, momentum=0.9): """Stochastic Gradient Descent (SGD) updates with Nesterov momentum Math: * ``velocity := momentum * velocity - learning_rate * grad`` * ``param := momentum*velocity + param - learning_rate * grad`` Parameters ---------- cost : a scalar loss. params : a list of cgt shared variables. We generate update expressions w.r.t. these variables. learning_rate : float Tunes the size of the update step. momentum: float Tunes the weight given to the velocity term. Returns ------- list of tuples of the form [(param, updates) (velocity, velocity_update)] """ updates = [] grads = cgt.grad(cost, params) for param, grad in zip(params, grads): value = param.op.get_value() velocity = cgt.shared(np.zeros(value.shape, dtype=value.dtype)) x = momentum * velocity - learning_rate * grad updates.append((velocity, x)) updates.append((param, momentum * x + param - learning_rate * grad)) return updates
def test_devices(): N = 10 K = 3 compile_info = cgt.compilation.get_compile_info() cuda_enabled = compile_info["CGT_ENABLE_CUDA"] if not cuda_enabled: raise SkipTest("cuda disabled") Xval = np.random.randn(N,K).astype(cgt.floatX) wval = np.random.randn(K).astype(cgt.floatX) bval = np.asarray(np.random.randn()).astype(cgt.floatX) yval = np.random.randn(N).astype(cgt.floatX) with cgt.scoped_update_config(default_device=cgt.Device(devtype="gpu")): X_nk = cgt.shared(Xval, "X", device=cgt.Device(devtype='gpu')) y_n = cgt.shared(yval, "y") w_k = cgt.shared(wval, "w") b = cgt.shared(bval, name="b") print "bval",bval ypred = cgt.dot(cgt.square(X_nk), w_k) + b err = cgt.sum(cgt.sin(ypred - y_n)) g = cgt.grad(err, [w_k, b]) outputs = [err]+g f = cgt.function([], [err]+g) results = f() print results assert np.allclose(results[0] , np.sin(np.square(Xval).dot(wval)+bval-yval).sum())
def test_cudnn(): with cgt.scoped_update_config(precision="double",backend="native"): if not get_compile_info()["CGT_ENABLE_CUDNN"]: raise SkipTest("CUDNN not enabled. Skipping this test") Xval = nr.randn(2,3,19,18) Wval = nr.randn(5,3,3,3) bval = nr.randn(1,5,1,1) X = cgt.tensor4("X", fixed_shape=Xval.shape) W = cgt.tensor4("W", fixed_shape=Wval.shape) b = cgt.tensor4("b", fixed_shape=bval.shape) Y = cgt.core.Result(cudnn_ops.CudnnConvForward(1,1,1,1),[X, W, b]) Y2 = nr.randn(*cgt.core.infer_shape(Y)) fY = cgt.function([X,W,b],Y) Yval = fY(Xval,Wval,bval) cost = (Y*Y2).sum() fcost = cgt.function([X,W,b],cost) fgrad = cgt.function([X,W,b],cgt.grad(cost, [X,W,b])) angrads = fgrad(Xval,Wval,bval) nugrads = numeric_grad_multi(fcost, [Xval, Wval, bval],eps=1e-3) for (nugrad,angrad) in zip(nugrads,angrads): assert np.allclose(nugrad, angrad)
def nesterov_momentum(cost, params, learning_rate, momentum=0.9): """Stochastic Gradient Descent (SGD) updates with Nesterov momentum Math: * ``velocity := momentum * velocity - learning_rate * grad`` * ``param := momentum*velocity + param - learning_rate * grad`` Parameters ---------- cost : a scalar loss. params : a list of cgt shared variables. We generate update expressions w.r.t. these variables. learning_rate : float Tunes the size of the update step. momentum: float Tunes the weight given to the velocity term. Returns ------- list of tuples of the form [(param, updates) (velocity, velocity_update)] """ updates = [] grads = cgt.grad(cost, params) for param, grad in zip(params, grads): value = param.op.get_value() velocity = cgt.shared(np.zeros(value.shape, dtype=value.dtype)) x = momentum * velocity - learning_rate * grad updates.append((velocity, x)) updates.append((param, momentum*x + param - learning_rate * grad)) return updates
def test_flatvec(): cgt.reset_config cgt.set_precision('double') cgt.core.update_config(backend="python") # XXX N = 10 K = 3 Xval = np.random.randn(N, K) wval = np.random.randn(K) bval = np.random.randn() yval = np.random.randn(N) X_nk = cgt.shared(Xval, "X") y_n = cgt.shared(yval, "y") w_k = cgt.shared(wval, "w") b = cgt.shared(bval, name="b") ypred = cgt.dot(X_nk, w_k) + b err = cgt.sum(cgt.square(ypred - y_n)) g = cgt.grad(err, [w_k, b]) g = core.simplify(g) pars = [w_k, b] flatx = nn.setup_contiguous_storage(pars) f = cgt.function([], [err, cgt.flatcat(g)])
def test_cudnn(): if not get_compile_info()["CGT_ENABLE_CUDNN"]: raise SkipTest("CUDNN not enabled. Skipping this test") Xval = nr.randn(2, 3, 19, 18) Wval = nr.randn(5, 3, 3, 3) bval = nr.randn(1, 5, 1, 1) X = cgt.tensor4("X", fixed_shape=Xval.shape) W = cgt.tensor4("W", fixed_shape=Wval.shape) b = cgt.tensor4("b", fixed_shape=bval.shape) Y = cgt.core.Result(cudnn_ops.CudnnConvForward(1, 1, 1, 1), [X, W, b]) Y2 = nr.randn(*cgt.core.infer_shape(Y)) fY = cgt.function([X, W, b], Y) Yval = fY(Xval, Wval, bval) cost = (Y * Y2).sum() fcost = cgt.function([X, W, b], cost) fgrad = cgt.function([X, W, b], cgt.grad(cost, [X, W, b])) angrads = fgrad(Xval, Wval, bval) nugrads = numeric_grad_multi(fcost, [Xval, Wval, bval], eps=1e-3) for (nugrad, angrad) in zip(nugrads, angrads): assert np.allclose(nugrad, angrad)
def test_flatvec(): cgt.reset_config cgt.set_precision('double') cgt.core.update_config(backend="python") # XXX N = 10 K = 3 Xval = np.random.randn(N,K) wval = np.random.randn(K) bval = np.random.randn() yval = np.random.randn(N) X_nk = cgt.shared(Xval, "X") y_n = cgt.shared(yval, "y") w_k = cgt.shared(wval, "w") b = cgt.shared(bval, name="b") ypred = cgt.dot(X_nk, w_k) + b err = cgt.sum(cgt.square(ypred - y_n)) g = cgt.grad(err, [w_k, b]) g = core.simplify(g) pars = [w_k, b] flatx = nn.setup_contiguous_storage(pars) f = cgt.function([], [err,cgt.flatcat(g)])
def gradcheck_model(cost, params, extravars=(), extravals=(), atol=1e-8, eps=1e-9): precision = cgt.get_precision() if precision == "single": cgt.utils.warn( "You're doing a gradient check with %s precision. Use double or better yet quad for best results" % (precision)) assert all(param.is_input() for param in params) assert len(extravars) == len(extravals) # Convert to Argument nodes param_args = [ cgt.core.Argument(typ=s.typ, name=s.name) if s.is_data() else s for s in params ] # Get new cost in terms o farguments cost = cgt.core.clone(cost, replace=dict(zip(params, param_args))) grads = cgt.grad(cost, param_args) paramvals = [param.op.get_value() for param in params] fcost = cgt.function(param_args, cost, givens=zip(extravars, extravals)) fgrad = cgt.function(param_args, grads, givens=zip(extravars, extravals)) angrads = fgrad(*paramvals) nugrads = numeric_grad_multi(fcost, paramvals, eps=eps) for (angrad, nugrad) in zip(angrads, nugrads): assert np.allclose(angrad, nugrad, atol=atol)
def test_cudnn(): compile_info = get_compile_info() if not (compile_info["CGT_ENABLE_CUDNN"] and compile_info["CGT_ENABLE_CUDA"]): raise SkipTest("CUDNN not enabled. Skipping this test") Xval = nr.randn(2,3,19,18) Wval = nr.randn(5,3,3,3) bval = nr.randn(1,5,1,1) X = cgt.tensor4("X", fixed_shape=Xval.shape) W = cgt.tensor4("W", fixed_shape=Wval.shape) b = cgt.tensor4("b", fixed_shape=bval.shape) Y = cgt.core.Result(cudnn_ops.CudnnConvForward(1,1,1,1),[X, W, b]) Y2 = nr.randn(*cgt.core.infer_shape(Y)) fY = cgt.function([X,W,b],Y) Yval = fY(Xval,Wval,bval) cost = (Y*Y2).sum() fcost = cgt.function([X,W,b],cost) fgrad = cgt.function([X,W,b],cgt.grad(cost, [X,W,b])) angrads = fgrad(Xval,Wval,bval) nugrads = numeric_grad_multi(fcost, [Xval, Wval, bval],eps=1e-3) for (nugrad,angrad) in zip(nugrads,angrads): assert np.allclose(nugrad, angrad, rtol=9e-3, atol=1e-7)
def make_updater_fc(): X = cgt.matrix("X", fixed_shape=(None, 28 * 28)) y = cgt.vector("y", dtype="i8") stepsize = cgt.scalar("stepsize") loss = build_fc_return_loss(X, y) params = nn.get_parameters(loss) gparams = cgt.grad(loss, params) updates = [(p, p - stepsize * gp) for (p, gp) in zip(params, gparams)] return cgt.function([X, y, stepsize], loss, updates=updates)
def check_scalar_grads(precision, backend): cgt.reset_config() np.random.seed(0) cgt.set_precision(precision) cgt.core.update_config(backend=backend) x = cgt.scalar('x') y = cgt.scalar('y') z = cgt.scalar('z') vars = [x,y,z] #pylint: disable=W0622 vals = nr.rand(len(vars))+1 PROB2RESULT = {} for ((key,_), cls) in it.chain( it.izip(core.UNARY_INFO.items(),it.repeat(core.ElwiseUnary)), it.izip(core.BINARY_INFO.items(),it.repeat(core.ElwiseBinary)) ): if key == "conj": print "skipping conj" continue utils.colorprint(utils.Color.YELLOW, "Testing %s\n"%key) if cls == core.ElwiseUnary: n_in = 1 op = cls(key) else: n_in = 2 op = cls(key, (True,True)) inputvars = vars[0:n_in] inputvals = vals[0:n_in] out = core.Result(op, inputvars) f = cgt.function(inputvars, out) try: grads = cgt.grad(out, inputvars) except core.NonDifferentiable: print "nondiff" continue if DISPLAY: print "Function:" cgt.print_tree(out) print "Gradient original:" cgt.print_tree(grads) print "Gradient simplified:" grads_simple = core.simplify(grads) if DISPLAY: cgt.print_tree(grads_simple) gradf = cgt.function(inputvars, grads) eps = {"single":1e-4,"double":1e-9}[precision] nugrad = numeric_grad(lambda li: f(*li), inputvals,eps=eps) #pylint: disable=W0640 cgtgrad = gradf(*inputvals) np.testing.assert_almost_equal(nugrad,cgtgrad,decimal={"single":3,"double":6}[precision]) grad_count = core.count_nodes(grads_simple) PROB2RESULT[key] = {} PROB2RESULT[key]["grad"] = grad_count if DISPLAY: from thirdparty.tabulate import tabulate print tabulate([[key,val["grad"]] for (key,val) in PROB2RESULT.iteritems()],headers=["funcname","gradcount"])
def make_updater_fc(): X = cgt.matrix("X", fixed_shape=(None, 28 * 28)) y = cgt.vector("y", dtype='i8') stepsize = cgt.scalar("stepsize") loss = build_fc_return_loss(X, y) params = nn.get_parameters(loss) gparams = cgt.grad(loss, params) updates = [(p, p - stepsize * gp) for (p, gp) in zip(params, gparams)] return cgt.function([X, y, stepsize], loss, updates=updates)
def initialize(self, loss, scale): self._iter = 0 self.pc = Params(self.params) cur_val = self.pc.get_value_flat() idx = cur_val.nonzero() new_val = np.random.uniform(-scale, scale, size=(self.pc.get_total_size(),)) new_val[idx] = cur_val[idx] self.sync(new_val) grad = cgt.concatenate([g.flatten() for g in cgt.grad(loss, self.params)]) return grad
def make_updater_convnet(): X = cgt.tensor4("X", fixed_shape=(None, 1, 28, 28)) # so shapes can be inferred y = cgt.vector("y", dtype="i8") stepsize = cgt.scalar("stepsize") loss = build_convnet_return_loss(X, y) params = nn.get_parameters(loss) gparams = cgt.grad(loss, params) updates = [(p, p - stepsize * gp) for (p, gp) in zip(params, gparams)] return cgt.function([X, y, stepsize], loss, updates=updates)
def __init__(self, obs_dim, ctrl_dim): cgt.set_precision('double') Serializable.__init__(self, obs_dim, ctrl_dim) self.obs_dim = obs_dim self.ctrl_dim = ctrl_dim o_no = cgt.matrix("o_no",fixed_shape=(None,obs_dim)) a_na = cgt.matrix("a_na",fixed_shape = (None, ctrl_dim)) adv_n = cgt.vector("adv_n") oldpdist_np = cgt.matrix("oldpdist", fixed_shape=(None, 2*ctrl_dim)) self.logstd = logstd_1a = nn.parameter(np.zeros((1, self.ctrl_dim)), name="std_1a") std_1a = cgt.exp(logstd_1a) # Here's where we apply the network h0 = o_no nhid = 32 h1 = cgt.tanh(nn.Affine(obs_dim,nhid,weight_init=nn.IIDGaussian(std=0.1))(h0)) h2 = cgt.tanh(nn.Affine(nhid,nhid,weight_init=nn.IIDGaussian(std=0.1))(h1)) mean_na = nn.Affine(nhid,ctrl_dim,weight_init=nn.IIDGaussian(std=0.01))(h2) b = cgt.size(o_no, 0) std_na = cgt.repeat(std_1a, b, axis=0) oldmean_na = oldpdist_np[:, 0:self.ctrl_dim] oldstd_na = oldpdist_np[:, self.ctrl_dim:2*self.ctrl_dim] logp_n = ((-.5) * cgt.square( (a_na - mean_na) / std_na ).sum(axis=1)) - logstd_1a.sum() oldlogp_n = ((-.5) * cgt.square( (a_na - oldmean_na) / oldstd_na ).sum(axis=1)) - cgt.log(oldstd_na).sum(axis=1) ratio_n = cgt.exp(logp_n - oldlogp_n) surr = (ratio_n*adv_n).mean() pdists_np = cgt.concatenate([mean_na, std_na], axis=1) # kl = cgt.log(sigafter/) params = nn.get_parameters(surr) oldvar_na = cgt.square(oldstd_na) var_na = cgt.square(std_na) kl = (cgt.log(std_na / oldstd_na) + (oldvar_na + cgt.square(oldmean_na - mean_na)) / (2 * var_na) - .5).sum(axis=1).mean() lam = cgt.scalar() penobj = surr - lam * kl self._compute_surr_kl = cgt.function([oldpdist_np, o_no, a_na, adv_n], [surr, kl]) self._compute_grad_lagrangian = cgt.function([lam, oldpdist_np, o_no, a_na, adv_n], cgt.concatenate([p.flatten() for p in cgt.grad(penobj,params)])) self.f_pdist = cgt.function([o_no], pdists_np) self.f_objs = cgt.function([oldpdist_np, o_no, a_na, adv_n], [surr, kl]) self.pc = ParamCollection(params)
def rmsprop_updates(cost, params, stepsize=0.001, rho=0.9, epsilon=1e-6): grads = cgt.grad(cost, params) updates = [] for p, g in zip(params, grads): acc = cgt.shared(p.op.get_value() * 0.) acc_new = rho * acc + (1 - rho) * cgt.square(g) gradient_scaling = cgt.sqrt(acc_new + epsilon) g = g / gradient_scaling updates.append((acc, acc_new)) updates.append((p, p - stepsize * g)) return updates
def test_scalars(): np.random.seed(0) x = cgt.scalar('x') y = cgt.scalar('y') z = cgt.scalar('z') vars = [x,y,z] #pylint: disable=W0622 vals = nr.rand(len(vars))+1 PROB2RESULT = {} for ((key,_), cls) in it.chain( it.izip(core.UNARY_INFO.items(),it.repeat(core.ElwiseUnary)), it.izip(core.BINARY_INFO.items(),it.repeat(core.ElwiseBinary)) ): if key == "conj": print "skipping conj" continue utils.colorprint(utils.Color.YELLOW, "Testing %s\n"%key) if cls == core.ElwiseUnary: n_in = 1 op = cls(key) else: n_in = 2 op = cls(key, (True,True)) inputvars = vars[0:n_in] inputvals = vals[0:n_in] out = core.Result(op, inputvars) f = cgt.function(inputvars, out) try: grads = cgt.grad(out, inputvars) except core.NonDifferentiable: print "nondiff" continue if DISPLAY: print "Function:" cgt.print_tree(out) print "Gradient original:" cgt.print_tree(grads) print "Gradient simplified:" grads_simple = core.simplify(grads) if DISPLAY: cgt.print_tree(grads_simple) gradf = cgt.function(inputvars, grads) eps = {"single":1e-4,"double":1e-9}[cgt.get_precision()] nugrad = numeric_grad(lambda li: f(*li), inputvals,eps=eps) #pylint: disable=W0640 cgtgrad = gradf(*inputvals) np.testing.assert_almost_equal(nugrad,cgtgrad,decimal={"single":3,"double":6}[cgt.get_precision()]) grad_count = core.count_nodes(grads_simple) PROB2RESULT[key] = {} PROB2RESULT[key]["grad"] = grad_count if DISPLAY: from thirdparty.tabulate import tabulate print tabulate([[key,val["grad"]] for (key,val) in PROB2RESULT.iteritems()],headers=["funcname","gradcount"])
def make_updater_convnet(): X = cgt.tensor4("X", fixed_shape=(None, 1, 28, 28)) # so shapes can be inferred y = cgt.vector("y", dtype='i8') stepsize = cgt.scalar("stepsize") loss = build_convnet_return_loss(X, y) params = nn.get_parameters(loss) gparams = cgt.grad(loss, params) updates = [(p, p - stepsize * gp) for (p, gp) in zip(params, gparams)] return cgt.function([X, y, stepsize], loss, updates=updates)
def CGT_dvLJ(x): N = len(x) xt = cgt.vector('xt') vLJt = 0 for j in range(1,N): for i in range(j): rho = ((xt[i*D:i*D+D] - xt[j*D:j*D+D])**2).sum() vLJt += rho**(-6.0)-(rho**(-3.0)) dvLJc = cgt.grad(4*vLJt, xt) df = cgt.function([xt],dvLJc) return df(np.ravel(x))
def adadelta(cost, params, learning_rate=1.0, rho=0.95, epsilon=1e-6): """ Adadelta updates The learning rate is scaled by the ratio of accumulated gradients to the ratio of accumulated step sizes. Math: * ``accu_new = rho * accu + (1 - rho) * grad ** 2`` * ``update = (grad * cgt.sqrt(delta_accu + epsilon) / cgt.sqrt(accu_new + epsilon))`` * ``param = param - learning_rate * update`` * ``delta_accu_new = rho * delta_accu + (1 - rho) * update ** 2`` Parameters ---------- cost : a scalar loss. params : a list of cgt shared variables. We generate update expressions w.r.t. these variables. learning_rate : float Tunes the size of the update step. rho : float Controls decay of gradient moving average. epsilon : float Avoid division by 0 while scaling. Small constant. Returns ------- list of tuples of the form (param, updates), (accumulated_grads, accumulated_grads_new), (step_accum, step_accum_new) References ---------- .. [1] Zeiler, M. D. (2012): ADADELTA: An Adaptive Learning Rate Method. arXiv Preprint arXiv:1212.5701. """ updates = [] grads = cgt.grad(cost, params) for param, grad in zip(params, grads): assert isinstance(param.op, core.GetData) accu = cgt.shared(np.zeros(param.op.get_shape(), dtype=param.dtype)) delta_accu = cgt.shared( np.zeros(param.op.get_shape(), dtype=param.dtype)) accu_new = rho * accu + (1 - rho) * grad**2 updates.append((accu, accu_new)) update = (grad * cgt.sqrt(delta_accu + epsilon) / cgt.sqrt(accu_new + epsilon)) updates.append((param, param - learning_rate * update)) delta_accu_new = rho * delta_accu + (1 - rho) * update**2 updates.append((delta_accu, delta_accu_new)) return updates
def check_affine(f, *nu_inputs): types = ",".join(["{%s,%s}" % (x.dtype, x.ndim) for x in nu_inputs]) cgt.utils.colorprint(cgt.utils.Color.YELLOW, "Testing %s(%s)\n" % (f.__name__, types)) sy_inputs = map(tensor_like, nu_inputs) for (i, sy) in enumerate(sy_inputs): sy.name = "x%i" % i sy_result = f(*sy_inputs) def maybeprint(msg): if DISPLAY: print msg maybeprint("Function:") if DISPLAY: cgt.print_tree([sy_result]) f_cgt = cgt.function(sy_inputs, sy_result) sy_grads = cgt.grad(sy_result, sy_inputs) gradf_cgt = cgt.function(sy_inputs, sy_grads) sy_result_simple = core.simplify([sy_result]) sy_grads_simple = core.simplify(sy_grads) maybeprint("Gradient:") if DISPLAY: cgt.print_tree(sy_grads) maybeprint("Gradient after simplification:") if DISPLAY: cgt.print_tree(sy_grads_simple) out_true = f(*nu_inputs) out_cgt = f_cgt(*nu_inputs) grads_true = gradients_affine(f_cgt, nu_inputs, h=1e-4 if "max" in f.__name__ else 1e-1) grads_cgt = gradf_cgt(*nu_inputs) rtol = {"single": 1e-3, "double": 1e-5}[cgt.get_precision()] np.testing.assert_allclose(out_cgt, out_true, rtol=rtol) for (g_cgt, g_true) in zip(grads_cgt, grads_true): np.testing.assert_allclose(g_cgt, g_true, rtol=rtol) result_count = cgt.count_nodes(sy_result_simple) grad_count = cgt.count_nodes(sy_grads_simple) maybeprint("Result before: %i. after: %i" % (cgt.count_nodes([sy_result]), result_count)) maybeprint("Grad before: %i. after: %i" % (cgt.count_nodes(sy_grads), grad_count)) PROB2RESULT[f.__name__] = {} PROB2RESULT[f.__name__]["fn"] = result_count PROB2RESULT[f.__name__]["grad"] = grad_count
def __init__(self, n_actions): Serializable.__init__(self, n_actions) cgt.set_precision('double') n_in = 128 o_no = cgt.matrix("o_no", fixed_shape=(None, n_in)) a_n = cgt.vector("a_n", dtype='i8') q_n = cgt.vector("q_n") oldpdist_np = cgt.matrix("oldpdists") h0 = (o_no - 128.0) / 128.0 nhid = 64 h1 = cgt.tanh( nn.Affine(128, nhid, weight_init=nn.IIDGaussian(std=.1))(h0)) probs_na = nn.softmax( nn.Affine(nhid, n_actions, weight_init=nn.IIDGaussian(std=0.01))(h1)) logprobs_na = cgt.log(probs_na) b = cgt.size(o_no, 0) logps_n = logprobs_na[cgt.arange(b), a_n] surr = (logps_n * q_n).mean() kl = (oldpdist_np * cgt.log(oldpdist_np / probs_na)).sum(axis=1).mean() params = nn.get_parameters(surr) gradsurr = cgt.grad(surr, params) flatgrad = cgt.concatenate([p.flatten() for p in gradsurr]) lam = cgt.scalar() penobj = surr - lam * kl self._f_grad_lagrangian = cgt.function( [lam, oldpdist_np, o_no, a_n, q_n], cgt.concatenate([p.flatten() for p in cgt.grad(penobj, params)])) self.f_pdist = cgt.function([o_no], probs_na) self.f_probs = cgt.function([o_no], probs_na) self.f_surr_kl = cgt.function([oldpdist_np, o_no, a_n, q_n], [surr, kl]) self.f_gradlogp = cgt.function([oldpdist_np, o_no, a_n, q_n], flatgrad) self.pc = ParamCollection(params)
def make_loss_and_grad(net): X_b = inps[0] #cgt.matrix(dtype=cgt.floatX) y_onehot = cgt.matrix(dtype='i4') outputs = [logprobs] loss = nn.crossent(outputs[0], y_onehot) / b_size #gradloss = cgt.grad(loss, params) gradloss = cgt.grad(loss, param_list) # XXX use flatcat function grad = cgt.concatenate([x.flatten() for x in gradloss]) #grad = gradloss return cgt.make_function([X_b, y_onehot], [loss, grad, logprobs])
def adadelta(cost, params, learning_rate=1.0, rho=0.95, epsilon=1e-6): """ Adadelta updates The learning rate is scaled by the ratio of accumulated gradients to the ratio of accumulated step sizes. Math: * ``accu_new = rho * accu + (1 - rho) * grad ** 2`` * ``update = (grad * cgt.sqrt(delta_accu + epsilon) / cgt.sqrt(accu_new + epsilon))`` * ``param = param - learning_rate * update`` * ``delta_accu_new = rho * delta_accu + (1 - rho) * update ** 2`` Parameters ---------- cost : a scalar loss. params : a list of cgt shared variables. We generate update expressions w.r.t. these variables. learning_rate : float Tunes the size of the update step. rho : float Controls decay of gradient moving average. epsilon : float Avoid division by 0 while scaling. Small constant. Returns ------- list of tuples of the form (param, updates), (accumulated_grads, accumulated_grads_new), (step_accum, step_accum_new) References ---------- .. [1] Zeiler, M. D. (2012): ADADELTA: An Adaptive Learning Rate Method. arXiv Preprint arXiv:1212.5701. """ updates = [] grads = cgt.grad(cost, params) for param, grad in zip(params, grads): assert isinstance(param.op, core.GetData) accu = cgt.shared(np.zeros(param.op.get_shape(), dtype=param.dtype)) delta_accu = cgt.shared(np.zeros(param.op.get_shape(), dtype=param.dtype)) accu_new = rho * accu + (1 - rho) * grad ** 2 updates.append((accu, accu_new)) update = (grad * cgt.sqrt(delta_accu + epsilon) / cgt.sqrt(accu_new + epsilon)) updates.append((param, param - learning_rate * update)) delta_accu_new = rho * delta_accu + (1 - rho) * update ** 2 updates.append((delta_accu, delta_accu_new)) return updates
def __init__(self, xdim, args, dec="bernoulli"): self.xdim = xdim self.hdim = args.hdim self.zdim = args.zdim self.lmbda = args.lmbda # weight decay coefficient * 2 self.x = cgt.matrix("x", dtype=cgt.floatX) self.eps = cgt.matrix("eps", dtype=cgt.floatX) self.enc_mlp = GaussianMLP(self.x, self.xdim, self.hdim, self.zdim, nlayers=args.nlayers, eps=self.eps) if dec == "bernoulli": # log p(x | z) defined as -CE(x, y) = dec_mlp.cost(y) self.dec_mlp = BernoulliMLP(self.enc_mlp.out, self.zdim, self.hdim, self.xdim, nlayers=args.nlayers, y=self.x) elif dec == "gaussian": self.dec_mlp = GaussianMLP(self.enc_mlp.out, self.zdim, self.hdim, self.xdim, nlayers=args.nlayers, y=self.x) else: raise RuntimeError("unrecognized decoder %" % dec) self.cost = (-cgt.sum(kld_unit_mvn(self.enc_mlp.mu, self.enc_mlp.var)) + self.dec_mlp.cost) / args.batch_size self.params = self.enc_mlp.params + self.dec_mlp.params # L2 regularization self.gparams = [cgt.grad(self.cost, [p])[0] + self.lmbda * p for p in self.params] self.gaccums = [cgt.shared(np.zeros(p.op.get_value().shape, dtype=cgt.floatX)) for p in self.params] # XXX replace w/ adagrad update from nn ADAGRAD_EPS = 1e-10 # for stability self.updates = [ (param, param - args.lr * gparam / cgt.sqrt(gaccum + cgt.square(gparam) + ADAGRAD_EPS)) for param, gparam, gaccum in zip(self.params, self.gparams, self.gaccums) ] self.updates += [ (gaccum, gaccum + cgt.square(gparam)) for gaccum, gparam in zip(self.gaccums, self.gparams) ] self.train = cgt.function( [self.x, self.eps], self.cost, updates=self.updates ) self.test = cgt.function( [self.x, self.eps], self.cost, updates=None ) # can be used for semi-supervised learning for example self.encode = cgt.function( [self.x, self.eps], self.enc_mlp.out )
def test_im2col(): for settings in [ ((4,4),(0,0),(1,1)), ((3,3),(1,1),(2,2)), ((3,3),(1,1),(3,3)) ]: xval = np.arange(2*1*28*28).reshape(2,1,28,28).astype(cgt.floatX) x = cgt.tensor4("x", fixed_shape=xval.shape) y = im2col(x, *settings) h = cgt.constant(np.random.randn(*cgt.infer_shape(y))) cost = (y*h).sum() fcost = cgt.function([x],cost) fgrad = cgt.function([x], cgt.grad(cost, [x])[0]) from cgt.numeric_diff import numeric_grad gnum = numeric_grad(fcost, xval,eps=1e-5) gana = fgrad(xval) assert np.allclose(gnum, gana)
def make_updater_fc_parallel(): X = cgt.matrix("X", fixed_shape=(None,28*28)) y = cgt.vector("y",dtype='i8') stepsize = cgt.scalar("stepsize") loss = build_fc_return_loss(X,y) params = nn.get_parameters(loss) m = nn.Module([X,y], [loss]) split_loss = 0 for start in xrange(0, batch_size, batch_size//4): sli = slice(start, start+batch_size//4) split_loss += m([X[sli], y[sli]])[0] split_loss /= 4 gparams = cgt.grad(split_loss, params) updates2 = [(p, p-stepsize*gp) for (p, gp) in zip(params, gparams)] return cgt.function([X,y, stepsize], split_loss, updates=updates2)
def make_updater_fc_parallel(): X = cgt.matrix("X", fixed_shape=(None, 28 * 28)) y = cgt.vector("y", dtype='i8') stepsize = cgt.scalar("stepsize") loss = build_fc_return_loss(X, y) params = nn.get_parameters(loss) m = nn.Module([X, y], [loss]) split_loss = 0 for start in xrange(0, batch_size, batch_size // 4): sli = slice(start, start + batch_size // 4) split_loss += m([X[sli], y[sli]])[0] split_loss /= 4 gparams = cgt.grad(split_loss, params) updates2 = [(p, p - stepsize * gp) for (p, gp) in zip(params, gparams)] return cgt.function([X, y, stepsize], split_loss, updates=updates2)
def make_updater_convnet_parallel(): X = cgt.tensor4("X", fixed_shape=(None, 1, 28, 28)) # so shapes can be inferred y = cgt.vector("y", dtype="i8") stepsize = cgt.scalar("stepsize") loss = build_convnet_return_loss(X, y) m = nn.Module([X, y], [loss]) split_loss = 0 for start in xrange(0, batch_size, batch_size // 4): sli = slice(start, start + batch_size // 4) split_loss += m([X[sli], y[sli]])[0] split_loss /= 4 params = nn.get_parameters(loss) gparams = cgt.grad(split_loss, params) updates2 = [(p, p - stepsize * gp) for (p, gp) in zip(params, gparams)] return cgt.function([X, y, stepsize], split_loss, updates=updates2)
def make_loss_and_grad_and_step(arch, size_input, size_output, size_mem, size_batch, n_layers, n_unroll, k_in, k_h): # symbolic variables x_tnk = cgt.tensor3() targ_tnk = cgt.tensor3() #make_network = make_deep_lstm if arch=="lstm" else make_deep_gru make_network = make_deep_rrnn_rot_relu network = make_network(size_input, size_mem, n_layers, size_output, size_batch, k_in, k_h) init_hiddens = [ cgt.matrix() for _ in xrange(get_num_hiddens(arch, n_layers)) ] # TODO fixed sizes cur_hiddens = init_hiddens loss = 0 for t in xrange(n_unroll): outputs = network([x_tnk[t]] + cur_hiddens) cur_hiddens, prediction_logprobs = outputs[:-1], outputs[-1] # loss = loss + nn.categorical_negloglik(prediction_probs, targ_tnk[t]).sum() loss = loss - (prediction_logprobs * targ_tnk[t]).sum() cur_hiddens = outputs[:-1] final_hiddens = cur_hiddens loss = loss / (n_unroll * size_batch) params = network.get_parameters() gradloss = cgt.grad(loss, params) flatgrad = flatcat(gradloss) with utils.Message("compiling loss+grad"): f_loss_and_grad = cgt.function([x_tnk, targ_tnk] + init_hiddens, [loss, flatgrad] + final_hiddens) f_loss = cgt.function([x_tnk, targ_tnk] + init_hiddens, loss) assert len(init_hiddens) == len(final_hiddens) x_nk = cgt.matrix('x') outputs = network([x_nk] + init_hiddens) f_step = cgt.function([x_nk] + init_hiddens, outputs) # print "node count", cgt.count_nodes(flatgrad) return network, f_loss, f_loss_and_grad, f_step
def test_im2col(): for settings in [((4, 4), (0, 0), (1, 1)), ((3, 3), (1, 1), (2, 2)), ((3, 3), (1, 1), (3, 3))]: xval = np.arange(2 * 1 * 28 * 28).reshape(2, 1, 28, 28).astype(cgt.floatX) x = cgt.tensor4("x", fixed_shape=xval.shape) y = im2col(x, *settings) h = cgt.constant(np.random.randn(*cgt.infer_shape(y))) cost = (y * h).sum() fcost = cgt.function([x], cost) fgrad = cgt.function([x], cgt.grad(cost, [x])[0]) from cgt.numeric_diff import numeric_grad gnum = numeric_grad(fcost, xval, eps=1e-5) gana = fgrad(xval) assert np.allclose(gnum, gana)
def test_pool(**kwargs): np.random.seed(0) x = cgt.tensor4("x", fixed_shape=(2,3,5,7)) y = max_pool_2d(x, (4,4),(0,0),(1,1)) xval = np.random.randn(2,3,5,7) hval = np.random.randn(*cgt.infer_shape(y)) h = cgt.constant(hval) cost = (y*h).sum() fcost = cgt.function([x], cost) fgrad = cgt.function([x], cgt.grad(cost, [x])[0]) from cgt.numeric_diff import numeric_grad gnum = numeric_grad(fcost, xval) gana = fgrad(xval) assert np.allclose(gnum,gana)
def test_cpu_pool(**kwargs): np.random.seed(0) x = cgt.tensor4("x", fixed_shape=(2, 3, 5, 7)) y = max_pool_2d(x, (4, 4), (0, 0), (1, 1)) xval = np.random.randn(2, 3, 5, 7) hval = np.random.randn(*cgt.infer_shape(y)) h = cgt.constant(hval) cost = (y * h).sum() fcost = cgt.function([x], cost) fgrad = cgt.function([x], cgt.grad(cost, [x])[0]) from cgt.numeric_diff import numeric_grad gnum = numeric_grad(fcost, xval) gana = fgrad(xval) assert np.allclose(gnum, gana)
def adagrad_updates(cost, params, stepsize=0.001, rho=0.9, epsilon=1e-6): grads = cgt.grad(cost, params) updates = [] for param, grad in zip(params, grads): value = param.op.get_value() accu = cgt.shared(np.zeros(value.shape, dtype=value.dtype)) delta_accu = cgt.shared(np.zeros(value.shape, dtype=value.dtype)) accu_new = rho * accu + (1 - rho) * grad**2 updates.append((accu, accu_new)) update = (grad * cgt.sqrt(delta_accu + epsilon) / cgt.sqrt(accu_new + epsilon)) updates.append((param, param - stepsize * update)) delta_accu_new = rho * delta_accu + (1 - rho) * update**2 updates.append((delta_accu, delta_accu_new)) return updates
def rmsprop(cost, params, learning_rate=1.0, rho=0.9, epsilon=1e-6): """RMSProp updates Divide learning rate by moving average of RMS gradients. See [1] Math: * ``accu_new = rho * accu + (1 - rho) * grad ** 2`` * ``param = param - (learning_rate * grad / cgt.sqrt(accu_new + epsilon))`` Parameters ---------- cost : a scalar loss. params : a list of cgt shared variables. We generate update expressions w.r.t. these variables. learning_rate : float Tunes the size of the update step. rho : float Controls decay of gradient moving average. epsilon : float Avoid division by 0 while scaling. Small constant. Returns ------- list of tuples of the form [(param, updates), (accumulated_RMS_grads, accumulated_RMS_grads_new)] References ---------- .. [1] Yann N. Dauphin, Harm de Vries, Junyoung Chung, Yoshua Bengio (2015): RMSProp and equilibrated adaptive learning rates for non-convex optimization arXiv:1502.04390 http://arxiv.org/abs/1502.04390 """ updates = [] grads = cgt.grad(cost, params) for param, grad in zip(params, grads): value = param.op.get_value() accu = cgt.shared(np.zeros(value.shape, dtype=value.dtype)) accu_new = rho * accu + (1 - rho) * grad**2 updates.append((accu, accu_new)) updates.append( (param, param - (learning_rate * grad / cgt.sqrt(accu_new + epsilon)))) return updates
def make_funcs(config, dbg_out=None): params, Xs, Ys, C_0, H_0, C_T, H_T, C_1, H_1 = lstm_network( config['rnn_steps'], config['num_inputs'], config['num_outputs'], config['num_units'], config['num_mems']) # basic size_batch = Xs[0].shape[0] dY = Ys[0].shape[-1] Ys_gt = [ cgt.matrix(fixed_shape=(size_batch, dY), name='Y%d' % t) for t in range(len(Ys)) ] Ys_var = [cgt.tensor3(fixed_shape=(size_batch, dY, dY)) for _ in Ys] net_inputs, net_outputs = Xs + C_0 + H_0 + Ys_var, Ys + C_T + H_T # calculate loss loss_vec = [] for i in range(len(Ys)): # if i == 0: continue _l = dist.gaussian.logprob(Ys_gt[i], Ys[i], Ys_var[i]) loss_vec.append(_l) loss_vec = cgt.add_multi(loss_vec) if config['weight_decay'] > 0.: params_flat = cgt.concatenate([p.flatten() for p in params]) loss_param = config['weight_decay'] * cgt.sum(params_flat**2) loss_vec -= loss_param # / size_batch loss = cgt.sum(loss_vec) / config['rnn_steps'] / size_batch grad = cgt.grad(loss, params) # functions def f_init(size_batch): c_0, h_0 = [], [] for _n_m in config['num_mems']: if _n_m > 0: c_0.append(np.zeros((size_batch, _n_m))) h_0.append(np.zeros((size_batch, _n_m))) return c_0, h_0 f_step = cgt.function([Xs[0]] + C_0 + H_0, [Ys[0]] + C_1 + H_1) f_loss = cgt.function(net_inputs + Ys_gt, loss) f_grad = cgt.function(net_inputs + Ys_gt, grad) f_surr = cgt.function(net_inputs + Ys_gt, [loss] + net_outputs + grad) return params, f_step, f_loss, f_grad, f_init, f_surr
def test_linreg(): cgt.reset_config() cgt.set_precision('double') N = 10 K = 3 Xval = np.random.randn(N, K) wval = np.random.randn(K) bval = np.random.randn() yval = np.random.randn(N) X_nk = cgt.matrix("X") y_n = cgt.vector("y") w_k = cgt.vector("w") b = cgt.scalar(name="b") ypred = cgt.dot(X_nk, w_k) + b err = cgt.sum(cgt.square(ypred - y_n)) g = cgt.grad(err, [w_k, b]) g_simple, an, _ = cgt.core.simplify_and_analyze(g) print "Loss function:" cgt.print_tree([err]) print "Gradient:" cgt.print_tree(g) print "Gradient simplified" cgt.print_tree( g_simple, nodefn=lambda node, o: o.write(" " + an["node2hash"][node][:5])) print "-------" d = {X_nk: Xval, w_k: wval, b: bval, y_n: yval} np.testing.assert_allclose(cgt.numeric_eval(err, d), np.linalg.norm(Xval.dot(wval) + bval - yval)**2) np.testing.assert_allclose(cgt.numeric_eval(g[0], d), 2 * Xval.T.dot(Xval.dot(wval) + bval - yval)) np.testing.assert_allclose(cgt.numeric_eval(g[1], d), 2 * np.sum(Xval.dot(wval) + bval - yval, 0))
def __init__(self, num_features=None, num_hidden=100): stepsize = 0.01 # with shape (batchsize, ncols) X = cgt.matrix("X", fixed_shape=(1, num_features)) # y: a symbolic variable representing the rewards, which are integers y = cgt.scalar("y", dtype='float64') hid1 = nn.rectify( nn.Affine(num_features, num_hidden, weight_init=nn.IIDGaussian(std=.1), bias_init=nn.Constant(1))(X) ) # One final fully-connected layer, and then a linear activation output for reward output = nn.Affine(num_hidden, 1, weight_init=nn.IIDGaussian(std=.1), bias_init=nn.Constant(1))(hid1) abs_deviation = cgt.abs(output - y).mean() params = nn.get_parameters(abs_deviation) gparams = cgt.grad(abs_deviation, params) updates = [(p, p-stepsize*gp) for (p, gp) in zip(params, gparams)] self.predictor = cgt.function([X], output) self.updater = cgt.function([X, y], abs_deviation, updates=updates)
def CGT_vLJ_Optimize(x): N = len(x) #cgt.set_precision('double') xt = cgt.vector('xt') vLJt = 0 for j in range(1,N): for i in range(j): rho = ((xt[i*D:i*D+D] - xt[j*D:j*D+D])**2).sum() vLJt += rho**(-6.0)-(rho**(-3.0)) f = cgt.function([xt],4*vLJt) dvLJc = cgt.grad(4*vLJt, xt) df = cgt.function([xt],dvLJc) CGT_BFGSres = optimize.minimize(f, np.ravel(x), \ method='L-BFGS-B', \ jac = df, \ options={'disp': False}) return np.reshape(CGT_BFGSres.x, (N,D))
def make_loss_and_grad_and_step(arch, size_input, size_output, size_mem, size_batch, n_layers, n_unroll, k_in, k_h): # symbolic variables x_tnk = cgt.tensor3() targ_tnk = cgt.tensor3() #make_network = make_deep_lstm if arch=="lstm" else make_deep_gru make_network = make_deep_rrnn_rot_relu network = make_network(size_input, size_mem, n_layers, size_output, size_batch, k_in, k_h) init_hiddens = [cgt.matrix() for _ in xrange(get_num_hiddens(arch, n_layers))] # TODO fixed sizes cur_hiddens = init_hiddens loss = 0 for t in xrange(n_unroll): outputs = network([x_tnk[t]] + cur_hiddens) cur_hiddens, prediction_logprobs = outputs[:-1], outputs[-1] # loss = loss + nn.categorical_negloglik(prediction_probs, targ_tnk[t]).sum() loss = loss - (prediction_logprobs*targ_tnk[t]).sum() cur_hiddens = outputs[:-1] final_hiddens = cur_hiddens loss = loss / (n_unroll * size_batch) params = network.get_parameters() gradloss = cgt.grad(loss, params) flatgrad = flatcat(gradloss) with utils.Message("compiling loss+grad"): f_loss_and_grad = cgt.function([x_tnk, targ_tnk] + init_hiddens, [loss, flatgrad] + final_hiddens) f_loss = cgt.function([x_tnk, targ_tnk] + init_hiddens, loss) assert len(init_hiddens) == len(final_hiddens) x_nk = cgt.matrix('x') outputs = network([x_nk] + init_hiddens) f_step = cgt.function([x_nk]+init_hiddens, outputs) # print "node count", cgt.count_nodes(flatgrad) return network, f_loss, f_loss_and_grad, f_step
def rmsprop(cost, params, learning_rate=1.0, rho=0.9, epsilon=1e-6): """RMSProp updates Divide learning rate by moving average of RMS gradients. See [1] Math: * ``accu_new = rho * accu + (1 - rho) * grad ** 2`` * ``param = param - (learning_rate * grad / cgt.sqrt(accu_new + epsilon))`` Parameters ---------- cost : a scalar loss. params : a list of cgt shared variables. We generate update expressions w.r.t. these variables. learning_rate : float Tunes the size of the update step. rho : float Controls decay of gradient moving average. epsilon : float Avoid division by 0 while scaling. Small constant. Returns ------- list of tuples of the form (param, updates), (accumulated_RMS_grads, accumulated_RMS_grads_new) References ---------- .. [1] Yann N. Dauphin, Harm de Vries, Junyoung Chung, Yoshua Bengio (2015): RMSProp and equilibrated adaptive learning rates for non-convex optimization arXiv:1502.04390 http://arxiv.org/abs/1502.04390 """ updates = [] grads = cgt.grad(cost, params) for param, grad in zip(params, grads): assert isinstance(param.op, core.GetData) accu = cgt.shared(np.zeros(param.op.get_shape(), dtype=param.dtype)) accu_new = rho * accu + (1 - rho) * grad ** 2 updates.append((accu, accu_new)) updates.append((param, param - (learning_rate * grad / cgt.sqrt(accu_new + epsilon)))) return updates
def make_funcs(config, dbg_out=None): params, Xs, Ys, C_0, H_0, C_T, H_T, C_1, H_1 = lstm_network( config['rnn_steps'], config['num_inputs'], config['num_outputs'], config['num_units'], config['num_mems'] ) # basic size_batch = Xs[0].shape[0] dY = Ys[0].shape[-1] Ys_gt = [cgt.matrix(fixed_shape=(size_batch, dY), name='Y%d'%t) for t in range(len(Ys))] Ys_var = [cgt.tensor3(fixed_shape=(size_batch, dY, dY)) for _ in Ys] net_inputs, net_outputs = Xs + C_0 + H_0 + Ys_var, Ys + C_T + H_T # calculate loss loss_vec = [] for i in range(len(Ys)): # if i == 0: continue _l = dist.gaussian.logprob(Ys_gt[i], Ys[i], Ys_var[i]) loss_vec.append(_l) loss_vec = cgt.add_multi(loss_vec) if config['weight_decay'] > 0.: params_flat = cgt.concatenate([p.flatten() for p in params]) loss_param = config['weight_decay'] * cgt.sum(params_flat ** 2) loss_vec -= loss_param # / size_batch loss = cgt.sum(loss_vec) / config['rnn_steps'] / size_batch grad = cgt.grad(loss, params) # functions def f_init(size_batch): c_0, h_0 = [], [] for _n_m in config['num_mems']: if _n_m > 0: c_0.append(np.zeros((size_batch, _n_m))) h_0.append(np.zeros((size_batch, _n_m))) return c_0, h_0 f_step = cgt.function([Xs[0]] + C_0 + H_0, [Ys[0]] + C_1 + H_1) f_loss = cgt.function(net_inputs + Ys_gt, loss) f_grad = cgt.function(net_inputs + Ys_gt, grad) f_surr = cgt.function(net_inputs + Ys_gt, [loss] + net_outputs + grad) return params, f_step, f_loss, f_grad, f_init, f_surr
def adagrad(cost, params, learning_rate=1.0, epsilon=1e-6): """Adagrad updates The learning rate will be scaled by dividing it by the sqaure root of the sum of accumulated squared gradients. Math: * ``accu_new = accu + grad ** 2`` * ``param = param - (learning_rate * grad) / cgt.sqrt(accu_new + epsilon)`` Parameters ---------- cost : a scalar loss. params : a list of cgt shared variables. We generate update expressions w.r.t. these variables. learning_rate : float Tunes the size of the update step. epsilon: avoids division close to zero. Small float. Returns ------- list of tuples of the form [(param, updates), (accumulated_grads, accumulated_grads_new)] References ---------- .. [1] Duchi, J., Hazan, E., & Singer, Y. (2011): Adaptive subgradient methods for online learning and stochastic optimization. JMLR, 12:2121-2159. """ updates = [] grads = cgt.grad(cost, params) for param, grad in zip(params, grads): value = param.op.get_value() accu = cgt.shared(np.zeros(value.shape, dtype=value.dtype)) accu_new = accu + grad**2 updates.append((accu, accu_new)) updates.append( (param, param - (learning_rate * grad) / cgt.sqrt(accu_new + epsilon))) return updates
def sgd(cost, params, learning_rate): """Stochastic Gradient Descent (SGD) updates Math: * ``param := param - learning_rate * gradient`` Parameters ---------- cost : a scalar loss. params : a list of cgt shared variables. We generate update expressions w.r.t. these variables. learning_rate : float Returns ------- list of tuples of the form (param, new_param) """ updates = [] grads = cgt.grad(cost, params) for param, grad in zip(params, grads): updates.append((param, param - learning_rate * grad)) return updates
def sgd(cost, params, learning_rate): """Stochastic Gradient Descent (SGD) updates Math: * ``param := param - learning_rate * gradient`` Parameters ---------- cost : a scalar loss. params : a list of cgt shared variables. We generate update expressions w.r.t. these variables. learning_rate : float Tunes the size of the update step. Returns ------- list of tuples of the form (param, updates) """ updates = [] grads = cgt.grad(cost, params) for param, grad in zip(params, grads): updates.append((param, param - learning_rate * grad)) return updates
def make_funcs(opt, ntm, total_time, loss_timesteps): x_tbk = cgt.tensor3("x", fixed_shape=(total_time, opt.b, opt.k)) y_tbp = cgt.tensor3("y", fixed_shape=(total_time, opt.b, opt.p)) loss_timesteps = set(loss_timesteps) initial_states = make_ntm_initial_states(opt) params = ntm.get_parameters() + get_parameters(initial_states) # params = ntm.get_parameters() lossCE = 0 loss01 = 0 state_arrs = initial_states for t in xrange(total_time): tmp = ntm([x_tbk[t]] + state_arrs) raw_pred = tmp[0] state_arrs = tmp[1:4] if t in loss_timesteps: p_pred = cgt.sigmoid(raw_pred) ce = bernoulli_crossentropy( y_tbp[t], p_pred).sum() # cross-entropy of bernoulli distribution lossCE = lossCE + ce loss01 = loss01 + cgt.cast(cgt.equal(y_tbp[t], round01(p_pred)), cgt.floatX).sum() lossCE = lossCE / (len(loss_timesteps) * opt.p * opt.b) / np.log(2) loss01 = loss01 / (len(loss_timesteps) * opt.p * opt.b) gradloss = cgt.grad(lossCE, params) flatgrad = flatcat(gradloss) f_loss = cgt.function([x_tbk, y_tbp], lossCE) f_loss_and_grad = cgt.function([x_tbk, y_tbp], [lossCE, loss01, flatgrad]) print "number of nodes in computation graph:", core.count_nodes( [lossCE, loss01, flatgrad]) return f_loss, f_loss_and_grad, params
def test_linreg(): N = 10 K = 3 Xval = np.random.randn(N,K) wval = np.random.randn(K) bval = np.random.randn() yval = np.random.randn(N) X_nk = cgt.matrix("X") y_n = cgt.vector("y") w_k = cgt.vector("w") b = cgt.scalar(name="b") ypred = cgt.dot(X_nk, w_k) + b err = cgt.sum(cgt.square(ypred - y_n)) g = cgt.grad(err, [w_k, b]) g_simple,an,_ = cgt.core.simplify_and_analyze(g) print "Loss function:" cgt.print_tree([err]) print "Gradient:" cgt.print_tree(g) print "Gradient simplified" cgt.print_tree(g_simple, nodefn=lambda node,o: o.write(" " + an["node2hash"][node][:5])) print "-------" d = {X_nk : Xval, w_k : wval, b : bval, y_n : yval} np.testing.assert_allclose(cgt.numeric_eval(err,d), np.linalg.norm(Xval.dot(wval) + bval - yval)**2, atol={"single":1e-3,"double":1e-6}[cgt.get_precision()]) np.testing.assert_allclose(cgt.numeric_eval(g[0],d), 2 * Xval.T.dot(Xval.dot(wval) + bval - yval), atol={"single":1e-3,"double":1e-6}[cgt.get_precision()]) np.testing.assert_allclose(cgt.numeric_eval(g[1],d), 2 * np.sum(Xval.dot(wval) + bval - yval, 0), atol={"single":1e-3,"double":1e-6}[cgt.get_precision()])
def adagrad(cost, params, learning_rate=1.0, epsilon=1e-6): """Adagrad updates The learning rate will be scaled by dividing it by the sqaure root of the sum of accumulated squared gradients. Math: * ``accu_new = accu + grad ** 2`` * ``param = param - (learning_rate * grad) / cgt.sqrt(accu_new + epsilon)`` Parameters ---------- cost : a scalar loss. params : a list of cgt shared variables. We generate update expressions w.r.t. these variables. learning_rate : float Tunes the size of the update step. epsilon: avoids division close to zero. Small float. Returns ------- list of tuples of the form [(param, updates), (accumulated_grads, accumulated_grads_new)] References ---------- .. [1] Duchi, J., Hazan, E., & Singer, Y. (2011): Adaptive subgradient methods for online learning and stochastic optimization. JMLR, 12:2121-2159. """ updates = [] grads = cgt.grad(cost, params) for param, grad in zip(params, grads): assert isinstance(param.op, core.GetData) accu = cgt.shared(np.zeros(param.op.get_shape(), dtype=param.dtype)) accu_new = accu + grad ** 2 updates.append((accu, accu_new)) updates.append((param, param - (learning_rate * grad) / cgt.sqrt(accu_new + epsilon))) return updates
def test_cpu_pool(): with cgt.scoped_update_config(precision="quad", backend="native"): print cgt.get_precision() ci = get_compile_info() np.random.seed(0) x = cgt.tensor4("x", fixed_shape=(2, 3, 5, 7)) y = max_pool_2d(x, (4, 4), (0, 0), (1, 1)) xval = np.random.randn(2, 3, 5, 7) hval = np.random.randn(*cgt.infer_shape(y)) h = cgt.constant(hval) cost = (y * h).sum() fcost = cgt.function([x], cost) fgrad = cgt.function([x], cgt.grad(cost, [x])[0]) from cgt.numeric_diff import numeric_grad gnum = numeric_grad(fcost, xval) gana = fgrad(xval) assert np.allclose(gnum, gana)