def __init__(self, n_actions): Serializable.__init__(self, n_actions) cgt.set_precision('double') n_in = 128 o_no = cgt.matrix("o_no",fixed_shape=(None,n_in)) a_n = cgt.vector("a_n",dtype='i8') q_n = cgt.vector("q_n") oldpdist_np = cgt.matrix("oldpdists") h0 = (o_no - 128.0)/128.0 nhid = 64 h1 = cgt.tanh(nn.Affine(128,nhid,weight_init=nn.IIDGaussian(std=.1))(h0)) probs_na = nn.softmax(nn.Affine(nhid,n_actions,weight_init=nn.IIDGaussian(std=0.01))(h1)) logprobs_na = cgt.log(probs_na) b = cgt.size(o_no, 0) logps_n = logprobs_na[cgt.arange(b), a_n] surr = (logps_n*q_n).mean() kl = (oldpdist_np * cgt.log(oldpdist_np/probs_na)).sum(axis=1).mean() params = nn.get_parameters(surr) gradsurr = cgt.grad(surr, params) flatgrad = cgt.concatenate([p.flatten() for p in gradsurr]) lam = cgt.scalar() penobj = surr - lam * kl self._f_grad_lagrangian = cgt.function([lam, oldpdist_np, o_no, a_n, q_n], cgt.concatenate([p.flatten() for p in cgt.grad(penobj,params)])) self.f_pdist = cgt.function([o_no], probs_na) self.f_probs = cgt.function([o_no], probs_na) self.f_surr_kl = cgt.function([oldpdist_np, o_no, a_n, q_n], [surr, kl]) self.f_gradlogp = cgt.function([oldpdist_np, o_no, a_n, q_n], flatgrad) self.pc = ParamCollection(params)
def test_devices(): cgt.set_precision("double") cgt.update_config(backend="native") N = 10 K = 3 compile_info = cgt.compilation.get_compile_info() cuda_enabled = compile_info["CGT_ENABLE_CUDA"] if not cuda_enabled: raise SkipTest("cuda disabled") Xval = np.random.randn(N, K).astype(cgt.floatX) wval = np.random.randn(K).astype(cgt.floatX) bval = np.asarray(np.random.randn()).astype(cgt.floatX) yval = np.random.randn(N).astype(cgt.floatX) with cgt.scoped_update_config(default_device=cgt.Device(devtype="gpu")): X_nk = cgt.shared(Xval, "X", device=cgt.Device(devtype="gpu")) y_n = cgt.shared(yval, "y") w_k = cgt.shared(wval, "w") b = cgt.shared(bval, name="b") print "bval", bval ypred = cgt.dot(cgt.square(X_nk), w_k) + b err = cgt.sum(cgt.sin(ypred - y_n)) g = cgt.grad(err, [w_k, b]) outputs = [err] + g f = cgt.function([], [err] + g) results = f() print results assert np.allclose(results[0], np.sin(np.square(Xval).dot(wval) + bval - yval).sum())
def test_flatvec(): cgt.reset_config cgt.set_precision('double') cgt.core.update_config(backend="python") # XXX N = 10 K = 3 Xval = np.random.randn(N,K) wval = np.random.randn(K) bval = np.random.randn() yval = np.random.randn(N) X_nk = cgt.shared(Xval, "X") y_n = cgt.shared(yval, "y") w_k = cgt.shared(wval, "w") b = cgt.shared(bval, name="b") ypred = cgt.dot(X_nk, w_k) + b err = cgt.sum(cgt.square(ypred - y_n)) g = cgt.grad(err, [w_k, b]) g = core.simplify(g) pars = [w_k, b] flatx = nn.setup_contiguous_storage(pars) f = cgt.function([], [err,cgt.flatcat(g)])
def check_conv(precision): cgt.reset_config() cgt.set_precision(precision) f = cgt.function([], nn.conv2d(cgt.constant(x), cgt.constant(filt), kernelshape=(filtrows,filtcols), pad=(filtrows-1, filtcols-1))) out1 = f() # out1 = cgt.numeric_eval1(nn.conv2d(cgt.constant(x), cgt.constant(f), kersize=(filtrows,filtcols)), {}) np.testing.assert_allclose(out, out1, atol={"single":1e-3,"double":1e-6}[precision])
def runTest(self): cgt.set_precision('double') x = cgt.vector() y = cgt.square(x) eg = cgt.execution.compilation_pipeline([x],[y+y],[]) pprint.pprint(eg.to_json()) import cycgt interp = cycgt.cInterpreter(eg) print interp(np.array([3,4,5,6],'f8'))
def check_scalar_grads(precision, backend): cgt.reset_config() np.random.seed(0) cgt.set_precision(precision) cgt.core.update_config(backend=backend) x = cgt.scalar('x') y = cgt.scalar('y') z = cgt.scalar('z') vars = [x,y,z] #pylint: disable=W0622 vals = nr.rand(len(vars))+1 PROB2RESULT = {} for ((key,_), cls) in it.chain( it.izip(core.UNARY_INFO.items(),it.repeat(core.ElwiseUnary)), it.izip(core.BINARY_INFO.items(),it.repeat(core.ElwiseBinary)) ): if key == "conj": print "skipping conj" continue utils.colorprint(utils.Color.YELLOW, "Testing %s\n"%key) if cls == core.ElwiseUnary: n_in = 1 op = cls(key) else: n_in = 2 op = cls(key, (True,True)) inputvars = vars[0:n_in] inputvals = vals[0:n_in] out = core.Result(op, inputvars) f = cgt.function(inputvars, out) try: grads = cgt.grad(out, inputvars) except core.NonDifferentiable: print "nondiff" continue if DISPLAY: print "Function:" cgt.print_tree(out) print "Gradient original:" cgt.print_tree(grads) print "Gradient simplified:" grads_simple = core.simplify(grads) if DISPLAY: cgt.print_tree(grads_simple) gradf = cgt.function(inputvars, grads) eps = {"single":1e-4,"double":1e-9}[precision] nugrad = numeric_grad(lambda li: f(*li), inputvals,eps=eps) #pylint: disable=W0640 cgtgrad = gradf(*inputvals) np.testing.assert_almost_equal(nugrad,cgtgrad,decimal={"single":3,"double":6}[precision]) grad_count = core.count_nodes(grads_simple) PROB2RESULT[key] = {} PROB2RESULT[key]["grad"] = grad_count if DISPLAY: from thirdparty.tabulate import tabulate print tabulate([[key,val["grad"]] for (key,val) in PROB2RESULT.iteritems()],headers=["funcname","gradcount"])
def __init__(self, obs_dim, ctrl_dim): cgt.set_precision('double') Serializable.__init__(self, obs_dim, ctrl_dim) self.obs_dim = obs_dim self.ctrl_dim = ctrl_dim o_no = cgt.matrix("o_no",fixed_shape=(None,obs_dim)) a_na = cgt.matrix("a_na",fixed_shape = (None, ctrl_dim)) adv_n = cgt.vector("adv_n") oldpdist_np = cgt.matrix("oldpdist", fixed_shape=(None, 2*ctrl_dim)) self.logstd = logstd_1a = nn.parameter(np.zeros((1, self.ctrl_dim)), name="std_1a") std_1a = cgt.exp(logstd_1a) # Here's where we apply the network h0 = o_no nhid = 32 h1 = cgt.tanh(nn.Affine(obs_dim,nhid,weight_init=nn.IIDGaussian(std=0.1))(h0)) h2 = cgt.tanh(nn.Affine(nhid,nhid,weight_init=nn.IIDGaussian(std=0.1))(h1)) mean_na = nn.Affine(nhid,ctrl_dim,weight_init=nn.IIDGaussian(std=0.01))(h2) b = cgt.size(o_no, 0) std_na = cgt.repeat(std_1a, b, axis=0) oldmean_na = oldpdist_np[:, 0:self.ctrl_dim] oldstd_na = oldpdist_np[:, self.ctrl_dim:2*self.ctrl_dim] logp_n = ((-.5) * cgt.square( (a_na - mean_na) / std_na ).sum(axis=1)) - logstd_1a.sum() oldlogp_n = ((-.5) * cgt.square( (a_na - oldmean_na) / oldstd_na ).sum(axis=1)) - cgt.log(oldstd_na).sum(axis=1) ratio_n = cgt.exp(logp_n - oldlogp_n) surr = (ratio_n*adv_n).mean() pdists_np = cgt.concatenate([mean_na, std_na], axis=1) # kl = cgt.log(sigafter/) params = nn.get_parameters(surr) oldvar_na = cgt.square(oldstd_na) var_na = cgt.square(std_na) kl = (cgt.log(std_na / oldstd_na) + (oldvar_na + cgt.square(oldmean_na - mean_na)) / (2 * var_na) - .5).sum(axis=1).mean() lam = cgt.scalar() penobj = surr - lam * kl self._compute_surr_kl = cgt.function([oldpdist_np, o_no, a_na, adv_n], [surr, kl]) self._compute_grad_lagrangian = cgt.function([lam, oldpdist_np, o_no, a_na, adv_n], cgt.concatenate([p.flatten() for p in cgt.grad(penobj,params)])) self.f_pdist = cgt.function([o_no], pdists_np) self.f_objs = cgt.function([oldpdist_np, o_no, a_na, adv_n], [surr, kl]) self.pc = ParamCollection(params)
def test_multi_output(): cgt.reset_config() cgt.set_precision("single") for x in (cgt.scalar('x'), cgt.vector('x'), cgt.matrix('x')): for cls in (SinCos, SinCos2): y,z = core.unpack(core.Result(cls(), [x])) xnum = np.ones((3,)*x.ndim, cgt.floatX) correct = (np.sin(xnum),np.cos(xnum)) yznum = cgt.numeric_eval([y,z], {x:xnum}) np.testing.assert_allclose(yznum, correct) f = cgt.function([x],[y,z]) np.testing.assert_allclose(f(xnum), correct)
def test_linreg(): cgt.reset_config() cgt.set_precision('double') N = 10 K = 3 Xval = np.random.randn(N, K) wval = np.random.randn(K) bval = np.random.randn() yval = np.random.randn(N) X_nk = cgt.matrix("X") y_n = cgt.vector("y") w_k = cgt.vector("w") b = cgt.scalar(name="b") ypred = cgt.dot(X_nk, w_k) + b err = cgt.sum(cgt.square(ypred - y_n)) g = cgt.grad(err, [w_k, b]) g_simple, an, _ = cgt.core.simplify_and_analyze(g) print "Loss function:" cgt.print_tree([err]) print "Gradient:" cgt.print_tree(g) print "Gradient simplified" cgt.print_tree( g_simple, nodefn=lambda node, o: o.write(" " + an["node2hash"][node][:5])) print "-------" d = {X_nk: Xval, w_k: wval, b: bval, y_n: yval} np.testing.assert_allclose(cgt.numeric_eval(err, d), np.linalg.norm(Xval.dot(wval) + bval - yval)**2) np.testing.assert_allclose(cgt.numeric_eval(g[0], d), 2 * Xval.T.dot(Xval.dot(wval) + bval - yval)) np.testing.assert_allclose(cgt.numeric_eval(g[1], d), 2 * np.sum(Xval.dot(wval) + bval - yval, 0))
def test_einsum(): cgt.reset_config() cgt.set_precision("double") x = cgt.tensor3() y = cgt.tensor3() sizes = {'i':2,'j':3,'k':5,'l':7} xaxes = 'ijk' yaxes = 'ikl' zaxes = 'ijl' for i in xrange(10): xperm = xaxes (yperm,zperm) = permaxes = [[chars[i] for i in np.random.permutation(3)] for chars in [yaxes,zaxes]] desc = "%s,%s->%s"%tuple("".join(chars) for chars in [xperm] + permaxes) z = cgt.einsum(desc, x, y) xval = nr.randn(*(sizes[c] for c in xperm)) yval = nr.randn(*(sizes[c] for c in yperm)) np.testing.assert_allclose( cgt.numeric_eval(z, {x : xval, y : yval}), np.einsum(desc, xval, yval))
def test_lrn(): if not get_compile_info()["CGT_ENABLE_CUDA"]: raise SkipTest("Skipping because CUDA disabled") with cgt.scoped_update_config(precision="double",backend="native"): from cgt.tests import gradcheck_model cgt.set_precision('double') nr.seed(0) Xval = nr.randn(4,8,16,16) X = cgt.shared(Xval, name="X", fixed_shape_mask="all") # X = cgt.tensor4(name='X') y = cross_channel_lrn(X, localsize=4, alpha=.1, beta=.5) f = cgt.function([],y) print f().sum() print f().sum() print f().sum() assert np.isfinite(f().sum()) # print f(Xval).sum() a = nr.rand(*cgt.infer_shape(y)) loss = (y*a).sum() gradcheck_model(loss, [X],eps=1e-5)
def __init__(self, n_actions): Serializable.__init__(self, n_actions) cgt.set_precision('double') n_in = 128 o_no = cgt.matrix("o_no", fixed_shape=(None, n_in)) a_n = cgt.vector("a_n", dtype='i8') q_n = cgt.vector("q_n") oldpdist_np = cgt.matrix("oldpdists") h0 = (o_no - 128.0) / 128.0 nhid = 64 h1 = cgt.tanh( nn.Affine(128, nhid, weight_init=nn.IIDGaussian(std=.1))(h0)) probs_na = nn.softmax( nn.Affine(nhid, n_actions, weight_init=nn.IIDGaussian(std=0.01))(h1)) logprobs_na = cgt.log(probs_na) b = cgt.size(o_no, 0) logps_n = logprobs_na[cgt.arange(b), a_n] surr = (logps_n * q_n).mean() kl = (oldpdist_np * cgt.log(oldpdist_np / probs_na)).sum(axis=1).mean() params = nn.get_parameters(surr) gradsurr = cgt.grad(surr, params) flatgrad = cgt.concatenate([p.flatten() for p in gradsurr]) lam = cgt.scalar() penobj = surr - lam * kl self._f_grad_lagrangian = cgt.function( [lam, oldpdist_np, o_no, a_n, q_n], cgt.concatenate([p.flatten() for p in cgt.grad(penobj, params)])) self.f_pdist = cgt.function([o_no], probs_na) self.f_probs = cgt.function([o_no], probs_na) self.f_surr_kl = cgt.function([oldpdist_np, o_no, a_n, q_n], [surr, kl]) self.f_gradlogp = cgt.function([oldpdist_np, o_no, a_n, q_n], flatgrad) self.pc = ParamCollection(params)
def test_linreg(): cgt.reset_config() cgt.set_precision('double') N = 10 K = 3 Xval = np.random.randn(N,K) wval = np.random.randn(K) bval = np.random.randn() yval = np.random.randn(N) X_nk = cgt.matrix("X") y_n = cgt.vector("y") w_k = cgt.vector("w") b = cgt.scalar(name="b") ypred = cgt.dot(X_nk, w_k) + b err = cgt.sum(cgt.square(ypred - y_n)) g = cgt.grad(err, [w_k, b]) g_simple,an,_ = cgt.core.simplify_and_analyze(g) print "Loss function:" cgt.print_tree([err]) print "Gradient:" cgt.print_tree(g) print "Gradient simplified" cgt.print_tree(g_simple, nodefn=lambda node,o: o.write(" " + an["node2hash"][node][:5])) print "-------" d = {X_nk : Xval, w_k : wval, b : bval, y_n : yval} np.testing.assert_allclose(cgt.numeric_eval(err,d), np.linalg.norm(Xval.dot(wval) + bval - yval)**2) np.testing.assert_allclose(cgt.numeric_eval(g[0],d), 2 * Xval.T.dot(Xval.dot(wval) + bval - yval)) np.testing.assert_allclose(cgt.numeric_eval(g[1],d), 2 * np.sum(Xval.dot(wval) + bval - yval, 0))
def __init__(self, obs_dim, ctrl_dim): cgt.set_precision('double') Serializable.__init__(self, obs_dim, ctrl_dim) self.obs_dim = obs_dim self.ctrl_dim = ctrl_dim o_no = cgt.matrix("o_no", fixed_shape=(None, obs_dim)) a_na = cgt.matrix("a_na", fixed_shape=(None, ctrl_dim)) adv_n = cgt.vector("adv_n") oldpdist_np = cgt.matrix("oldpdist", fixed_shape=(None, 2 * ctrl_dim)) self.logstd = logstd_1a = nn.parameter(np.zeros((1, self.ctrl_dim)), name="std_1a") std_1a = cgt.exp(logstd_1a) # Here's where we apply the network h0 = o_no nhid = 32 h1 = cgt.tanh( nn.Affine(obs_dim, nhid, weight_init=nn.IIDGaussian(std=0.1))(h0)) h2 = cgt.tanh( nn.Affine(nhid, nhid, weight_init=nn.IIDGaussian(std=0.1))(h1)) mean_na = nn.Affine(nhid, ctrl_dim, weight_init=nn.IIDGaussian(std=0.01))(h2) b = cgt.size(o_no, 0) std_na = cgt.repeat(std_1a, b, axis=0) oldmean_na = oldpdist_np[:, 0:self.ctrl_dim] oldstd_na = oldpdist_np[:, self.ctrl_dim:2 * self.ctrl_dim] logp_n = ((-.5) * cgt.square( (a_na - mean_na) / std_na).sum(axis=1)) - logstd_1a.sum() oldlogp_n = ((-.5) * cgt.square( (a_na - oldmean_na) / oldstd_na).sum(axis=1) ) - cgt.log(oldstd_na).sum(axis=1) ratio_n = cgt.exp(logp_n - oldlogp_n) surr = (ratio_n * adv_n).mean() pdists_np = cgt.concatenate([mean_na, std_na], axis=1) # kl = cgt.log(sigafter/) params = nn.get_parameters(surr) oldvar_na = cgt.square(oldstd_na) var_na = cgt.square(std_na) kl = (cgt.log(std_na / oldstd_na) + (oldvar_na + cgt.square(oldmean_na - mean_na)) / (2 * var_na) - .5).sum(axis=1).mean() lam = cgt.scalar() penobj = surr - lam * kl self._compute_surr_kl = cgt.function([oldpdist_np, o_no, a_na, adv_n], [surr, kl]) self._compute_grad_lagrangian = cgt.function( [lam, oldpdist_np, o_no, a_na, adv_n], cgt.concatenate([p.flatten() for p in cgt.grad(penobj, params)])) self.f_pdist = cgt.function([o_no], pdists_np) self.f_objs = cgt.function([oldpdist_np, o_no, a_na, adv_n], [surr, kl]) self.pc = ParamCollection(params)
def __init__(self, model="dense", im_size=[28, 28], dropout=True, devtype="cpu", grad_check=True, reg=0): if grad_check: cgt.set_precision("quad") self.model = model self.reg = reg np.random.seed(0) cgt.update_config(default_device=cgt.core.Device(devtype=devtype), backend="native") print(model) # MLP with 1 hidden layer if model == "dense1": self.Xsize = 2 * im_size[0] * im_size[1] + im_size[0] + im_size[1] self.X = cgt.matrix("X", fixed_shape=(None, self.Xsize)) self.y = cgt.vector("y", dtype='i8') self.p_drop_input, self.p_drop_hidden = (0.2, 0.5) if dropout else (0, 0) self.w_h = init_weights(self.Xsize, 256) self.w_o = init_weights(256, 8) self.pofy_drop = dense_model1(self.X, self.w_h, self.w_o, self.p_drop_input, self.p_drop_hidden) self.pofy_nodrop = dense_model1(self.X, self.w_h, self.w_o, 0., 0.) self.params = [self.w_h, self.w_o] self.l1 = cgt.abs(self.w_h).sum() + cgt.abs(self.w_o).sum() self.cost_drop = -cgt.mean( categorical.loglik(self.y, self.pofy_drop)) + self.reg * self.l1 # MLP with 2 hidden layers elif model == "dense2": self.Xsize = 2 * im_size[0] * im_size[1] + im_size[0] + im_size[1] self.X = cgt.matrix("X", fixed_shape=(None, self.Xsize)) self.y = cgt.vector("y", dtype='i8') self.p_drop_input, self.p_drop_hidden = (0.2, 0.5) if dropout else (0, 0) self.w_h = init_weights(self.Xsize, 256) self.w_h2 = init_weights(256, 256) self.w_o = init_weights(256, 8) self.pofy_drop = dense_model2(self.X, self.w_h, self.w_h2, self.w_o, self.p_drop_input, self.p_drop_hidden) self.pofy_nodrop = dense_model2(self.X, self.w_h, self.w_h2, self.w_o, 0., 0.) self.params = [self.w_h, self.w_h2, self.w_o] self.l1 = cgt.abs(self.w_h).sum() + cgt.abs( self.w_h2).sum() + cgt.abs(self.w_o).sum() self.cost_drop = -cgt.mean( categorical.loglik(self.y, self.pofy_drop)) + self.reg * self.l1 # MLP with 3 hidden layers elif model == "dense3": self.Xsize = 2 * im_size[0] * im_size[1] + im_size[0] + im_size[1] self.X = cgt.matrix("X", fixed_shape=(None, self.Xsize)) self.y = cgt.vector("y", dtype='i8') self.p_drop_input, self.p_drop_hidden = ( 0.0, [0.5, 0.5, 0.5]) if dropout else (0, [0, 0, 0]) self.w_h = init_weights(self.Xsize, 256) self.w_h2 = init_weights(256, 256) self.w_h3 = init_weights(256, 256) self.w_o = init_weights(256, 8) self.pofy_drop = dense_model3(self.X, self.w_h, self.w_h2, self.w_h3, self.w_o, self.p_drop_input, self.p_drop_hidden) self.pofy_nodrop = dense_model3(self.X, self.w_h, self.w_h2, self.w_h3, self.w_o, 0., [0., 0., 0.]) self.params = [self.w_h, self.w_h2, self.w_h3, self.w_o] self.l1 = cgt.abs(self.w_h).sum() + cgt.abs(self.w_h2).sum() + cgt.abs(self.w_h3).sum() + \ cgt.abs(self.w_o).sum() self.cost_drop = -cgt.mean( categorical.loglik(self.y, self.pofy_drop)) + self.reg * self.l1 else: raise RuntimeError("Unknown Model") self.y_nodrop = cgt.argmax(self.pofy_nodrop, axis=1) self.cost_nodrop = -cgt.mean( categorical.loglik(self.y, self.pofy_nodrop)) self.err_nodrop = cgt.cast(cgt.not_equal(self.y_nodrop, self.y), cgt.floatX).mean() self.computeloss = cgt.function( inputs=[self.X, self.y], outputs=[self.err_nodrop, self.cost_nodrop]) self.y_out = cgt.function(inputs=[self.X], outputs=[self.y_nodrop]) self.updates = rmsprop_updates(self.cost_drop, self.params) self.train = cgt.function(inputs=[self.X, self.y], outputs=[], updates=self.updates)
def run_training(self, input, stepsize=0.01, epochs=10, output='None', batch_size=128, grad_check=True, profile=False, step_decrease_rate=0.5, step_decrease_time=1000): # run NN training from input matlab data file, and save test data prediction in output file # load data from Matlab file, including # im_data: flattened images # state_data: concatenated one-hot vectors for each state variable # label_data: one-hot vector for action (state difference) if grad_check: cgt.set_precision("quad") matlab_data = sio.loadmat(input) im_data = matlab_data["im_data"] im_data = (im_data - 1) / 255 # obstacles = 1, free zone = 0 state_data = matlab_data["state_data"] value_data = matlab_data["value_data"] label_data = matlab_data["label_data"] Xdata = (np.concatenate((np.concatenate( (im_data, value_data), axis=1), state_data), axis=1)).astype(cgt.floatX) ydata = label_data training_samples = int(6 / 7.0 * Xdata.shape[0]) Xtrain = Xdata[0:training_samples] ytrain = ydata[0:training_samples] Xtest = Xdata[training_samples:] ytest = ydata[training_samples:] sortinds = np.random.permutation(training_samples) Xtrain = Xtrain[sortinds] ytrain = ytrain[sortinds] self.updates = rmsprop_updates(self.cost_drop, self.params, stepsize=stepsize) self.train = cgt.function(inputs=[self.X, self.y], outputs=[], updates=self.updates) from cgt.tests import gradcheck_model if grad_check: cost_nodrop = cgt.core.clone(self.cost_nodrop, { self.X: Xtrain[:1], self.y: ytrain[:1] }) print "doing gradient check..." print "------------------------------------" gradcheck_model(cost_nodrop, self.params[0:1]) print "success!" return if profile: cgt.profiler.start() print fmt_row(10, [ "Epoch", "Train NLL", "Train Err", "Test NLL", "Test Err", "Epoch Time" ]) for i_epoch in xrange(int(epochs)): tstart = time.time() for start in xrange(0, Xtrain.shape[0], batch_size): end = start + batch_size self.train(Xtrain[start:end], ytrain[start:end]) elapsed = time.time() - tstart trainerr, trainloss = self.computeloss(Xtrain[:len(Xtest)], ytrain[:len(Xtest)]) testerr, testloss = self.computeloss(Xtest, ytest) print fmt_row( 10, [i_epoch, trainloss, trainerr, testloss, testerr, elapsed]) if (i_epoch > 0) & (i_epoch % step_decrease_time == 0): stepsize = step_decrease_rate * stepsize self.updates = rmsprop_updates(self.cost_drop, self.params, stepsize=stepsize) self.train = cgt.function(inputs=[self.X, self.y], outputs=[], updates=self.updates) print stepsize if profile: cgt.execution.profiler.print_stats() # save Matlab data if output != 'None': sio.savemat(file_name=output, mdict={ 'in': Xtest, 'out': self.y_out(Xtest) })
def main(): nr.seed(0) parser = argparse.ArgumentParser() parser.add_argument("--data_dir", type=str, default="alice") parser.add_argument("--size_mem", type=int,default=64) parser.add_argument("--size_batch", type=int,default=64) parser.add_argument("--n_layers",type=int,default=2) parser.add_argument("--n_unroll",type=int,default=16) parser.add_argument("--step_size",type=float,default=.01) parser.add_argument("--decay_rate",type=float,default=0.95) parser.add_argument("--n_epochs",type=int,default=20) parser.add_argument("--arch",choices=["lstm","gru"],default="lstm") parser.add_argument("--grad_check",action="store_true") parser.add_argument("--profile",action="store_true") parser.add_argument("--unittest",action="store_true") parser.add_argument("--temperature",type=float,default=1) args = parser.parse_args() cgt.set_precision("quad" if args.grad_check else "single") assert args.n_unroll > 1 loader = Loader(args.data_dir,args.size_batch, args.n_unroll, (1.0,0,0)) network, f_loss, f_loss_and_grad, f_step = make_loss_and_grad_and_step(args.arch, loader.size_vocab, loader.size_vocab, args.size_mem, args.size_batch, args.n_layers, args.n_unroll) if args.profile: profiler.start() params = network.get_parameters() pc = ParamCollection(params) pc.set_value_flat(nr.uniform(-.1, .1, size=(pc.get_total_size(),))) def initialize_hiddens(n): return [np.zeros((n, args.size_mem), cgt.floatX) for _ in xrange(get_num_hiddens(args.arch, args.n_layers))] if args.grad_check: x,y = loader.train_batches_iter().next() prev_hiddens = initialize_hiddens(args.size_batch) def f(thnew): thold = pc.get_value_flat() pc.set_value_flat(thnew) loss = f_loss(x,y, *prev_hiddens) pc.set_value_flat(thold) return loss from cgt.numeric_diff import numeric_grad g_num = numeric_grad(f, pc.get_value_flat(),eps=1e-10) result = f_loss_and_grad(x,y,*prev_hiddens) g_anal = result[1] assert np.allclose(g_num, g_anal, atol=1e-4) print "Gradient check succeeded!" return optim_state = make_rmsprop_state(theta=pc.get_value_flat(), step_size = args.step_size, decay_rate = args.decay_rate) for iepoch in xrange(args.n_epochs): losses = [] tstart = time() print "starting epoch",iepoch cur_hiddens = initialize_hiddens(args.size_batch) for (x,y) in loader.train_batches_iter(): out = f_loss_and_grad(x,y, *cur_hiddens) loss = out[0] grad = out[1] cur_hiddens = out[2:] rmsprop_update(grad, optim_state) pc.set_value_flat(optim_state.theta) losses.append(loss) if args.unittest: return print "%.3f s/batch. avg loss = %.3f"%((time()-tstart)/len(losses), np.mean(losses)) optim_state.step_size *= .98 #pylint: disable=E1101 sample(f_step, initialize_hiddens(1), char2ind=loader.char2ind, n_steps=1000, temperature=args.temperature, seed_text = "") if args.profile: profiler.print_stats()
import cgt from cgt import nn from cgt.core import infer_shape import numpy as np infile = "/Users/joschu/Src/caffe/examples/mnist/lenet.prototxt" # infile = "/Users/joschu/Src/caffe/models/bvlc_googlenet/train_val.prototxt" with open(osp.expanduser(infile),"r") as fh: text = fh.read() net = NetParameter() text_format.Merge(text, net) name2node = {} cgt.set_precision('single') if net.input: #pylint: disable=E1101 assert len(net.input) == 1 #pylint: disable=E1101 name2node[net.input[0]] = cgt.tensor(ndim=4,dtype=cgt.floatX, fixed_shape=tuple(net.input_dim)) # XXX super inefficient for layer in net.layer: #pylint: disable=E1101 if layer.phase==TRAIN: print "loading layer %s type=%s in=%s out=%s"%(layer.name, layer.type, layer.bottom, layer.top) output = None inputs = [name2node[name] for name in layer.bottom] if layer.type == "Data": tp = layer.transform_param
def main(): import argparse parser=argparse.ArgumentParser() parser.add_argument("--epochs",type=int,default=10) parser.add_argument("--profile",action="store_true") parser.add_argument("--dropout",action="store_true") parser.add_argument("--stepsize",type=float, default=.001) parser.add_argument("--model",choices=["dense","conv"],default="dense") parser.add_argument("--unittest",action="store_true") parser.add_argument("--grad_check",action="store_true") parser.add_argument("--devtype",choices=["cpu","gpu"],default="cpu") args = parser.parse_args() if args.grad_check: cgt.set_precision("quad") # from mldata.org http://mldata.org/repository/data/viewslug/mnist-original/ # converted to npz mnist = fetch_dataset("http://rll.berkeley.edu/cgt-data/mnist.npz") Xdata = (mnist["X"]/255.).astype(cgt.floatX) ydata = mnist["y"] np.random.seed(0) cgt.update_config(default_device=cgt.core.Device(devtype=args.devtype), backend="native") if args.model=="conv": Xdata = Xdata.reshape(-1, 1, 28, 28) Xtrain = Xdata[0:60000] ytrain = ydata[0:60000] Xtest = Xdata[60000:70000] ytest = ydata[60000:70000] sortinds = np.random.permutation(60000) Xtrain = Xtrain[sortinds] ytrain = ytrain[sortinds] X = cgt.tensor4("X",fixed_shape=(None,1,28,28)) if args.model=="conv" else cgt.matrix("X", fixed_shape=(None,28*28)) y = cgt.vector("y",dtype='i8') if args.model == "dense": p_drop_input,p_drop_hidden = (0.2, 0.5) if args.dropout else (0,0) w_h = init_weights(784, 256) w_h2 = init_weights(256, 256) w_o = init_weights(256, 10) pofy_drop = dense_model(X, w_h, w_h2, w_o, p_drop_input, p_drop_hidden) pofy_nodrop = dense_model(X, w_h, w_h2, w_o, 0., 0.) params = [w_h, w_h2, w_o] elif args.model == "conv": p_drop_conv,p_drop_hidden = (0.2, 0.5) if args.dropout else (0,0) w = init_weights(32, 1, 3, 3) w2 = init_weights(64, 32, 3, 3) w3 = init_weights(128, 64, 3, 3) w4 = init_weights(128 * 2 * 2, 625) w_o = init_weights(625, 10) pofy_drop = convnet_model(X, w, w2, w3, w4, w_o, p_drop_conv, p_drop_hidden) pofy_nodrop = convnet_model(X, w, w2, w3, w4, w_o, 0., 0.) params = [w, w2, w3, w4, w_o] else: raise RuntimeError("Unreachable") cost_drop = -cgt.mean(categorical.loglik(y, pofy_drop)) updates = rmsprop_updates(cost_drop, params, stepsize=args.stepsize) y_nodrop = cgt.argmax(pofy_nodrop, axis=1) cost_nodrop = -cgt.mean(categorical.loglik(y, pofy_nodrop)) err_nodrop = cgt.cast(cgt.not_equal(y_nodrop, y), cgt.floatX).mean() train = cgt.function(inputs=[X, y], outputs=[], updates=updates) computeloss = cgt.function(inputs=[X, y], outputs=[err_nodrop,cost_nodrop]) batch_size=128 from cgt.tests import gradcheck_model if args.grad_check: cost_nodrop = cgt.core.clone(cost_nodrop, {X:Xtrain[:1],y:ytrain[:1]}) print "doing gradient check..." print "------------------------------------" gradcheck_model(cost_nodrop, params[0:1]) print "success!" return if args.profile: cgt.profiler.start() print fmt_row(10, ["Epoch","Train NLL","Train Err","Test NLL","Test Err","Epoch Time"]) for i_epoch in xrange(args.epochs): tstart = time.time() for start in xrange(0, Xtrain.shape[0], batch_size): end = start+batch_size train(Xtrain[start:end], ytrain[start:end]) if args.unittest: return elapsed = time.time() - tstart trainerr, trainloss = computeloss(Xtrain[:len(Xtest)], ytrain[:len(Xtest)]) testerr, testloss = computeloss(Xtest, ytest) print fmt_row(10, [i_epoch, trainloss, trainerr, testloss, testerr, elapsed]) if args.profile: cgt.execution.profiler.print_stats()
def main(): nr.seed(0) parser = argparse.ArgumentParser() parser.add_argument("--data_dir", type=str, default="alice") parser.add_argument("--size_mem", type=int, default=64) parser.add_argument("--size_batch", type=int, default=64) parser.add_argument("--n_layers", type=int, default=2) parser.add_argument("--n_unroll", type=int, default=16) parser.add_argument("--k_in", type=int, default=3) parser.add_argument("--k_h", type=int, default=5) parser.add_argument("--step_size", type=float, default=.01) parser.add_argument("--decay_rate", type=float, default=0.95) parser.add_argument("--n_epochs", type=int, default=20) parser.add_argument("--arch", choices=["lstm", "gru"], default="gru") parser.add_argument("--grad_check", action="store_true") parser.add_argument("--profile", action="store_true") parser.add_argument("--unittest", action="store_true") args = parser.parse_args() cgt.set_precision("quad" if args.grad_check else "single") assert args.n_unroll > 1 loader = Loader(args.data_dir, args.size_batch, args.n_unroll, (.8, .1, .1)) network, f_loss, f_loss_and_grad, f_step = make_loss_and_grad_and_step( args.arch, loader.size_vocab, loader.size_vocab, args.size_mem, args.size_batch, args.n_layers, args.n_unroll, args.k_in, args.k_h) if args.profile: profiler.start() params = network.get_parameters() pc = ParamCollection(params) pc.set_value_flat(nr.uniform(-0.01, 0.01, size=(pc.get_total_size(), ))) for i, param in enumerate(pc.params): if "is_rotation" in param.props: shape = pc.get_shapes()[i] num_vec = int(shape[0] / 2) size_vec = int(shape[1]) gauss = nr.normal(size=(num_vec * size_vec)) gauss = np.reshape(gauss, (num_vec, size_vec)) gauss_mag = norm(gauss, axis=1, keepdims=True) gauss_normed = gauss / gauss_mag gauss_perturb = nr.normal(scale=0.01, size=(num_vec * size_vec)) gauss_perturb = np.reshape(gauss_perturb, (num_vec, size_vec)) second_vec = gauss_normed + gauss_perturb second_vec_mag = norm(second_vec, axis=1, keepdims=True) second_vec_normed = second_vec / second_vec_mag new_param_value = np.zeros(shape) for j in xrange(num_vec): new_param_value[2 * j, :] = gauss_normed[j, :] new_param_value[2 * j + 1, :] = second_vec_normed[j, :] param.op.set_value(new_param_value) #print new_param_value def initialize_hiddens(n): return [ np.ones((n, args.size_mem), cgt.floatX) / float(args.size_mem) for _ in xrange(get_num_hiddens(args.arch, args.n_layers)) ] if args.grad_check: #if True: x, y = loader.train_batches_iter().next() prev_hiddens = initialize_hiddens(args.size_batch) def f(thnew): thold = pc.get_value_flat() pc.set_value_flat(thnew) loss = f_loss(x, y, *prev_hiddens) pc.set_value_flat(thold) return loss from cgt.numeric_diff import numeric_grad print "Beginning grad check" g_num = numeric_grad(f, pc.get_value_flat(), eps=1e-10) print "Ending grad check" result = f_loss_and_grad(x, y, *prev_hiddens) g_anal = result[1] diff = g_num - g_anal abs_diff = np.abs(diff) print np.where(abs_diff > 1e-4) print diff[np.where(abs_diff > 1e-4)] embed() assert np.allclose(g_num, g_anal, atol=1e-4) print "Gradient check succeeded!" return optim_state = make_rmsprop_state(theta=pc.get_value_flat(), step_size=args.step_size, decay_rate=args.decay_rate) for iepoch in xrange(args.n_epochs): losses = [] tstart = time() print "starting epoch", iepoch cur_hiddens = initialize_hiddens(args.size_batch) for (x, y) in loader.train_batches_iter(): out = f_loss_and_grad(x, y, *cur_hiddens) loss = out[0] grad = out[1] cur_hiddens = out[2:] rmsprop_update(grad, optim_state) pc.set_value_flat(optim_state.theta) losses.append(loss) if args.unittest: return print "%.3f s/batch. avg loss = %.3f" % ( (time() - tstart) / len(losses), np.mean(losses)) optim_state.step_size *= .98 #pylint: disable=E1101 sample(f_step, initialize_hiddens(1), char2ind=loader.char2ind, n_steps=300, temp=1.0, seed_text="") if args.profile: profiler.print_stats()
def main(): nr.seed(0) parser = argparse.ArgumentParser() parser.add_argument("--data_dir", type=str, default="alice") parser.add_argument("--size_mem", type=int,default=64) parser.add_argument("--size_batch", type=int,default=64) parser.add_argument("--n_layers",type=int,default=2) parser.add_argument("--n_unroll",type=int,default=16) parser.add_argument("--k_in",type=int,default=3) parser.add_argument("--k_h",type=int,default=5) parser.add_argument("--step_size",type=float,default=.01) parser.add_argument("--decay_rate",type=float,default=0.95) parser.add_argument("--n_epochs",type=int,default=20) parser.add_argument("--arch",choices=["lstm","gru"],default="gru") parser.add_argument("--grad_check",action="store_true") parser.add_argument("--profile",action="store_true") parser.add_argument("--unittest",action="store_true") args = parser.parse_args() cgt.set_precision("quad" if args.grad_check else "single") assert args.n_unroll > 1 loader = Loader(args.data_dir,args.size_batch, args.n_unroll, (.8,.1,.1)) network, f_loss, f_loss_and_grad, f_step = make_loss_and_grad_and_step(args.arch, loader.size_vocab, loader.size_vocab, args.size_mem, args.size_batch, args.n_layers, args.n_unroll, args.k_in, args.k_h) if args.profile: profiler.start() params = network.get_parameters() pc = ParamCollection(params) pc.set_value_flat(nr.uniform(-0.01, 0.01, size=(pc.get_total_size(),))) for i, param in enumerate(pc.params): if "is_rotation" in param.props: shape = pc.get_shapes()[i] num_vec = int(shape[0] / 2) size_vec = int(shape[1]) gauss = nr.normal(size=(num_vec * size_vec)) gauss = np.reshape(gauss, (num_vec, size_vec)) gauss_mag = norm(gauss, axis=1, keepdims=True) gauss_normed = gauss / gauss_mag gauss_perturb = nr.normal(scale=0.01, size=(num_vec * size_vec)) gauss_perturb = np.reshape(gauss_perturb, (num_vec, size_vec)) second_vec = gauss_normed + gauss_perturb second_vec_mag = norm(second_vec, axis=1, keepdims=True) second_vec_normed = second_vec / second_vec_mag new_param_value = np.zeros(shape) for j in xrange(num_vec): new_param_value[2 * j, :] = gauss_normed[j, :] new_param_value[2 * j + 1, :] = second_vec_normed[j, :] param.op.set_value(new_param_value) #print new_param_value def initialize_hiddens(n): return [np.ones((n, args.size_mem), cgt.floatX) / float(args.size_mem) for _ in xrange(get_num_hiddens(args.arch, args.n_layers))] if args.grad_check: #if True: x,y = loader.train_batches_iter().next() prev_hiddens = initialize_hiddens(args.size_batch) def f(thnew): thold = pc.get_value_flat() pc.set_value_flat(thnew) loss = f_loss(x,y, *prev_hiddens) pc.set_value_flat(thold) return loss from cgt.numeric_diff import numeric_grad print "Beginning grad check" g_num = numeric_grad(f, pc.get_value_flat(),eps=1e-10) print "Ending grad check" result = f_loss_and_grad(x,y,*prev_hiddens) g_anal = result[1] diff = g_num - g_anal abs_diff = np.abs(diff) print np.where(abs_diff > 1e-4) print diff[np.where(abs_diff > 1e-4)] embed() assert np.allclose(g_num, g_anal, atol=1e-4) print "Gradient check succeeded!" return optim_state = make_rmsprop_state(theta=pc.get_value_flat(), step_size = args.step_size, decay_rate = args.decay_rate) for iepoch in xrange(args.n_epochs): losses = [] tstart = time() print "starting epoch",iepoch cur_hiddens = initialize_hiddens(args.size_batch) for (x,y) in loader.train_batches_iter(): out = f_loss_and_grad(x,y, *cur_hiddens) loss = out[0] grad = out[1] cur_hiddens = out[2:] rmsprop_update(grad, optim_state) pc.set_value_flat(optim_state.theta) losses.append(loss) if args.unittest: return print "%.3f s/batch. avg loss = %.3f"%((time()-tstart)/len(losses), np.mean(losses)) optim_state.step_size *= .98 #pylint: disable=E1101 sample(f_step, initialize_hiddens(1), char2ind=loader.char2ind, n_steps=300, temp=1.0, seed_text = "") if args.profile: profiler.print_stats()
def main(): import argparse parser = argparse.ArgumentParser() parser.add_argument("--grad_check",action="store_true") parser.add_argument("--n_batches",type=int,default=1000000) parser.add_argument("--profile",action="store_true") parser.add_argument("--unittest", action="store_true") parser.add_argument("--task",choices=["copy","reverse_copy","repeat_copy"],default="copy") args = parser.parse_args() np.seterr("raise") cgt.set_precision("quad" if args.grad_check else "double") np.random.seed(0) # model parameters if args.grad_check: opt = NTMOpts( b = 1, # batch size h = 1, # number of heads n = 2, # number of memory sites m = 3, # dimension at each memory site k = 4, # dimension of input p = 2, # dimension of output ff_hid_sizes = [] ) seq_length = 2 else: opt = NTMOpts( b = 64, # batch size h = 3, # number of heads n = 128, # number of memory sites m = 20, # dimension at each memory site k = 3, # dimension of input p = 1, # dimension of output ff_hid_sizes = [128,128] ) seq_length = 10 if args.unittest: seq_length=3 args.n_batches=3 tstart = time.time() ntm = make_ntm(opt) if args.task == "copy": task = CopyTask(opt.b, seq_length, opt.p) elif args.task == "reverse_copy": task = ReverseCopyTask(opt.b, seq_length, opt.p) elif args.task == "repeat_copy": n_copies = 4 task = RepeatCopyTask(opt.b, seq_length, opt.p, n_copies) f_loss, f_loss_and_grad, params = make_funcs(opt, ntm, task.total_time(), task.loss_timesteps()) print "graph construction and compilation took %g seconds"%(time.time()-tstart) pc = ParamCollection(params) pc.set_value_flat(nr.uniform(-.1, .1, size=(pc.get_total_size(),))) if args.grad_check: x,y = task.gen_batch() def f(thnew): thold = th.copy() pc.set_value_flat(thnew) loss = f_loss(x,y) pc.set_value_flat(thold) return loss from cgt.numeric_diff import numeric_grad g_num = numeric_grad(f, th,eps=1e-8) _, _, g_anal = f_loss_and_grad(x,y) assert np.allclose(g_num, g_anal, atol=1e-8) print "Gradient check succeeded!" print "%i/%i elts of grad are nonzero"%( (g_anal != 0).sum(), g_anal.size ) return seq_num = 0 state = make_rmsprop_state(pc.get_value_flat(), .01, .95) print fmt_row(13, ["seq num", "CE (bits)", "accuracy", "|g|_inf"], header=True) if args.profile: cgt.profiler.start() for i in xrange(args.n_batches): x,y = task.gen_batch() seq_num += x.shape[1] l,l01,g = f_loss_and_grad(x,y) print fmt_row(13, [seq_num, l,l01,np.abs(g).max()]) rmsprop_update(g, state) pc.set_value_flat(state.theta) if not np.isfinite(l): break if args.profile: cgt.profiler.print_stats()
def check_affine_funcs(precision, backend): cgt.reset_config() np.random.seed(0) cgt.set_precision(precision) cgt.core.update_config(backend=backend) sA = np.array(nr.rand()) sB = np.array(nr.rand()) sC = np.array(nr.rand()) mA = nr.randn(2,3) mB = nr.randn(2,3) mC = nr.randn(2,3) for fn in [xplusx, _2x_plus_3x, xm1, onemx]: for arg in [sA, mA]: check_affine(fn, arg) check_affine(elem_mult2, mA, mB, mC) check_affine(elem_mult2, sA, sB, sC) check_affine(pyramid, sA, sB, sC) check_affine(pyramid, mA, mB, mC) check_affine(slisum1, mA) check_affine(slisum2, mA) check_affine(slisum3, mA) check_affine(slisum4, mA) check_affine(max0, mA) check_affine(max1, mA) check_affine(max2, mA) check_affine(fancysli0, mA) check_affine(sum10, mA) check_affine(sum01, mA) check_affine(repeat0, mA[0:1, :], nr.randn(7,3)) check_affine(repeat1, mA[:, 0:1], nr.randn(2,7)) M23 = mA M35 = nr.randn(3,5) v3 = nr.randn(3) v13 = v3.reshape(1,3) #XXX v5 = nr.randn(5) v15 = v5.reshape(1,5) #XXX v3b = nr.randn(3) check_affine(matmat00, M23, M35) check_affine(matmat01, M23, M35.T) check_affine(matmat10, M23.T, M35) check_affine(matmat11, M23.T, M35.T) check_affine(matmat00a, M23, M35) check_affine(matmat01a, M23, M35.T) # check_affine(matmat10a, M23.T, M35) check_affine(matmat11a, M23.T, M35.T) check_affine(matvec, M23, v3) check_affine(vecvec, v3, v3b) check_affine(bcadd, M23, v13) check_affine(matmatplusvec, M23, M35, v15) check_affine(transpose, M23, nr.randn(3,2)) T235 = nr.randn(2,3,5) T235a = nr.randn(2,3,5) T257 = nr.randn(2,5,7) T2357 = nr.randn(2,3,5,7) T2357a = nr.randn(2,3,5,7) check_affine(transpose012, T235, T235a) check_affine(transpose021, T235, T235a.transpose(0,2,1)) check_affine(transpose102, T235, T235a.transpose(1,0,2)) check_affine(transpose0312, T2357, T2357a.transpose(0,3,1,2)) check_affine(transpose0231, T2357, T2357a.transpose(0,2,3,1)) check_affine(batchedmatmul, T235, T257) check_affine(flip0, M23, nr.randn(2,3)) check_affine(flip1, M23, nr.randn(2,3)) # check_affine(negsli0, M23, nr.randn(2,3)) # check_affine(negsli1, M23, nr.randn(2,3)) # check_affine(negsli01, M23, nr.randn(2,3)) # check_affine(rfft, M35) check_affine(convlike, T2357, nr.randn(11,3*5*7), nr.randn(2,11)) if DISPLAY: from thirdparty.tabulate import tabulate print tabulate([[key,val["fn"],val["grad"]] for (key,val) in sorted(PROB2RESULT.items())],headers=["funcname","fncount","gradcount"])