def test_get_decoder_state(): batch_size = 32 feat_t_steps = 20 feat_num_features = 42 num_out_classes = 28 num_out_classes_true = num_out_classes + 2 # Start, end, are added decoder_size = 50 tau = np.reshape(np.random.normal(0.1, 0.2, batch_size*feat_t_steps*feat_num_features), (batch_size, feat_t_steps, feat_num_features)) tau2 = np.reshape(np.random.normal(0.1, 0.2, batch_size*feat_num_features), (batch_size, feat_num_features)) tau3 = np.reshape(np.random.normal(0.1, 0.2, batch_size*num_out_classes_true), (batch_size, num_out_classes_true)) feats = cgt.tensor3(fixed_shape=(batch_size, feat_t_steps, feat_num_features)) s = nnbuilder.Seq2Seq(nn_input_btf=feats, num_out_classes=num_out_classes, decoder_size=decoder_size, feature_size=feat_num_features) context_bf = cgt.matrix(fixed_shape=(batch_size, feat_num_features)) prev_out_bc = cgt.matrix(fixed_shape=(batch_size, num_out_classes_true)) state_i_bf = nn.parameter(nn.init_array(nn.IIDGaussian(0.1), (batch_size, decoder_size)), name="decoder_init") decoder_out = s.get_decoder_state(context_bf, prev_out_bc, state_i_bf) decode_fun = cgt.function([feats, context_bf, prev_out_bc], [decoder_out]) m = decode_fun(tau, tau2, tau3)[0] assert m.shape == (batch_size, decoder_size) assert np.mean(m) < 1.0
def test_matmuls(): with cgt.scoped_update_config(parallel=True): m = 8 d = 1000 # build graph X = cgt.matrix("X") Y = cgt.matrix("Y") loss = 0 for k in xrange(m): # loss = loss+cgt.sin(X*Y+k).sum() loss = loss + (X.dot(Y + k)).sum() f = cgt.function([X, Y], loss) # test things out! seed(0) X_val = randn(d, d) Y_val = randn(d, d) vals = [X_val, Y_val] tic = time.time() out = f(*vals) toc = time.time() print toc - tic
def test_get_character_distribution(): batch_size = 32 feat_t_steps = 20 feat_num_features = 42 num_out_classes = 28 # This is the index of the start token. num_out_classes_true = num_out_classes + 2 # Add start and end tokens automatically. decoder_size = 50 tau = np.reshape(np.random.normal(0.1, 0.2, batch_size*feat_t_steps*feat_num_features), (batch_size, feat_t_steps, feat_num_features)) tau2 = np.reshape(np.random.normal(0.1, 0.2, batch_size*feat_num_features), (batch_size, feat_num_features)) tau3 = np.reshape(np.random.normal(0.1, 0.2, batch_size*decoder_size), (batch_size, decoder_size)) feats = cgt.tensor3(fixed_shape=(batch_size, feat_t_steps, feat_num_features)) s = nnbuilder.Seq2Seq(nn_input_btf=feats, num_out_classes=num_out_classes, decoder_size=decoder_size, feature_size=feat_num_features) context_bf = cgt.matrix(fixed_shape=(batch_size, feat_num_features)) state_bf = cgt.matrix(fixed_shape=(batch_size, decoder_size)) m_out = s.get_character_distribution(state_bf, context_bf) out_fun = cgt.function([feats, context_bf, state_bf], [m_out]) m = out_fun(tau, tau2, tau3)[0] assert m.shape == (batch_size, num_out_classes_true)
def test_matmuls(): with cgt.scoped_update_config(parallel = True, backend="native"): m = 8 d = 1000 # build graph X = cgt.matrix("X") Y = cgt.matrix("Y") loss=0 for k in xrange(m): # loss = loss+cgt.sin(X*Y+k).sum() loss = loss+(X.dot(Y+k)).sum() f = cgt.function([X,Y], loss) # test things out! seed(0) X_val = randn(d, d) Y_val = randn(d, d) vals = [X_val, Y_val] tic = time.time() out = f(*vals) toc = time.time() print toc-tic
def make_deep_lstm(size_input, size_mem, n_layers, size_output, size_batch): inputs = [cgt.matrix(fixed_shape=(size_batch, size_input))] for _ in xrange(2 * n_layers): inputs.append(cgt.matrix(fixed_shape=(size_batch, size_mem))) outputs = [] for i_layer in xrange(n_layers): prev_h = inputs[i_layer * 2] prev_c = inputs[i_layer * 2 + 1] if i_layer == 0: x = inputs[0] size_x = size_input else: x = outputs[(i_layer - 1) * 2] size_x = size_mem input_sums = nn.Affine(size_x, 4 * size_mem)(x) + nn.Affine( size_x, 4 * size_mem)(prev_h) sigmoid_chunk = cgt.sigmoid(input_sums[:, 0:3 * size_mem]) in_gate = sigmoid_chunk[:, 0:size_mem] forget_gate = sigmoid_chunk[:, size_mem:2 * size_mem] out_gate = sigmoid_chunk[:, 2 * size_mem:3 * size_mem] in_transform = cgt.tanh(input_sums[:, 3 * size_mem:4 * size_mem]) next_c = forget_gate * prev_c + in_gate * in_transform next_h = out_gate * cgt.tanh(next_c) outputs.append(next_c) outputs.append(next_h) category_activations = nn.Affine(size_mem, size_output)(outputs[-1]) logprobs = nn.logsoftmax(category_activations) outputs.append(logprobs) return nn.Module(inputs, outputs)
def __init__(self, n_actions): Serializable.__init__(self, n_actions) cgt.set_precision('double') n_in = 128 o_no = cgt.matrix("o_no",fixed_shape=(None,n_in)) a_n = cgt.vector("a_n",dtype='i8') q_n = cgt.vector("q_n") oldpdist_np = cgt.matrix("oldpdists") h0 = (o_no - 128.0)/128.0 nhid = 64 h1 = cgt.tanh(nn.Affine(128,nhid,weight_init=nn.IIDGaussian(std=.1))(h0)) probs_na = nn.softmax(nn.Affine(nhid,n_actions,weight_init=nn.IIDGaussian(std=0.01))(h1)) logprobs_na = cgt.log(probs_na) b = cgt.size(o_no, 0) logps_n = logprobs_na[cgt.arange(b), a_n] surr = (logps_n*q_n).mean() kl = (oldpdist_np * cgt.log(oldpdist_np/probs_na)).sum(axis=1).mean() params = nn.get_parameters(surr) gradsurr = cgt.grad(surr, params) flatgrad = cgt.concatenate([p.flatten() for p in gradsurr]) lam = cgt.scalar() penobj = surr - lam * kl self._f_grad_lagrangian = cgt.function([lam, oldpdist_np, o_no, a_n, q_n], cgt.concatenate([p.flatten() for p in cgt.grad(penobj,params)])) self.f_pdist = cgt.function([o_no], probs_na) self.f_probs = cgt.function([o_no], probs_na) self.f_surr_kl = cgt.function([oldpdist_np, o_no, a_n, q_n], [surr, kl]) self.f_gradlogp = cgt.function([oldpdist_np, o_no, a_n, q_n], flatgrad) self.pc = ParamCollection(params)
def make_deep_lstm(size_input, size_mem, n_layers, size_output, size_batch): inputs = [cgt.matrix(fixed_shape=(size_batch, size_input))] for _ in xrange(2*n_layers): inputs.append(cgt.matrix(fixed_shape=(size_batch, size_mem))) outputs = [] for i_layer in xrange(n_layers): prev_h = inputs[i_layer*2] prev_c = inputs[i_layer*2+1] if i_layer==0: x = inputs[0] size_x = size_input else: x = outputs[(i_layer-1)*2] size_x = size_mem input_sums = nn.Affine(size_x, 4*size_mem)(x) + nn.Affine(size_x, 4*size_mem)(prev_h) sigmoid_chunk = cgt.sigmoid(input_sums[:,0:3*size_mem]) in_gate = sigmoid_chunk[:,0:size_mem] forget_gate = sigmoid_chunk[:,size_mem:2*size_mem] out_gate = sigmoid_chunk[:,2*size_mem:3*size_mem] in_transform = cgt.tanh(input_sums[:,3*size_mem:4*size_mem]) next_c = forget_gate*prev_c + in_gate * in_transform next_h = out_gate*cgt.tanh(next_c) outputs.append(next_c) outputs.append(next_h) category_activations = nn.Affine(size_mem, size_output)(outputs[-1]) logprobs = nn.logsoftmax(category_activations) outputs.append(logprobs) return nn.Module(inputs, outputs)
def s_func_lstm(_in, _s_in, _s_out, name=''): c_prev = cgt.matrix(fixed_shape=(None, _s_out)) h_prev = cgt.matrix(fixed_shape=(None, _s_out)) c_cur, h_cur = lstm_block(h_prev, c_prev, _in, _s_in, _s_out, name) net_c_prev.append(c_prev) net_h_prev.append(h_prev) net_c_curr.append(c_cur) net_h_curr.append(h_cur) return h_cur
def __init__(self, obs_dim, ctrl_dim): cgt.set_precision('double') Serializable.__init__(self, obs_dim, ctrl_dim) self.obs_dim = obs_dim self.ctrl_dim = ctrl_dim o_no = cgt.matrix("o_no",fixed_shape=(None,obs_dim)) a_na = cgt.matrix("a_na",fixed_shape = (None, ctrl_dim)) adv_n = cgt.vector("adv_n") oldpdist_np = cgt.matrix("oldpdist", fixed_shape=(None, 2*ctrl_dim)) self.logstd = logstd_1a = nn.parameter(np.zeros((1, self.ctrl_dim)), name="std_1a") std_1a = cgt.exp(logstd_1a) # Here's where we apply the network h0 = o_no nhid = 32 h1 = cgt.tanh(nn.Affine(obs_dim,nhid,weight_init=nn.IIDGaussian(std=0.1))(h0)) h2 = cgt.tanh(nn.Affine(nhid,nhid,weight_init=nn.IIDGaussian(std=0.1))(h1)) mean_na = nn.Affine(nhid,ctrl_dim,weight_init=nn.IIDGaussian(std=0.01))(h2) b = cgt.size(o_no, 0) std_na = cgt.repeat(std_1a, b, axis=0) oldmean_na = oldpdist_np[:, 0:self.ctrl_dim] oldstd_na = oldpdist_np[:, self.ctrl_dim:2*self.ctrl_dim] logp_n = ((-.5) * cgt.square( (a_na - mean_na) / std_na ).sum(axis=1)) - logstd_1a.sum() oldlogp_n = ((-.5) * cgt.square( (a_na - oldmean_na) / oldstd_na ).sum(axis=1)) - cgt.log(oldstd_na).sum(axis=1) ratio_n = cgt.exp(logp_n - oldlogp_n) surr = (ratio_n*adv_n).mean() pdists_np = cgt.concatenate([mean_na, std_na], axis=1) # kl = cgt.log(sigafter/) params = nn.get_parameters(surr) oldvar_na = cgt.square(oldstd_na) var_na = cgt.square(std_na) kl = (cgt.log(std_na / oldstd_na) + (oldvar_na + cgt.square(oldmean_na - mean_na)) / (2 * var_na) - .5).sum(axis=1).mean() lam = cgt.scalar() penobj = surr - lam * kl self._compute_surr_kl = cgt.function([oldpdist_np, o_no, a_na, adv_n], [surr, kl]) self._compute_grad_lagrangian = cgt.function([lam, oldpdist_np, o_no, a_na, adv_n], cgt.concatenate([p.flatten() for p in cgt.grad(penobj,params)])) self.f_pdist = cgt.function([o_no], pdists_np) self.f_objs = cgt.function([oldpdist_np, o_no, a_na, adv_n], [surr, kl]) self.pc = ParamCollection(params)
def __init__(self, xdim, args, dec="bernoulli"): self.xdim = xdim self.hdim = args.hdim self.zdim = args.zdim self.lmbda = args.lmbda # weight decay coefficient * 2 self.x = cgt.matrix("x", dtype=cgt.floatX) self.eps = cgt.matrix("eps", dtype=cgt.floatX) self.enc_mlp = GaussianMLP(self.x, self.xdim, self.hdim, self.zdim, nlayers=args.nlayers, eps=self.eps) if dec == "bernoulli": # log p(x | z) defined as -CE(x, y) = dec_mlp.cost(y) self.dec_mlp = BernoulliMLP(self.enc_mlp.out, self.zdim, self.hdim, self.xdim, nlayers=args.nlayers, y=self.x) elif dec == "gaussian": self.dec_mlp = GaussianMLP(self.enc_mlp.out, self.zdim, self.hdim, self.xdim, nlayers=args.nlayers, y=self.x) else: raise RuntimeError("unrecognized decoder %" % dec) self.cost = (-cgt.sum(kld_unit_mvn(self.enc_mlp.mu, self.enc_mlp.var)) + self.dec_mlp.cost) / args.batch_size self.params = self.enc_mlp.params + self.dec_mlp.params # L2 regularization self.gparams = [cgt.grad(self.cost, [p])[0] + self.lmbda * p for p in self.params] self.gaccums = [cgt.shared(np.zeros(p.op.get_value().shape, dtype=cgt.floatX)) for p in self.params] # XXX replace w/ adagrad update from nn ADAGRAD_EPS = 1e-10 # for stability self.updates = [ (param, param - args.lr * gparam / cgt.sqrt(gaccum + cgt.square(gparam) + ADAGRAD_EPS)) for param, gparam, gaccum in zip(self.params, self.gparams, self.gaccums) ] self.updates += [ (gaccum, gaccum + cgt.square(gparam)) for gaccum, gparam in zip(self.gaccums, self.gparams) ] self.train = cgt.function( [self.x, self.eps], self.cost, updates=self.updates ) self.test = cgt.function( [self.x, self.eps], self.cost, updates=None ) # can be used for semi-supervised learning for example self.encode = cgt.function( [self.x, self.eps], self.enc_mlp.out )
def make_funcs(config, dbg_out={}): net_in, net_out = hybrid_network(config['num_inputs'], config['num_outputs'], config['num_units'], config['num_sto'], dbg_out=dbg_out) if not config['dbg_out_full']: dbg_out = {} # def f_sample(_inputs, num_samples=1, flatten=False): # _mean, _var = f_step(_inputs) # _samples = [] # for _m, _v in zip(_mean, _var): # _s = np.random.multivariate_normal(_m, np.diag(np.sqrt(_v)), num_samples) # if flatten: _samples.extend(_s) # else: _samples.append(_s) # return np.array(_samples) Y_gt = cgt.matrix("Y") Y_prec = cgt.tensor3('V', fixed_shape=(None, config['num_inputs'], config['num_inputs'])) params = nn.get_parameters(net_out) size_batch, size_out = net_out.shape inputs, outputs = [net_in], [net_out] if config['no_bias']: print "Excluding bias" params = [p for p in params if not p.name.endswith(".b")] loss_vec = dist.gaussian.logprob(Y_gt, net_out, Y_prec) if config['weight_decay'] > 0.: print "Applying penalty on parameter norm" params_flat = cgt.concatenate([p.flatten() for p in params]) loss_param = config['weight_decay'] * cgt.sum(params_flat ** 2) loss_vec -= loss_param # / size_batch loss = cgt.sum(loss_vec) / size_batch # TODO_TZ f_step seems not to fail if X has wrong dim f_step = cgt.function(inputs, outputs) f_surr = get_surrogate_func(inputs + [Y_prec, Y_gt], outputs, [loss_vec], params, _dbg_out=dbg_out) return params, f_step, None, None, None, f_surr
def make_ff_controller(opt): b, h, m, p, k = opt.b, opt.h, opt.m, opt.p, opt.k H = 2*h in_size = k + h*m out_size = H*m + H + H + H*3 + H + h*m + h*m + p # Previous reads r_bhm = cgt.tensor3("r", fixed_shape = (b,h,m)) # External inputs X_bk = cgt.matrix("x", fixed_shape = (b,k)) r_b_hm = r_bhm.reshape([r_bhm.shape[0], r_bhm.shape[1]*r_bhm.shape[2]]) # Input to controller inp_bq = cgt.concatenate([X_bk, r_b_hm], axis=1) hid_sizes = opt.ff_hid_sizes activation = cgt.tanh layer_out_sizes = [in_size] + hid_sizes + [out_size] last_out = inp_bq # feedforward part. we could simplify a bit by using nn.Affine for i in xrange(len(layer_out_sizes)-1): indim = layer_out_sizes[i] outdim = layer_out_sizes[i+1] W = cgt.shared(.02*nr.randn(indim, outdim), name="W%i"%i, fixed_shape_mask="all") bias = cgt.shared(.02*nr.randn(1, outdim), name="b%i"%i, fixed_shape_mask="all") last_out = cgt.broadcast("+",last_out.dot(W),bias,"xx,1x") # Don't apply nonlinearity at the last layer if i != len(layer_out_sizes)-2: last_out = activation(last_out) idx = 0 k_bHm = last_out[:,idx:idx+H*m]; idx += H*m; k_bHm = k_bHm.reshape([b,H,m]) beta_bH = last_out[:,idx:idx+H]; idx += H g_bH = last_out[:,idx:idx+H]; idx += H s_bH3 = last_out[:,idx:idx+3*H]; idx += 3*H; s_bH3 = s_bH3.reshape([b,H,3]) gamma_bH = last_out[:,idx:idx+H]; idx += H e_bhm = last_out[:,idx:idx+h*m]; idx += h*m; e_bhm = e_bhm.reshape([b,h,m]) a_bhm = last_out[:,idx:idx+h*m]; idx += h*m; a_bhm = a_bhm.reshape([b,h,m]) y_bp = last_out[:,idx:idx+p]; idx += p k_bHm = cgt.tanh(k_bHm) beta_bH = nn.softplus(beta_bH) g_bH = cgt.sigmoid(g_bH) s_bH3 = sum_normalize2(cgt.exp(s_bH3)) gamma_bH = cgt.sigmoid(gamma_bH)+1 e_bhm = cgt.sigmoid(e_bhm) a_bhm = cgt.tanh(a_bhm) # y_bp = y_bp assert infer_shape(k_bHm) == (b,H,m) assert infer_shape(beta_bH) == (b,H) assert infer_shape(g_bH) == (b,H) assert infer_shape(s_bH3) == (b,H,3) assert infer_shape(gamma_bH) == (b,H) assert infer_shape(e_bhm) == (b,h,m) assert infer_shape(a_bhm) == (b,h,m) assert infer_shape(y_bp) == (b,p) return nn.Module([r_bhm, X_bk], [k_bHm, beta_bH, g_bH, s_bH3, gamma_bH, e_bhm, a_bhm, y_bp])
def make_deep_gru(size_input, size_mem, n_layers, size_output, size_batch): inputs = [cgt.matrix() for i_layer in xrange(n_layers + 1)] outputs = [] for i_layer in xrange(n_layers): prev_h = inputs[ i_layer + 1] # note that inputs[0] is the external input, so we add 1 x = inputs[0] if i_layer == 0 else outputs[i_layer - 1] size_x = size_input if i_layer == 0 else size_mem update_gate = cgt.sigmoid( nn.Affine(size_x, size_mem, name="i2u")(x) + nn.Affine(size_mem, size_mem, name="h2u")(prev_h)) reset_gate = cgt.sigmoid( nn.Affine(size_x, size_mem, name="i2r")(x) + nn.Affine(size_mem, size_mem, name="h2r")(prev_h)) gated_hidden = reset_gate * prev_h p2 = nn.Affine(size_mem, size_mem)(gated_hidden) p1 = nn.Affine(size_x, size_mem)(x) hidden_target = cgt.tanh(p1 + p2) next_h = (1.0 - update_gate) * prev_h + update_gate * hidden_target outputs.append(next_h) category_activations = nn.Affine(size_mem, size_output, name="pred")(outputs[-1]) logprobs = nn.logsoftmax(category_activations) outputs.append(logprobs) return nn.Module(inputs, outputs)
def hybrid_network(size_in, size_out, num_units, num_stos, dbg_out={}): assert len(num_units) == len(num_stos) net_in = cgt.matrix("X", fixed_shape=(None, size_in)) prev_num_units, prev_out = size_in, net_in dbg_out['NET~in'] = net_in curr_layer = 1 for (curr_num_units, curr_num_sto) in zip(num_units, num_stos): assert curr_num_units >= curr_num_sto >= 0 prev_out = combo_layer( prev_out, prev_num_units, curr_num_units, (curr_num_sto, ), s_funcs=s_func_ip, o_funcs=(lambda x: cgt.bernoulli(cgt.sigmoid(x)), cgt.nn.rectify), name=str(curr_layer), dbg_out=dbg_out) dbg_out['L%d~out' % curr_layer] = prev_out prev_num_units = curr_num_units curr_layer += 1 net_out = nn.Affine(prev_num_units, size_out, name="InnerProd(%d->%d)" % (prev_num_units, size_out))(prev_out) dbg_out['NET~out'] = net_out return net_in, net_out
def lstm_network_t(size_in, size_out, num_units, num_mems, dbg_out={}): def s_func_lstm(_in, _s_in, _s_out, name=''): c_prev = cgt.matrix(fixed_shape=(None, _s_out)) h_prev = cgt.matrix(fixed_shape=(None, _s_out)) c_cur, h_cur = lstm_block(h_prev, c_prev, _in, _s_in, _s_out, name) net_c_prev.append(c_prev) net_h_prev.append(h_prev) net_c_curr.append(c_cur) net_h_curr.append(h_cur) return h_cur assert len(num_units) == len(num_mems) net_c_prev, net_h_prev, net_c_curr, net_h_curr = [], [], [], [] net_in = cgt.matrix(fixed_shape=(None, size_in)) prev_num_units, prev_out = size_in, net_in curr_layer = 1 for curr_num_units, curr_num_mem in zip(num_units, num_mems): assert curr_num_units >= curr_num_mem >= 0 prev_out = combo_layer(prev_out, prev_num_units, curr_num_units, (curr_num_mem, ), s_funcs=(s_func_lstm, s_func_ip), o_funcs=(None, cgt.sigmoid), name=str(curr_layer), dbg_out=dbg_out) dbg_out['L%d~out' % curr_layer] = prev_out prev_num_units = curr_num_units curr_layer += 1 net_out = nn.Affine(prev_num_units, size_out, name="Out")(prev_out) dbg_out['NET~out'] = net_out return net_in, net_out, net_c_prev, net_h_prev, net_c_curr, net_h_curr
def hybrid_network(size_in, size_out, num_units, num_stos, dbg_out=[]): assert len(num_units) == len(num_stos) X = cgt.matrix("X", fixed_shape=(None, size_in)) prev_num_units, prev_out = size_in, X dbg_out.append(X) for (curr_num_units, curr_num_sto) in zip(num_units, num_stos): _layer_dbg_out = [] prev_out = hybrid_layer(prev_out, prev_num_units, curr_num_units, curr_num_sto, dbg_out=_layer_dbg_out) prev_num_units = curr_num_units dbg_out.extend(_layer_dbg_out) dbg_out.append(prev_out) # TODO_TZ bigger problem! param cannot deterministically influence cost # otherwise the surrogate cost is not complete log likelihood net_out = nn.Affine(prev_num_units, size_out, name="InnerProd(%d->%d)" % (prev_num_units, size_out))(prev_out) dbg_out.append(net_out) # assert prev_num_units == size_out # net_out = prev_out return X, net_out
def lstm_network_t(size_in, size_out, num_units, num_mems, dbg_out={}): def s_func_lstm(_in, _s_in, _s_out, name=''): c_prev = cgt.matrix(fixed_shape=(None, _s_out)) h_prev = cgt.matrix(fixed_shape=(None, _s_out)) c_cur, h_cur = lstm_block(h_prev, c_prev, _in, _s_in, _s_out, name) net_c_prev.append(c_prev) net_h_prev.append(h_prev) net_c_curr.append(c_cur) net_h_curr.append(h_cur) return h_cur assert len(num_units) == len(num_mems) net_c_prev, net_h_prev, net_c_curr, net_h_curr = [], [], [], [] net_in = cgt.matrix(fixed_shape=(None, size_in)) prev_num_units, prev_out = size_in, net_in curr_layer = 1 for curr_num_units, curr_num_mem in zip(num_units, num_mems): assert curr_num_units >= curr_num_mem >= 0 prev_out = combo_layer( prev_out, prev_num_units, curr_num_units, (curr_num_mem,), s_funcs=(s_func_lstm, s_func_ip), o_funcs=(None, cgt.sigmoid), name=str(curr_layer), dbg_out=dbg_out ) dbg_out['L%d~out' % curr_layer] = prev_out prev_num_units = curr_num_units curr_layer += 1 net_out = nn.Affine(prev_num_units, size_out, name="Out")(prev_out) dbg_out['NET~out'] = net_out return net_in, net_out, net_c_prev, net_h_prev, net_c_curr, net_h_curr
def test_setting_weights(): X = cgt.matrix("X", fixed_shape=(None, 28*28)) model = build_model(X, 0.0) nnbuilder.set_all_weights(model, 'mnist.p') y = cgt.vector("y", dtype='i8') cost = -cgt.mean(categorical.loglik(y, model)) selected_number = cgt.argmax(model, axis=1) err_nodrop = cgt.cast(cgt.not_equal(selected_number, y), cgt.floatX).mean() computeloss = cgt.function(inputs=[X, y], outputs=[err_nodrop, cost]) Xdata, ydata = load_data() Xtrain = Xdata[0:60000] ytrain = ydata[0:60000] Xtest = Xdata[60000:70000] ytest = ydata[60000:70000] sortinds = np.random.permutation(60000) Xtrain = Xtrain[sortinds] ytrain = ytrain[sortinds] print fmt_row(10, ["Epoch","Train NLL","Train Err","Test NLL","Test Err","Epoch Time"]) for i_epoch in xrange(3): tstart = time.time() elapsed = time.time() - tstart trainerr, trainloss = computeloss(Xtrain[:len(Xtest)], ytrain[:len(Xtest)]) testerr, testloss = computeloss(Xtest, ytest) print fmt_row(10, [i_epoch, trainloss, trainerr, testloss, testerr, elapsed])
def make_loss_and_grad_and_step(arch, size_input, size_output, size_mem, size_batch, n_layers, n_unroll, k_in, k_h): # symbolic variables x_tnk = cgt.tensor3() targ_tnk = cgt.tensor3() #make_network = make_deep_lstm if arch=="lstm" else make_deep_gru make_network = make_deep_rrnn_rot_relu network = make_network(size_input, size_mem, n_layers, size_output, size_batch, k_in, k_h) init_hiddens = [ cgt.matrix() for _ in xrange(get_num_hiddens(arch, n_layers)) ] # TODO fixed sizes cur_hiddens = init_hiddens loss = 0 for t in xrange(n_unroll): outputs = network([x_tnk[t]] + cur_hiddens) cur_hiddens, prediction_logprobs = outputs[:-1], outputs[-1] # loss = loss + nn.categorical_negloglik(prediction_probs, targ_tnk[t]).sum() loss = loss - (prediction_logprobs * targ_tnk[t]).sum() cur_hiddens = outputs[:-1] final_hiddens = cur_hiddens loss = loss / (n_unroll * size_batch) params = network.get_parameters() gradloss = cgt.grad(loss, params) flatgrad = flatcat(gradloss) with utils.Message("compiling loss+grad"): f_loss_and_grad = cgt.function([x_tnk, targ_tnk] + init_hiddens, [loss, flatgrad] + final_hiddens) f_loss = cgt.function([x_tnk, targ_tnk] + init_hiddens, loss) assert len(init_hiddens) == len(final_hiddens) x_nk = cgt.matrix('x') outputs = network([x_nk] + init_hiddens) f_step = cgt.function([x_nk] + init_hiddens, outputs) # print "node count", cgt.count_nodes(flatgrad) return network, f_loss, f_loss_and_grad, f_step
def make_updater_fc(): X = cgt.matrix("X", fixed_shape=(None, 28 * 28)) y = cgt.vector("y", dtype='i8') stepsize = cgt.scalar("stepsize") loss = build_fc_return_loss(X, y) params = nn.get_parameters(loss) gparams = cgt.grad(loss, params) updates = [(p, p - stepsize * gp) for (p, gp) in zip(params, gparams)] return cgt.function([X, y, stepsize], loss, updates=updates)
def make_deep_rrnn(size_input, size_mem, n_layers, size_output, size_batch_in, k_in, k_h): inputs = [cgt.matrix() for i_layer in xrange(n_layers+1)] outputs = [] print 'input_size: ', size_input for i_layer in xrange(n_layers): prev_h = inputs[i_layer+1] # note that inputs[0] is the external input, so we add 1 x = inputs[0] if i_layer==0 else outputs[i_layer-1] size_x = size_input if i_layer==0 else size_mem size_batch = prev_h.shape[0] xform_h_param = nn.TensorParam((2 * k_h, size_mem), name="rotxform") xform_h_non = xform_h_param.weight xform_h_non.props["is_rotation"] = True xform_h_norm = cgt.norm(xform_h_non, axis=1, keepdims=True) xform_h = cgt.broadcast('/', xform_h_non, xform_h_norm, "xx,x1") r_vec = nn.Affine(size_x, 2 * k_in * size_mem)(x) r_non = cgt.reshape(r_vec, (size_batch, 2 * k_in, size_mem)) r_norm = cgt.norm(r_non, axis=2, keepdims=True) r = cgt.broadcast('/', r_non, r_norm, "xxx,xx1") prev_h_3 = cgt.reshape(prev_h, (size_batch, size_mem, 1)) inters_in = [prev_h_3] colon = slice(None, None, None) for i in xrange(2 * k_in): inter_in = inters_in[-1] r_cur = cgt.subtensor(r, [colon, i, colon]) r_cur_3_transpose = cgt.reshape(r_cur, (size_batch, 1, size_mem)) r_cur_3 = cgt.reshape(r_cur, (size_batch, size_mem, 1)) ref_cur = cgt.batched_matmul(r_cur_3, cgt.batched_matmul(r_cur_3_transpose, inter_in)) inter_out = inter_in - 2 * ref_cur inters_in.append(inter_out) h_in_rot = cgt.reshape(inters_in[-1], (size_batch, size_mem)) inters_h = [h_in_rot] for i in xrange(2 * k_h): inter_in = inters_h[-1] r_cur = cgt.subtensor(xform_h, [i, colon]) r_cur_2_transpose = cgt.reshape(r_cur, (size_mem, 1)) r_cur_2 = cgt.reshape(r_cur, (1, size_mem)) ref_cur = cgt.dot(cgt.dot(inter_in, r_cur_2_transpose), r_cur_2) inter_out = inter_in - 2 * ref_cur inters_h.append(inter_out) next_h = inters_h[-1] outputs.append(next_h) category_activations = nn.Affine(size_mem, size_output,name="pred")(outputs[-1]) logprobs = nn.logsoftmax(category_activations) outputs.append(logprobs) #print 'len outputs:', len(outputs) #print 'len inputs:', len(inputs) return nn.Module(inputs, outputs)
def make_updater_fc(): X = cgt.matrix("X", fixed_shape=(None, 28 * 28)) y = cgt.vector("y", dtype="i8") stepsize = cgt.scalar("stepsize") loss = build_fc_return_loss(X, y) params = nn.get_parameters(loss) gparams = cgt.grad(loss, params) updates = [(p, p - stepsize * gp) for (p, gp) in zip(params, gparams)] return cgt.function([X, y, stepsize], loss, updates=updates)
def test_stack(): x = cgt.scalar() y = cgt.scalar() z = cgt.scalar() s0 = cgt.stack([x, y, z], axis=0) assert cgt.numeric_eval(s0, {x: 1, y: 2, z: 3}).shape == (3, ) x = cgt.vector() y = cgt.vector() z = cgt.vector() v0 = cgt.stack([x, y, z], axis=0) assert cgt.numeric_eval(v0, { x: np.zeros(2), y: np.zeros(2), z: np.zeros(2) }).shape == (3, 2) v1 = cgt.stack([x, y, z], axis=1) assert cgt.numeric_eval(v1, { x: np.zeros(2), y: np.ones(2), z: np.zeros(2) }).shape == (2, 3) x = cgt.matrix() y = cgt.matrix() z = cgt.matrix() m0 = cgt.stack([x, y, z], axis=0) assert cgt.numeric_eval(m0, { x: np.zeros((2, 4)), y: np.zeros((2, 4)), z: np.zeros((2, 4)) }).shape == (3, 2, 4) m1 = cgt.stack([x, y, z], axis=1) assert cgt.numeric_eval(m1, { x: np.zeros((2, 4)), y: np.zeros((2, 4)), z: np.zeros((2, 4)) }).shape == (2, 3, 4) m2 = cgt.stack([x, y, z], axis=2) assert cgt.numeric_eval(m2, { x: np.zeros((2, 4)), y: np.zeros((2, 4)), z: np.zeros((2, 4)) }).shape == (2, 4, 3)
def test_multi_output(): for x in (cgt.scalar('x'), cgt.vector('x'), cgt.matrix('x')): for cls in (SinCos, SinCos2): y,z = core.unpack(core.Result(cls(), [x])) xnum = np.ones((3,)*x.ndim, cgt.floatX) correct = (np.sin(xnum),np.cos(xnum)) yznum = cgt.numeric_eval([y,z], {x:xnum}) np.testing.assert_allclose(yznum, correct) f = cgt.function([x],[y,z]) np.testing.assert_allclose(f(xnum), correct)
def lstm_network(T, size_in, size_out, num_units, num_mems, dbg_out={}): assert T > 0 x, y, c_in, h_in, c_out, h_out = lstm_network_t( size_in, size_out, num_units, num_mems, dbg_out ) f_lstm_t = nn.Module([x] + c_in + h_in, [y] + c_out + h_out) Xs = [cgt.matrix(fixed_shape=x.get_fixed_shape(), name="X%d"%t) for t in range(T)] C_0 = [cgt.matrix(fixed_shape=_c.get_fixed_shape()) for _c in c_in] H_0 = [cgt.matrix(fixed_shape=_h.get_fixed_shape()) for _h in h_in] loss, C_t, H_t, Ys = [], C_0, H_0, [] for t, x in enumerate(Xs): _out = f_lstm_t([x] + C_t + H_t) y, C_t, H_t = _out[0], _out[1:len(C_t)+1], _out[1+len(C_t):] Ys.append(y) if t == 0: C_1, H_1 = C_t, H_t C_T, H_T = C_t, H_t params = f_lstm_t.get_parameters() return params, Xs, Ys, C_0, H_0, C_T, H_T, C_1, H_1
def test_multi_output(): for x in (cgt.scalar('x'), cgt.vector('x'), cgt.matrix('x')): for cls in (SinCos, SinCos2): y, z = core.unpack(core.Result(cls(), [x])) xnum = np.ones((3, ) * x.ndim, cgt.floatX) correct = (np.sin(xnum), np.cos(xnum)) yznum = cgt.numeric_eval([y, z], {x: xnum}) np.testing.assert_allclose(yznum, correct) f = cgt.function([x], [y, z]) np.testing.assert_allclose(f(xnum), correct)
def make_loss_and_grad_and_step(arch, size_input, size_output, size_mem, size_batch, n_layers, n_unroll, k_in, k_h): # symbolic variables x_tnk = cgt.tensor3() targ_tnk = cgt.tensor3() #make_network = make_deep_lstm if arch=="lstm" else make_deep_gru make_network = make_deep_rrnn_rot_relu network = make_network(size_input, size_mem, n_layers, size_output, size_batch, k_in, k_h) init_hiddens = [cgt.matrix() for _ in xrange(get_num_hiddens(arch, n_layers))] # TODO fixed sizes cur_hiddens = init_hiddens loss = 0 for t in xrange(n_unroll): outputs = network([x_tnk[t]] + cur_hiddens) cur_hiddens, prediction_logprobs = outputs[:-1], outputs[-1] # loss = loss + nn.categorical_negloglik(prediction_probs, targ_tnk[t]).sum() loss = loss - (prediction_logprobs*targ_tnk[t]).sum() cur_hiddens = outputs[:-1] final_hiddens = cur_hiddens loss = loss / (n_unroll * size_batch) params = network.get_parameters() gradloss = cgt.grad(loss, params) flatgrad = flatcat(gradloss) with utils.Message("compiling loss+grad"): f_loss_and_grad = cgt.function([x_tnk, targ_tnk] + init_hiddens, [loss, flatgrad] + final_hiddens) f_loss = cgt.function([x_tnk, targ_tnk] + init_hiddens, loss) assert len(init_hiddens) == len(final_hiddens) x_nk = cgt.matrix('x') outputs = network([x_nk] + init_hiddens) f_step = cgt.function([x_nk]+init_hiddens, outputs) # print "node count", cgt.count_nodes(flatgrad) return network, f_loss, f_loss_and_grad, f_step
def test_take_one_step_lstm(): nn_input = cgt.matrix(fixed_shape=(20, 64)) l = nnbuilder.LSTM(num_units=128, input_time_size=None, input_feature_size=64) o = l.take_one_step(nn_input) out = cgt.function([nn_input], [o]) tau = np.zeros(shape=(20, 64)) tau[0, 0:40] = 1 m = out(tau)[0] mm = np.mean(m[0]) mmm = np.mean(m[1]) assert mm != mmm
def make_ntm(opt): Mprev_bnm = cgt.tensor3("M", fixed_shape=(opt.b, opt.n, opt.m)) X_bk = cgt.matrix("X", fixed_shape=(opt.b, opt.k)) wprev_bHn = cgt.tensor3("w", fixed_shape=(opt.b, opt.h*2, opt.n)) rprev_bhm = cgt.tensor3("r", fixed_shape=(opt.b, opt.h, opt.m)) controller = make_ff_controller(opt) M_bnm, w_bHn, r_bhm, y_bp = ntm_step(opt, Mprev_bnm, X_bk, wprev_bHn, rprev_bhm, controller) # in this form it looks like a standard seq-to-seq model # external input and output are first elements ntm = nn.Module([X_bk, Mprev_bnm, wprev_bHn, rprev_bhm], [y_bp, M_bnm, w_bHn, r_bhm]) return ntm
def lstm_network(T, size_in, size_out, num_units, num_mems, dbg_out={}): assert T > 0 x, y, c_in, h_in, c_out, h_out = lstm_network_t(size_in, size_out, num_units, num_mems, dbg_out) f_lstm_t = nn.Module([x] + c_in + h_in, [y] + c_out + h_out) Xs = [ cgt.matrix(fixed_shape=x.get_fixed_shape(), name="X%d" % t) for t in range(T) ] C_0 = [cgt.matrix(fixed_shape=_c.get_fixed_shape()) for _c in c_in] H_0 = [cgt.matrix(fixed_shape=_h.get_fixed_shape()) for _h in h_in] loss, C_t, H_t, Ys = [], C_0, H_0, [] for t, x in enumerate(Xs): _out = f_lstm_t([x] + C_t + H_t) y, C_t, H_t = _out[0], _out[1:len(C_t) + 1], _out[1 + len(C_t):] Ys.append(y) if t == 0: C_1, H_1 = C_t, H_t C_T, H_T = C_t, H_t params = f_lstm_t.get_parameters() return params, Xs, Ys, C_0, H_0, C_T, H_T, C_1, H_1
def __init__(self, n_actions): Serializable.__init__(self, n_actions) cgt.set_precision('double') n_in = 128 o_no = cgt.matrix("o_no", fixed_shape=(None, n_in)) a_n = cgt.vector("a_n", dtype='i8') q_n = cgt.vector("q_n") oldpdist_np = cgt.matrix("oldpdists") h0 = (o_no - 128.0) / 128.0 nhid = 64 h1 = cgt.tanh( nn.Affine(128, nhid, weight_init=nn.IIDGaussian(std=.1))(h0)) probs_na = nn.softmax( nn.Affine(nhid, n_actions, weight_init=nn.IIDGaussian(std=0.01))(h1)) logprobs_na = cgt.log(probs_na) b = cgt.size(o_no, 0) logps_n = logprobs_na[cgt.arange(b), a_n] surr = (logps_n * q_n).mean() kl = (oldpdist_np * cgt.log(oldpdist_np / probs_na)).sum(axis=1).mean() params = nn.get_parameters(surr) gradsurr = cgt.grad(surr, params) flatgrad = cgt.concatenate([p.flatten() for p in gradsurr]) lam = cgt.scalar() penobj = surr - lam * kl self._f_grad_lagrangian = cgt.function( [lam, oldpdist_np, o_no, a_n, q_n], cgt.concatenate([p.flatten() for p in cgt.grad(penobj, params)])) self.f_pdist = cgt.function([o_no], probs_na) self.f_probs = cgt.function([o_no], probs_na) self.f_surr_kl = cgt.function([oldpdist_np, o_no, a_n, q_n], [surr, kl]) self.f_gradlogp = cgt.function([oldpdist_np, o_no, a_n, q_n], flatgrad) self.pc = ParamCollection(params)
def make_loss_and_grad(net): X_b = inps[0] #cgt.matrix(dtype=cgt.floatX) y_onehot = cgt.matrix(dtype='i4') outputs = [logprobs] loss = nn.crossent(outputs[0], y_onehot) / b_size #gradloss = cgt.grad(loss, params) gradloss = cgt.grad(loss, param_list) # XXX use flatcat function grad = cgt.concatenate([x.flatten() for x in gradloss]) #grad = gradloss return cgt.make_function([X_b, y_onehot], [loss, grad, logprobs])
def make_deep_rrnn_rot_relu(size_input, size_mem, n_layers, size_output, size_batch_in, k_in, k_h): inputs = [cgt.matrix() for i_layer in xrange(n_layers + 1)] outputs = [] print 'input_size: ', size_input for i_layer in xrange(n_layers): prev_h = inputs[ i_layer + 1] # note that inputs[0] is the external input, so we add 1 x = inputs[0] if i_layer == 0 else outputs[i_layer - 1] size_x = size_input if i_layer == 0 else size_mem size_batch = prev_h.shape[0] xform_h_param = nn.TensorParam((2 * k_h, size_mem), name="rotxform") xform_h_non = xform_h_param.weight xform_h_non.props["is_rotation"] = True xform_h_norm = cgt.norm(xform_h_non, axis=1, keepdims=True) xform_h = cgt.broadcast('/', xform_h_non, xform_h_norm, "xx,x1") add_in_lin = nn.Affine(size_x, size_mem)(x) add_in_relu = nn.rectify(add_in_lin) prev_h_scaled = nn.scale_mag(prev_h) h_in_added = prev_h_scaled + add_in_relu inters_h = [h_in_added] colon = slice(None, None, None) for i in xrange(2 * k_h): inter_in = inters_h[-1] r_cur = xform_h[i, :] #r_cur = cgt.subtensor(xform_h, [i, colon]) r_cur_2_transpose = cgt.reshape(r_cur, (size_mem, 1)) r_cur_2 = cgt.reshape(r_cur, (1, size_mem)) ref_cur = cgt.dot(cgt.dot(inter_in, r_cur_2_transpose), r_cur_2) inter_out = inter_in - 2 * ref_cur inters_h.append(inter_out) next_h = inters_h[-1] outputs.append(next_h) category_activations = nn.Affine(size_mem, size_output, name="pred")(outputs[-1]) logprobs = nn.logsoftmax(category_activations) outputs.append(logprobs) #print 'len outputs:', len(outputs) #print 'len inputs:', len(inputs) return nn.Module(inputs, outputs)
def test_stack(): x = cgt.scalar() y = cgt.scalar() z = cgt.scalar() s0 = cgt.stack([x, y, z], axis=0) assert cgt.numeric_eval(s0, {x: 1, y: 2, z: 3}).shape == (3,) x = cgt.vector() y = cgt.vector() z = cgt.vector() v0 = cgt.stack([x, y, z], axis=0) assert cgt.numeric_eval(v0, {x: np.zeros(2), y: np.zeros(2), z: np.zeros(2)}).shape == (3, 2) v1 = cgt.stack([x, y, z], axis=1) assert cgt.numeric_eval(v1, {x: np.zeros(2), y: np.ones(2), z: np.zeros(2)}).shape == (2, 3) x = cgt.matrix() y = cgt.matrix() z = cgt.matrix() m0 = cgt.stack([x, y, z], axis=0) assert cgt.numeric_eval(m0, {x: np.zeros((2, 4)), y: np.zeros((2, 4)), z: np.zeros((2, 4))}).shape == (3, 2, 4) m1 = cgt.stack([x, y, z], axis=1) assert cgt.numeric_eval(m1, {x: np.zeros((2, 4)), y: np.zeros((2, 4)), z: np.zeros((2, 4))}).shape == (2, 3, 4) m2 = cgt.stack([x, y, z], axis=2) assert cgt.numeric_eval(m2, {x: np.zeros((2, 4)), y: np.zeros((2, 4)), z: np.zeros((2, 4))}).shape == (2, 4, 3)
def make_updater_fc_parallel(): X = cgt.matrix("X", fixed_shape=(None, 28 * 28)) y = cgt.vector("y", dtype="i8") stepsize = cgt.scalar("stepsize") loss = build_fc_return_loss(X, y) params = nn.get_parameters(loss) m = nn.Module([X, y], [loss]) split_loss = 0 for start in xrange(0, batch_size, batch_size // 4): sli = slice(start, start + batch_size // 4) split_loss += m([X[sli], y[sli]])[0] split_loss /= 4 gparams = cgt.grad(split_loss, params) updates2 = [(p, p - stepsize * gp) for (p, gp) in zip(params, gparams)] return cgt.function([X, y, stepsize], split_loss, updates=updates2)
def test_incsubtensor0(): # First let's test fancy slice along zeroth dimension W = cgt.shared(np.zeros((5, 3)), name="W") inc = cgt.matrix() # we'll increment W by this matrix incval = np.arange(9).reshape(3, 3) inds = cgt.vector(dtype='i8') updates = {W: cgt.inc_subtensor(W, inds, inc)} f = cgt.function([inds, inc], [], updates=updates) f([1, 2, 4], incval) assert np.allclose( W.op.get_value(), np.array([[0., 0., 0.], [0., 1., 2.], [3., 4., 5.], [0., 0., 0.], [6., 7., 8.]]))
def make_updater_fc_parallel(): X = cgt.matrix("X", fixed_shape=(None, 28 * 28)) y = cgt.vector("y", dtype='i8') stepsize = cgt.scalar("stepsize") loss = build_fc_return_loss(X, y) params = nn.get_parameters(loss) m = nn.Module([X, y], [loss]) split_loss = 0 for start in xrange(0, batch_size, batch_size // 4): sli = slice(start, start + batch_size // 4) split_loss += m([X[sli], y[sli]])[0] split_loss /= 4 gparams = cgt.grad(split_loss, params) updates2 = [(p, p - stepsize * gp) for (p, gp) in zip(params, gparams)] return cgt.function([X, y, stepsize], split_loss, updates=updates2)
def test_noncontiguous_matrix(): x = np.arange(1,7).reshape(2,3).astype(cgt.floatX) result = np.log(x.sum(axis=0)).sum() xvar = cgt.matrix() f = cgt.function([xvar],cgt.log(xvar.sum(axis=0)).sum()) assert np.allclose( f(np.asarray(x, order='C')), result) assert np.allclose( f(np.asarray(x, order='C', dtype='int64')), result) assert np.allclose( f(np.asarray(x, order='F')), result) X = np.zeros((4,6)) X[::2,::2] = x assert np.allclose( f(X[::2,::2]), result)
def main(): print("Loading data...") X = cgt.matrix("X", fixed_shape=(None, 28*28)) y = cgt.vector("y", dtype='i8') model = build_model(X, 0.0) loss = -cgt.mean(categorical.loglik(y, model)) updates = nn.rmsprop(loss, nn.get_parameters(loss), 0.01) train = cgt.function(inputs=[X, y], outputs=[], updates=updates) y_nodrop = cgt.argmax(model, axis=1) cost_nodrop = -cgt.mean(categorical.loglik(y, model)) err_nodrop = cgt.cast(cgt.not_equal(y_nodrop, y), cgt.floatX).mean() computeloss = cgt.function(inputs=[X, y], outputs=[err_nodrop, cost_nodrop]) batch_size=128 Xdata, ydata = load_data() Xtrain = Xdata[0:60000] ytrain = ydata[0:60000] Xtest = Xdata[60000:70000] ytest = ydata[60000:70000] sortinds = np.random.permutation(60000) Xtrain = Xtrain[sortinds] ytrain = ytrain[sortinds] print fmt_row(10, ["Epoch","Train NLL","Train Err","Test NLL","Test Err","Epoch Time"]) for i_epoch in xrange(3): tstart = time.time() for start in xrange(0, Xtrain.shape[0], batch_size): end = start+batch_size train(Xtrain[start:end], ytrain[start:end]) elapsed = time.time() - tstart trainerr, trainloss = computeloss(Xtrain[:len(Xtest)], ytrain[:len(Xtest)]) testerr, testloss = computeloss(Xtest, ytest) print fmt_row(10, [i_epoch, trainloss, trainerr, testloss, testerr, elapsed]) nnbuilder.save_weights(model, 'mnist')
def test_linreg(): cgt.reset_config() cgt.set_precision('double') N = 10 K = 3 Xval = np.random.randn(N, K) wval = np.random.randn(K) bval = np.random.randn() yval = np.random.randn(N) X_nk = cgt.matrix("X") y_n = cgt.vector("y") w_k = cgt.vector("w") b = cgt.scalar(name="b") ypred = cgt.dot(X_nk, w_k) + b err = cgt.sum(cgt.square(ypred - y_n)) g = cgt.grad(err, [w_k, b]) g_simple, an, _ = cgt.core.simplify_and_analyze(g) print "Loss function:" cgt.print_tree([err]) print "Gradient:" cgt.print_tree(g) print "Gradient simplified" cgt.print_tree( g_simple, nodefn=lambda node, o: o.write(" " + an["node2hash"][node][:5])) print "-------" d = {X_nk: Xval, w_k: wval, b: bval, y_n: yval} np.testing.assert_allclose(cgt.numeric_eval(err, d), np.linalg.norm(Xval.dot(wval) + bval - yval)**2) np.testing.assert_allclose(cgt.numeric_eval(g[0], d), 2 * Xval.T.dot(Xval.dot(wval) + bval - yval)) np.testing.assert_allclose(cgt.numeric_eval(g[1], d), 2 * np.sum(Xval.dot(wval) + bval - yval, 0))
def test_incsubtensor1(): W = cgt.shared(np.zeros((5,3)), name="W") inc = cgt.matrix() # we'll increment W by this matrix incval = np.arange(9).reshape(3,3) start = cgt.scalar(dtype='i8') stop = cgt.scalar(dtype='i8') updates = {W : cgt.inc_subtensor(W, slice(start, stop), inc)} f = cgt.function([start,stop,inc],[],updates=updates) f(0,3,incval) assert np.allclose(W.op.get_value(), np.array( [ [ 0., 1., 2.], [ 3., 4., 5.], [ 6., 7., 8.], [ 0., 0., 0.], [ 0., 0., 0.], ]))
def test_incsubtensor1(): W = cgt.shared(np.zeros((5, 3)), name="W") inc = cgt.matrix() # we'll increment W by this matrix incval = np.arange(9).reshape(3, 3) start = cgt.scalar(dtype='i8') stop = cgt.scalar(dtype='i8') updates = {W: cgt.inc_subtensor(W, slice(start, stop), inc)} f = cgt.function([start, stop, inc], [], updates=updates) f(0, 3, incval) assert np.allclose( W.op.get_value(), np.array([ [0., 1., 2.], [3., 4., 5.], [6., 7., 8.], [0., 0., 0.], [0., 0., 0.], ]))
def make_funcs(config, dbg_out=None): params, Xs, Ys, C_0, H_0, C_T, H_T, C_1, H_1 = lstm_network( config['rnn_steps'], config['num_inputs'], config['num_outputs'], config['num_units'], config['num_mems']) # basic size_batch = Xs[0].shape[0] dY = Ys[0].shape[-1] Ys_gt = [ cgt.matrix(fixed_shape=(size_batch, dY), name='Y%d' % t) for t in range(len(Ys)) ] Ys_var = [cgt.tensor3(fixed_shape=(size_batch, dY, dY)) for _ in Ys] net_inputs, net_outputs = Xs + C_0 + H_0 + Ys_var, Ys + C_T + H_T # calculate loss loss_vec = [] for i in range(len(Ys)): # if i == 0: continue _l = dist.gaussian.logprob(Ys_gt[i], Ys[i], Ys_var[i]) loss_vec.append(_l) loss_vec = cgt.add_multi(loss_vec) if config['weight_decay'] > 0.: params_flat = cgt.concatenate([p.flatten() for p in params]) loss_param = config['weight_decay'] * cgt.sum(params_flat**2) loss_vec -= loss_param # / size_batch loss = cgt.sum(loss_vec) / config['rnn_steps'] / size_batch grad = cgt.grad(loss, params) # functions def f_init(size_batch): c_0, h_0 = [], [] for _n_m in config['num_mems']: if _n_m > 0: c_0.append(np.zeros((size_batch, _n_m))) h_0.append(np.zeros((size_batch, _n_m))) return c_0, h_0 f_step = cgt.function([Xs[0]] + C_0 + H_0, [Ys[0]] + C_1 + H_1) f_loss = cgt.function(net_inputs + Ys_gt, loss) f_grad = cgt.function(net_inputs + Ys_gt, grad) f_surr = cgt.function(net_inputs + Ys_gt, [loss] + net_outputs + grad) return params, f_step, f_loss, f_grad, f_init, f_surr
def __init__(self, num_features=None, num_hidden=100): stepsize = 0.01 # with shape (batchsize, ncols) X = cgt.matrix("X", fixed_shape=(1, num_features)) # y: a symbolic variable representing the rewards, which are integers y = cgt.scalar("y", dtype='float64') hid1 = nn.rectify( nn.Affine(num_features, num_hidden, weight_init=nn.IIDGaussian(std=.1), bias_init=nn.Constant(1))(X) ) # One final fully-connected layer, and then a linear activation output for reward output = nn.Affine(num_hidden, 1, weight_init=nn.IIDGaussian(std=.1), bias_init=nn.Constant(1))(hid1) abs_deviation = cgt.abs(output - y).mean() params = nn.get_parameters(abs_deviation) gparams = cgt.grad(abs_deviation, params) updates = [(p, p-stepsize*gp) for (p, gp) in zip(params, gparams)] self.predictor = cgt.function([X], output) self.updater = cgt.function([X, y], abs_deviation, updates=updates)
def test_get_context(): batch_size = 32 feat_t_steps = 3 feat_num_features = 30 state_num_features = 20 num_out_classes = 28 feats = cgt.tensor3(fixed_shape=(batch_size, feat_t_steps, feat_num_features)) prev_out = cgt.matrix(fixed_shape=(batch_size, state_num_features)) sigmoided = cgt.sigmoid(prev_out) s = nnbuilder.Seq2Seq(nn_input_btf=feats, num_out_classes=num_out_classes, feature_size=feat_num_features, decoder_size=state_num_features) mm = cgt.infer_shape(s.features_post_mlp_btf) assert mm == (batch_size, feat_t_steps, feat_num_features) context_out = s.get_context(sigmoided) out_fun = cgt.function([feats, prev_out], [context_out]) tau = np.reshape(np.random.normal(0.1, 0.2, batch_size*feat_t_steps*feat_num_features), (batch_size, feat_t_steps, feat_num_features)) tau2 = np.reshape(np.random.normal(0.1, 0.2, batch_size*state_num_features), (batch_size, state_num_features)) m = out_fun(tau, tau2)[0] assert m.shape == (batch_size, feat_num_features) assert np.mean(m) < 1
def test_incsubtensor0(): # First let's test fancy slice along zeroth dimension W = cgt.shared(np.zeros((5,3)), name="W") inc = cgt.matrix() # we'll increment W by this matrix incval = np.arange(9).reshape(3,3) inds = cgt.vector(dtype='i8') updates = {W : cgt.inc_subtensor(W, inds, inc)} f = cgt.function([inds,inc],[],updates=updates) f([1,2,4],incval) assert np.allclose(W.op.get_value(), np.array( [[ 0., 0., 0.], [ 0., 1., 2.], [ 3., 4., 5.], [ 0., 0., 0.], [ 6., 7., 8.]]))
def build_bilinear_net(input_shapes, **kwargs): x_shape, u_shape = input_shapes X = cgt.tensor4('X', fixed_shape=(None, ) + x_shape) U = cgt.matrix('U', fixed_shape=(None, ) + u_shape) X_diff_pred = Bilinear(input_shapes, b=None, name='bilinear')(X, U) X_next_pred = X + X_diff_pred Y = X.reshape((X.shape[0], cgt.mul_multi(X.shape[1:]))) Y_diff_pred = X_diff_pred.reshape( (X_diff_pred.shape[0], cgt.mul_multi(X_diff_pred.shape[1:]))) X_diff = cgt.tensor4('X_diff', fixed_shape=(None, ) + x_shape) X_next = X + X_diff loss = ((X_next - X_next_pred)**2).mean(axis=0).sum() / 2. net_name = 'BilinearNet' input_vars = OrderedDict([(var.name, var) for var in [X, U, X_diff]]) pred_vars = OrderedDict([('Y_diff_pred', Y_diff_pred), ('Y', Y), ('X_next_pred', X_next_pred)]) return net_name, input_vars, pred_vars, loss
def make_funcs(config, dbg_out={}): net_in, net_out = hybrid_network(config['num_inputs'], config['num_outputs'], config['num_units'], config['num_sto'], dbg_out=dbg_out) if not config['dbg_out_full']: dbg_out = {} # def f_sample(_inputs, num_samples=1, flatten=False): # _mean, _var = f_step(_inputs) # _samples = [] # for _m, _v in zip(_mean, _var): # _s = np.random.multivariate_normal(_m, np.diag(np.sqrt(_v)), num_samples) # if flatten: _samples.extend(_s) # else: _samples.append(_s) # return np.array(_samples) Y_gt = cgt.matrix("Y") Y_prec = cgt.tensor3('V', fixed_shape=(None, config['num_inputs'], config['num_inputs'])) params = nn.get_parameters(net_out) size_batch, size_out = net_out.shape inputs, outputs = [net_in], [net_out] if config['no_bias']: print "Excluding bias" params = [p for p in params if not p.name.endswith(".b")] loss_vec = dist.gaussian.logprob(Y_gt, net_out, Y_prec) if config['weight_decay'] > 0.: print "Applying penalty on parameter norm" params_flat = cgt.concatenate([p.flatten() for p in params]) loss_param = config['weight_decay'] * cgt.sum(params_flat**2) loss_vec -= loss_param # / size_batch loss = cgt.sum(loss_vec) / size_batch # TODO_TZ f_step seems not to fail if X has wrong dim f_step = cgt.function(inputs, outputs) f_surr = get_surrogate_func(inputs + [Y_prec, Y_gt], outputs, [loss_vec], params, _dbg_out=dbg_out) return params, f_step, None, None, None, f_surr
def make_funcs(config, dbg_out=None): params, Xs, Ys, C_0, H_0, C_T, H_T, C_1, H_1 = lstm_network( config['rnn_steps'], config['num_inputs'], config['num_outputs'], config['num_units'], config['num_mems'] ) # basic size_batch = Xs[0].shape[0] dY = Ys[0].shape[-1] Ys_gt = [cgt.matrix(fixed_shape=(size_batch, dY), name='Y%d'%t) for t in range(len(Ys))] Ys_var = [cgt.tensor3(fixed_shape=(size_batch, dY, dY)) for _ in Ys] net_inputs, net_outputs = Xs + C_0 + H_0 + Ys_var, Ys + C_T + H_T # calculate loss loss_vec = [] for i in range(len(Ys)): # if i == 0: continue _l = dist.gaussian.logprob(Ys_gt[i], Ys[i], Ys_var[i]) loss_vec.append(_l) loss_vec = cgt.add_multi(loss_vec) if config['weight_decay'] > 0.: params_flat = cgt.concatenate([p.flatten() for p in params]) loss_param = config['weight_decay'] * cgt.sum(params_flat ** 2) loss_vec -= loss_param # / size_batch loss = cgt.sum(loss_vec) / config['rnn_steps'] / size_batch grad = cgt.grad(loss, params) # functions def f_init(size_batch): c_0, h_0 = [], [] for _n_m in config['num_mems']: if _n_m > 0: c_0.append(np.zeros((size_batch, _n_m))) h_0.append(np.zeros((size_batch, _n_m))) return c_0, h_0 f_step = cgt.function([Xs[0]] + C_0 + H_0, [Ys[0]] + C_1 + H_1) f_loss = cgt.function(net_inputs + Ys_gt, loss) f_grad = cgt.function(net_inputs + Ys_gt, grad) f_surr = cgt.function(net_inputs + Ys_gt, [loss] + net_outputs + grad) return params, f_step, f_loss, f_grad, f_init, f_surr
def hybrid_network(size_in, size_out, num_units, num_stos, dbg_out={}): assert len(num_units) == len(num_stos) net_in = cgt.matrix("X", fixed_shape=(None, size_in)) prev_num_units, prev_out = size_in, net_in dbg_out['NET~in'] = net_in curr_layer = 1 for (curr_num_units, curr_num_sto) in zip(num_units, num_stos): assert curr_num_units >= curr_num_sto >= 0 prev_out = combo_layer(prev_out, prev_num_units, curr_num_units, (curr_num_sto,), s_funcs=s_func_ip, o_funcs=(lambda x: cgt.bernoulli(cgt.sigmoid(x)), cgt.nn.rectify), name=str(curr_layer), dbg_out=dbg_out) dbg_out['L%d~out' % curr_layer] = prev_out prev_num_units = curr_num_units curr_layer += 1 net_out = nn.Affine(prev_num_units, size_out, name="InnerProd(%d->%d)" % (prev_num_units, size_out) )(prev_out) dbg_out['NET~out'] = net_out return net_in, net_out
def __init__(self, num_features=None, num_hidden=100): stepsize = 0.01 # with shape (batchsize, ncols) X = cgt.matrix("X", fixed_shape=(1, num_features)) # y: a symbolic variable representing the rewards, which are integers y = cgt.scalar("y", dtype='float64') hid1 = nn.rectify( nn.Affine(num_features, num_hidden, weight_init=nn.IIDGaussian(std=.1), bias_init=nn.Constant(1))(X)) # One final fully-connected layer, and then a linear activation output for reward output = nn.Affine(num_hidden, 1, weight_init=nn.IIDGaussian(std=.1), bias_init=nn.Constant(1))(hid1) abs_deviation = cgt.abs(output - y).mean() params = nn.get_parameters(abs_deviation) gparams = cgt.grad(abs_deviation, params) updates = [(p, p - stepsize * gp) for (p, gp) in zip(params, gparams)] self.predictor = cgt.function([X], output) self.updater = cgt.function([X, y], abs_deviation, updates=updates)
def make_funcs(net_in, net_out, config, dbg_out=None): def f_grad(*x): out = f_surr(*x) return out['loss'], out['surr_loss'], out['surr_grad'] Y = cgt.matrix("Y") params = nn.get_parameters(net_out) if 'no_bias' in config and config['no_bias']: print "Excluding bias" params = [p for p in params if not p.name.endswith(".b")] size_out, size_batch = Y.shape[1], net_in.shape[0] f_step = cgt.function([net_in], [net_out]) # loss_raw of shape (size_batch, 1); loss should be a scalar # sum-of-squares loss sigma = 0.1 loss_raw = -cgt.sum((net_out - Y)**2, axis=1, keepdims=True) / sigma # negative log-likelihood # out_sigma = cgt.exp(net_out[:, size_out:]) + 1.e-6 # positive sigma # loss_raw = -gaussian_diagonal.logprob( # Y, net_out, # out_sigma # cgt.fill(.01, [size_batch, size_out]) # ) if 'param_penal_wt' in config: print "Applying penalty on parameter norm" assert config['param_penal_wt'] > 0 params_flat = cgt.concatenate([p.flatten() for p in params]) loss_param = cgt.fill(cgt.sum(params_flat**2), [size_batch, 1]) loss_param *= config['param_penal_wt'] loss_raw += loss_param loss = cgt.sum(loss_raw) / size_batch # end of loss definition f_loss = cgt.function([net_in, Y], [net_out, loss]) f_surr = get_surrogate_func([net_in, Y], [net_out] + dbg_out, [loss_raw], params) return params, f_step, f_loss, f_grad, f_surr
def main(): import argparse parser = argparse.ArgumentParser() parser.add_argument("--epochs", type=int, default=10) parser.add_argument("--profile", action="store_true") parser.add_argument("--dropout", action="store_true") parser.add_argument("--stepsize", type=float, default=.001) parser.add_argument("--model", choices=["dense", "conv"], default="dense") parser.add_argument("--unittest", action="store_true") parser.add_argument("--grad_check", action="store_true") args = parser.parse_args() if args.grad_check: cgt.set_precision("quad") # from mldata.org http://mldata.org/repository/data/viewslug/mnist-original/ # converted to npz mnist = fetch_dataset("http://rll.berkeley.edu/cgt-data/mnist.npz") Xdata = (mnist["X"] / 255.).astype(cgt.floatX) ydata = mnist["y"] np.random.seed(0) if args.model == "conv": Xdata = Xdata.reshape(-1, 1, 28, 28) Xtrain = Xdata[0:60000] ytrain = ydata[0:60000] Xtest = Xdata[60000:70000] ytest = ydata[60000:70000] sortinds = np.random.permutation(60000) Xtrain = Xtrain[sortinds] ytrain = ytrain[sortinds] X = cgt.tensor4("X", fixed_shape=(None, 1, 28, 28)) if args.model == "conv" else cgt.matrix( "X", fixed_shape=(None, 28 * 28)) y = cgt.vector("y", dtype='i8') if args.model == "dense": p_drop_input, p_drop_hidden = (0.2, 0.5) if args.dropout else (0, 0) w_h = init_weights(784, 256) w_h2 = init_weights(256, 256) w_o = init_weights(256, 10) pofy_drop = dense_model(X, w_h, w_h2, w_o, p_drop_input, p_drop_hidden) pofy_nodrop = dense_model(X, w_h, w_h2, w_o, 0., 0.) params = [w_h, w_h2, w_o] elif args.model == "conv": p_drop_conv, p_drop_hidden = (0.2, 0.5) if args.dropout else (0, 0) w = init_weights(32, 1, 3, 3) w2 = init_weights(64, 32, 3, 3) w3 = init_weights(128, 64, 3, 3) w4 = init_weights(128 * 2 * 2, 625) w_o = init_weights(625, 10) pofy_drop = convnet_model(X, w, w2, w3, w4, w_o, p_drop_conv, p_drop_hidden) pofy_nodrop = convnet_model(X, w, w2, w3, w4, w_o, 0., 0.) params = [w, w2, w3, w4, w_o] else: raise RuntimeError("Unreachable") cost_drop = -cgt.mean(categorical.loglik(y, pofy_drop)) updates = rmsprop_updates(cost_drop, params, stepsize=args.stepsize) y_nodrop = cgt.argmax(pofy_nodrop, axis=1) cost_nodrop = -cgt.mean(categorical.loglik(y, pofy_nodrop)) err_nodrop = cgt.cast(cgt.not_equal(y_nodrop, y), cgt.floatX).mean() train = cgt.function(inputs=[X, y], outputs=[], updates=updates) computeloss = cgt.function(inputs=[X, y], outputs=[err_nodrop, cost_nodrop]) batch_size = 128 from cgt.tests import gradcheck_model if args.grad_check: cost_nodrop = cgt.core.clone(cost_nodrop, { X: Xtrain[:1], y: ytrain[:1] }) print "doing gradient check..." print "------------------------------------" gradcheck_model(cost_nodrop, params[0:1]) print "success!" return if args.profile: cgt.profiler.start() print fmt_row(10, [ "Epoch", "Train NLL", "Train Err", "Test NLL", "Test Err", "Epoch Time" ]) for i_epoch in xrange(args.epochs): tstart = time.time() for start in xrange(0, Xtrain.shape[0], batch_size): end = start + batch_size train(Xtrain[start:end], ytrain[start:end]) if args.unittest: return elapsed = time.time() - tstart trainerr, trainloss = computeloss(Xtrain[:len(Xtest)], ytrain[:len(Xtest)]) testerr, testloss = computeloss(Xtest, ytest) print fmt_row( 10, [i_epoch, trainloss, trainerr, testloss, testerr, elapsed]) if args.profile: cgt.execution.profiler.print_stats()
import cgt from cgt import nn, utils import numpy as np, numpy.random as nr from numpy.linalg import norm from param_collection import ParamCollection k_in = 1 size_x = 3 size_mem = 4 size_batch = 4 x = cgt.matrix(fixed_shape=(size_batch, size_x)) prev_h = cgt.matrix(fixed_shape=(size_batch, size_mem)) r_vec = nn.Affine(size_x, 2 * k_in * size_mem)(x) r_non = cgt.reshape(r_vec, (size_batch, 2 * k_in, size_mem)) r_norm = cgt.norm(r_non, axis=2, keepdims=True) r = cgt.broadcast('/', r_non, r_norm, "xxx,xx1") prev_h_3 = cgt.reshape(prev_h, (size_batch, size_mem, 1)) inters = [prev_h_3] for i in xrange(k_in * 2): inter_in = inters[-1] r_cur = r[:, i, :] r_cur_3_transpose = cgt.reshape(r_cur, (size_batch, 1, size_mem)) r_cur_3 = cgt.reshape(r_cur, (size_batch, size_mem, 1)) ref_cur = cgt.batched_matmul( r_cur_3, cgt.batched_matmul(r_cur_3_transpose, inter_in)) inter_out = inter_in - ref_cur inters.append(inter_out) h = inters[-1]