def get_context(self, prev_state_bf): state_step_bf = self.states_mlp_bf(prev_state_bf) state_step_b1f = cgt.dimshuffle(state_step_bf, [0, 'x', 1]) # Compute the inner product <phi(s_i), psi(h_u)> where phi and psi are MLPs. # The below line computes the pointwise product of phi(s_i) and psi(h_u) and then sums to get the inner product. # scalar_energies_vec_bt = cgt.sqrt(cgt.sum(cgt.broadcast('*', state_step_b1f, self.features_post_mlp_btf, 'x1x,xxx'), axis=2)) # Compute tau=tanh(h_u*W + s_i*V), broadcasting to do all h_u mults at once. scalar_energies_vec_btf = cgt.tanh(cgt.broadcast('+', self.features_post_mlp_btf, state_step_b1f, 'xxx,x1x')) # The next two lines compute w^T*(tau) with a pointwise product and then a sum. scalar_energies_vec_btf = cgt.broadcast('*', self.mixing_vec_w, scalar_energies_vec_btf, '11x,xxx') scalar_energies_vec_bt = cgt.sum(scalar_energies_vec_btf, axis=2) # Softmax weights the blended features over their time dimesions. softmax_weights_bt = nn.softmax(scalar_energies_vec_bt, axis=1) # This weight multiplies all features. extended_softmax_bt1 = cgt.dimshuffle(softmax_weights_bt, [0, 1, 'x']) # Weight the features by it's temporally dependent softmax weight. pre_blended = cgt.broadcast('*', extended_softmax_bt1, self.features_post_mlp_btf, 'xx1,xxx') # Integrate out time. blended_features_bf = cgt.sum(pre_blended, axis=1) return blended_features_bf
def make_funcs(config, dbg_out={}): net_in, net_out = hybrid_network(config['num_inputs'], config['num_outputs'], config['num_units'], config['num_sto'], dbg_out=dbg_out) if not config['dbg_out_full']: dbg_out = {} # def f_sample(_inputs, num_samples=1, flatten=False): # _mean, _var = f_step(_inputs) # _samples = [] # for _m, _v in zip(_mean, _var): # _s = np.random.multivariate_normal(_m, np.diag(np.sqrt(_v)), num_samples) # if flatten: _samples.extend(_s) # else: _samples.append(_s) # return np.array(_samples) Y_gt = cgt.matrix("Y") Y_prec = cgt.tensor3('V', fixed_shape=(None, config['num_inputs'], config['num_inputs'])) params = nn.get_parameters(net_out) size_batch, size_out = net_out.shape inputs, outputs = [net_in], [net_out] if config['no_bias']: print "Excluding bias" params = [p for p in params if not p.name.endswith(".b")] loss_vec = dist.gaussian.logprob(Y_gt, net_out, Y_prec) if config['weight_decay'] > 0.: print "Applying penalty on parameter norm" params_flat = cgt.concatenate([p.flatten() for p in params]) loss_param = config['weight_decay'] * cgt.sum(params_flat ** 2) loss_vec -= loss_param # / size_batch loss = cgt.sum(loss_vec) / size_batch # TODO_TZ f_step seems not to fail if X has wrong dim f_step = cgt.function(inputs, outputs) f_surr = get_surrogate_func(inputs + [Y_prec, Y_gt], outputs, [loss_vec], params, _dbg_out=dbg_out) return params, f_step, None, None, None, f_surr
def logprob(self, x, mu, sigma): """ Calculate logprob for each row of x, mu, sigma """ assert sigma.ndim == mu.ndim == x.ndim == 2 k = x.shape[1] log_det = cgt.sum(cgt.log(sigma), axis=1, keepdims=True) prob_z = -.5 * (k * np.log(2. * np.pi) + log_det) prob_e = cgt.sum(-.5 * sigma * ((x - mu)**2), axis=1, keepdims=True) # output shape: (size_batch, 1) return prob_z + prob_e
def test_flatvec(): cgt.reset_config cgt.set_precision('double') cgt.core.update_config(backend="python") # XXX N = 10 K = 3 Xval = np.random.randn(N,K) wval = np.random.randn(K) bval = np.random.randn() yval = np.random.randn(N) X_nk = cgt.shared(Xval, "X") y_n = cgt.shared(yval, "y") w_k = cgt.shared(wval, "w") b = cgt.shared(bval, name="b") ypred = cgt.dot(X_nk, w_k) + b err = cgt.sum(cgt.square(ypred - y_n)) g = cgt.grad(err, [w_k, b]) g = core.simplify(g) pars = [w_k, b] flatx = nn.setup_contiguous_storage(pars) f = cgt.function([], [err,cgt.flatcat(g)])
def test_devices(): N = 10 K = 3 compile_info = cgt.compilation.get_compile_info() cuda_enabled = compile_info["CGT_ENABLE_CUDA"] if not cuda_enabled: raise SkipTest("cuda disabled") Xval = np.random.randn(N,K).astype(cgt.floatX) wval = np.random.randn(K).astype(cgt.floatX) bval = np.asarray(np.random.randn()).astype(cgt.floatX) yval = np.random.randn(N).astype(cgt.floatX) with cgt.scoped_update_config(default_device=cgt.Device(devtype="gpu")): X_nk = cgt.shared(Xval, "X", device=cgt.Device(devtype='gpu')) y_n = cgt.shared(yval, "y") w_k = cgt.shared(wval, "w") b = cgt.shared(bval, name="b") print "bval",bval ypred = cgt.dot(cgt.square(X_nk), w_k) + b err = cgt.sum(cgt.sin(ypred - y_n)) g = cgt.grad(err, [w_k, b]) outputs = [err]+g f = cgt.function([], [err]+g) results = f() print results assert np.allclose(results[0] , np.sin(np.square(Xval).dot(wval)+bval-yval).sum())
def test_flatvec(): cgt.reset_config cgt.set_precision('double') cgt.core.update_config(backend="python") # XXX N = 10 K = 3 Xval = np.random.randn(N, K) wval = np.random.randn(K) bval = np.random.randn() yval = np.random.randn(N) X_nk = cgt.shared(Xval, "X") y_n = cgt.shared(yval, "y") w_k = cgt.shared(wval, "w") b = cgt.shared(bval, name="b") ypred = cgt.dot(X_nk, w_k) + b err = cgt.sum(cgt.square(ypred - y_n)) g = cgt.grad(err, [w_k, b]) g = core.simplify(g) pars = [w_k, b] flatx = nn.setup_contiguous_storage(pars) f = cgt.function([], [err, cgt.flatcat(g)])
def test_devices(): N = 10 K = 3 compile_info = cgt.compilation.get_compile_info() cuda_enabled = compile_info["CGT_ENABLE_CUDA"] if not cuda_enabled: raise SkipTest("cuda disabled") Xval = np.random.randn(N, K).astype(cgt.floatX) wval = np.random.randn(K).astype(cgt.floatX) bval = np.asarray(np.random.randn()).astype(cgt.floatX) yval = np.random.randn(N).astype(cgt.floatX) with cgt.scoped_update_config(default_device=cgt.Device(devtype="gpu")): X_nk = cgt.shared(Xval, "X", device=cgt.Device(devtype='gpu')) y_n = cgt.shared(yval, "y") w_k = cgt.shared(wval, "w") b = cgt.shared(bval, name="b") print "bval", bval ypred = cgt.dot(cgt.square(X_nk), w_k) + b err = cgt.sum(cgt.sin(ypred - y_n)) g = cgt.grad(err, [w_k, b]) outputs = [err] + g f = cgt.function([], [err] + g) results = f() print results assert np.allclose( results[0], np.sin(np.square(Xval).dot(wval) + bval - yval).sum())
def __init__(self, x, n_in, n_hid, n_out, nlayers=1, y=None, eps=None): super(GaussianMLP, self).__init__(x, n_in, n_hid, nlayers=nlayers, prefix="GaussianMLP_hidden") self.mu_layer = HiddenLayer( input=self.hidden_layers[-1].output, n_in=self.hidden_layers[-1].n_out, n_out=n_out, activation=None, prefix="GaussianMLP_mu" ) # log(sigma^2) self.logvar_layer = HiddenLayer( input=self.hidden_layers[-1].output, n_in=self.hidden_layers[-1].n_out, n_out=n_out, activation=None, prefix="GaussianMLP_logvar" ) self.mu = self.mu_layer.output self.var = cgt.exp(self.logvar_layer.output) self.sigma = cgt.sqrt(self.var) self.params = self.params + self.mu_layer.params +\ self.logvar_layer.params # for use as encoder if eps is not None: assert(y is None) self.out = self.mu + self.sigma * eps # for use as decoder if y: assert(eps is None) self.out = cgt.sigmoid(self.mu) self.cost = -cgt.sum(log_diag_mvn(self.out, self.var)(y))
def get_train_objective(self, max_label_length, ground_labels_basis_btc): context_i_bf = parameter(init_array(IIDUniform(-0.1, 0.1), (self.batch_size, self.feature_size)), name=None) state_i_bf = parameter(init_array(IIDUniform(-0.1, 0.1), (self.batch_size, self.decoder_size)), name=None) prev_out_bc = cgt.zeros((self.batch_size, self.true_number_classes), dtype='i8') #+ self.start_token_index log_probs = None for iter_step in range(0, max_label_length): state_i_bf = self.get_decoder_state(context_i_bf, prev_out_bc, state_i_bf) context_i_bf = self.get_context(state_i_bf) this_character_dist_bc = self.get_character_distribution(state_i_bf, context_i_bf) prev_out_bc = ground_labels_basis_btc[:, iter_step, :] log_probs_pre = prev_out_bc * this_character_dist_bc log_probs_pre = cgt.log(cgt.sum(log_probs_pre, axis=1)) if log_probs is None: log_probs = cgt.sum(log_probs_pre) else: log_probs += cgt.sum(log_probs_pre) log_probs = -log_probs return log_probs
def make_funcs(config, dbg_out=None): params, Xs, Ys, C_0, H_0, C_T, H_T, C_1, H_1 = lstm_network( config['rnn_steps'], config['num_inputs'], config['num_outputs'], config['num_units'], config['num_mems']) # basic size_batch = Xs[0].shape[0] dY = Ys[0].shape[-1] Ys_gt = [ cgt.matrix(fixed_shape=(size_batch, dY), name='Y%d' % t) for t in range(len(Ys)) ] Ys_var = [cgt.tensor3(fixed_shape=(size_batch, dY, dY)) for _ in Ys] net_inputs, net_outputs = Xs + C_0 + H_0 + Ys_var, Ys + C_T + H_T # calculate loss loss_vec = [] for i in range(len(Ys)): # if i == 0: continue _l = dist.gaussian.logprob(Ys_gt[i], Ys[i], Ys_var[i]) loss_vec.append(_l) loss_vec = cgt.add_multi(loss_vec) if config['weight_decay'] > 0.: params_flat = cgt.concatenate([p.flatten() for p in params]) loss_param = config['weight_decay'] * cgt.sum(params_flat**2) loss_vec -= loss_param # / size_batch loss = cgt.sum(loss_vec) / config['rnn_steps'] / size_batch grad = cgt.grad(loss, params) # functions def f_init(size_batch): c_0, h_0 = [], [] for _n_m in config['num_mems']: if _n_m > 0: c_0.append(np.zeros((size_batch, _n_m))) h_0.append(np.zeros((size_batch, _n_m))) return c_0, h_0 f_step = cgt.function([Xs[0]] + C_0 + H_0, [Ys[0]] + C_1 + H_1) f_loss = cgt.function(net_inputs + Ys_gt, loss) f_grad = cgt.function(net_inputs + Ys_gt, grad) f_surr = cgt.function(net_inputs + Ys_gt, [loss] + net_outputs + grad) return params, f_step, f_loss, f_grad, f_init, f_surr
def make_funcs(config, dbg_out={}): net_in, net_out = hybrid_network(config['num_inputs'], config['num_outputs'], config['num_units'], config['num_sto'], dbg_out=dbg_out) if not config['dbg_out_full']: dbg_out = {} # def f_sample(_inputs, num_samples=1, flatten=False): # _mean, _var = f_step(_inputs) # _samples = [] # for _m, _v in zip(_mean, _var): # _s = np.random.multivariate_normal(_m, np.diag(np.sqrt(_v)), num_samples) # if flatten: _samples.extend(_s) # else: _samples.append(_s) # return np.array(_samples) Y_gt = cgt.matrix("Y") Y_prec = cgt.tensor3('V', fixed_shape=(None, config['num_inputs'], config['num_inputs'])) params = nn.get_parameters(net_out) size_batch, size_out = net_out.shape inputs, outputs = [net_in], [net_out] if config['no_bias']: print "Excluding bias" params = [p for p in params if not p.name.endswith(".b")] loss_vec = dist.gaussian.logprob(Y_gt, net_out, Y_prec) if config['weight_decay'] > 0.: print "Applying penalty on parameter norm" params_flat = cgt.concatenate([p.flatten() for p in params]) loss_param = config['weight_decay'] * cgt.sum(params_flat**2) loss_vec -= loss_param # / size_batch loss = cgt.sum(loss_vec) / size_batch # TODO_TZ f_step seems not to fail if X has wrong dim f_step = cgt.function(inputs, outputs) f_surr = get_surrogate_func(inputs + [Y_prec, Y_gt], outputs, [loss_vec], params, _dbg_out=dbg_out) return params, f_step, None, None, None, f_surr
def make_funcs(config, dbg_out=None): params, Xs, Ys, C_0, H_0, C_T, H_T, C_1, H_1 = lstm_network( config['rnn_steps'], config['num_inputs'], config['num_outputs'], config['num_units'], config['num_mems'] ) # basic size_batch = Xs[0].shape[0] dY = Ys[0].shape[-1] Ys_gt = [cgt.matrix(fixed_shape=(size_batch, dY), name='Y%d'%t) for t in range(len(Ys))] Ys_var = [cgt.tensor3(fixed_shape=(size_batch, dY, dY)) for _ in Ys] net_inputs, net_outputs = Xs + C_0 + H_0 + Ys_var, Ys + C_T + H_T # calculate loss loss_vec = [] for i in range(len(Ys)): # if i == 0: continue _l = dist.gaussian.logprob(Ys_gt[i], Ys[i], Ys_var[i]) loss_vec.append(_l) loss_vec = cgt.add_multi(loss_vec) if config['weight_decay'] > 0.: params_flat = cgt.concatenate([p.flatten() for p in params]) loss_param = config['weight_decay'] * cgt.sum(params_flat ** 2) loss_vec -= loss_param # / size_batch loss = cgt.sum(loss_vec) / config['rnn_steps'] / size_batch grad = cgt.grad(loss, params) # functions def f_init(size_batch): c_0, h_0 = [], [] for _n_m in config['num_mems']: if _n_m > 0: c_0.append(np.zeros((size_batch, _n_m))) h_0.append(np.zeros((size_batch, _n_m))) return c_0, h_0 f_step = cgt.function([Xs[0]] + C_0 + H_0, [Ys[0]] + C_1 + H_1) f_loss = cgt.function(net_inputs + Ys_gt, loss) f_grad = cgt.function(net_inputs + Ys_gt, grad) f_surr = cgt.function(net_inputs + Ys_gt, [loss] + net_outputs + grad) return params, f_step, f_loss, f_grad, f_init, f_surr
def __init__(self, x, n_in, n_hid, n_out, nlayers=1, y=None): super(BernoulliMLP, self).__init__(x, n_in, n_hid, nlayers=nlayers, prefix="BernoulliMLP_hidden") self.out_layer = HiddenLayer( input=self.hidden_layers[-1].output, n_in=self.hidden_layers[-1].n_out, n_out=n_out, activation=cgt.sigmoid, prefix="BernoulliMLP_y_hat" ) self.params = self.params + self.out_layer.params if y is not None: self.out = self.out_layer.output self.cost = cgt.sum(nn.binary_crossentropy(self.out, y))
def __init__(self, xdim, args, dec="bernoulli"): self.xdim = xdim self.hdim = args.hdim self.zdim = args.zdim self.lmbda = args.lmbda # weight decay coefficient * 2 self.x = cgt.matrix("x", dtype=cgt.floatX) self.eps = cgt.matrix("eps", dtype=cgt.floatX) self.enc_mlp = GaussianMLP(self.x, self.xdim, self.hdim, self.zdim, nlayers=args.nlayers, eps=self.eps) if dec == "bernoulli": # log p(x | z) defined as -CE(x, y) = dec_mlp.cost(y) self.dec_mlp = BernoulliMLP(self.enc_mlp.out, self.zdim, self.hdim, self.xdim, nlayers=args.nlayers, y=self.x) elif dec == "gaussian": self.dec_mlp = GaussianMLP(self.enc_mlp.out, self.zdim, self.hdim, self.xdim, nlayers=args.nlayers, y=self.x) else: raise RuntimeError("unrecognized decoder %" % dec) self.cost = (-cgt.sum(kld_unit_mvn(self.enc_mlp.mu, self.enc_mlp.var)) + self.dec_mlp.cost) / args.batch_size self.params = self.enc_mlp.params + self.dec_mlp.params # L2 regularization self.gparams = [cgt.grad(self.cost, [p])[0] + self.lmbda * p for p in self.params] self.gaccums = [cgt.shared(np.zeros(p.op.get_value().shape, dtype=cgt.floatX)) for p in self.params] # XXX replace w/ adagrad update from nn ADAGRAD_EPS = 1e-10 # for stability self.updates = [ (param, param - args.lr * gparam / cgt.sqrt(gaccum + cgt.square(gparam) + ADAGRAD_EPS)) for param, gparam, gaccum in zip(self.params, self.gparams, self.gaccums) ] self.updates += [ (gaccum, gaccum + cgt.square(gparam)) for gaccum, gparam in zip(self.gaccums, self.gparams) ] self.train = cgt.function( [self.x, self.eps], self.cost, updates=self.updates ) self.test = cgt.function( [self.x, self.eps], self.cost, updates=None ) # can be used for semi-supervised learning for example self.encode = cgt.function( [self.x, self.eps], self.enc_mlp.out )
def make_funcs(net_in, net_out, config, dbg_out=None): def f_grad (*x): out = f_surr(*x) return out['loss'], out['surr_loss'], out['surr_grad'] Y = cgt.matrix("Y") params = nn.get_parameters(net_out) if 'no_bias' in config and config['no_bias']: print "Excluding bias" params = [p for p in params if not p.name.endswith(".b")] size_out, size_batch = Y.shape[1], net_in.shape[0] f_step = cgt.function([net_in], [net_out]) # loss_raw of shape (size_batch, 1); loss should be a scalar # sum-of-squares loss sigma = 0.1 loss_raw = -cgt.sum((net_out - Y) ** 2, axis=1, keepdims=True) / sigma # negative log-likelihood # out_sigma = cgt.exp(net_out[:, size_out:]) + 1.e-6 # positive sigma # loss_raw = -gaussian_diagonal.logprob( # Y, net_out, # out_sigma # cgt.fill(.01, [size_batch, size_out]) # ) if 'param_penal_wt' in config: print "Applying penalty on parameter norm" assert config['param_penal_wt'] > 0 params_flat = cgt.concatenate([p.flatten() for p in params]) loss_param = cgt.fill(cgt.sum(params_flat ** 2), [size_batch, 1]) loss_param *= config['param_penal_wt'] loss_raw += loss_param loss = cgt.sum(loss_raw) / size_batch # end of loss definition f_loss = cgt.function([net_in, Y], [net_out, loss]) f_surr = get_surrogate_func([net_in, Y], [net_out] + dbg_out, [loss_raw], params) return params, f_step, f_loss, f_grad, f_surr
def make_funcs(net_in, net_out, config, dbg_out=None): def f_grad(*x): out = f_surr(*x) return out['loss'], out['surr_loss'], out['surr_grad'] Y = cgt.matrix("Y") params = nn.get_parameters(net_out) if 'no_bias' in config and config['no_bias']: print "Excluding bias" params = [p for p in params if not p.name.endswith(".b")] size_out, size_batch = Y.shape[1], net_in.shape[0] f_step = cgt.function([net_in], [net_out]) # loss_raw of shape (size_batch, 1); loss should be a scalar # sum-of-squares loss sigma = 0.1 loss_raw = -cgt.sum((net_out - Y)**2, axis=1, keepdims=True) / sigma # negative log-likelihood # out_sigma = cgt.exp(net_out[:, size_out:]) + 1.e-6 # positive sigma # loss_raw = -gaussian_diagonal.logprob( # Y, net_out, # out_sigma # cgt.fill(.01, [size_batch, size_out]) # ) if 'param_penal_wt' in config: print "Applying penalty on parameter norm" assert config['param_penal_wt'] > 0 params_flat = cgt.concatenate([p.flatten() for p in params]) loss_param = cgt.fill(cgt.sum(params_flat**2), [size_batch, 1]) loss_param *= config['param_penal_wt'] loss_raw += loss_param loss = cgt.sum(loss_raw) / size_batch # end of loss definition f_loss = cgt.function([net_in, Y], [net_out, loss]) f_surr = get_surrogate_func([net_in, Y], [net_out] + dbg_out, [loss_raw], params) return params, f_step, f_loss, f_grad, f_surr
def test_linreg(): cgt.reset_config() cgt.set_precision('double') N = 10 K = 3 Xval = np.random.randn(N, K) wval = np.random.randn(K) bval = np.random.randn() yval = np.random.randn(N) X_nk = cgt.matrix("X") y_n = cgt.vector("y") w_k = cgt.vector("w") b = cgt.scalar(name="b") ypred = cgt.dot(X_nk, w_k) + b err = cgt.sum(cgt.square(ypred - y_n)) g = cgt.grad(err, [w_k, b]) g_simple, an, _ = cgt.core.simplify_and_analyze(g) print "Loss function:" cgt.print_tree([err]) print "Gradient:" cgt.print_tree(g) print "Gradient simplified" cgt.print_tree( g_simple, nodefn=lambda node, o: o.write(" " + an["node2hash"][node][:5])) print "-------" d = {X_nk: Xval, w_k: wval, b: bval, y_n: yval} np.testing.assert_allclose(cgt.numeric_eval(err, d), np.linalg.norm(Xval.dot(wval) + bval - yval)**2) np.testing.assert_allclose(cgt.numeric_eval(g[0], d), 2 * Xval.T.dot(Xval.dot(wval) + bval - yval)) np.testing.assert_allclose(cgt.numeric_eval(g[1], d), 2 * np.sum(Xval.dot(wval) + bval - yval, 0))
def get_context_backup(self, prev_state_bf): state_step_bf = cgt.sigmoid(self.states_mlp_bf(prev_state_bf)) product_list = [] for time_step in range(0, 3): inner_product = cgt.sum(state_step_bf*self.features_post_mlp_btf[:, time_step, :], axis=1) product_list.append(inner_product) st = cgt.stack(product_list) st = cgt.dimshuffle(st, [1, 0]) softmax_weights = softmax(st) sum = None for time_step in range(0, 3): softmax_t_step = cgt.dimshuffle(softmax_weights[:, time_step], [0, 'x']) if sum is None: sum = cgt.broadcast('*', softmax_t_step, self.features_post_mlp_btf[:, time_step, :], 'x1,xx') else: sum += cgt.broadcast('*', softmax_t_step, self.features_post_mlp_btf[:, time_step, :], 'x1,xx') return sum
def test_linreg(): N = 10 K = 3 Xval = np.random.randn(N,K) wval = np.random.randn(K) bval = np.random.randn() yval = np.random.randn(N) X_nk = cgt.matrix("X") y_n = cgt.vector("y") w_k = cgt.vector("w") b = cgt.scalar(name="b") ypred = cgt.dot(X_nk, w_k) + b err = cgt.sum(cgt.square(ypred - y_n)) g = cgt.grad(err, [w_k, b]) g_simple,an,_ = cgt.core.simplify_and_analyze(g) print "Loss function:" cgt.print_tree([err]) print "Gradient:" cgt.print_tree(g) print "Gradient simplified" cgt.print_tree(g_simple, nodefn=lambda node,o: o.write(" " + an["node2hash"][node][:5])) print "-------" d = {X_nk : Xval, w_k : wval, b : bval, y_n : yval} np.testing.assert_allclose(cgt.numeric_eval(err,d), np.linalg.norm(Xval.dot(wval) + bval - yval)**2, atol={"single":1e-3,"double":1e-6}[cgt.get_precision()]) np.testing.assert_allclose(cgt.numeric_eval(g[0],d), 2 * Xval.T.dot(Xval.dot(wval) + bval - yval), atol={"single":1e-3,"double":1e-6}[cgt.get_precision()]) np.testing.assert_allclose(cgt.numeric_eval(g[1],d), 2 * np.sum(Xval.dot(wval) + bval - yval, 0), atol={"single":1e-3,"double":1e-6}[cgt.get_precision()])
X_tnk = cgt.tensor3("X") cell = gru.GRUCell([dim_x], mem_size) Minit_nk = cgt.zeros((X_tnk.shape[0], X_tnk.shape[1]), cgt.floatX) M = Minit_nk for t in xrange(horizon): M = cell(M, X_tnk[t]) # cgt.print_tree(M) print "simplifying..." M_simp = cgt.simplify([M]) print "done" # cgt.print_tree(M_simp) print "fn before:", cgt.count_nodes(M) print "fn after:", cgt.count_nodes(M_simp) gs = cgt.grad(cgt.sum(M), cell.params()) print "grad before", cgt.count_nodes(gs) g_simp = cgt.simplify(gs) print "grad after", cgt.count_nodes(g_simp) # M = cgt.simplify(M) elapsed.append(time() - tstart) import matplotlib.pyplot as plt plt.plot(horizons, elapsed, 'x-') plt.show()
def normalize(var): return cgt.broadcast("/", var, cgt.sum(var,axis=2,keepdims=True), "xxx,xx1")
def loglik(self, x, p): """ Log likelihood of params on dataset x """ return cgt.sum(self.logprob(x, p))
def prod(x, axis=None, keepdims=False): """ Like numpy.prod """ return cgt.exp(cgt.sum(cgt.log(x), axis=axis, keepdims=keepdims))
def f(x): # expects batches k = mu.shape[1] logp = (-k / 2.0) * np.log(2 * np.pi) - 0.5 * cgt.sum(cgt.log(var), axis=1) - cgt.sum(0.5 * (1.0 / var) * (x - mu) * (x - mu), axis=1) return logp
def kld_unit_mvn(mu, var): # KL divergence from N(0, I) return (mu.shape[1] + cgt.sum(cgt.log(var), axis=1) - cgt.sum(cgt.square(mu), axis=1) - cgt.sum(var, axis=1)) / 2.0
X_tnk = cgt.tensor3("X") cell = gru.GRUCell([dim_x], mem_size) Minit_nk = cgt.zeros((X_tnk.shape[0], X_tnk.shape[1]),cgt.floatX) M = Minit_nk for t in xrange(horizon): M = cell(M, X_tnk[t]) # cgt.print_tree(M) print "simplifying..." M_simp = cgt.simplify([M]) print "done" # cgt.print_tree(M_simp) print "fn before:",cgt.count_nodes(M) print "fn after:",cgt.count_nodes(M_simp) gs = cgt.grad(cgt.sum(M), cell.params()) print "grad before", cgt.count_nodes(gs) g_simp = cgt.simplify(gs) print "grad after",cgt.count_nodes(g_simp) # M = cgt.simplify(M) elapsed.append(time()-tstart) import matplotlib.pyplot as plt plt.plot(horizons,elapsed,'x-') plt.show()
def sum(x, axis=None): return cgt.sum(x, axis=axis)
def test_the_test_problem(): #Works batch_size = 32 # How many samples do you want to batch. feat_t_steps = 20 # How many 10ms sound clips. feat_num_features = 10 # The dimension of the 10ms clips. max_label_length = feat_t_steps # The maximal label length of the transcription. includes start character. num_out_classes = 27 num_out_classes_true = num_out_classes + 2 num_batches = 756 num_epochs = 30 feats = cgt.tensor3(fixed_shape=(batch_size, feat_t_steps, feat_num_features)) ground_labels_basis = cgt.tensor3(fixed_shape=(batch_size, max_label_length, num_out_classes_true)) last_time = time.time() print 'initializing temporal dense layer' d1 = nnbuilder.temporalDenseLayer(feats, num_units=128, activation=cgt.sigmoid) #d2 = nnbuilder.temporalDenseLayer(d1, num_units=128, activation=cgt.sigmoid) d3 = nnbuilder.temporalDenseLayer(d1, num_units=num_out_classes_true, activation=nnbuilder.linear) out = nn.three_d_softmax(d3, axis=2) log_probs = None for iter_step in range(0, max_label_length): this_character_dist_bc = out[:, iter_step, :] prev_out_bc = ground_labels_basis[:, iter_step, :] log_probs_pre = prev_out_bc * this_character_dist_bc log_probs_pre = cgt.log(cgt.sum(log_probs_pre, axis=1)) if log_probs is None: log_probs = cgt.sum(log_probs_pre) else: log_probs += cgt.sum(log_probs_pre) log_probs = -log_probs print 'that took ' + str(time.time() - last_time) + ' seconds' last_time = time.time() print 'compiling objective function' updates = nn.rmsprop(log_probs, nn.get_parameters(log_probs), learning_rate=0.01) pred_train = cgt.function([feats, ground_labels_basis], [], updates=updates) pred_fun = cgt.function([feats, ground_labels_basis], [log_probs]) most_likely_chars = cgt.argmax(out, axis=1) actual_predictions = cgt.function([feats, ground_labels_basis], [most_likely_chars]) print 'that took ' + str(time.time() - last_time) + ' seconds' test_data = np.load('test_data.npy') test_labels = np.load('test_labels.npy') data_mean = np.mean(test_data) data_sd = np.mean(test_data) print 'now training' for one_epoch in range(0, num_epochs): trained = 0 last_time = time.time() print 'starting epoch ' + str(one_epoch) for batch_iter in range(0, num_batches): batch, labels_basis = normalize_batch_and_labels(test_data, batch_iter, feat_t_steps, data_mean, data_sd, test_labels, num_out_classes_true) pred_train(batch, labels_basis) for batch_iter in range(0, num_batches): batch, labels_basis = normalize_batch_and_labels(test_data, batch_iter, feat_t_steps, data_mean, data_sd, test_labels, num_out_classes_true) trained += pred_fun(batch, labels_basis)[0] trained = trained/batch_iter print 'train loss is ' + str(trained) print 'that took ' + str(time.time() - last_time) + ' seconds' act_pred = actual_predictions(batch, labels_basis)[0] print 'an actual prediction is ' print act_pred
def loglik(self, x, mu, sigma): return cgt.sum(self.logprob(x, mu, sigma))