def test_einsum(): x = cgt.tensor3() y = cgt.tensor3() sizes = {'i': 2, 'j': 3, 'k': 5, 'l': 7} xaxes = 'ijk' yaxes = 'ikl' zaxes = 'ijl' for i in xrange(10): xperm = xaxes (yperm, zperm) = permaxes = [[chars[i] for i in np.random.permutation(3)] for chars in [yaxes, zaxes]] desc = "%s,%s->%s" % tuple("".join(chars) for chars in [xperm] + permaxes) z = cgt.einsum(desc, x, y) xval = nr.randn(*(sizes[c] for c in xperm)) yval = nr.randn(*(sizes[c] for c in yperm)) np.testing.assert_allclose(cgt.numeric_eval(z, { x: xval, y: yval }), np.einsum(desc, xval, yval), atol={ "single": 1e-3, "double": 1e-6 }[cgt.get_precision()])
def make_ntm(opt): Mprev_bnm = cgt.tensor3("M", fixed_shape=(opt.b, opt.n, opt.m)) X_bk = cgt.matrix("X", fixed_shape=(opt.b, opt.k)) wprev_bHn = cgt.tensor3("w", fixed_shape=(opt.b, opt.h*2, opt.n)) rprev_bhm = cgt.tensor3("r", fixed_shape=(opt.b, opt.h, opt.m)) controller = make_ff_controller(opt) M_bnm, w_bHn, r_bhm, y_bp = ntm_step(opt, Mprev_bnm, X_bk, wprev_bHn, rprev_bhm, controller) # in this form it looks like a standard seq-to-seq model # external input and output are first elements ntm = nn.Module([X_bk, Mprev_bnm, wprev_bHn, rprev_bhm], [y_bp, M_bnm, w_bHn, r_bhm]) return ntm
def test_get_train_objective(): batch_size = 32 feat_t_steps = 5 feat_num_features = 256 max_label_length = 5 num_out_classes = 27 feats = cgt.tensor3(fixed_shape=(batch_size, feat_t_steps, feat_num_features)) ground_labels_basis = cgt.tensor3(fixed_shape=(batch_size, max_label_length, num_out_classes)) seq2seq = nnbuilder.Seq2Seq(nn_input_btf=feats, num_out_classes=num_out_classes) train_objective = seq2seq.get_train_objective(max_label_length=max_label_length, ground_labels_basis_btc=ground_labels_basis) train_shape = cgt.infer_shape(train_objective) assert train_shape == () nn.get_parameters(train_objective)
def test_get_character_distribution(): batch_size = 32 feat_t_steps = 20 feat_num_features = 42 num_out_classes = 28 # This is the index of the start token. num_out_classes_true = num_out_classes + 2 # Add start and end tokens automatically. decoder_size = 50 tau = np.reshape(np.random.normal(0.1, 0.2, batch_size*feat_t_steps*feat_num_features), (batch_size, feat_t_steps, feat_num_features)) tau2 = np.reshape(np.random.normal(0.1, 0.2, batch_size*feat_num_features), (batch_size, feat_num_features)) tau3 = np.reshape(np.random.normal(0.1, 0.2, batch_size*decoder_size), (batch_size, decoder_size)) feats = cgt.tensor3(fixed_shape=(batch_size, feat_t_steps, feat_num_features)) s = nnbuilder.Seq2Seq(nn_input_btf=feats, num_out_classes=num_out_classes, decoder_size=decoder_size, feature_size=feat_num_features) context_bf = cgt.matrix(fixed_shape=(batch_size, feat_num_features)) state_bf = cgt.matrix(fixed_shape=(batch_size, decoder_size)) m_out = s.get_character_distribution(state_bf, context_bf) out_fun = cgt.function([feats, context_bf, state_bf], [m_out]) m = out_fun(tau, tau2, tau3)[0] assert m.shape == (batch_size, num_out_classes_true)
def make_ff_controller(opt): b, h, m, p, k = opt.b, opt.h, opt.m, opt.p, opt.k H = 2*h in_size = k + h*m out_size = H*m + H + H + H*3 + H + h*m + h*m + p # Previous reads r_bhm = cgt.tensor3("r", fixed_shape = (b,h,m)) # External inputs X_bk = cgt.matrix("x", fixed_shape = (b,k)) r_b_hm = r_bhm.reshape([r_bhm.shape[0], r_bhm.shape[1]*r_bhm.shape[2]]) # Input to controller inp_bq = cgt.concatenate([X_bk, r_b_hm], axis=1) hid_sizes = opt.ff_hid_sizes activation = cgt.tanh layer_out_sizes = [in_size] + hid_sizes + [out_size] last_out = inp_bq # feedforward part. we could simplify a bit by using nn.Affine for i in xrange(len(layer_out_sizes)-1): indim = layer_out_sizes[i] outdim = layer_out_sizes[i+1] W = cgt.shared(.02*nr.randn(indim, outdim), name="W%i"%i, fixed_shape_mask="all") bias = cgt.shared(.02*nr.randn(1, outdim), name="b%i"%i, fixed_shape_mask="all") last_out = cgt.broadcast("+",last_out.dot(W),bias,"xx,1x") # Don't apply nonlinearity at the last layer if i != len(layer_out_sizes)-2: last_out = activation(last_out) idx = 0 k_bHm = last_out[:,idx:idx+H*m]; idx += H*m; k_bHm = k_bHm.reshape([b,H,m]) beta_bH = last_out[:,idx:idx+H]; idx += H g_bH = last_out[:,idx:idx+H]; idx += H s_bH3 = last_out[:,idx:idx+3*H]; idx += 3*H; s_bH3 = s_bH3.reshape([b,H,3]) gamma_bH = last_out[:,idx:idx+H]; idx += H e_bhm = last_out[:,idx:idx+h*m]; idx += h*m; e_bhm = e_bhm.reshape([b,h,m]) a_bhm = last_out[:,idx:idx+h*m]; idx += h*m; a_bhm = a_bhm.reshape([b,h,m]) y_bp = last_out[:,idx:idx+p]; idx += p k_bHm = cgt.tanh(k_bHm) beta_bH = nn.softplus(beta_bH) g_bH = cgt.sigmoid(g_bH) s_bH3 = sum_normalize2(cgt.exp(s_bH3)) gamma_bH = cgt.sigmoid(gamma_bH)+1 e_bhm = cgt.sigmoid(e_bhm) a_bhm = cgt.tanh(a_bhm) # y_bp = y_bp assert infer_shape(k_bHm) == (b,H,m) assert infer_shape(beta_bH) == (b,H) assert infer_shape(g_bH) == (b,H) assert infer_shape(s_bH3) == (b,H,3) assert infer_shape(gamma_bH) == (b,H) assert infer_shape(e_bhm) == (b,h,m) assert infer_shape(a_bhm) == (b,h,m) assert infer_shape(y_bp) == (b,p) return nn.Module([r_bhm, X_bk], [k_bHm, beta_bH, g_bH, s_bH3, gamma_bH, e_bhm, a_bhm, y_bp])
def make_funcs(config, dbg_out={}): net_in, net_out = hybrid_network(config['num_inputs'], config['num_outputs'], config['num_units'], config['num_sto'], dbg_out=dbg_out) if not config['dbg_out_full']: dbg_out = {} # def f_sample(_inputs, num_samples=1, flatten=False): # _mean, _var = f_step(_inputs) # _samples = [] # for _m, _v in zip(_mean, _var): # _s = np.random.multivariate_normal(_m, np.diag(np.sqrt(_v)), num_samples) # if flatten: _samples.extend(_s) # else: _samples.append(_s) # return np.array(_samples) Y_gt = cgt.matrix("Y") Y_prec = cgt.tensor3('V', fixed_shape=(None, config['num_inputs'], config['num_inputs'])) params = nn.get_parameters(net_out) size_batch, size_out = net_out.shape inputs, outputs = [net_in], [net_out] if config['no_bias']: print "Excluding bias" params = [p for p in params if not p.name.endswith(".b")] loss_vec = dist.gaussian.logprob(Y_gt, net_out, Y_prec) if config['weight_decay'] > 0.: print "Applying penalty on parameter norm" params_flat = cgt.concatenate([p.flatten() for p in params]) loss_param = config['weight_decay'] * cgt.sum(params_flat ** 2) loss_vec -= loss_param # / size_batch loss = cgt.sum(loss_vec) / size_batch # TODO_TZ f_step seems not to fail if X has wrong dim f_step = cgt.function(inputs, outputs) f_surr = get_surrogate_func(inputs + [Y_prec, Y_gt], outputs, [loss_vec], params, _dbg_out=dbg_out) return params, f_step, None, None, None, f_surr
def test_get_decoder_state(): batch_size = 32 feat_t_steps = 20 feat_num_features = 42 num_out_classes = 28 num_out_classes_true = num_out_classes + 2 # Start, end, are added decoder_size = 50 tau = np.reshape(np.random.normal(0.1, 0.2, batch_size*feat_t_steps*feat_num_features), (batch_size, feat_t_steps, feat_num_features)) tau2 = np.reshape(np.random.normal(0.1, 0.2, batch_size*feat_num_features), (batch_size, feat_num_features)) tau3 = np.reshape(np.random.normal(0.1, 0.2, batch_size*num_out_classes_true), (batch_size, num_out_classes_true)) feats = cgt.tensor3(fixed_shape=(batch_size, feat_t_steps, feat_num_features)) s = nnbuilder.Seq2Seq(nn_input_btf=feats, num_out_classes=num_out_classes, decoder_size=decoder_size, feature_size=feat_num_features) context_bf = cgt.matrix(fixed_shape=(batch_size, feat_num_features)) prev_out_bc = cgt.matrix(fixed_shape=(batch_size, num_out_classes_true)) state_i_bf = nn.parameter(nn.init_array(nn.IIDGaussian(0.1), (batch_size, decoder_size)), name="decoder_init") decoder_out = s.get_decoder_state(context_bf, prev_out_bc, state_i_bf) decode_fun = cgt.function([feats, context_bf, prev_out_bc], [decoder_out]) m = decode_fun(tau, tau2, tau3)[0] assert m.shape == (batch_size, decoder_size) assert np.mean(m) < 1.0
def make_loss_and_grad_and_step(arch, size_input, size_output, size_mem, size_batch, n_layers, n_unroll, k_in, k_h): # symbolic variables x_tnk = cgt.tensor3() targ_tnk = cgt.tensor3() #make_network = make_deep_lstm if arch=="lstm" else make_deep_gru make_network = make_deep_rrnn_rot_relu network = make_network(size_input, size_mem, n_layers, size_output, size_batch, k_in, k_h) init_hiddens = [ cgt.matrix() for _ in xrange(get_num_hiddens(arch, n_layers)) ] # TODO fixed sizes cur_hiddens = init_hiddens loss = 0 for t in xrange(n_unroll): outputs = network([x_tnk[t]] + cur_hiddens) cur_hiddens, prediction_logprobs = outputs[:-1], outputs[-1] # loss = loss + nn.categorical_negloglik(prediction_probs, targ_tnk[t]).sum() loss = loss - (prediction_logprobs * targ_tnk[t]).sum() cur_hiddens = outputs[:-1] final_hiddens = cur_hiddens loss = loss / (n_unroll * size_batch) params = network.get_parameters() gradloss = cgt.grad(loss, params) flatgrad = flatcat(gradloss) with utils.Message("compiling loss+grad"): f_loss_and_grad = cgt.function([x_tnk, targ_tnk] + init_hiddens, [loss, flatgrad] + final_hiddens) f_loss = cgt.function([x_tnk, targ_tnk] + init_hiddens, loss) assert len(init_hiddens) == len(final_hiddens) x_nk = cgt.matrix('x') outputs = network([x_nk] + init_hiddens) f_step = cgt.function([x_nk] + init_hiddens, outputs) # print "node count", cgt.count_nodes(flatgrad) return network, f_loss, f_loss_and_grad, f_step
def test_seq2seq_init(): batch_size = 32 feat_t_steps = 5 feat_num_features = 256 feats = cgt.tensor3(fixed_shape=(batch_size, feat_t_steps, feat_num_features)) s = nnbuilder.Seq2Seq(nn_input_btf=feats, num_out_classes=28) #assert type(s.recurrent_decoder_one) == nnbuilder.GRULayer assert s.get_features_fun == s.get_features_bengio
def test_make_prediction(): batch_size = 32 # How many samples do you want to batch. feat_t_steps = 20 # How many 10ms sound clips. feat_num_features = 10 # The dimension of the 10ms clips. max_label_length = feat_t_steps # The maximal label length of the transcription. includes start character. num_out_classes = 27 num_out_classes_true = 27 + 2 feats = cgt.tensor3(fixed_shape=(batch_size, feat_t_steps, feat_num_features)) ground_labels_basis = cgt.tensor3(fixed_shape=(batch_size, max_label_length, num_out_classes_true)) last_time = time.time() print 'initializing seq2seq' seq2seq = nnbuilder.Seq2Seq(nn_input_btf=feats, num_out_classes=num_out_classes) print 'that took ' + str(time.time() - last_time) + ' seconds' last_time = time.time() print 'making prediction objective' pred = seq2seq.make_prediction(ground_labels_basis_btc=ground_labels_basis, max_label_length=feat_t_steps) print 'that took ' + str(time.time() - last_time) + ' seconds' last_time = time.time() print 'compiling pred function' pred_fun = cgt.function([feats, ground_labels_basis], [pred]) print 'that took ' + str(time.time() - last_time) + ' seconds' test_data = np.load('test_data.npy') test_labels = np.load('test_labels.npy') data_mean = np.mean(test_data) data_sd = np.mean(test_data) print 'now predicting' last_time = time.time() batch_iter = 0 batch = test_data[batch_iter, :, 0:feat_t_steps, :] batch = batch - data_mean batch = batch / data_sd labels = test_labels[batch_iter, :, 0:feat_t_steps] labels_basis = ind_to_basis(num_out_classes_true, labels) prediction_final = pred_fun(batch, labels_basis)[0] one_pred = prediction_final[0] print 'that took ' + str(time.time() - last_time) + ' seconds'
def make_loss_and_grad_and_step(arch, size_input, size_output, size_mem, size_batch, n_layers, n_unroll, k_in, k_h): # symbolic variables x_tnk = cgt.tensor3() targ_tnk = cgt.tensor3() #make_network = make_deep_lstm if arch=="lstm" else make_deep_gru make_network = make_deep_rrnn_rot_relu network = make_network(size_input, size_mem, n_layers, size_output, size_batch, k_in, k_h) init_hiddens = [cgt.matrix() for _ in xrange(get_num_hiddens(arch, n_layers))] # TODO fixed sizes cur_hiddens = init_hiddens loss = 0 for t in xrange(n_unroll): outputs = network([x_tnk[t]] + cur_hiddens) cur_hiddens, prediction_logprobs = outputs[:-1], outputs[-1] # loss = loss + nn.categorical_negloglik(prediction_probs, targ_tnk[t]).sum() loss = loss - (prediction_logprobs*targ_tnk[t]).sum() cur_hiddens = outputs[:-1] final_hiddens = cur_hiddens loss = loss / (n_unroll * size_batch) params = network.get_parameters() gradloss = cgt.grad(loss, params) flatgrad = flatcat(gradloss) with utils.Message("compiling loss+grad"): f_loss_and_grad = cgt.function([x_tnk, targ_tnk] + init_hiddens, [loss, flatgrad] + final_hiddens) f_loss = cgt.function([x_tnk, targ_tnk] + init_hiddens, loss) assert len(init_hiddens) == len(final_hiddens) x_nk = cgt.matrix('x') outputs = network([x_nk] + init_hiddens) f_step = cgt.function([x_nk]+init_hiddens, outputs) # print "node count", cgt.count_nodes(flatgrad) return network, f_loss, f_loss_and_grad, f_step
def test_einsum(): x = cgt.tensor3() y = cgt.tensor3() sizes = {"i": 2, "j": 3, "k": 5, "l": 7} xaxes = "ijk" yaxes = "ikl" zaxes = "ijl" for i in xrange(10): xperm = xaxes (yperm, zperm) = permaxes = [[chars[i] for i in np.random.permutation(3)] for chars in [yaxes, zaxes]] desc = "%s,%s->%s" % tuple("".join(chars) for chars in [xperm] + permaxes) z = cgt.einsum(desc, x, y) xval = nr.randn(*(sizes[c] for c in xperm)) yval = nr.randn(*(sizes[c] for c in yperm)) np.testing.assert_allclose( cgt.numeric_eval(z, {x: xval, y: yval}), np.einsum(desc, xval, yval), atol={"single": 1e-3, "double": 1e-6}[cgt.get_precision()], )
def test_einsum(): cgt.reset_config() cgt.set_precision("double") x = cgt.tensor3() y = cgt.tensor3() sizes = {'i':2,'j':3,'k':5,'l':7} xaxes = 'ijk' yaxes = 'ikl' zaxes = 'ijl' for i in xrange(10): xperm = xaxes (yperm,zperm) = permaxes = [[chars[i] for i in np.random.permutation(3)] for chars in [yaxes,zaxes]] desc = "%s,%s->%s"%tuple("".join(chars) for chars in [xperm] + permaxes) z = cgt.einsum(desc, x, y) xval = nr.randn(*(sizes[c] for c in xperm)) yval = nr.randn(*(sizes[c] for c in yperm)) np.testing.assert_allclose( cgt.numeric_eval(z, {x : xval, y : yval}), np.einsum(desc, xval, yval))
def make_funcs(opt, ntm, total_time, loss_timesteps): x_tbk = cgt.tensor3("x", fixed_shape=(total_time, opt.b, opt.k)) y_tbp = cgt.tensor3("y", fixed_shape=(total_time, opt.b, opt.p)) loss_timesteps = set(loss_timesteps) initial_states = make_ntm_initial_states(opt) params = ntm.get_parameters() + get_parameters(initial_states) # params = ntm.get_parameters() lossCE = 0 loss01 = 0 state_arrs = initial_states for t in xrange(total_time): tmp = ntm([x_tbk[t]] + state_arrs) raw_pred = tmp[0] state_arrs = tmp[1:4] if t in loss_timesteps: p_pred = cgt.sigmoid(raw_pred) ce = bernoulli_crossentropy( y_tbp[t], p_pred).sum() # cross-entropy of bernoulli distribution lossCE = lossCE + ce loss01 = loss01 + cgt.cast(cgt.equal(y_tbp[t], round01(p_pred)), cgt.floatX).sum() lossCE = lossCE / (len(loss_timesteps) * opt.p * opt.b) / np.log(2) loss01 = loss01 / (len(loss_timesteps) * opt.p * opt.b) gradloss = cgt.grad(lossCE, params) flatgrad = flatcat(gradloss) f_loss = cgt.function([x_tbk, y_tbp], lossCE) f_loss_and_grad = cgt.function([x_tbk, y_tbp], [lossCE, loss01, flatgrad]) print "number of nodes in computation graph:", core.count_nodes( [lossCE, loss01, flatgrad]) return f_loss, f_loss_and_grad, params
def test_get_features_default(): batch_size = 32 feat_t_steps = 20 feat_num_features = 42 num_out_classes = 28 num_units = 20 feats = cgt.tensor3(fixed_shape=(batch_size, feat_t_steps, feat_num_features)) s = nnbuilder.Seq2Seq(nn_input_btf=feats, num_out_classes=num_out_classes) out = s.get_features_fun(feats, num_units=num_units) out_fun = cgt.function([feats], [out]) tau = np.reshape(np.random.normal(0.1, 0.2, batch_size*feat_t_steps*feat_num_features), (batch_size, feat_t_steps, feat_num_features)) m = out_fun(tau)[0] assert m.shape == (batch_size, feat_t_steps, num_units) assert np.mean(m) < 1
def make_funcs(opt, ntm, total_time, loss_timesteps): x_tbk = cgt.tensor3("x", fixed_shape=(total_time, opt.b, opt.k)) y_tbp = cgt.tensor3("y", fixed_shape=(total_time, opt.b, opt.p)) loss_timesteps = set(loss_timesteps) initial_states = make_ntm_initial_states(opt) params = ntm.get_parameters() + get_parameters(initial_states) # params = ntm.get_parameters() lossCE = 0 loss01 = 0 state_arrs = initial_states for t in xrange(total_time): tmp = ntm([x_tbk[t]] + state_arrs) raw_pred = tmp[0] state_arrs = tmp[1:4] if t in loss_timesteps: p_pred = cgt.sigmoid(raw_pred) ce = bernoulli_crossentropy(y_tbp[t] , p_pred).sum() # cross-entropy of bernoulli distribution lossCE = lossCE + ce loss01 = loss01 + cgt.cast(cgt.equal(y_tbp[t], round01(p_pred)),cgt.floatX).sum() lossCE = lossCE / (len(loss_timesteps) * opt.p * opt.b) / np.log(2) loss01 = loss01 / (len(loss_timesteps) * opt.p * opt.b) gradloss = cgt.grad(lossCE, params) flatgrad = flatcat(gradloss) f_loss = cgt.function([x_tbk, y_tbp], lossCE) f_loss_and_grad = cgt.function([x_tbk, y_tbp], [lossCE, loss01, flatgrad]) print "number of nodes in computation graph:", core.count_nodes([lossCE, loss01, flatgrad]) return f_loss, f_loss_and_grad, params
def main(num_epochs=NUM_EPOCHS): #cgt.set_precision('half') print("Building network ...") # Recurrent layers expect input of shape # (batch size, max sequence length, number of features) X = cgt.tensor3(name='X', fixed_shape=(N_BATCH, MAX_LENGTH, 2)) l_forward = nnbuilder.recurrentLayer(nn_input=X, num_units=N_HIDDEN) l_backward = nnbuilder.recurrentLayer(nn_input=X, num_units=N_HIDDEN, backwards=True) #l_forward = nnbuilder.LSTMLayer(nn_input=X, num_units=N_HIDDEN, activation=cgt.sigmoid) #l_backward = nnbuilder.LSTMLayer(nn_input=X, num_units=N_HIDDEN, activation=cgt.sigmoid, backwards=True) #l_forward = nnbuilder.GRULayer(nn_input=X, num_units=N_HIDDEN, activation=nn.rectify) #l_backward = nnbuilder.GRULayer(nn_input=X, num_units=N_HIDDEN, activation=nn.rectify, backwards=True) l_forward_slice = l_forward[:, MAX_LENGTH-1, :] # Take the last element in the forward slice time dimension l_backward_slice = l_backward[:, 0, :] # And the first element in the backward slice time dimension l_sum = cgt.concatenate([l_forward_slice, l_backward_slice], axis=1) l_out = nnbuilder.denseLayer(l_sum, num_units=1, activation=cgt.tanh) target_values = cgt.vector('target_output') predicted_values = l_out[:, 0] # For this task we only need the last value cost = cgt.mean((predicted_values - target_values)**2) # Compute SGD updates for training print("Computing updates ...") updates = nn.rmsprop(cost, nn.get_parameters(l_out), LEARNING_RATE) #updates = nn.nesterov_momentum(cost, nn.get_parameters(l_out), 0.05) # cgt functions for training and computing cost print("Compiling functions ...") train = cgt.function([X, target_values], cost, updates=updates) compute_cost = cgt.function([X, target_values], cost) # We'll use this "validation set" to periodically check progress X_val, y_val, mask_val = gen_data() print("Training ...") time_start = time.time() try: for epoch in range(num_epochs): for _ in range(EPOCH_SIZE): X, y, m = gen_data() train(X, y) cost_val = compute_cost(X_val, y_val) print("Epoch {} validation cost = {}".format(epoch+1, cost_val)) print ('Epoch took ' + str(time.time() - time_start)) time_start = time.time() except KeyboardInterrupt: pass
def make_funcs(config, dbg_out=None): params, Xs, Ys, C_0, H_0, C_T, H_T, C_1, H_1 = lstm_network( config['rnn_steps'], config['num_inputs'], config['num_outputs'], config['num_units'], config['num_mems']) # basic size_batch = Xs[0].shape[0] dY = Ys[0].shape[-1] Ys_gt = [ cgt.matrix(fixed_shape=(size_batch, dY), name='Y%d' % t) for t in range(len(Ys)) ] Ys_var = [cgt.tensor3(fixed_shape=(size_batch, dY, dY)) for _ in Ys] net_inputs, net_outputs = Xs + C_0 + H_0 + Ys_var, Ys + C_T + H_T # calculate loss loss_vec = [] for i in range(len(Ys)): # if i == 0: continue _l = dist.gaussian.logprob(Ys_gt[i], Ys[i], Ys_var[i]) loss_vec.append(_l) loss_vec = cgt.add_multi(loss_vec) if config['weight_decay'] > 0.: params_flat = cgt.concatenate([p.flatten() for p in params]) loss_param = config['weight_decay'] * cgt.sum(params_flat**2) loss_vec -= loss_param # / size_batch loss = cgt.sum(loss_vec) / config['rnn_steps'] / size_batch grad = cgt.grad(loss, params) # functions def f_init(size_batch): c_0, h_0 = [], [] for _n_m in config['num_mems']: if _n_m > 0: c_0.append(np.zeros((size_batch, _n_m))) h_0.append(np.zeros((size_batch, _n_m))) return c_0, h_0 f_step = cgt.function([Xs[0]] + C_0 + H_0, [Ys[0]] + C_1 + H_1) f_loss = cgt.function(net_inputs + Ys_gt, loss) f_grad = cgt.function(net_inputs + Ys_gt, grad) f_surr = cgt.function(net_inputs + Ys_gt, [loss] + net_outputs + grad) return params, f_step, f_loss, f_grad, f_init, f_surr
def test_get_context(): batch_size = 32 feat_t_steps = 3 feat_num_features = 30 state_num_features = 20 num_out_classes = 28 feats = cgt.tensor3(fixed_shape=(batch_size, feat_t_steps, feat_num_features)) prev_out = cgt.matrix(fixed_shape=(batch_size, state_num_features)) sigmoided = cgt.sigmoid(prev_out) s = nnbuilder.Seq2Seq(nn_input_btf=feats, num_out_classes=num_out_classes, feature_size=feat_num_features, decoder_size=state_num_features) mm = cgt.infer_shape(s.features_post_mlp_btf) assert mm == (batch_size, feat_t_steps, feat_num_features) context_out = s.get_context(sigmoided) out_fun = cgt.function([feats, prev_out], [context_out]) tau = np.reshape(np.random.normal(0.1, 0.2, batch_size*feat_t_steps*feat_num_features), (batch_size, feat_t_steps, feat_num_features)) tau2 = np.reshape(np.random.normal(0.1, 0.2, batch_size*state_num_features), (batch_size, state_num_features)) m = out_fun(tau, tau2)[0] assert m.shape == (batch_size, feat_num_features) assert np.mean(m) < 1
def make_funcs(config, dbg_out=None): params, Xs, Ys, C_0, H_0, C_T, H_T, C_1, H_1 = lstm_network( config['rnn_steps'], config['num_inputs'], config['num_outputs'], config['num_units'], config['num_mems'] ) # basic size_batch = Xs[0].shape[0] dY = Ys[0].shape[-1] Ys_gt = [cgt.matrix(fixed_shape=(size_batch, dY), name='Y%d'%t) for t in range(len(Ys))] Ys_var = [cgt.tensor3(fixed_shape=(size_batch, dY, dY)) for _ in Ys] net_inputs, net_outputs = Xs + C_0 + H_0 + Ys_var, Ys + C_T + H_T # calculate loss loss_vec = [] for i in range(len(Ys)): # if i == 0: continue _l = dist.gaussian.logprob(Ys_gt[i], Ys[i], Ys_var[i]) loss_vec.append(_l) loss_vec = cgt.add_multi(loss_vec) if config['weight_decay'] > 0.: params_flat = cgt.concatenate([p.flatten() for p in params]) loss_param = config['weight_decay'] * cgt.sum(params_flat ** 2) loss_vec -= loss_param # / size_batch loss = cgt.sum(loss_vec) / config['rnn_steps'] / size_batch grad = cgt.grad(loss, params) # functions def f_init(size_batch): c_0, h_0 = [], [] for _n_m in config['num_mems']: if _n_m > 0: c_0.append(np.zeros((size_batch, _n_m))) h_0.append(np.zeros((size_batch, _n_m))) return c_0, h_0 f_step = cgt.function([Xs[0]] + C_0 + H_0, [Ys[0]] + C_1 + H_1) f_loss = cgt.function(net_inputs + Ys_gt, loss) f_grad = cgt.function(net_inputs + Ys_gt, grad) f_surr = cgt.function(net_inputs + Ys_gt, [loss] + net_outputs + grad) return params, f_step, f_loss, f_grad, f_init, f_surr
def make_funcs(config, dbg_out={}): net_in, net_out = hybrid_network(config['num_inputs'], config['num_outputs'], config['num_units'], config['num_sto'], dbg_out=dbg_out) if not config['dbg_out_full']: dbg_out = {} # def f_sample(_inputs, num_samples=1, flatten=False): # _mean, _var = f_step(_inputs) # _samples = [] # for _m, _v in zip(_mean, _var): # _s = np.random.multivariate_normal(_m, np.diag(np.sqrt(_v)), num_samples) # if flatten: _samples.extend(_s) # else: _samples.append(_s) # return np.array(_samples) Y_gt = cgt.matrix("Y") Y_prec = cgt.tensor3('V', fixed_shape=(None, config['num_inputs'], config['num_inputs'])) params = nn.get_parameters(net_out) size_batch, size_out = net_out.shape inputs, outputs = [net_in], [net_out] if config['no_bias']: print "Excluding bias" params = [p for p in params if not p.name.endswith(".b")] loss_vec = dist.gaussian.logprob(Y_gt, net_out, Y_prec) if config['weight_decay'] > 0.: print "Applying penalty on parameter norm" params_flat = cgt.concatenate([p.flatten() for p in params]) loss_param = config['weight_decay'] * cgt.sum(params_flat**2) loss_vec -= loss_param # / size_batch loss = cgt.sum(loss_vec) / size_batch # TODO_TZ f_step seems not to fail if X has wrong dim f_step = cgt.function(inputs, outputs) f_surr = get_surrogate_func(inputs + [Y_prec, Y_gt], outputs, [loss_vec], params, _dbg_out=dbg_out) return params, f_step, None, None, None, f_surr
from gru import GRUCell import time from cgt.utils import Message import numpy as np if __name__ == "__main__": import argparse parser = argparse.ArgumentParser() parser.add_argument("--horizon", type=int) args = parser.parse_args() horizon = args.horizon assert horizon is not None size = 128 batchsize = 64 cell = GRUCell([size], size) X = cgt.tensor3() init = cgt.matrix() prev_h = init for i in xrange(horizon): prev_h = cell(X[i], prev_h) loss = prev_h.sum() with Message("compiling"): f = cgt.function([X, init], cgt.grad(loss, cell.params())) with Message("running"): xval = np.zeros((horizon, batchsize, size), cgt.floatX) initval = np.zeros((batchsize, size), cgt.floatX) for i in xrange(100): f(xval, initval)
import gru, cgt, numpy as np import sys from time import time elapsed = [] horizons = 2**np.arange(2, 10) for horizon in horizons: print "HORIZON", horizon tstart = time() batch_size = 6 dim_x = 16 mem_size = 10 X_tnk = cgt.tensor3("X") cell = gru.GRUCell([dim_x], mem_size) Minit_nk = cgt.zeros((X_tnk.shape[0], X_tnk.shape[1]), cgt.floatX) M = Minit_nk for t in xrange(horizon): M = cell(M, X_tnk[t]) # cgt.print_tree(M) print "simplifying..." M_simp = cgt.simplify([M]) print "done" # cgt.print_tree(M_simp) print "fn before:", cgt.count_nodes(M)
import gru,cgt, numpy as np import sys from time import time elapsed = [] horizons = 2**np.arange(2, 10) for horizon in horizons: print "HORIZON",horizon tstart = time() batch_size = 6 dim_x = 16 mem_size = 10 X_tnk = cgt.tensor3("X") cell = gru.GRUCell([dim_x], mem_size) Minit_nk = cgt.zeros((X_tnk.shape[0], X_tnk.shape[1]),cgt.floatX) M = Minit_nk for t in xrange(horizon): M = cell(M, X_tnk[t]) # cgt.print_tree(M) print "simplifying..." M_simp = cgt.simplify([M]) print "done" # cgt.print_tree(M_simp) print "fn before:",cgt.count_nodes(M)
def test_seq_2_seq(): batch_size = 32 # How many samples do you want to batch. feat_t_steps = 3 # How many 10ms sound clips. feat_num_features = 10 # The dimension of the 10ms clips. max_label_length = feat_t_steps # The maximal label length of the transcription. num_out_classes = 27 # 26 letters and space. num_out_classes_true = 27 + 2 # Start and end tokens are added. num_batches = 512 # 1032 num_epochs = 40 feats = cgt.tensor3(fixed_shape=(batch_size, feat_t_steps, feat_num_features)) ground_labels_basis = cgt.tensor3(fixed_shape=(batch_size, max_label_length, num_out_classes_true)) last_time = time.time() print 'initializing seq2seq' seq2seq = nnbuilder.Seq2Seq(nn_input_btf=feats, num_out_classes=num_out_classes) print 'that took ' + str(time.time() - last_time) + ' seconds' last_time = time.time() print 'making train objective' train_objective = seq2seq.get_train_objective(max_label_length=max_label_length, ground_labels_basis_btc=ground_labels_basis) print 'that took ' + str(time.time() - last_time) + ' seconds' last_time = time.time() print 'making updates' updates = nn.rmsprop(train_objective, nn.get_parameters(train_objective), learning_rate=0.0001) #updates = nn.nesterov_momentum(train_objective, nn.get_parameters(train_objective), learning_rate=0.0001, mu=0.4) #updates = nn.momentum(train_objective, nn.get_parameters(train_objective), learning_rate=0.00001, mu=0.4) #updates = nn.adadelta(train_objective, nn.get_parameters(train_objective), learning_rate=0.0001, rho=0.95) print 'that took ' + str(time.time() - last_time) + ' seconds' last_time = time.time() print 'compiling train function, test function, and prediction output function' train_function = cgt.function([feats, ground_labels_basis], [], updates=updates) test_function = cgt.function([feats, ground_labels_basis], [train_objective]) pred = seq2seq.make_prediction(ground_labels_basis_btc=ground_labels_basis, max_label_length=feat_t_steps) pred_fun = cgt.function([feats, ground_labels_basis], [pred]) print 'that took ' + str(time.time() - last_time) + ' seconds' test_data = np.load('test_data.npy') test_labels = np.load('test_labels.npy') data_mean = np.mean(test_data) data_sd = np.std(test_data) print 'now training' last_time = time.time() for one_epoch in range(0, num_epochs): tested = 0 print 'starting epoch ' + str(one_epoch) for batch_iter in range(0, num_batches): batch, labels_basis = normalize_batch_and_labels(test_data, batch_iter, feat_t_steps, data_mean, data_sd, test_labels, num_out_classes_true) train_function(batch, labels_basis) for batch_iter in range(0, num_batches): batch, labels_basis = normalize_batch_and_labels(test_data, batch_iter, feat_t_steps, data_mean, data_sd, test_labels, num_out_classes_true) tested += test_function(batch, labels_basis)[0] tested = tested / batch_iter print 'train loss for batch ' + str(batch_iter) + ' is ' + str(tested) print 'an actual prediction is ' print pred_fun(batch, labels_basis)[0] print 'the truth is' print test_labels[batch_iter, :, 0:feat_t_steps] print 'that took ' + str(time.time() - last_time) + ' seconds' last_time = time.time() prediction_final = pred_fun(batch, labels_basis)[0] print prediction_final
from gru import GRUCell import time from cgt.utils import Message import numpy as np if __name__ == "__main__": import argparse parser = argparse.ArgumentParser() parser.add_argument("--horizon",type=int) args = parser.parse_args() horizon = args.horizon assert horizon is not None size=128 batchsize=64 cell = GRUCell([size],size) X = cgt.tensor3() init = cgt.matrix() prev_h = init for i in xrange(horizon): prev_h = cell(X[i], prev_h) loss = prev_h.sum() with Message("compiling"): f = cgt.function([X, init],cgt.grad(loss, cell.params())) with Message("running"): xval = np.zeros((horizon,batchsize,size),cgt.floatX) initval = np.zeros((batchsize, size), cgt.floatX) for i in xrange(100): f(xval, initval)
def test_the_test_problem(): #Works batch_size = 32 # How many samples do you want to batch. feat_t_steps = 20 # How many 10ms sound clips. feat_num_features = 10 # The dimension of the 10ms clips. max_label_length = feat_t_steps # The maximal label length of the transcription. includes start character. num_out_classes = 27 num_out_classes_true = num_out_classes + 2 num_batches = 756 num_epochs = 30 feats = cgt.tensor3(fixed_shape=(batch_size, feat_t_steps, feat_num_features)) ground_labels_basis = cgt.tensor3(fixed_shape=(batch_size, max_label_length, num_out_classes_true)) last_time = time.time() print 'initializing temporal dense layer' d1 = nnbuilder.temporalDenseLayer(feats, num_units=128, activation=cgt.sigmoid) #d2 = nnbuilder.temporalDenseLayer(d1, num_units=128, activation=cgt.sigmoid) d3 = nnbuilder.temporalDenseLayer(d1, num_units=num_out_classes_true, activation=nnbuilder.linear) out = nn.three_d_softmax(d3, axis=2) log_probs = None for iter_step in range(0, max_label_length): this_character_dist_bc = out[:, iter_step, :] prev_out_bc = ground_labels_basis[:, iter_step, :] log_probs_pre = prev_out_bc * this_character_dist_bc log_probs_pre = cgt.log(cgt.sum(log_probs_pre, axis=1)) if log_probs is None: log_probs = cgt.sum(log_probs_pre) else: log_probs += cgt.sum(log_probs_pre) log_probs = -log_probs print 'that took ' + str(time.time() - last_time) + ' seconds' last_time = time.time() print 'compiling objective function' updates = nn.rmsprop(log_probs, nn.get_parameters(log_probs), learning_rate=0.01) pred_train = cgt.function([feats, ground_labels_basis], [], updates=updates) pred_fun = cgt.function([feats, ground_labels_basis], [log_probs]) most_likely_chars = cgt.argmax(out, axis=1) actual_predictions = cgt.function([feats, ground_labels_basis], [most_likely_chars]) print 'that took ' + str(time.time() - last_time) + ' seconds' test_data = np.load('test_data.npy') test_labels = np.load('test_labels.npy') data_mean = np.mean(test_data) data_sd = np.mean(test_data) print 'now training' for one_epoch in range(0, num_epochs): trained = 0 last_time = time.time() print 'starting epoch ' + str(one_epoch) for batch_iter in range(0, num_batches): batch, labels_basis = normalize_batch_and_labels(test_data, batch_iter, feat_t_steps, data_mean, data_sd, test_labels, num_out_classes_true) pred_train(batch, labels_basis) for batch_iter in range(0, num_batches): batch, labels_basis = normalize_batch_and_labels(test_data, batch_iter, feat_t_steps, data_mean, data_sd, test_labels, num_out_classes_true) trained += pred_fun(batch, labels_basis)[0] trained = trained/batch_iter print 'train loss is ' + str(trained) print 'that took ' + str(time.time() - last_time) + ' seconds' act_pred = actual_predictions(batch, labels_basis)[0] print 'an actual prediction is ' print act_pred