def test_get_train_objective(): batch_size = 32 feat_t_steps = 5 feat_num_features = 256 max_label_length = 5 num_out_classes = 27 feats = cgt.tensor3(fixed_shape=(batch_size, feat_t_steps, feat_num_features)) ground_labels_basis = cgt.tensor3(fixed_shape=(batch_size, max_label_length, num_out_classes)) seq2seq = nnbuilder.Seq2Seq(nn_input_btf=feats, num_out_classes=num_out_classes) train_objective = seq2seq.get_train_objective(max_label_length=max_label_length, ground_labels_basis_btc=ground_labels_basis) train_shape = cgt.infer_shape(train_objective) assert train_shape == () nn.get_parameters(train_objective)
def save_weights(network_out_layer, file_name_for_saving): all_params = get_parameters(network_out_layer) param_values = [] for param in all_params: param_values.append(param.op.get_value()) pickle.dump(param_values, open(file_name_for_saving+".p", "wb"))
def make_funcs(config, dbg_out={}): net_in, net_out = hybrid_network(config['num_inputs'], config['num_outputs'], config['num_units'], config['num_sto'], dbg_out=dbg_out) if not config['dbg_out_full']: dbg_out = {} # def f_sample(_inputs, num_samples=1, flatten=False): # _mean, _var = f_step(_inputs) # _samples = [] # for _m, _v in zip(_mean, _var): # _s = np.random.multivariate_normal(_m, np.diag(np.sqrt(_v)), num_samples) # if flatten: _samples.extend(_s) # else: _samples.append(_s) # return np.array(_samples) Y_gt = cgt.matrix("Y") Y_prec = cgt.tensor3('V', fixed_shape=(None, config['num_inputs'], config['num_inputs'])) params = nn.get_parameters(net_out) size_batch, size_out = net_out.shape inputs, outputs = [net_in], [net_out] if config['no_bias']: print "Excluding bias" params = [p for p in params if not p.name.endswith(".b")] loss_vec = dist.gaussian.logprob(Y_gt, net_out, Y_prec) if config['weight_decay'] > 0.: print "Applying penalty on parameter norm" params_flat = cgt.concatenate([p.flatten() for p in params]) loss_param = config['weight_decay'] * cgt.sum(params_flat ** 2) loss_vec -= loss_param # / size_batch loss = cgt.sum(loss_vec) / size_batch # TODO_TZ f_step seems not to fail if X has wrong dim f_step = cgt.function(inputs, outputs) f_surr = get_surrogate_func(inputs + [Y_prec, Y_gt], outputs, [loss_vec], params, _dbg_out=dbg_out) return params, f_step, None, None, None, f_surr
def __init__(self, n_actions): Serializable.__init__(self, n_actions) cgt.set_precision('double') n_in = 128 o_no = cgt.matrix("o_no",fixed_shape=(None,n_in)) a_n = cgt.vector("a_n",dtype='i8') q_n = cgt.vector("q_n") oldpdist_np = cgt.matrix("oldpdists") h0 = (o_no - 128.0)/128.0 nhid = 64 h1 = cgt.tanh(nn.Affine(128,nhid,weight_init=nn.IIDGaussian(std=.1))(h0)) probs_na = nn.softmax(nn.Affine(nhid,n_actions,weight_init=nn.IIDGaussian(std=0.01))(h1)) logprobs_na = cgt.log(probs_na) b = cgt.size(o_no, 0) logps_n = logprobs_na[cgt.arange(b), a_n] surr = (logps_n*q_n).mean() kl = (oldpdist_np * cgt.log(oldpdist_np/probs_na)).sum(axis=1).mean() params = nn.get_parameters(surr) gradsurr = cgt.grad(surr, params) flatgrad = cgt.concatenate([p.flatten() for p in gradsurr]) lam = cgt.scalar() penobj = surr - lam * kl self._f_grad_lagrangian = cgt.function([lam, oldpdist_np, o_no, a_n, q_n], cgt.concatenate([p.flatten() for p in cgt.grad(penobj,params)])) self.f_pdist = cgt.function([o_no], probs_na) self.f_probs = cgt.function([o_no], probs_na) self.f_surr_kl = cgt.function([oldpdist_np, o_no, a_n, q_n], [surr, kl]) self.f_gradlogp = cgt.function([oldpdist_np, o_no, a_n, q_n], flatgrad) self.pc = ParamCollection(params)
def main(): parser = argparse.ArgumentParser() parser.add_argument("--profile",action="store_true") parser.add_argument("--unittest",action="store_true") parser.add_argument("--epochs",type=int,default=10) args = parser.parse_args() batchsize = 64 Xshape = (batchsize, 3, 32, 32) X = cgt.tensor4("X", fixed_shape = Xshape) y = cgt.vector("y", fixed_shape = (batchsize,), dtype='i4') conv1 = nn.SpatialConvolution(3, 32, kernelshape=(5,5), pad=(2,2), weight_init=nn.IIDGaussian(std=1e-4))(X) relu1 = nn.rectify(conv1) pool1 = nn.max_pool_2d(relu1, kernelshape=(3,3), stride=(2,2)) conv2 = nn.SpatialConvolution(32, 32, kernelshape=(5,5), pad=(2,2), weight_init=nn.IIDGaussian(std=0.01))(relu1) relu2 = nn.rectify(conv2) pool2 = nn.max_pool_2d(relu2, kernelshape=(3,3), stride=(2,2)) conv3 = nn.SpatialConvolution(32, 64, kernelshape=(5,5), pad=(2,2), weight_init=nn.IIDGaussian(std=0.01))(pool2) pool3 = nn.max_pool_2d(conv3, kernelshape=(3,3), stride=(2,2)) relu3 = nn.rectify(pool3) d0,d1,d2,d3 = relu3.shape flatlayer = relu3.reshape([d0,d1*d2*d3]) nfeats = cgt.infer_shape(flatlayer)[1] ip1 = nn.Affine(nfeats, 10)(flatlayer) logprobs = nn.logsoftmax(ip1) loss = -logprobs[cgt.arange(batchsize), y].mean() params = nn.get_parameters(loss) updates = rmsprop_updates(loss, params, stepsize=1e-3) train = cgt.function(inputs=[X, y], outputs=[loss], updates=updates) if args.profile: cgt.profiler.start() data = np.load("/Users/joschu/Data/cifar-10-batches-py/cifar10.npz") Xtrain = data["X_train"] ytrain = data["y_train"] print fmt_row(10, ["Epoch","Train NLL","Train Err","Test NLL","Test Err","Epoch Time"]) for i_epoch in xrange(args.epochs): for start in xrange(0, Xtrain.shape[0], batchsize): tstart = time.time() end = start+batchsize print train(Xtrain[start:end], ytrain[start:end]), time.time()-tstart if start > batchsize*5: break # elapsed = time.time() - tstart # trainerr, trainloss = computeloss(Xtrain[:len(Xtest)], ytrain[:len(Xtest)]) # testerr, testloss = computeloss(Xtest, ytest) # print fmt_row(10, [i_epoch, trainloss, trainerr, testloss, testerr, elapsed]) if args.profile: cgt.profiler.print_stats() return if args.unittest: break
def main(): parser = argparse.ArgumentParser() parser.add_argument("--profile",action="store_true") parser.add_argument("--unittest",action="store_true") parser.add_argument("--epochs",type=int,default=10) args = parser.parse_args() batchsize = 64 Xshape = (batchsize, 3, 32, 32) X = cgt.tensor4("X", fixed_shape = Xshape) y = cgt.vector("y", fixed_shape = (batchsize,), dtype='i4') conv1 = nn.SpatialConvolution(3, 32, kernelshape=(5,5), pad=(2,2), weight_init=nn.IIDGaussian(std=1e-4))(X) relu1 = nn.rectify(conv1) pool1 = nn.max_pool_2d(relu1, kernelshape=(3,3), stride=(2,2)) conv2 = nn.SpatialConvolution(32, 32, kernelshape=(5,5), pad=(2,2), weight_init=nn.IIDGaussian(std=0.01))(pool1) relu2 = nn.rectify(conv2) pool2 = nn.max_pool_2d(relu2, kernelshape=(3,3), stride=(2,2)) conv3 = nn.SpatialConvolution(32, 64, kernelshape=(5,5), pad=(2,2), weight_init=nn.IIDGaussian(std=0.01))(pool2) pool3 = nn.max_pool_2d(conv3, kernelshape=(3,3), stride=(2,2)) relu3 = nn.rectify(pool3) d0,d1,d2,d3 = relu3.shape flatlayer = relu3.reshape([d0,d1*d2*d3]) nfeats = cgt.infer_shape(flatlayer)[1] ip1 = nn.Affine(nfeats, 10)(flatlayer) logprobs = nn.logsoftmax(ip1) loss = -logprobs[cgt.arange(batchsize), y].mean() params = nn.get_parameters(loss) updates = rmsprop_updates(loss, params, stepsize=1e-3) train = cgt.function(inputs=[X, y], outputs=[loss], updates=updates) if args.profile: cgt.profiler.start() data = fetch_dataset("http://rll.berkeley.edu/cgt-data/cifar10.npz") Xtrain = data["X_train"] ytrain = data["y_train"] print fmt_row(10, ["Epoch","Train NLL","Train Err","Test NLL","Test Err","Epoch Time"]) for i_epoch in xrange(args.epochs): for start in xrange(0, Xtrain.shape[0], batchsize): tstart = time.time() end = start+batchsize print train(Xtrain[start:end], ytrain[start:end]), time.time()-tstart if start > batchsize*5: break # elapsed = time.time() - tstart # trainerr, trainloss = computeloss(Xtrain[:len(Xtest)], ytrain[:len(Xtest)]) # testerr, testloss = computeloss(Xtest, ytest) # print fmt_row(10, [i_epoch, trainloss, trainerr, testloss, testerr, elapsed]) if args.profile: cgt.profiler.print_stats() return if args.unittest: break
def make_updater_fc(): X = cgt.matrix("X", fixed_shape=(None, 28 * 28)) y = cgt.vector("y", dtype="i8") stepsize = cgt.scalar("stepsize") loss = build_fc_return_loss(X, y) params = nn.get_parameters(loss) gparams = cgt.grad(loss, params) updates = [(p, p - stepsize * gp) for (p, gp) in zip(params, gparams)] return cgt.function([X, y, stepsize], loss, updates=updates)
def make_updater_fc(): X = cgt.matrix("X", fixed_shape=(None, 28 * 28)) y = cgt.vector("y", dtype='i8') stepsize = cgt.scalar("stepsize") loss = build_fc_return_loss(X, y) params = nn.get_parameters(loss) gparams = cgt.grad(loss, params) updates = [(p, p - stepsize * gp) for (p, gp) in zip(params, gparams)] return cgt.function([X, y, stepsize], loss, updates=updates)
def make_updater_convnet(): X = cgt.tensor4("X", fixed_shape=(None, 1, 28, 28)) # so shapes can be inferred y = cgt.vector("y", dtype="i8") stepsize = cgt.scalar("stepsize") loss = build_convnet_return_loss(X, y) params = nn.get_parameters(loss) gparams = cgt.grad(loss, params) updates = [(p, p - stepsize * gp) for (p, gp) in zip(params, gparams)] return cgt.function([X, y, stepsize], loss, updates=updates)
def __init__(self, obs_dim, ctrl_dim): cgt.set_precision('double') Serializable.__init__(self, obs_dim, ctrl_dim) self.obs_dim = obs_dim self.ctrl_dim = ctrl_dim o_no = cgt.matrix("o_no",fixed_shape=(None,obs_dim)) a_na = cgt.matrix("a_na",fixed_shape = (None, ctrl_dim)) adv_n = cgt.vector("adv_n") oldpdist_np = cgt.matrix("oldpdist", fixed_shape=(None, 2*ctrl_dim)) self.logstd = logstd_1a = nn.parameter(np.zeros((1, self.ctrl_dim)), name="std_1a") std_1a = cgt.exp(logstd_1a) # Here's where we apply the network h0 = o_no nhid = 32 h1 = cgt.tanh(nn.Affine(obs_dim,nhid,weight_init=nn.IIDGaussian(std=0.1))(h0)) h2 = cgt.tanh(nn.Affine(nhid,nhid,weight_init=nn.IIDGaussian(std=0.1))(h1)) mean_na = nn.Affine(nhid,ctrl_dim,weight_init=nn.IIDGaussian(std=0.01))(h2) b = cgt.size(o_no, 0) std_na = cgt.repeat(std_1a, b, axis=0) oldmean_na = oldpdist_np[:, 0:self.ctrl_dim] oldstd_na = oldpdist_np[:, self.ctrl_dim:2*self.ctrl_dim] logp_n = ((-.5) * cgt.square( (a_na - mean_na) / std_na ).sum(axis=1)) - logstd_1a.sum() oldlogp_n = ((-.5) * cgt.square( (a_na - oldmean_na) / oldstd_na ).sum(axis=1)) - cgt.log(oldstd_na).sum(axis=1) ratio_n = cgt.exp(logp_n - oldlogp_n) surr = (ratio_n*adv_n).mean() pdists_np = cgt.concatenate([mean_na, std_na], axis=1) # kl = cgt.log(sigafter/) params = nn.get_parameters(surr) oldvar_na = cgt.square(oldstd_na) var_na = cgt.square(std_na) kl = (cgt.log(std_na / oldstd_na) + (oldvar_na + cgt.square(oldmean_na - mean_na)) / (2 * var_na) - .5).sum(axis=1).mean() lam = cgt.scalar() penobj = surr - lam * kl self._compute_surr_kl = cgt.function([oldpdist_np, o_no, a_na, adv_n], [surr, kl]) self._compute_grad_lagrangian = cgt.function([lam, oldpdist_np, o_no, a_na, adv_n], cgt.concatenate([p.flatten() for p in cgt.grad(penobj,params)])) self.f_pdist = cgt.function([o_no], pdists_np) self.f_objs = cgt.function([oldpdist_np, o_no, a_na, adv_n], [surr, kl]) self.pc = ParamCollection(params)
def make_updater_convnet(): X = cgt.tensor4("X", fixed_shape=(None, 1, 28, 28)) # so shapes can be inferred y = cgt.vector("y", dtype='i8') stepsize = cgt.scalar("stepsize") loss = build_convnet_return_loss(X, y) params = nn.get_parameters(loss) gparams = cgt.grad(loss, params) updates = [(p, p - stepsize * gp) for (p, gp) in zip(params, gparams)] return cgt.function([X, y, stepsize], loss, updates=updates)
def make_updater_convnet_parallel(): X = cgt.tensor4("X", fixed_shape=(None, 1, 28, 28)) # so shapes can be inferred y = cgt.vector("y", dtype="i8") stepsize = cgt.scalar("stepsize") loss = build_convnet_return_loss(X, y) m = nn.Module([X, y], [loss]) split_loss = 0 for start in xrange(0, batch_size, batch_size // 4): sli = slice(start, start + batch_size // 4) split_loss += m([X[sli], y[sli]])[0] split_loss /= 4 params = nn.get_parameters(loss) gparams = cgt.grad(split_loss, params) updates2 = [(p, p - stepsize * gp) for (p, gp) in zip(params, gparams)] return cgt.function([X, y, stepsize], split_loss, updates=updates2)
def make_updater_fc_parallel(): X = cgt.matrix("X", fixed_shape=(None,28*28)) y = cgt.vector("y",dtype='i8') stepsize = cgt.scalar("stepsize") loss = build_fc_return_loss(X,y) params = nn.get_parameters(loss) m = nn.Module([X,y], [loss]) split_loss = 0 for start in xrange(0, batch_size, batch_size//4): sli = slice(start, start+batch_size//4) split_loss += m([X[sli], y[sli]])[0] split_loss /= 4 gparams = cgt.grad(split_loss, params) updates2 = [(p, p-stepsize*gp) for (p, gp) in zip(params, gparams)] return cgt.function([X,y, stepsize], split_loss, updates=updates2)
def make_updater_fc_parallel(): X = cgt.matrix("X", fixed_shape=(None, 28 * 28)) y = cgt.vector("y", dtype='i8') stepsize = cgt.scalar("stepsize") loss = build_fc_return_loss(X, y) params = nn.get_parameters(loss) m = nn.Module([X, y], [loss]) split_loss = 0 for start in xrange(0, batch_size, batch_size // 4): sli = slice(start, start + batch_size // 4) split_loss += m([X[sli], y[sli]])[0] split_loss /= 4 gparams = cgt.grad(split_loss, params) updates2 = [(p, p - stepsize * gp) for (p, gp) in zip(params, gparams)] return cgt.function([X, y, stepsize], split_loss, updates=updates2)
def main(): print("Loading data...") X = cgt.matrix("X", fixed_shape=(None, 28*28)) y = cgt.vector("y", dtype='i8') model = build_model(X, 0.0) loss = -cgt.mean(categorical.loglik(y, model)) updates = nn.rmsprop(loss, nn.get_parameters(loss), 0.01) train = cgt.function(inputs=[X, y], outputs=[], updates=updates) y_nodrop = cgt.argmax(model, axis=1) cost_nodrop = -cgt.mean(categorical.loglik(y, model)) err_nodrop = cgt.cast(cgt.not_equal(y_nodrop, y), cgt.floatX).mean() computeloss = cgt.function(inputs=[X, y], outputs=[err_nodrop, cost_nodrop]) batch_size=128 Xdata, ydata = load_data() Xtrain = Xdata[0:60000] ytrain = ydata[0:60000] Xtest = Xdata[60000:70000] ytest = ydata[60000:70000] sortinds = np.random.permutation(60000) Xtrain = Xtrain[sortinds] ytrain = ytrain[sortinds] print fmt_row(10, ["Epoch","Train NLL","Train Err","Test NLL","Test Err","Epoch Time"]) for i_epoch in xrange(3): tstart = time.time() for start in xrange(0, Xtrain.shape[0], batch_size): end = start+batch_size train(Xtrain[start:end], ytrain[start:end]) elapsed = time.time() - tstart trainerr, trainloss = computeloss(Xtrain[:len(Xtest)], ytrain[:len(Xtest)]) testerr, testloss = computeloss(Xtest, ytest) print fmt_row(10, [i_epoch, trainloss, trainerr, testloss, testerr, elapsed]) nnbuilder.save_weights(model, 'mnist')
def main(num_epochs=NUM_EPOCHS): #cgt.set_precision('half') print("Building network ...") # Recurrent layers expect input of shape # (batch size, max sequence length, number of features) X = cgt.tensor3(name='X', fixed_shape=(N_BATCH, MAX_LENGTH, 2)) l_forward = nnbuilder.recurrentLayer(nn_input=X, num_units=N_HIDDEN) l_backward = nnbuilder.recurrentLayer(nn_input=X, num_units=N_HIDDEN, backwards=True) #l_forward = nnbuilder.LSTMLayer(nn_input=X, num_units=N_HIDDEN, activation=cgt.sigmoid) #l_backward = nnbuilder.LSTMLayer(nn_input=X, num_units=N_HIDDEN, activation=cgt.sigmoid, backwards=True) #l_forward = nnbuilder.GRULayer(nn_input=X, num_units=N_HIDDEN, activation=nn.rectify) #l_backward = nnbuilder.GRULayer(nn_input=X, num_units=N_HIDDEN, activation=nn.rectify, backwards=True) l_forward_slice = l_forward[:, MAX_LENGTH-1, :] # Take the last element in the forward slice time dimension l_backward_slice = l_backward[:, 0, :] # And the first element in the backward slice time dimension l_sum = cgt.concatenate([l_forward_slice, l_backward_slice], axis=1) l_out = nnbuilder.denseLayer(l_sum, num_units=1, activation=cgt.tanh) target_values = cgt.vector('target_output') predicted_values = l_out[:, 0] # For this task we only need the last value cost = cgt.mean((predicted_values - target_values)**2) # Compute SGD updates for training print("Computing updates ...") updates = nn.rmsprop(cost, nn.get_parameters(l_out), LEARNING_RATE) #updates = nn.nesterov_momentum(cost, nn.get_parameters(l_out), 0.05) # cgt functions for training and computing cost print("Compiling functions ...") train = cgt.function([X, target_values], cost, updates=updates) compute_cost = cgt.function([X, target_values], cost) # We'll use this "validation set" to periodically check progress X_val, y_val, mask_val = gen_data() print("Training ...") time_start = time.time() try: for epoch in range(num_epochs): for _ in range(EPOCH_SIZE): X, y, m = gen_data() train(X, y) cost_val = compute_cost(X_val, y_val) print("Epoch {} validation cost = {}".format(epoch+1, cost_val)) print ('Epoch took ' + str(time.time() - time_start)) time_start = time.time() except KeyboardInterrupt: pass
def __init__(self, num_features=None, num_hidden=100): stepsize = 0.01 # with shape (batchsize, ncols) X = cgt.matrix("X", fixed_shape=(1, num_features)) # y: a symbolic variable representing the rewards, which are integers y = cgt.scalar("y", dtype='float64') hid1 = nn.rectify( nn.Affine(num_features, num_hidden, weight_init=nn.IIDGaussian(std=.1), bias_init=nn.Constant(1))(X) ) # One final fully-connected layer, and then a linear activation output for reward output = nn.Affine(num_hidden, 1, weight_init=nn.IIDGaussian(std=.1), bias_init=nn.Constant(1))(hid1) abs_deviation = cgt.abs(output - y).mean() params = nn.get_parameters(abs_deviation) gparams = cgt.grad(abs_deviation, params) updates = [(p, p-stepsize*gp) for (p, gp) in zip(params, gparams)] self.predictor = cgt.function([X], output) self.updater = cgt.function([X, y], abs_deviation, updates=updates)
def make_funcs(config, dbg_out={}): net_in, net_out = hybrid_network(config['num_inputs'], config['num_outputs'], config['num_units'], config['num_sto'], dbg_out=dbg_out) if not config['dbg_out_full']: dbg_out = {} # def f_sample(_inputs, num_samples=1, flatten=False): # _mean, _var = f_step(_inputs) # _samples = [] # for _m, _v in zip(_mean, _var): # _s = np.random.multivariate_normal(_m, np.diag(np.sqrt(_v)), num_samples) # if flatten: _samples.extend(_s) # else: _samples.append(_s) # return np.array(_samples) Y_gt = cgt.matrix("Y") Y_prec = cgt.tensor3('V', fixed_shape=(None, config['num_inputs'], config['num_inputs'])) params = nn.get_parameters(net_out) size_batch, size_out = net_out.shape inputs, outputs = [net_in], [net_out] if config['no_bias']: print "Excluding bias" params = [p for p in params if not p.name.endswith(".b")] loss_vec = dist.gaussian.logprob(Y_gt, net_out, Y_prec) if config['weight_decay'] > 0.: print "Applying penalty on parameter norm" params_flat = cgt.concatenate([p.flatten() for p in params]) loss_param = config['weight_decay'] * cgt.sum(params_flat**2) loss_vec -= loss_param # / size_batch loss = cgt.sum(loss_vec) / size_batch # TODO_TZ f_step seems not to fail if X has wrong dim f_step = cgt.function(inputs, outputs) f_surr = get_surrogate_func(inputs + [Y_prec, Y_gt], outputs, [loss_vec], params, _dbg_out=dbg_out) return params, f_step, None, None, None, f_surr
def __init__(self, n_actions): Serializable.__init__(self, n_actions) cgt.set_precision('double') n_in = 128 o_no = cgt.matrix("o_no", fixed_shape=(None, n_in)) a_n = cgt.vector("a_n", dtype='i8') q_n = cgt.vector("q_n") oldpdist_np = cgt.matrix("oldpdists") h0 = (o_no - 128.0) / 128.0 nhid = 64 h1 = cgt.tanh( nn.Affine(128, nhid, weight_init=nn.IIDGaussian(std=.1))(h0)) probs_na = nn.softmax( nn.Affine(nhid, n_actions, weight_init=nn.IIDGaussian(std=0.01))(h1)) logprobs_na = cgt.log(probs_na) b = cgt.size(o_no, 0) logps_n = logprobs_na[cgt.arange(b), a_n] surr = (logps_n * q_n).mean() kl = (oldpdist_np * cgt.log(oldpdist_np / probs_na)).sum(axis=1).mean() params = nn.get_parameters(surr) gradsurr = cgt.grad(surr, params) flatgrad = cgt.concatenate([p.flatten() for p in gradsurr]) lam = cgt.scalar() penobj = surr - lam * kl self._f_grad_lagrangian = cgt.function( [lam, oldpdist_np, o_no, a_n, q_n], cgt.concatenate([p.flatten() for p in cgt.grad(penobj, params)])) self.f_pdist = cgt.function([o_no], probs_na) self.f_probs = cgt.function([o_no], probs_na) self.f_surr_kl = cgt.function([oldpdist_np, o_no, a_n, q_n], [surr, kl]) self.f_gradlogp = cgt.function([oldpdist_np, o_no, a_n, q_n], flatgrad) self.pc = ParamCollection(params)
def main(): X = cgt.matrix(name='data', dtype=cgt.floatX, fixed_shape=(None, 2212)) y = cgt.vector("y", dtype='i8') model = build_nn(X) loss = -cgt.mean(categorical.loglik(y, model)) updates = nn.adagrad(loss, nn.get_parameters(loss), 0.01) y_nodrop = cgt.argmax(model, axis=1) cost_nodrop = -cgt.mean(categorical.loglik(y, model)) err_nodrop = cgt.cast(cgt.not_equal(y_nodrop, y), cgt.floatX).mean() train = cgt.function(inputs=[X, y], outputs=[], updates=updates) computeloss = cgt.function(inputs=[X, y], outputs=[err_nodrop, cost_nodrop]) batch_size = 20 Xdata, ydata = load_data() Xtrain = Xdata[0:5200] ytrain = ydata[0:5200] Xtest = Xdata[5200:5573] ytest = ydata[5200:5573] sortinds = np.random.permutation(5200) Xtrain = Xtrain[sortinds] ytrain = ytrain[sortinds] print fmt_row(10, ["Epoch","Train NLL","Train Err","Test NLL","Test Err","Epoch Time"]) for i_epoch in xrange(20): tstart = time.time() for start in xrange(0, Xtrain.shape[0], batch_size): end = start+batch_size train(Xtrain[start:end], ytrain[start:end]) elapsed = time.time() - tstart trainerr, trainloss = computeloss(Xtrain[:len(Xtest)], ytrain[:len(Xtest)]) testerr, testloss = computeloss(Xtest, ytest) print fmt_row(10, [i_epoch, trainloss, trainerr, testloss, testerr, elapsed])
def make_funcs(net_in, net_out, config, dbg_out=None): def f_grad(*x): out = f_surr(*x) return out['loss'], out['surr_loss'], out['surr_grad'] Y = cgt.matrix("Y") params = nn.get_parameters(net_out) if 'no_bias' in config and config['no_bias']: print "Excluding bias" params = [p for p in params if not p.name.endswith(".b")] size_out, size_batch = Y.shape[1], net_in.shape[0] f_step = cgt.function([net_in], [net_out]) # loss_raw of shape (size_batch, 1); loss should be a scalar # sum-of-squares loss sigma = 0.1 loss_raw = -cgt.sum((net_out - Y)**2, axis=1, keepdims=True) / sigma # negative log-likelihood # out_sigma = cgt.exp(net_out[:, size_out:]) + 1.e-6 # positive sigma # loss_raw = -gaussian_diagonal.logprob( # Y, net_out, # out_sigma # cgt.fill(.01, [size_batch, size_out]) # ) if 'param_penal_wt' in config: print "Applying penalty on parameter norm" assert config['param_penal_wt'] > 0 params_flat = cgt.concatenate([p.flatten() for p in params]) loss_param = cgt.fill(cgt.sum(params_flat**2), [size_batch, 1]) loss_param *= config['param_penal_wt'] loss_raw += loss_param loss = cgt.sum(loss_raw) / size_batch # end of loss definition f_loss = cgt.function([net_in, Y], [net_out, loss]) f_surr = get_surrogate_func([net_in, Y], [net_out] + dbg_out, [loss_raw], params) return params, f_step, f_loss, f_grad, f_surr
def make_funcs(net_in, net_out, config, dbg_out=None): def f_grad (*x): out = f_surr(*x) return out['loss'], out['surr_loss'], out['surr_grad'] Y = cgt.matrix("Y") params = nn.get_parameters(net_out) if 'no_bias' in config and config['no_bias']: print "Excluding bias" params = [p for p in params if not p.name.endswith(".b")] size_out, size_batch = Y.shape[1], net_in.shape[0] f_step = cgt.function([net_in], [net_out]) # loss_raw of shape (size_batch, 1); loss should be a scalar # sum-of-squares loss sigma = 0.1 loss_raw = -cgt.sum((net_out - Y) ** 2, axis=1, keepdims=True) / sigma # negative log-likelihood # out_sigma = cgt.exp(net_out[:, size_out:]) + 1.e-6 # positive sigma # loss_raw = -gaussian_diagonal.logprob( # Y, net_out, # out_sigma # cgt.fill(.01, [size_batch, size_out]) # ) if 'param_penal_wt' in config: print "Applying penalty on parameter norm" assert config['param_penal_wt'] > 0 params_flat = cgt.concatenate([p.flatten() for p in params]) loss_param = cgt.fill(cgt.sum(params_flat ** 2), [size_batch, 1]) loss_param *= config['param_penal_wt'] loss_raw += loss_param loss = cgt.sum(loss_raw) / size_batch # end of loss definition f_loss = cgt.function([net_in, Y], [net_out, loss]) f_surr = get_surrogate_func([net_in, Y], [net_out] + dbg_out, [loss_raw], params) return params, f_step, f_loss, f_grad, f_surr
def __init__(self, num_features=None, num_hidden=100): stepsize = 0.01 # with shape (batchsize, ncols) X = cgt.matrix("X", fixed_shape=(1, num_features)) # y: a symbolic variable representing the rewards, which are integers y = cgt.scalar("y", dtype='float64') hid1 = nn.rectify( nn.Affine(num_features, num_hidden, weight_init=nn.IIDGaussian(std=.1), bias_init=nn.Constant(1))(X)) # One final fully-connected layer, and then a linear activation output for reward output = nn.Affine(num_hidden, 1, weight_init=nn.IIDGaussian(std=.1), bias_init=nn.Constant(1))(hid1) abs_deviation = cgt.abs(output - y).mean() params = nn.get_parameters(abs_deviation) gparams = cgt.grad(abs_deviation, params) updates = [(p, p - stepsize * gp) for (p, gp) in zip(params, gparams)] self.predictor = cgt.function([X], output) self.updater = cgt.function([X, y], abs_deviation, updates=updates)
def __init__(self, obs_dim, ctrl_dim): cgt.set_precision('double') Serializable.__init__(self, obs_dim, ctrl_dim) self.obs_dim = obs_dim self.ctrl_dim = ctrl_dim o_no = cgt.matrix("o_no", fixed_shape=(None, obs_dim)) a_na = cgt.matrix("a_na", fixed_shape=(None, ctrl_dim)) adv_n = cgt.vector("adv_n") oldpdist_np = cgt.matrix("oldpdist", fixed_shape=(None, 2 * ctrl_dim)) self.logstd = logstd_1a = nn.parameter(np.zeros((1, self.ctrl_dim)), name="std_1a") std_1a = cgt.exp(logstd_1a) # Here's where we apply the network h0 = o_no nhid = 32 h1 = cgt.tanh( nn.Affine(obs_dim, nhid, weight_init=nn.IIDGaussian(std=0.1))(h0)) h2 = cgt.tanh( nn.Affine(nhid, nhid, weight_init=nn.IIDGaussian(std=0.1))(h1)) mean_na = nn.Affine(nhid, ctrl_dim, weight_init=nn.IIDGaussian(std=0.01))(h2) b = cgt.size(o_no, 0) std_na = cgt.repeat(std_1a, b, axis=0) oldmean_na = oldpdist_np[:, 0:self.ctrl_dim] oldstd_na = oldpdist_np[:, self.ctrl_dim:2 * self.ctrl_dim] logp_n = ((-.5) * cgt.square( (a_na - mean_na) / std_na).sum(axis=1)) - logstd_1a.sum() oldlogp_n = ((-.5) * cgt.square( (a_na - oldmean_na) / oldstd_na).sum(axis=1) ) - cgt.log(oldstd_na).sum(axis=1) ratio_n = cgt.exp(logp_n - oldlogp_n) surr = (ratio_n * adv_n).mean() pdists_np = cgt.concatenate([mean_na, std_na], axis=1) # kl = cgt.log(sigafter/) params = nn.get_parameters(surr) oldvar_na = cgt.square(oldstd_na) var_na = cgt.square(std_na) kl = (cgt.log(std_na / oldstd_na) + (oldvar_na + cgt.square(oldmean_na - mean_na)) / (2 * var_na) - .5).sum(axis=1).mean() lam = cgt.scalar() penobj = surr - lam * kl self._compute_surr_kl = cgt.function([oldpdist_np, o_no, a_na, adv_n], [surr, kl]) self._compute_grad_lagrangian = cgt.function( [lam, oldpdist_np, o_no, a_na, adv_n], cgt.concatenate([p.flatten() for p in cgt.grad(penobj, params)])) self.f_pdist = cgt.function([o_no], pdists_np) self.f_objs = cgt.function([oldpdist_np, o_no, a_na, adv_n], [surr, kl]) self.pc = ParamCollection(params)
def test_seq_2_seq(): batch_size = 32 # How many samples do you want to batch. feat_t_steps = 3 # How many 10ms sound clips. feat_num_features = 10 # The dimension of the 10ms clips. max_label_length = feat_t_steps # The maximal label length of the transcription. num_out_classes = 27 # 26 letters and space. num_out_classes_true = 27 + 2 # Start and end tokens are added. num_batches = 512 # 1032 num_epochs = 40 feats = cgt.tensor3(fixed_shape=(batch_size, feat_t_steps, feat_num_features)) ground_labels_basis = cgt.tensor3(fixed_shape=(batch_size, max_label_length, num_out_classes_true)) last_time = time.time() print 'initializing seq2seq' seq2seq = nnbuilder.Seq2Seq(nn_input_btf=feats, num_out_classes=num_out_classes) print 'that took ' + str(time.time() - last_time) + ' seconds' last_time = time.time() print 'making train objective' train_objective = seq2seq.get_train_objective(max_label_length=max_label_length, ground_labels_basis_btc=ground_labels_basis) print 'that took ' + str(time.time() - last_time) + ' seconds' last_time = time.time() print 'making updates' updates = nn.rmsprop(train_objective, nn.get_parameters(train_objective), learning_rate=0.0001) #updates = nn.nesterov_momentum(train_objective, nn.get_parameters(train_objective), learning_rate=0.0001, mu=0.4) #updates = nn.momentum(train_objective, nn.get_parameters(train_objective), learning_rate=0.00001, mu=0.4) #updates = nn.adadelta(train_objective, nn.get_parameters(train_objective), learning_rate=0.0001, rho=0.95) print 'that took ' + str(time.time() - last_time) + ' seconds' last_time = time.time() print 'compiling train function, test function, and prediction output function' train_function = cgt.function([feats, ground_labels_basis], [], updates=updates) test_function = cgt.function([feats, ground_labels_basis], [train_objective]) pred = seq2seq.make_prediction(ground_labels_basis_btc=ground_labels_basis, max_label_length=feat_t_steps) pred_fun = cgt.function([feats, ground_labels_basis], [pred]) print 'that took ' + str(time.time() - last_time) + ' seconds' test_data = np.load('test_data.npy') test_labels = np.load('test_labels.npy') data_mean = np.mean(test_data) data_sd = np.std(test_data) print 'now training' last_time = time.time() for one_epoch in range(0, num_epochs): tested = 0 print 'starting epoch ' + str(one_epoch) for batch_iter in range(0, num_batches): batch, labels_basis = normalize_batch_and_labels(test_data, batch_iter, feat_t_steps, data_mean, data_sd, test_labels, num_out_classes_true) train_function(batch, labels_basis) for batch_iter in range(0, num_batches): batch, labels_basis = normalize_batch_and_labels(test_data, batch_iter, feat_t_steps, data_mean, data_sd, test_labels, num_out_classes_true) tested += test_function(batch, labels_basis)[0] tested = tested / batch_iter print 'train loss for batch ' + str(batch_iter) + ' is ' + str(tested) print 'an actual prediction is ' print pred_fun(batch, labels_basis)[0] print 'the truth is' print test_labels[batch_iter, :, 0:feat_t_steps] print 'that took ' + str(time.time() - last_time) + ' seconds' last_time = time.time() prediction_final = pred_fun(batch, labels_basis)[0] print prediction_final
def test_the_test_problem(): #Works batch_size = 32 # How many samples do you want to batch. feat_t_steps = 20 # How many 10ms sound clips. feat_num_features = 10 # The dimension of the 10ms clips. max_label_length = feat_t_steps # The maximal label length of the transcription. includes start character. num_out_classes = 27 num_out_classes_true = num_out_classes + 2 num_batches = 756 num_epochs = 30 feats = cgt.tensor3(fixed_shape=(batch_size, feat_t_steps, feat_num_features)) ground_labels_basis = cgt.tensor3(fixed_shape=(batch_size, max_label_length, num_out_classes_true)) last_time = time.time() print 'initializing temporal dense layer' d1 = nnbuilder.temporalDenseLayer(feats, num_units=128, activation=cgt.sigmoid) #d2 = nnbuilder.temporalDenseLayer(d1, num_units=128, activation=cgt.sigmoid) d3 = nnbuilder.temporalDenseLayer(d1, num_units=num_out_classes_true, activation=nnbuilder.linear) out = nn.three_d_softmax(d3, axis=2) log_probs = None for iter_step in range(0, max_label_length): this_character_dist_bc = out[:, iter_step, :] prev_out_bc = ground_labels_basis[:, iter_step, :] log_probs_pre = prev_out_bc * this_character_dist_bc log_probs_pre = cgt.log(cgt.sum(log_probs_pre, axis=1)) if log_probs is None: log_probs = cgt.sum(log_probs_pre) else: log_probs += cgt.sum(log_probs_pre) log_probs = -log_probs print 'that took ' + str(time.time() - last_time) + ' seconds' last_time = time.time() print 'compiling objective function' updates = nn.rmsprop(log_probs, nn.get_parameters(log_probs), learning_rate=0.01) pred_train = cgt.function([feats, ground_labels_basis], [], updates=updates) pred_fun = cgt.function([feats, ground_labels_basis], [log_probs]) most_likely_chars = cgt.argmax(out, axis=1) actual_predictions = cgt.function([feats, ground_labels_basis], [most_likely_chars]) print 'that took ' + str(time.time() - last_time) + ' seconds' test_data = np.load('test_data.npy') test_labels = np.load('test_labels.npy') data_mean = np.mean(test_data) data_sd = np.mean(test_data) print 'now training' for one_epoch in range(0, num_epochs): trained = 0 last_time = time.time() print 'starting epoch ' + str(one_epoch) for batch_iter in range(0, num_batches): batch, labels_basis = normalize_batch_and_labels(test_data, batch_iter, feat_t_steps, data_mean, data_sd, test_labels, num_out_classes_true) pred_train(batch, labels_basis) for batch_iter in range(0, num_batches): batch, labels_basis = normalize_batch_and_labels(test_data, batch_iter, feat_t_steps, data_mean, data_sd, test_labels, num_out_classes_true) trained += pred_fun(batch, labels_basis)[0] trained = trained/batch_iter print 'train loss is ' + str(trained) print 'that took ' + str(time.time() - last_time) + ' seconds' act_pred = actual_predictions(batch, labels_basis)[0] print 'an actual prediction is ' print act_pred
conv2 = nn.rectify( nn.SpatialConvolution(32, 32, kernelshape=(3,3), stride=(1,1), pad=(1,1), weight_init=nn.IIDGaussian(std=.1))(pool1) ) pool2 = nn.max_pool_2d(conv2, kernelshape=(2,2), stride=(2,2)) d0, d1, d2, d3 = pool2.shape flat = pool2.reshape([d0, d1*d2*d3]) nfeats = cgt.infer_shape(flat)[1] probs = nn.softmax(nn.Affine(nfeats, 10)(flat)) cost = -categorical.loglik(y, probs).mean() y_preds = cgt.argmax(probs, axis=1) err = cgt.cast(cgt.not_equal(y, y_preds), cgt.floatX).mean() params = nn.get_parameters(cost) updates = nn.sgd(cost, params, 1e-3) # training function f = cgt.function(inputs=[X, y], outputs=[], updates=updates) # compute the cost and error cost_and_err = cgt.function(inputs=[X, y], outputs=[cost, err]) for i in xrange(epochs): t0 = time.time() for start in xrange(0, Xtrain.shape[0], batch_size): end = batch_size + start f(Xtrainimg[start:end], ytrain[start:end]) elapsed = time.time() - t0 costval, errval = cost_and_err(Xtestimg, ytest) print("Epoch {} took {}, test cost = {}, test error = {}".format(i, elapsed, costval, errval))
def set_all_weights_helper(network_out_layer, param_values): all_params = get_parameters(network_out_layer) for param, param_value in zip(all_params, param_values): param.op.set_value(param_value)
def get_all_weights(network_out_layer): all_params = get_parameters(network_out_layer) param_values = [] for param in all_params: param_values.append(param.op.get_value()) return param_values
def set_all_weights(network_out_layer, pickled_list_of_weights): all_params = get_parameters(network_out_layer) param_values = pickle.load(open(pickled_list_of_weights, "rb")) for param, param_value in zip(all_params, param_values): param.op.set_value(param_value)