def __call__(self, x): if self.original_stride != 1: _, _, width, height = cgt.infer_shape(x) unstrided_width = width * self.original_stride[0] unstrided_height = height * self.original_stride[1] # workaround for this # cgt.inc_subtensor(upsampled, (slice(None), slice(None), slice(None, None, self.original_stride[0])), slice(None, None, self.original_stride[1])), x) placeholder = cgt.zeros((x.shape[0], x.shape[1], width, unstrided_height)) # (None, 64, 4, 8) cgt.inc_subtensor(placeholder, (slice(None), slice(None), slice(None), slice(None, None, self.original_stride[1])), x) upsampled = cgt.zeros((x.shape[0], x.shape[1], unstrided_width, unstrided_height)) # (None, 64, 8, 8) cgt.inc_subtensor( upsampled, (slice(None), slice(None), slice(None, None, self.original_stride[0]), slice(None)), placeholder) else: upsampled = x # then we conv to deconv deconv = super(SpatialDeconvolution, self).__call__(upsampled) # lastly we cut off original padding pad = self.original_pad original_width = ( (width - 1) * self.original_stride[0] ) - 2 * self.original_pad[0] + self.original_kernelshape[0] original_height = ( (height - 1) * self.original_stride[1] ) - 2 * self.original_pad[1] + self.original_kernelshape[1] t = deconv[:, :, pad[0]:(pad[0] + original_width), pad[1]:(pad[1] + original_height)] return t
def denseLayer(nn_input, num_units, activation=rectify, w_init=XavierNormal(), bias_init=Constant(0)): """ Batch by feature input. """ if len(nn_input.shape) > 2: nn_input = cgt.reshape(nn_input, [nn_input.shape[0], reduce(lambda x, y: x*y, nn_input.shape[1:])]) feature_dims = cgt.infer_shape(nn_input)[1] return activation(Affine(feature_dims, num_units, weight_init=w_init, bias_init=bias_init)(nn_input))
def main(): parser = argparse.ArgumentParser() parser.add_argument("--profile",action="store_true") parser.add_argument("--unittest",action="store_true") parser.add_argument("--epochs",type=int,default=10) args = parser.parse_args() batchsize = 64 Xshape = (batchsize, 3, 32, 32) X = cgt.tensor4("X", fixed_shape = Xshape) y = cgt.vector("y", fixed_shape = (batchsize,), dtype='i4') conv1 = nn.SpatialConvolution(3, 32, kernelshape=(5,5), pad=(2,2), weight_init=nn.IIDGaussian(std=1e-4))(X) relu1 = nn.rectify(conv1) pool1 = nn.max_pool_2d(relu1, kernelshape=(3,3), stride=(2,2)) conv2 = nn.SpatialConvolution(32, 32, kernelshape=(5,5), pad=(2,2), weight_init=nn.IIDGaussian(std=0.01))(pool1) relu2 = nn.rectify(conv2) pool2 = nn.max_pool_2d(relu2, kernelshape=(3,3), stride=(2,2)) conv3 = nn.SpatialConvolution(32, 64, kernelshape=(5,5), pad=(2,2), weight_init=nn.IIDGaussian(std=0.01))(pool2) pool3 = nn.max_pool_2d(conv3, kernelshape=(3,3), stride=(2,2)) relu3 = nn.rectify(pool3) d0,d1,d2,d3 = relu3.shape flatlayer = relu3.reshape([d0,d1*d2*d3]) nfeats = cgt.infer_shape(flatlayer)[1] ip1 = nn.Affine(nfeats, 10)(flatlayer) logprobs = nn.logsoftmax(ip1) loss = -logprobs[cgt.arange(batchsize), y].mean() params = nn.get_parameters(loss) updates = rmsprop_updates(loss, params, stepsize=1e-3) train = cgt.function(inputs=[X, y], outputs=[loss], updates=updates) if args.profile: cgt.profiler.start() data = fetch_dataset("http://rll.berkeley.edu/cgt-data/cifar10.npz") Xtrain = data["X_train"] ytrain = data["y_train"] print fmt_row(10, ["Epoch","Train NLL","Train Err","Test NLL","Test Err","Epoch Time"]) for i_epoch in xrange(args.epochs): for start in xrange(0, Xtrain.shape[0], batchsize): tstart = time.time() end = start+batchsize print train(Xtrain[start:end], ytrain[start:end]), time.time()-tstart if start > batchsize*5: break # elapsed = time.time() - tstart # trainerr, trainloss = computeloss(Xtrain[:len(Xtest)], ytrain[:len(Xtest)]) # testerr, testloss = computeloss(Xtest, ytest) # print fmt_row(10, [i_epoch, trainloss, trainerr, testloss, testerr, elapsed]) if args.profile: cgt.profiler.print_stats() return if args.unittest: break
def main(): parser = argparse.ArgumentParser() parser.add_argument("--profile",action="store_true") parser.add_argument("--unittest",action="store_true") parser.add_argument("--epochs",type=int,default=10) args = parser.parse_args() batchsize = 64 Xshape = (batchsize, 3, 32, 32) X = cgt.tensor4("X", fixed_shape = Xshape) y = cgt.vector("y", fixed_shape = (batchsize,), dtype='i4') conv1 = nn.SpatialConvolution(3, 32, kernelshape=(5,5), pad=(2,2), weight_init=nn.IIDGaussian(std=1e-4))(X) relu1 = nn.rectify(conv1) pool1 = nn.max_pool_2d(relu1, kernelshape=(3,3), stride=(2,2)) conv2 = nn.SpatialConvolution(32, 32, kernelshape=(5,5), pad=(2,2), weight_init=nn.IIDGaussian(std=0.01))(relu1) relu2 = nn.rectify(conv2) pool2 = nn.max_pool_2d(relu2, kernelshape=(3,3), stride=(2,2)) conv3 = nn.SpatialConvolution(32, 64, kernelshape=(5,5), pad=(2,2), weight_init=nn.IIDGaussian(std=0.01))(pool2) pool3 = nn.max_pool_2d(conv3, kernelshape=(3,3), stride=(2,2)) relu3 = nn.rectify(pool3) d0,d1,d2,d3 = relu3.shape flatlayer = relu3.reshape([d0,d1*d2*d3]) nfeats = cgt.infer_shape(flatlayer)[1] ip1 = nn.Affine(nfeats, 10)(flatlayer) logprobs = nn.logsoftmax(ip1) loss = -logprobs[cgt.arange(batchsize), y].mean() params = nn.get_parameters(loss) updates = rmsprop_updates(loss, params, stepsize=1e-3) train = cgt.function(inputs=[X, y], outputs=[loss], updates=updates) if args.profile: cgt.profiler.start() data = np.load("/Users/joschu/Data/cifar-10-batches-py/cifar10.npz") Xtrain = data["X_train"] ytrain = data["y_train"] print fmt_row(10, ["Epoch","Train NLL","Train Err","Test NLL","Test Err","Epoch Time"]) for i_epoch in xrange(args.epochs): for start in xrange(0, Xtrain.shape[0], batchsize): tstart = time.time() end = start+batchsize print train(Xtrain[start:end], ytrain[start:end]), time.time()-tstart if start > batchsize*5: break # elapsed = time.time() - tstart # trainerr, trainloss = computeloss(Xtrain[:len(Xtest)], ytrain[:len(Xtest)]) # testerr, testloss = computeloss(Xtest, ytest) # print fmt_row(10, [i_epoch, trainloss, trainerr, testloss, testerr, elapsed]) if args.profile: cgt.profiler.print_stats() return if args.unittest: break
def GRULayer(nn_input, num_units, activation=cgt.sigmoid, backwards=False, w_init=XavierNormal(), hid_out_init=IIDUniform(0, 1)): if len(nn_input.shape) > 3: nn_input = nn_input.reshape([nn_input.shape[0], nn_input.shape[1], reduce(lambda x, y: x*y, nn_input.shape[2:])]) in_shape = cgt.infer_shape(nn_input) time_dim = in_shape[1] feature_dims = in_shape[2] return GRU(input_feature_size=feature_dims, input_time_size=time_dim, num_units=num_units, weight_init=w_init, hid_out_init=hid_out_init, activation=activation, backwards=backwards)(nn_input)
def LSTMLayer(nn_input, num_units, activation=rectify, backwards=False, w_init=XavierNormal(), hid_out_init=IIDUniform(0, 1), cell_out_init=IIDUniform(0, 1)): if len(nn_input.shape) > 3: nn_input = nn_input.reshape([nn_input.shape[0], nn_input.shape[1], nn_input.shape[2:]]) in_shape = cgt.infer_shape(nn_input) time_dim = in_shape[1] feature_dims = in_shape[2] return LSTM(input_feature_size=feature_dims, input_time_size=time_dim, num_units=num_units, weight_init=w_init, hid_out_init=hid_out_init, cell_out_init=cell_out_init, activation=activation, backwards=backwards)(nn_input)
def recurrentLayer(nn_input, num_units, activation=rectify, w_init=XavierNormal(), hid_out_init=IIDUniform(0, 1), backwards=False, mask=None): """ Batch by time by features """ if len(nn_input.shape) > 3: nn_input = nn_input.reshape([nn_input.shape[0], nn_input.shape[1], reduce(lambda x, y: x*y, nn_input.shape[2:])]) in_shape = cgt.infer_shape(nn_input) time_dim = in_shape[1] feature_dims = in_shape[2] return Recurrent(input_feature_size=feature_dims, input_time_size=time_dim, num_units=num_units, weight_init=w_init, hid_out_init=hid_out_init, activation=activation, backwards=backwards)(nn_input)
def test_get_train_objective(): batch_size = 32 feat_t_steps = 5 feat_num_features = 256 max_label_length = 5 num_out_classes = 27 feats = cgt.tensor3(fixed_shape=(batch_size, feat_t_steps, feat_num_features)) ground_labels_basis = cgt.tensor3(fixed_shape=(batch_size, max_label_length, num_out_classes)) seq2seq = nnbuilder.Seq2Seq(nn_input_btf=feats, num_out_classes=num_out_classes) train_objective = seq2seq.get_train_objective(max_label_length=max_label_length, ground_labels_basis_btc=ground_labels_basis) train_shape = cgt.infer_shape(train_objective) assert train_shape == () nn.get_parameters(train_objective)
def pyramidLayer(nn_input, temporal_resolution_decrease=2): """ Batch by time by features. Decreases temporal resolution and increases feature dimension by a resolution decrease factor. """ t_steps = cgt.infer_shape(nn_input)[1] if t_steps % temporal_resolution_decrease != 0: raise ValueError('number of timesteps is not divisable by resolution decrease!') out_list = [] for iter_step in range(0, t_steps, temporal_resolution_decrease): concentrate_list = [] for sub_iter_step in range(0, temporal_resolution_decrease): concentrate_list.append(nn_input[:, iter_step + sub_iter_step, :]) out_list.append(cgt.concatenate(concentrate_list, axis=1)) return cgt.dimshuffle(cgt.stack(out_list), [1, 0, 2])
def test_im2col(): for settings in [ ((4,4),(0,0),(1,1)), ((3,3),(1,1),(2,2)), ((3,3),(1,1),(3,3)) ]: xval = np.arange(2*1*28*28).reshape(2,1,28,28).astype(cgt.floatX) x = cgt.tensor4("x", fixed_shape=xval.shape) y = im2col(x, *settings) h = cgt.constant(np.random.randn(*cgt.infer_shape(y))) cost = (y*h).sum() fcost = cgt.function([x],cost) fgrad = cgt.function([x], cgt.grad(cost, [x])[0]) from cgt.numeric_diff import numeric_grad gnum = numeric_grad(fcost, xval,eps=1e-5) gana = fgrad(xval) assert np.allclose(gnum, gana)
def temporalDenseLayer(nn_input, num_units, activation=rectify, w_init=XavierNormal(), bias_init=Constant(0)): """ Batch by time by features. """ if len(nn_input.shape) > 3: nn_input = nn_input.reshape([nn_input.shape[0], nn_input.shape[1], nn_input.shape[2:]]) dims = cgt.infer_shape(nn_input) temporal_dims = dims[1] feature_dims = dims[2] affine_underbelly = Affine(feature_dims, num_units, weight_init=w_init, bias_init=bias_init) out_list = [] for iter_step in range(0, temporal_dims): input_slice = nn_input[:, iter_step, :] out_list.append(activation(affine_underbelly(input_slice))) return cgt.dimshuffle(cgt.stack(out_list), [1, 0, 2])
def build_convnet_return_loss(X, y): np.random.seed(0) conv1 = nn.rectify( nn.SpatialConvolution(1, 32, kernelshape=(3, 3), pad=(0, 0), weight_init=nn.IIDGaussian(std=0.1))(X) ) pool1 = nn.max_pool_2d(conv1, kernelshape=(3, 3), stride=(2, 2)) conv2 = nn.rectify( nn.SpatialConvolution(32, 32, kernelshape=(3, 3), pad=(0, 0), weight_init=nn.IIDGaussian(std=0.1))(pool1) ) pool2 = nn.max_pool_2d(conv2, kernelshape=(3, 3), stride=(2, 2)) d0, d1, d2, d3 = pool2.shape flatlayer = pool2.reshape([d0, d1 * d2 * d3]) nfeats = cgt.infer_shape(flatlayer)[1] logprobs = nn.logsoftmax(nn.Affine(nfeats, 10)(flatlayer)) loss = -logprobs[cgt.arange(X.shape[0]), y].mean() return loss
def test_im2col(): for settings in [((4, 4), (0, 0), (1, 1)), ((3, 3), (1, 1), (2, 2)), ((3, 3), (1, 1), (3, 3))]: xval = np.arange(2 * 1 * 28 * 28).reshape(2, 1, 28, 28).astype(cgt.floatX) x = cgt.tensor4("x", fixed_shape=xval.shape) y = im2col(x, *settings) h = cgt.constant(np.random.randn(*cgt.infer_shape(y))) cost = (y * h).sum() fcost = cgt.function([x], cost) fgrad = cgt.function([x], cgt.grad(cost, [x])[0]) from cgt.numeric_diff import numeric_grad gnum = numeric_grad(fcost, xval, eps=1e-5) gana = fgrad(xval) assert np.allclose(gnum, gana)
def test_cpu_pool(**kwargs): np.random.seed(0) x = cgt.tensor4("x", fixed_shape=(2, 3, 5, 7)) y = max_pool_2d(x, (4, 4), (0, 0), (1, 1)) xval = np.random.randn(2, 3, 5, 7) hval = np.random.randn(*cgt.infer_shape(y)) h = cgt.constant(hval) cost = (y * h).sum() fcost = cgt.function([x], cost) fgrad = cgt.function([x], cgt.grad(cost, [x])[0]) from cgt.numeric_diff import numeric_grad gnum = numeric_grad(fcost, xval) gana = fgrad(xval) assert np.allclose(gnum, gana)
def test_pool(**kwargs): np.random.seed(0) x = cgt.tensor4("x", fixed_shape=(2,3,5,7)) y = max_pool_2d(x, (4,4),(0,0),(1,1)) xval = np.random.randn(2,3,5,7) hval = np.random.randn(*cgt.infer_shape(y)) h = cgt.constant(hval) cost = (y*h).sum() fcost = cgt.function([x], cost) fgrad = cgt.function([x], cgt.grad(cost, [x])[0]) from cgt.numeric_diff import numeric_grad gnum = numeric_grad(fcost, xval) gana = fgrad(xval) assert np.allclose(gnum,gana)
def test_lrn(): if not get_compile_info()["CGT_ENABLE_CUDA"]: raise SkipTest("Skipping because CUDA disabled") nr.seed(0) Xval = nr.randn(4,8,16,16) X = cgt.shared(Xval, name="X", fixed_shape_mask="all") # X = cgt.tensor4(name='X') y = cross_channel_lrn(X, localsize=4, alpha=.1, beta=.5) f = cgt.function([],y) print f().sum() print f().sum() print f().sum() assert np.isfinite(f().sum()) # print f(Xval).sum() a = nr.rand(*cgt.infer_shape(y)) loss = (y*a).sum() gradcheck_model(loss, [X],eps=1e-5)
def test_lrn(): if not get_compile_info()["CGT_ENABLE_CUDA"]: raise SkipTest("Skipping because CUDA disabled") nr.seed(0) Xval = nr.randn(4, 8, 16, 16) X = cgt.shared(Xval, name="X", fixed_shape_mask="all") # X = cgt.tensor4(name='X') y = cross_channel_lrn(X, localsize=4, alpha=.1, beta=.5) f = cgt.function([], y) print f().sum() print f().sum() print f().sum() assert np.isfinite(f().sum()) # print f(Xval).sum() a = nr.rand(*cgt.infer_shape(y)) loss = (y * a).sum() gradcheck_model(loss, [X], eps=1e-5)
def test_get_context(): batch_size = 32 feat_t_steps = 3 feat_num_features = 30 state_num_features = 20 num_out_classes = 28 feats = cgt.tensor3(fixed_shape=(batch_size, feat_t_steps, feat_num_features)) prev_out = cgt.matrix(fixed_shape=(batch_size, state_num_features)) sigmoided = cgt.sigmoid(prev_out) s = nnbuilder.Seq2Seq(nn_input_btf=feats, num_out_classes=num_out_classes, feature_size=feat_num_features, decoder_size=state_num_features) mm = cgt.infer_shape(s.features_post_mlp_btf) assert mm == (batch_size, feat_t_steps, feat_num_features) context_out = s.get_context(sigmoided) out_fun = cgt.function([feats, prev_out], [context_out]) tau = np.reshape(np.random.normal(0.1, 0.2, batch_size*feat_t_steps*feat_num_features), (batch_size, feat_t_steps, feat_num_features)) tau2 = np.reshape(np.random.normal(0.1, 0.2, batch_size*state_num_features), (batch_size, state_num_features)) m = out_fun(tau, tau2)[0] assert m.shape == (batch_size, feat_num_features) assert np.mean(m) < 1
def __init__(self, nn_input_btf, num_out_classes, get_features_fun=None, feature_size=40, decoder_size=40, w_init=IIDUniform(-0.1, 0.1)): self.start_token_index = num_out_classes self.end_token_index = self.start_token_index + 1 self.true_number_classes = num_out_classes + 2 # add dims for start and end token. self.batch_size = cgt.infer_shape(nn_input_btf)[0] self.w_init = w_init self.feature_size = feature_size self.decoder_size = decoder_size if get_features_fun is not None: self.get_features_fun = get_features_fun else: self.get_features_fun = self.get_features_bengio features_btf = self.get_features_fun(nn_input_btf, num_units=self.feature_size) # Compute psi<h_u> over all u (timesteps), the features from the ground data. # This is for computing the context c_i. The features are put through a dense layer. self.features_post_mlp_btf = temporalDenseLayer(features_btf, self.feature_size, w_init=self.w_init, activation=linear, bias_init=Constant(0.0)) self.mixing_vec_w = parameter(init_array(w_init, (1, 1, self.feature_size,)), name=None) # These are for the decoder mechanism, which computes s_i. rnn_activation = cgt.sigmoid recurrence = Recurrent self.recurrent_decoder_one = recurrence(num_units=self.decoder_size, input_time_size=None, input_feature_size=self.feature_size + self.true_number_classes, weight_init=self.w_init, activation=rnn_activation).take_one_step self.recurrent_decoder_two = linear #self.recurrent_decoder_two = recurrence(num_units=self.decoder_size, input_time_size=None, # input_feature_size=self.decoder_size, # weight_init=self.w_init, activation=rnn_activation).take_one_step # Multiply s_i by V to make it have same dimension as h_u. self.states_mlp_bf = Affine(self.decoder_size, self.feature_size, weight_init=self.w_init, bias_init=Constant(0.0)) # This is the final dense layer, which computes the class probs at the end of all things. self.final_out_dense = Affine(self.decoder_size + self.feature_size, self.true_number_classes, weight_init=w_init, bias_init=Constant(0.0))
def test_cpu_pool(): with cgt.scoped_update_config(precision="quad",backend="native"): print cgt.get_precision() ci = get_compile_info() np.random.seed(0) x = cgt.tensor4("x", fixed_shape=(2,3,5,7)) y = max_pool_2d(x, (4,4),(0,0),(1,1)) xval = np.random.randn(2,3,5,7) hval = np.random.randn(*cgt.infer_shape(y)) h = cgt.constant(hval) cost = (y*h).sum() fcost = cgt.function([x], cost) fgrad = cgt.function([x], cgt.grad(cost, [x])[0]) from cgt.numeric_diff import numeric_grad gnum = numeric_grad(fcost, xval) gana = fgrad(xval) assert np.allclose(gnum,gana)
def test_cpu_pool(): with cgt.scoped_update_config(precision="quad", backend="native"): print cgt.get_precision() ci = get_compile_info() np.random.seed(0) x = cgt.tensor4("x", fixed_shape=(2, 3, 5, 7)) y = max_pool_2d(x, (4, 4), (0, 0), (1, 1)) xval = np.random.randn(2, 3, 5, 7) hval = np.random.randn(*cgt.infer_shape(y)) h = cgt.constant(hval) cost = (y * h).sum() fcost = cgt.function([x], cost) fgrad = cgt.function([x], cgt.grad(cost, [x])[0]) from cgt.numeric_diff import numeric_grad gnum = numeric_grad(fcost, xval) gana = fgrad(xval) assert np.allclose(gnum, gana)
def build_convnet_return_loss(X, y): np.random.seed(0) conv1 = nn.rectify( nn.SpatialConvolution(1, 32, kernelshape=(3, 3), pad=(0, 0), weight_init=nn.IIDGaussian(std=.1))(X)) pool1 = nn.max_pool_2d(conv1, kernelshape=(3, 3), stride=(2, 2)) conv2 = nn.rectify( nn.SpatialConvolution(32, 32, kernelshape=(3, 3), pad=(0, 0), weight_init=nn.IIDGaussian(std=.1))(pool1)) pool2 = nn.max_pool_2d(conv2, kernelshape=(3, 3), stride=(2, 2)) d0, d1, d2, d3 = pool2.shape flatlayer = pool2.reshape([d0, d1 * d2 * d3]) nfeats = cgt.infer_shape(flatlayer)[1] logprobs = nn.logsoftmax(nn.Affine(nfeats, 10)(flatlayer)) loss = -logprobs[cgt.arange(X.shape[0]), y].mean() return loss
(hidden_size, hidden_size)), name=name + ".W_hh") # hidden to output self.W_ho = parameter(init_array(weight_init, (hidden_size, hidden_size)), name=name + ".W_ho") def __call__(self, x, prev_h): """ x is the input prev_h is the input from the previous timestep Returns (out, next_h). Feed out into the next layer and next_h to the next timestep. """ next_h = cgt.tanh(prev_h.dot(self.W_hh) + x.dot(self.W_xh)) out = next_h.dot(self.W_ho) return out, next_h # Make sure it compiles! x = cgt.matrix() # (batch_size, n_features) h = cgt.matrix() # this will later be the identity matrix o, next_h = RNNCell(5, 10)(x, h) print("Output:", o, cgt.infer_shape(o)) print("Next Hidden:", next_h, cgt.infer_shape(next_h))
Returns next_h. For the GRU the output to the next timestep and next layer is one and the same. Copy it first! """ reset_gate = cgt.sigmoid(x.dot(self.W_xr) + prev_h.dot(self.W_hr)) update_gate = cgt.sigmoid(x.dot(self.W_xz) + prev_h.dot(self.W_hz)) # the elementwise multiplication here tells what of the previous # input we should forget. forget_gate = reset_gate * prev_h # this part is very similar to vanilla RNN h_candidate = cgt.tanh(x.dot(self.W_xc) + prev_h.dot(forget_gate)) # this isn't super clear in the paper just it's an elementwise mult here next_h = (1. - update_gate) * h + update_gate * h_candidate # In a standard GRU cell we only have 1 output. # However, it should be be copied and feed to # both the next timestep and the next layer return next_h # Make sure it compiles! x = cgt.matrix() # (batch_size, n_features) h = cgt.matrix() # this will later be the identity matrix next_h = GRUCell(5, 10)(x, h) print("Next Hidden:", next_h, cgt.infer_shape(next_h))
def take_one_step(self, nn_input_bf, hid_out): #PROBABLY BUGGED. SHOULD BE REWRITTEN. self.num_batches = cgt.infer_shape(nn_input_bf)[0] # (n_time_steps, n_batch, n_features) #input_bf = cgt.dimshuffle(nn_input_bf, [1, 0, 2]) # Stack input weight matrices into a (num_inputs, 3*num_units) # matrix, which speeds up computation W_in_stacked = cgt.concatenate( [self.W_in_to_resetgate, self.W_in_to_updategate, self.W_in_to_hidden_update], axis=1) # Same for hidden weight matrices W_hid_stacked = cgt.concatenate( [self.W_hid_to_resetgate, self.W_hid_to_updategate, self.W_hid_to_hidden_update], axis=1) # Stack gate biases into a (3*num_units) vector b_stacked = cgt.concatenate( [self.b_resetgate, self.b_updategate, self.b_hidden_update], axis=1) # At each loop, input_n will be (n_time_steps, 3*num_units). # We define a slicing function that extract the input to each GRU gate def slice_w(x, n): return x[:, n*self.num_units:(n+1)*self.num_units] # Create single recurrent computation step function # input__n is the n'th vector of the input def step(input_n, hid_previous, W_hid_stacked, W_in_stacked, b_stacked): # Compute W_{hr} h_{t - 1}, W_{hu} h_{t - 1}, and W_{hc} h_{t - 1} hid_input = cgt.dot(hid_previous, W_hid_stacked) # Compute W_{xr}x_t + b_r, W_{xu}x_t + b_u, and W_{xc}x_t + b_c input_n = cgt.broadcast("+", input_n.dot(W_in_stacked), b_stacked, "xx,1x") # Reset and update gates resetgate = slice_w(hid_input, 0) + slice_w(input_n, 0) updategate = slice_w(hid_input, 1) + slice_w(input_n, 1) resetgate = self.nonlinearity_resetgate(resetgate) updategate = self.nonlinearity_updategate(updategate) # Compute W_{xc}x_t + r_t \odot (W_{hc} h_{t - 1}) hidden_update_in = slice_w(input_n, 2) hidden_update_hid = slice_w(hid_input, 2) hidden_update = hidden_update_in + resetgate*hidden_update_hid # Compute (1 - u_t)h_{t - 1} + u_t c_t hid = (1 - updategate)*hid_previous + updategate*hidden_update return self.nonlinearity_hid(hid) # adding this non-linearity seems to help stability. #return hid if hid_out is None: if self.hid_out is None: self.hid_out = cgt.dot(cgt.ones((self.num_batches, 1)), self.hid_init) hid_out = self.hid_out # Retrieve the dimensionality of the incoming layer hid_out = step(nn_input_bf, hid_out, W_hid_stacked, W_in_stacked, b_stacked) # dimshuffle back to (n_batch, n_time_steps, n_features)) # self.hid_out = cgt.dimshuffle(self.hid_out, [1, 0, 2]) # if scan is backward reverse the output if self.backwards: hid_out = cgt.flip(hid_out, [1]) self.hid_out = hid_out return hid_out
def __call__(self, input_btf): # (n_time_steps, n_batch, n_features) input_tbf = cgt.dimshuffle(input_btf, [1, 0, 2]) self.num_batches = cgt.infer_shape(input_tbf)[1] # Stack input weight matrices into a (num_inputs, 3*num_units) # matrix, which speeds up computation W_in_stacked = cgt.concatenate( [self.W_in_to_resetgate, self.W_in_to_updategate, self.W_in_to_hidden_update], axis=1) # Same for hidden weight matrices W_hid_stacked = cgt.concatenate( [self.W_hid_to_resetgate, self.W_hid_to_updategate, self.W_hid_to_hidden_update], axis=1) # Stack gate biases into a (3*num_units) vector b_stacked = cgt.concatenate( [self.b_resetgate, self.b_updategate, self.b_hidden_update], axis=1) # At each loop, input_n will be (n_time_steps, 3*num_units). # We define a slicing function that extract the input to each GRU gate def slice_w(x, n): return x[:, n*self.num_units:(n+1)*self.num_units] # Create single recurrent computation step function # input__n is the n'th vector of the input def step(input_n, hid_previous, W_hid_stacked, W_in_stacked, b_stacked): # Compute W_{hr} h_{t - 1}, W_{hu} h_{t - 1}, and W_{hc} h_{t - 1} hid_input = cgt.dot(hid_previous, W_hid_stacked) # Compute W_{xr}x_t + b_r, W_{xu}x_t + b_u, and W_{xc}x_t + b_c input_n = cgt.broadcast("+", input_n.dot(W_in_stacked), b_stacked, "xx,1x") # Reset and update gates resetgate = slice_w(hid_input, 0) + slice_w(input_n, 0) updategate = slice_w(hid_input, 1) + slice_w(input_n, 1) resetgate = self.nonlinearity_resetgate(resetgate) updategate = self.nonlinearity_updategate(updategate) # Compute W_{xc}x_t + r_t \odot (W_{hc} h_{t - 1}) hidden_update_in = slice_w(input_n, 2) hidden_update_hid = slice_w(hid_input, 2) hidden_update = hidden_update_in + resetgate*hidden_update_hid # Compute (1 - u_t)h_{t - 1} + u_t c_t hid = (1 - updategate)*hid_previous + updategate*hidden_update return hid sequences = [input_tbf] step_fun = step hid_init = cgt.dot(cgt.ones((self.num_batches, 1)), self.hid_init) # The hidden-to-hidden weight matrix is always used in step non_seqs = [W_hid_stacked] # When we aren't precomputing the input outside of scan, we need to # provide the input weights and biases to the step function non_seqs += [W_in_stacked, b_stacked] # theano.scan only allows for positional arguments, so when # self.precompute_input is True, we need to supply fake placeholder # arguments for the input weights and biases. # Retrieve the dimensionality of the incoming layer hid_out = unroll_lstm( fn=step_fun, sequences=sequences, outputs_info=[hid_init], go_backwards=self.backwards, non_sequences=non_seqs, n_steps=self.timesteps)[0] # dimshuffle back to (n_batch, n_time_steps, n_features)) hid_out = cgt.dimshuffle(hid_out, [1, 0, 2]) # if scan is backward reverse the output if self.backwards: hid_out = cgt.flip(hid_out, [1]) return hid_out
Returns (next_c, next_h). next_h should be cloned since it's feed into the next layer and the next timstep. """ forget_gate = cgt.sigmoid(x.dot(self.W_xf) + prev_h.dot(self.W_hf)) input_gate = cgt.sigmoid(x.dot(self.W_xi) + prev_h.dot(self.W_hi)) output_gate = cgt.sigmoid(x.dot(self.W_xo) + prev_h.dot(self.W_ho)) candidate_values = cgt.tanh(x.dot(self.W_xc) + prev_h.dot(self.W_hc)) # new cell state next_c = forget_gate * prev_c + input_gate * candidate_values # input for next timestep next_h = output_gate * cgt.tanh(next_c) # NOTE: we feed next_h into the next layer and the next timestep # so we should clone the next_h output. return next_c, next_h # Make sure it compiles! x = cgt.matrix() # (batch_size, n_features) h = cgt.matrix() # this will later be the identity matrix c = cgt.matrix() # this will later be the identity matrix next_c, next_h = LSTMCell(5, 10)(x, c, h) print("Next Cell State:", next_c, cgt.infer_shape(next_c)) print("Next Hidden:", next_h, cgt.infer_shape(next_h))
import cgt import numpy as np from cgt.nn import parameter, init_array, HeUniform, Constant # ignore bias for the sake of simplicity class FeedforwardCell(object): def __init__(self, input_size, output_size, name="", weight_init=HeUniform(1.0), bias_init=Constant(0)): """ Initialize an Feedforward cell. """ self.W = parameter(init_array(weight_init, (input_size, output_size)), name=name + ".W") self.b = parameter(init_array(bias_init, (1, output_size)), name=name + ".b") def __call__(self, x): """ x is the input Returns the output to feed as the input into the next layer. """ return cgt.broadcast("+", x.dot(self.W), self.b, "xx,1x") # Make sure it compiles! # x is a matrix of size (batch_size, features_size) x = cgt.matrix() o = FeedforwardCell(5, 10)(x) print("Output:", o, cgt.infer_shape(o))
input_size, output_size, name="", weight_init=HeUniform(1.0), bias_init=Constant(0)): """ Initialize an Feedforward cell. """ self.W = parameter(init_array(weight_init, (input_size, output_size)), name=name + ".W") self.b = parameter(init_array(bias_init, (1, output_size)), name=name + '.b') def __call__(self, x): """ x is the input Returns the output to feed as the input into the next layer. """ return cgt.broadcast("+", x.dot(self.W), self.b, "xx,1x") # Make sure it compiles! # x is a matrix of size (batch_size, features_size) x = cgt.matrix() o = FeedforwardCell(5, 10)(x) print("Output:", o, cgt.infer_shape(o))
# hidden to hidden self.W_hh = parameter(init_array(weight_init, (hidden_size, hidden_size)), name=name+".W_hh") # hidden to output self.W_ho = parameter(init_array(weight_init, (hidden_size, hidden_size)), name=name+".W_ho") def __call__(self, x, prev_h): """ x is the input prev_h is the input from the previous timestep Returns (out, next_h). Feed out into the next layer and next_h to the next timestep. """ next_h = cgt.tanh(prev_h.dot(self.W_hh) + x.dot(self.W_xh)) out = next_h.dot(self.W_ho) return out, next_h # Make sure it compiles! x = cgt.matrix() # (batch_size, n_features) h = cgt.matrix() # this will later be the identity matrix o, next_h = RNNCell(5, 10)(x, h) print("Output:", o, cgt.infer_shape(o)) print("Next Hidden:", next_h, cgt.infer_shape(next_h))
X = cgt.tensor4('X', fixed_shape=(None, 1, 28, 28)) y = cgt.vector('y', dtype='i8') conv1 = nn.rectify( nn.SpatialConvolution(1, 32, kernelshape=(3,3), stride=(1,1), pad=(1,1), weight_init=nn.IIDGaussian(std=.1))(X) ) pool1 = nn.max_pool_2d(conv1, kernelshape=(2,2), stride=(2,2)) conv2 = nn.rectify( nn.SpatialConvolution(32, 32, kernelshape=(3,3), stride=(1,1), pad=(1,1), weight_init=nn.IIDGaussian(std=.1))(pool1) ) pool2 = nn.max_pool_2d(conv2, kernelshape=(2,2), stride=(2,2)) d0, d1, d2, d3 = pool2.shape flat = pool2.reshape([d0, d1*d2*d3]) nfeats = cgt.infer_shape(flat)[1] probs = nn.softmax(nn.Affine(nfeats, 10)(flat)) cost = -categorical.loglik(y, probs).mean() y_preds = cgt.argmax(probs, axis=1) err = cgt.cast(cgt.not_equal(y, y_preds), cgt.floatX).mean() params = nn.get_parameters(cost) updates = nn.sgd(cost, params, 1e-3) # training function f = cgt.function(inputs=[X, y], outputs=[], updates=updates) # compute the cost and error cost_and_err = cgt.function(inputs=[X, y], outputs=[cost, err]) for i in xrange(epochs):