def __call__(self, Y, U): if Y.ndim > (self.axis + 1): Y = Y.reshape(Y.shape[:self.axis] + [cgt.mul_multi(Y.shape[self.axis:])]) outer_YU = cgt.broadcast( '*', Y.dimshuffle(range(Y.ndim) + ['x']), U.dimshuffle([0] + ['x'] * self.axis + [1]), ''.join(['x'] * Y.ndim + ['1', ',', 'x'] + ['1'] * self.axis + ['x'])) bilinear = cgt.dot( outer_YU.reshape( (outer_YU.shape[0], cgt.mul_multi(outer_YU.shape[1:]))), self.M.reshape((self.y_dim, self.y_dim * self.u_dim)).T) if self.axis > 1: bilinear = bilinear.reshape((-1, ) + self.y_shape[:self.axis - 1] + (self.y_dim, )) linear = cgt.dot(U, self.N.T) if self.axis > 1: linear = linear.dimshuffle([0] + ['x'] * (self.axis - 1) + [1]) activation = bilinear + linear if self.b is not None: activation += cgt.broadcast( '+', activation, self.b.dimshuffle(['x'] * self.axis + [0]), ''.join(['x'] * activation.ndim + [','] + ['1'] * (activation.ndim - 1) + ['x'])) activation = activation.reshape((-1, ) + self.y_shape) return activation
def make_deep_rrnn(size_input, size_mem, n_layers, size_output, size_batch_in, k_in, k_h): inputs = [cgt.matrix() for i_layer in xrange(n_layers+1)] outputs = [] print 'input_size: ', size_input for i_layer in xrange(n_layers): prev_h = inputs[i_layer+1] # note that inputs[0] is the external input, so we add 1 x = inputs[0] if i_layer==0 else outputs[i_layer-1] size_x = size_input if i_layer==0 else size_mem size_batch = prev_h.shape[0] xform_h_param = nn.TensorParam((2 * k_h, size_mem), name="rotxform") xform_h_non = xform_h_param.weight xform_h_non.props["is_rotation"] = True xform_h_norm = cgt.norm(xform_h_non, axis=1, keepdims=True) xform_h = cgt.broadcast('/', xform_h_non, xform_h_norm, "xx,x1") r_vec = nn.Affine(size_x, 2 * k_in * size_mem)(x) r_non = cgt.reshape(r_vec, (size_batch, 2 * k_in, size_mem)) r_norm = cgt.norm(r_non, axis=2, keepdims=True) r = cgt.broadcast('/', r_non, r_norm, "xxx,xx1") prev_h_3 = cgt.reshape(prev_h, (size_batch, size_mem, 1)) inters_in = [prev_h_3] colon = slice(None, None, None) for i in xrange(2 * k_in): inter_in = inters_in[-1] r_cur = cgt.subtensor(r, [colon, i, colon]) r_cur_3_transpose = cgt.reshape(r_cur, (size_batch, 1, size_mem)) r_cur_3 = cgt.reshape(r_cur, (size_batch, size_mem, 1)) ref_cur = cgt.batched_matmul(r_cur_3, cgt.batched_matmul(r_cur_3_transpose, inter_in)) inter_out = inter_in - 2 * ref_cur inters_in.append(inter_out) h_in_rot = cgt.reshape(inters_in[-1], (size_batch, size_mem)) inters_h = [h_in_rot] for i in xrange(2 * k_h): inter_in = inters_h[-1] r_cur = cgt.subtensor(xform_h, [i, colon]) r_cur_2_transpose = cgt.reshape(r_cur, (size_mem, 1)) r_cur_2 = cgt.reshape(r_cur, (1, size_mem)) ref_cur = cgt.dot(cgt.dot(inter_in, r_cur_2_transpose), r_cur_2) inter_out = inter_in - 2 * ref_cur inters_h.append(inter_out) next_h = inters_h[-1] outputs.append(next_h) category_activations = nn.Affine(size_mem, size_output,name="pred")(outputs[-1]) logprobs = nn.logsoftmax(category_activations) outputs.append(logprobs) #print 'len outputs:', len(outputs) #print 'len inputs:', len(inputs) return nn.Module(inputs, outputs)
def dense_model(X, w_h, w_h2, w_o, p_drop_input, p_drop_hidden): X = nn.dropout(X, p_drop_input) h = nn.rectify(cgt.dot(X, w_h)) h = nn.dropout(h, p_drop_hidden) h2 = nn.rectify(cgt.dot(h, w_h2)) h2 = nn.dropout(h2, p_drop_hidden) py_x = nn.softmax(cgt.dot(h2, w_o)) return py_x
def take_one_step(self, nn_input_bf, hid_out=None): # Sometimes you don't want to unroll all t-steps of a recurrence but rather just one forward step. num_batch = nn_input_bf.shape[0] def slice_w(x, n): return x[:, n*self.num_units:(n+1)*self.num_units] # Create single recurrent computation step function # input_n is the n'th vector of the input def step(input_n, cell_previous, hid_previous, W_hid_stacked, W_in_stacked, b_stacked): input_n = cgt.broadcast("+", cgt.dot(input_n, W_in_stacked), b_stacked, "xx,1x") # Calculate gates pre-activations and slice gates = input_n + cgt.dot(hid_previous, W_hid_stacked) # Extract the pre-activation gate values ingate = slice_w(gates, 0) forgetgate = slice_w(gates, 1) cell_input = slice_w(gates, 2) outgate = slice_w(gates, 3) # Apply nonlinearities ingate = self.nonlinearity_ingate(ingate) forgetgate = self.nonlinearity_forgetgate(forgetgate) cell_input = self.nonlinearity_cell(cell_input) outgate = self.nonlinearity_outgate(outgate) # Compute new cell value cell = forgetgate*cell_previous + ingate*cell_input # Compute new hidden unit activation hid = outgate*self.nonlinearity(cell) return [cell, hid] if hid_out is None: if self.hid_prev is None: self.hid_prev = cgt.dot(cgt.ones((self.num_batches, 1)), self.hid_init) hid_out = self.hid_prev if self.cell_prev is None: ones = cgt.ones((num_batch, 1)) self.cell_prev = cgt.dot(ones, self.cell_init) self.hid_prev = cgt.dot(ones, self.hid_init) if hid_out is None: ones = cgt.ones((num_batch, 1)) self.hid_prev = cgt.dot(ones, self.hid_init) hid_out = self.hid_prev one_step_out = step(nn_input_bf, hid_out, self.cell_prev, self.W_hid_stacked, self.W_in_stacked, self.b_stacked) self.cell_prev = one_step_out[0] self.hid_prev = one_step_out[1] return self.hid_prev
def dense_model3(X, w_h, w_h2, w_h3, w_o, p_drop_input, p_drop_hidden): X = nn.dropout(X, p_drop_input) h = nn.rectify(cgt.dot(X, w_h)) h = nn.dropout(h, p_drop_hidden[0]) h2 = nn.rectify(cgt.dot(h, w_h2)) h2 = nn.dropout(h2, p_drop_hidden[1]) h3 = nn.rectify(cgt.dot(h2, w_h3)) h3 = nn.dropout(h3, p_drop_hidden[2]) py_x = nn.softmax(cgt.dot(h3, w_o)) return py_x
def make_deep_rrnn_rot_relu(size_input, size_mem, n_layers, size_output, size_batch_in, k_in, k_h): inputs = [cgt.matrix() for i_layer in xrange(n_layers + 1)] outputs = [] print 'input_size: ', size_input for i_layer in xrange(n_layers): prev_h = inputs[ i_layer + 1] # note that inputs[0] is the external input, so we add 1 x = inputs[0] if i_layer == 0 else outputs[i_layer - 1] size_x = size_input if i_layer == 0 else size_mem size_batch = prev_h.shape[0] xform_h_param = nn.TensorParam((2 * k_h, size_mem), name="rotxform") xform_h_non = xform_h_param.weight xform_h_non.props["is_rotation"] = True xform_h_norm = cgt.norm(xform_h_non, axis=1, keepdims=True) xform_h = cgt.broadcast('/', xform_h_non, xform_h_norm, "xx,x1") add_in_lin = nn.Affine(size_x, size_mem)(x) add_in_relu = nn.rectify(add_in_lin) prev_h_scaled = nn.scale_mag(prev_h) h_in_added = prev_h_scaled + add_in_relu inters_h = [h_in_added] colon = slice(None, None, None) for i in xrange(2 * k_h): inter_in = inters_h[-1] r_cur = xform_h[i, :] #r_cur = cgt.subtensor(xform_h, [i, colon]) r_cur_2_transpose = cgt.reshape(r_cur, (size_mem, 1)) r_cur_2 = cgt.reshape(r_cur, (1, size_mem)) ref_cur = cgt.dot(cgt.dot(inter_in, r_cur_2_transpose), r_cur_2) inter_out = inter_in - 2 * ref_cur inters_h.append(inter_out) next_h = inters_h[-1] outputs.append(next_h) category_activations = nn.Affine(size_mem, size_output, name="pred")(outputs[-1]) logprobs = nn.logsoftmax(category_activations) outputs.append(logprobs) #print 'len outputs:', len(outputs) #print 'len inputs:', len(inputs) return nn.Module(inputs, outputs)
def test_flatvec(): cgt.reset_config cgt.set_precision('double') cgt.core.update_config(backend="python") # XXX N = 10 K = 3 Xval = np.random.randn(N, K) wval = np.random.randn(K) bval = np.random.randn() yval = np.random.randn(N) X_nk = cgt.shared(Xval, "X") y_n = cgt.shared(yval, "y") w_k = cgt.shared(wval, "w") b = cgt.shared(bval, name="b") ypred = cgt.dot(X_nk, w_k) + b err = cgt.sum(cgt.square(ypred - y_n)) g = cgt.grad(err, [w_k, b]) g = core.simplify(g) pars = [w_k, b] flatx = nn.setup_contiguous_storage(pars) f = cgt.function([], [err, cgt.flatcat(g)])
def test_devices(): N = 10 K = 3 compile_info = cgt.compilation.get_compile_info() cuda_enabled = compile_info["CGT_ENABLE_CUDA"] if not cuda_enabled: raise SkipTest("cuda disabled") Xval = np.random.randn(N,K).astype(cgt.floatX) wval = np.random.randn(K).astype(cgt.floatX) bval = np.asarray(np.random.randn()).astype(cgt.floatX) yval = np.random.randn(N).astype(cgt.floatX) with cgt.scoped_update_config(default_device=cgt.Device(devtype="gpu")): X_nk = cgt.shared(Xval, "X", device=cgt.Device(devtype='gpu')) y_n = cgt.shared(yval, "y") w_k = cgt.shared(wval, "w") b = cgt.shared(bval, name="b") print "bval",bval ypred = cgt.dot(cgt.square(X_nk), w_k) + b err = cgt.sum(cgt.sin(ypred - y_n)) g = cgt.grad(err, [w_k, b]) outputs = [err]+g f = cgt.function([], [err]+g) results = f() print results assert np.allclose(results[0] , np.sin(np.square(Xval).dot(wval)+bval-yval).sum())
def test_devices(): N = 10 K = 3 compile_info = cgt.compilation.get_compile_info() cuda_enabled = compile_info["CGT_ENABLE_CUDA"] if not cuda_enabled: raise SkipTest("cuda disabled") Xval = np.random.randn(N, K).astype(cgt.floatX) wval = np.random.randn(K).astype(cgt.floatX) bval = np.asarray(np.random.randn()).astype(cgt.floatX) yval = np.random.randn(N).astype(cgt.floatX) with cgt.scoped_update_config(default_device=cgt.Device(devtype="gpu")): X_nk = cgt.shared(Xval, "X", device=cgt.Device(devtype='gpu')) y_n = cgt.shared(yval, "y") w_k = cgt.shared(wval, "w") b = cgt.shared(bval, name="b") print "bval", bval ypred = cgt.dot(cgt.square(X_nk), w_k) + b err = cgt.sum(cgt.sin(ypred - y_n)) g = cgt.grad(err, [w_k, b]) outputs = [err] + g f = cgt.function([], [err] + g) results = f() print results assert np.allclose( results[0], np.sin(np.square(Xval).dot(wval) + bval - yval).sum())
def test_flatvec(): cgt.reset_config cgt.set_precision('double') cgt.core.update_config(backend="python") # XXX N = 10 K = 3 Xval = np.random.randn(N,K) wval = np.random.randn(K) bval = np.random.randn() yval = np.random.randn(N) X_nk = cgt.shared(Xval, "X") y_n = cgt.shared(yval, "y") w_k = cgt.shared(wval, "w") b = cgt.shared(bval, name="b") ypred = cgt.dot(X_nk, w_k) + b err = cgt.sum(cgt.square(ypred - y_n)) g = cgt.grad(err, [w_k, b]) g = core.simplify(g) pars = [w_k, b] flatx = nn.setup_contiguous_storage(pars) f = cgt.function([], [err,cgt.flatcat(g)])
def take_one_step(self, input_bf, hid_out=None): num_batch = input_bf.shape[0] def step(input_bh, hid_previous_bh): hid_pre_bh = self.hid_to_hid(hid_previous_bh) hid_pre_bh += self.in_to_hid(input_bh) return self.activation(hid_pre_bh) if self.prev_out is None: self.prev_out = cgt.dot(cgt.ones((num_batch, 1)), self.hid_init) if hid_out is None: ones = cgt.ones((num_batch, 1)) self.prev_out = cgt.dot(ones, self.hid_init) hid_out = self.prev_out self.prev_out = step(input_bf, hid_out) return self.prev_out
def convnet_model(X, w, w2, w3, w4, w_o, p_drop_conv, p_drop_hidden): l1a = nn.rectify(nn.conv2d(X, w, kernelshape=(3,3), pad=(1,1))) l1 = nn.max_pool_2d(l1a, kernelshape=(2, 2), stride=(2,2)) l1 = nn.dropout(l1, p_drop_conv) l2a = nn.rectify(nn.conv2d(l1, w2, kernelshape=(3,3), pad=(1,1))) l2 = nn.max_pool_2d(l2a, kernelshape=(2, 2), stride=(2,2)) l2 = nn.dropout(l2, p_drop_conv) l3a = nn.rectify(nn.conv2d(l2, w3, kernelshape=(3,3), pad=(1,1))) l3b = nn.max_pool_2d(l3a, kernelshape=(2, 2), stride=(2,2)) batchsize,channels,rows,cols = l3b.shape l3 = cgt.reshape(l3b, [batchsize, channels*rows*cols]) l3 = nn.dropout(l3, p_drop_conv) l4 = nn.rectify(cgt.dot(l3, w4)) l4 = nn.dropout(l4, p_drop_hidden) pyx = nn.softmax(cgt.dot(l4, w_o)) return pyx
def convnet_model(X, w, w2, w3, w4, w_o, p_drop_conv, p_drop_hidden): l1a = nn.rectify(nn.conv2d(X, w, kernelshape=(3, 3), pad=(1, 1))) l1 = nn.max_pool_2d(l1a, kernelshape=(2, 2), stride=(2, 2)) l1 = nn.dropout(l1, p_drop_conv) l2a = nn.rectify(nn.conv2d(l1, w2, kernelshape=(3, 3), pad=(1, 1))) l2 = nn.max_pool_2d(l2a, kernelshape=(2, 2), stride=(2, 2)) l2 = nn.dropout(l2, p_drop_conv) l3a = nn.rectify(nn.conv2d(l2, w3, kernelshape=(3, 3), pad=(1, 1))) l3b = nn.max_pool_2d(l3a, kernelshape=(2, 2), stride=(2, 2)) batchsize, channels, rows, cols = l3b.shape l3 = cgt.reshape(l3b, [batchsize, channels * rows * cols]) l3 = nn.dropout(l3, p_drop_conv) l4 = nn.rectify(cgt.dot(l3, w4)) l4 = nn.dropout(l4, p_drop_hidden) pyx = nn.softmax(cgt.dot(l4, w_o)) return pyx
def step(input_n, cell_previous, hid_previous, W_hid_stacked, W_in_stacked, b_stacked): input_n = cgt.broadcast("+", cgt.dot(input_n, W_in_stacked), b_stacked, "xx,1x") # Calculate gates pre-activations and slice gates = input_n + cgt.dot(hid_previous, W_hid_stacked) # Extract the pre-activation gate values ingate = slice_w(gates, 0) forgetgate = slice_w(gates, 1) cell_input = slice_w(gates, 2) outgate = slice_w(gates, 3) # Apply nonlinearities ingate = self.nonlinearity_ingate(ingate) forgetgate = self.nonlinearity_forgetgate(forgetgate) cell_input = self.nonlinearity_cell(cell_input) outgate = self.nonlinearity_outgate(outgate) # Compute new cell value cell = forgetgate*cell_previous + ingate*cell_input # Compute new hidden unit activation hid = outgate*self.nonlinearity(cell) return [cell, hid]
def test_linreg(): cgt.reset_config() cgt.set_precision('double') N = 10 K = 3 Xval = np.random.randn(N, K) wval = np.random.randn(K) bval = np.random.randn() yval = np.random.randn(N) X_nk = cgt.matrix("X") y_n = cgt.vector("y") w_k = cgt.vector("w") b = cgt.scalar(name="b") ypred = cgt.dot(X_nk, w_k) + b err = cgt.sum(cgt.square(ypred - y_n)) g = cgt.grad(err, [w_k, b]) g_simple, an, _ = cgt.core.simplify_and_analyze(g) print "Loss function:" cgt.print_tree([err]) print "Gradient:" cgt.print_tree(g) print "Gradient simplified" cgt.print_tree( g_simple, nodefn=lambda node, o: o.write(" " + an["node2hash"][node][:5])) print "-------" d = {X_nk: Xval, w_k: wval, b: bval, y_n: yval} np.testing.assert_allclose(cgt.numeric_eval(err, d), np.linalg.norm(Xval.dot(wval) + bval - yval)**2) np.testing.assert_allclose(cgt.numeric_eval(g[0], d), 2 * Xval.T.dot(Xval.dot(wval) + bval - yval)) np.testing.assert_allclose(cgt.numeric_eval(g[1], d), 2 * np.sum(Xval.dot(wval) + bval - yval, 0))
def step(input_n, hid_previous, W_hid_stacked, W_in_stacked, b_stacked): # Compute W_{hr} h_{t - 1}, W_{hu} h_{t - 1}, and W_{hc} h_{t - 1} hid_input = cgt.dot(hid_previous, W_hid_stacked) # Compute W_{xr}x_t + b_r, W_{xu}x_t + b_u, and W_{xc}x_t + b_c input_n = cgt.broadcast("+", input_n.dot(W_in_stacked), b_stacked, "xx,1x") # Reset and update gates resetgate = slice_w(hid_input, 0) + slice_w(input_n, 0) updategate = slice_w(hid_input, 1) + slice_w(input_n, 1) resetgate = self.nonlinearity_resetgate(resetgate) updategate = self.nonlinearity_updategate(updategate) # Compute W_{xc}x_t + r_t \odot (W_{hc} h_{t - 1}) hidden_update_in = slice_w(input_n, 2) hidden_update_hid = slice_w(hid_input, 2) hidden_update = hidden_update_in + resetgate*hidden_update_hid # Compute (1 - u_t)h_{t - 1} + u_t c_t hid = (1 - updategate)*hid_previous + updategate*hidden_update return self.nonlinearity_hid(hid) # adding this non-linearity seems to help stability.
def test_linreg(): N = 10 K = 3 Xval = np.random.randn(N,K) wval = np.random.randn(K) bval = np.random.randn() yval = np.random.randn(N) X_nk = cgt.matrix("X") y_n = cgt.vector("y") w_k = cgt.vector("w") b = cgt.scalar(name="b") ypred = cgt.dot(X_nk, w_k) + b err = cgt.sum(cgt.square(ypred - y_n)) g = cgt.grad(err, [w_k, b]) g_simple,an,_ = cgt.core.simplify_and_analyze(g) print "Loss function:" cgt.print_tree([err]) print "Gradient:" cgt.print_tree(g) print "Gradient simplified" cgt.print_tree(g_simple, nodefn=lambda node,o: o.write(" " + an["node2hash"][node][:5])) print "-------" d = {X_nk : Xval, w_k : wval, b : bval, y_n : yval} np.testing.assert_allclose(cgt.numeric_eval(err,d), np.linalg.norm(Xval.dot(wval) + bval - yval)**2, atol={"single":1e-3,"double":1e-6}[cgt.get_precision()]) np.testing.assert_allclose(cgt.numeric_eval(g[0],d), 2 * Xval.T.dot(Xval.dot(wval) + bval - yval), atol={"single":1e-3,"double":1e-6}[cgt.get_precision()]) np.testing.assert_allclose(cgt.numeric_eval(g[1],d), 2 * np.sum(Xval.dot(wval) + bval - yval, 0), atol={"single":1e-3,"double":1e-6}[cgt.get_precision()])
def __call__(self, x): input_btf = x input_tbf = cgt.dimshuffle(input_btf, [1, 0, 2]) seq_len, num_batch = input_tbf.shape[0], input_tbf.shape[1] def step(input_bh, hid_previous_bh): hid_pre_bh = self.hid_to_hid(hid_previous_bh) hid_pre_bh += self.in_to_hid(input_bh) return self.activation(hid_pre_bh) hid_init_bh = cgt.dot(cgt.ones((num_batch, 1)), self.hid_init) hid_out_tbf = unroll_recurrence( step_function=step, input_to_unroll_tbf=input_tbf, hid_init=[hid_init_bh], go_backwards=self.backwards, n_steps=self.timesteps) hid_out_btf = cgt.dimshuffle(hid_out_tbf, [1, 0, 2]) if self.backwards: hid_out_btf = cgt.flip(hid_out_btf, [1]) return hid_out_btf
def __init__(self, input, n_in, n_out, W=None, b=None, activation=cgt.tanh, prefix=""): self.n_in = n_in self.n_out = n_out if W is None: # XXX replace with nn init W_values = np.asarray( rng.uniform( low=-np.sqrt(6. / (n_in + n_out)), high=np.sqrt(6. / (n_in + n_out)), size=(n_in, n_out) ), dtype=cgt.floatX ) if activation == cgt.sigmoid: W_values *= 4 W = cgt.shared(W_values, name=prefix+"_W") if b is None: b_values = np.zeros((n_out,), dtype=cgt.floatX) b = cgt.shared(b_values, name=prefix+"_b") self.W = W self.b = b # XXX broadcast api may change lin_output = cgt.broadcast("+", cgt.dot(input, self.W), cgt.dimshuffle(self.b, ["x", 0]), "xx,1x") self.output = ( lin_output if activation is None else activation(lin_output) ) # parameters of the model self.params = [self.W, self.b]
def dot(x, y): return cgt.dot(x, y)
def take_one_step(self, nn_input_bf, hid_out): #PROBABLY BUGGED. SHOULD BE REWRITTEN. self.num_batches = cgt.infer_shape(nn_input_bf)[0] # (n_time_steps, n_batch, n_features) #input_bf = cgt.dimshuffle(nn_input_bf, [1, 0, 2]) # Stack input weight matrices into a (num_inputs, 3*num_units) # matrix, which speeds up computation W_in_stacked = cgt.concatenate( [self.W_in_to_resetgate, self.W_in_to_updategate, self.W_in_to_hidden_update], axis=1) # Same for hidden weight matrices W_hid_stacked = cgt.concatenate( [self.W_hid_to_resetgate, self.W_hid_to_updategate, self.W_hid_to_hidden_update], axis=1) # Stack gate biases into a (3*num_units) vector b_stacked = cgt.concatenate( [self.b_resetgate, self.b_updategate, self.b_hidden_update], axis=1) # At each loop, input_n will be (n_time_steps, 3*num_units). # We define a slicing function that extract the input to each GRU gate def slice_w(x, n): return x[:, n*self.num_units:(n+1)*self.num_units] # Create single recurrent computation step function # input__n is the n'th vector of the input def step(input_n, hid_previous, W_hid_stacked, W_in_stacked, b_stacked): # Compute W_{hr} h_{t - 1}, W_{hu} h_{t - 1}, and W_{hc} h_{t - 1} hid_input = cgt.dot(hid_previous, W_hid_stacked) # Compute W_{xr}x_t + b_r, W_{xu}x_t + b_u, and W_{xc}x_t + b_c input_n = cgt.broadcast("+", input_n.dot(W_in_stacked), b_stacked, "xx,1x") # Reset and update gates resetgate = slice_w(hid_input, 0) + slice_w(input_n, 0) updategate = slice_w(hid_input, 1) + slice_w(input_n, 1) resetgate = self.nonlinearity_resetgate(resetgate) updategate = self.nonlinearity_updategate(updategate) # Compute W_{xc}x_t + r_t \odot (W_{hc} h_{t - 1}) hidden_update_in = slice_w(input_n, 2) hidden_update_hid = slice_w(hid_input, 2) hidden_update = hidden_update_in + resetgate*hidden_update_hid # Compute (1 - u_t)h_{t - 1} + u_t c_t hid = (1 - updategate)*hid_previous + updategate*hidden_update return self.nonlinearity_hid(hid) # adding this non-linearity seems to help stability. #return hid if hid_out is None: if self.hid_out is None: self.hid_out = cgt.dot(cgt.ones((self.num_batches, 1)), self.hid_init) hid_out = self.hid_out # Retrieve the dimensionality of the incoming layer hid_out = step(nn_input_bf, hid_out, W_hid_stacked, W_in_stacked, b_stacked) # dimshuffle back to (n_batch, n_time_steps, n_features)) # self.hid_out = cgt.dimshuffle(self.hid_out, [1, 0, 2]) # if scan is backward reverse the output if self.backwards: hid_out = cgt.flip(hid_out, [1]) self.hid_out = hid_out return hid_out
def __call__(self, input_btf): # (n_time_steps, n_batch, n_features) input_tbf = cgt.dimshuffle(input_btf, [1, 0, 2]) self.num_batches = cgt.infer_shape(input_tbf)[1] # Stack input weight matrices into a (num_inputs, 3*num_units) # matrix, which speeds up computation W_in_stacked = cgt.concatenate( [self.W_in_to_resetgate, self.W_in_to_updategate, self.W_in_to_hidden_update], axis=1) # Same for hidden weight matrices W_hid_stacked = cgt.concatenate( [self.W_hid_to_resetgate, self.W_hid_to_updategate, self.W_hid_to_hidden_update], axis=1) # Stack gate biases into a (3*num_units) vector b_stacked = cgt.concatenate( [self.b_resetgate, self.b_updategate, self.b_hidden_update], axis=1) # At each loop, input_n will be (n_time_steps, 3*num_units). # We define a slicing function that extract the input to each GRU gate def slice_w(x, n): return x[:, n*self.num_units:(n+1)*self.num_units] # Create single recurrent computation step function # input__n is the n'th vector of the input def step(input_n, hid_previous, W_hid_stacked, W_in_stacked, b_stacked): # Compute W_{hr} h_{t - 1}, W_{hu} h_{t - 1}, and W_{hc} h_{t - 1} hid_input = cgt.dot(hid_previous, W_hid_stacked) # Compute W_{xr}x_t + b_r, W_{xu}x_t + b_u, and W_{xc}x_t + b_c input_n = cgt.broadcast("+", input_n.dot(W_in_stacked), b_stacked, "xx,1x") # Reset and update gates resetgate = slice_w(hid_input, 0) + slice_w(input_n, 0) updategate = slice_w(hid_input, 1) + slice_w(input_n, 1) resetgate = self.nonlinearity_resetgate(resetgate) updategate = self.nonlinearity_updategate(updategate) # Compute W_{xc}x_t + r_t \odot (W_{hc} h_{t - 1}) hidden_update_in = slice_w(input_n, 2) hidden_update_hid = slice_w(hid_input, 2) hidden_update = hidden_update_in + resetgate*hidden_update_hid # Compute (1 - u_t)h_{t - 1} + u_t c_t hid = (1 - updategate)*hid_previous + updategate*hidden_update return hid sequences = [input_tbf] step_fun = step hid_init = cgt.dot(cgt.ones((self.num_batches, 1)), self.hid_init) # The hidden-to-hidden weight matrix is always used in step non_seqs = [W_hid_stacked] # When we aren't precomputing the input outside of scan, we need to # provide the input weights and biases to the step function non_seqs += [W_in_stacked, b_stacked] # theano.scan only allows for positional arguments, so when # self.precompute_input is True, we need to supply fake placeholder # arguments for the input weights and biases. # Retrieve the dimensionality of the incoming layer hid_out = unroll_lstm( fn=step_fun, sequences=sequences, outputs_info=[hid_init], go_backwards=self.backwards, non_sequences=non_seqs, n_steps=self.timesteps)[0] # dimshuffle back to (n_batch, n_time_steps, n_features)) hid_out = cgt.dimshuffle(hid_out, [1, 0, 2]) # if scan is backward reverse the output if self.backwards: hid_out = cgt.flip(hid_out, [1]) return hid_out
def __call__(self, nn_input_btf): # Because scan iterates over the first dimension we dimshuffle to # (n_time_steps, n_batch, n_features) nn_input_tbf = cgt.dimshuffle(nn_input_btf, [1, 0, 2]) seq_len, num_batch = nn_input_tbf.shape[0], nn_input_tbf.shape[1] def slice_w(x, n): return x[:, n*self.num_units:(n+1)*self.num_units] # Create single recurrent computation step function # input_n is the n'th vector of the input def step(input_n, cell_previous, hid_previous, W_hid_stacked, W_in_stacked, b_stacked): input_n = cgt.broadcast("+", cgt.dot(input_n, W_in_stacked), b_stacked, "xx,1x") # Calculate gates pre-activations and slice gates = input_n + cgt.dot(hid_previous, W_hid_stacked) # Extract the pre-activation gate values ingate = slice_w(gates, 0) forgetgate = slice_w(gates, 1) cell_input = slice_w(gates, 2) outgate = slice_w(gates, 3) # Apply nonlinearities ingate = self.nonlinearity_ingate(ingate) forgetgate = self.nonlinearity_forgetgate(forgetgate) cell_input = self.nonlinearity_cell(cell_input) outgate = self.nonlinearity_outgate(outgate) # Compute new cell value cell = forgetgate*cell_previous + ingate*cell_input # Compute new hidden unit activation hid = outgate*self.nonlinearity(cell) return [cell, hid] sequences = nn_input_tbf step_fun = step ones = cgt.ones((num_batch, 1)) cell_init = cgt.dot(ones, self.cell_init) hid_init = cgt.dot(ones, self.hid_init) # The hidden-to-hidden weight matrix is always used in step non_seqs = [self.W_hid_stacked] non_seqs += [self.W_in_stacked, self.b_stacked] cell_out, hid_out = unroll_lstm( fn=step_fun, sequences=sequences, outputs_info=[cell_init, hid_init], go_backwards=self.backwards, non_sequences=non_seqs, n_steps=self.timesteps) # dimshuffle back to (n_batch, n_time_steps, n_features)) hid_out = cgt.dimshuffle(hid_out, [1, 0, 2]) # if scan is backward reverse the output if self.backwards: hid_out = cgt.flip(hid_out, [1]) return hid_out
# hyperparams # # Be careful when setting alpha! If it's too large # here the cost will blow up. alpha = 1e-7 epochs = 100 # Linear regression model np.random.seed(0) X = cgt.matrix("X", fixed_shape=(None, nfeats)) Y = cgt.vector("Y") w = cgt.shared(np.random.randn(nfeats) * 0.01) # prediction ypred = cgt.dot(X, w) # cost cost = cgt.square(Y - ypred).mean() # derivative with respect to w dw = cgt.grad(cost=cost, wrt=w) updates = [(w, w - dw * alpha)] # training function trainf = cgt.function(inputs=[X, Y], outputs=[], updates=updates) # cost function, no updates costf = cgt.function(inputs=[X, Y], outputs=cost) for i in xrange(epochs): trainf(X_train, Y_train)