class TestBidirectionalStack(unittest.TestCase): def setUp(self): prototype = SimpleRecurrent(dim=3, activation=Tanh()) self.layers = [ Bidirectional(weights_init=Orthogonal(), prototype=prototype) for _ in range(3) ] self.stack = RecurrentStack(self.layers) for fork in self.stack.forks: fork.weights_init = Identity(1) fork.biases_init = Constant(0) self.stack.initialize() self.x_val = 0.1 * numpy.asarray( list(itertools.permutations(range(4))), dtype=theano.config.floatX) self.x_val = (numpy.ones( (24, 4, 3), dtype=theano.config.floatX) * self.x_val[..., None]) self.mask_val = numpy.ones((24, 4), dtype=theano.config.floatX) self.mask_val[12:24, 3] = 0 def test_steps(self): x = tensor.tensor3('x') mask = tensor.matrix('mask') calc_stack_layers = [ theano.function([x, mask], self.stack.apply(x, mask=mask)[i]) for i in range(len(self.layers)) ] stack_layers = [ f(self.x_val, self.mask_val) for f in calc_stack_layers ] h_val = self.x_val for stack_layer_value, bidir_net in zip(stack_layers, self.layers): calc = theano.function([x, mask], bidir_net.apply(x, mask=mask)) simple_layer_value = calc(h_val, self.mask_val) assert_allclose(stack_layer_value, simple_layer_value, rtol=1e-04) h_val = simple_layer_value[..., :3] def test_dims(self): self.assertEqual(self.stack.get_dim("inputs"), 3) for i in range(len(self.layers)): state_name = self.stack.suffix("states", i) self.assertEqual(self.stack.get_dim(state_name), 6)
class TestBidirectionalStack(unittest.TestCase): def setUp(self): prototype = SimpleRecurrent(dim=3, activation=Tanh()) self.layers = [ Bidirectional(weights_init=Orthogonal(), prototype=prototype) for _ in range(3)] self.stack = RecurrentStack(self.layers) for fork in self.stack.forks: fork.weights_init = Identity(1) fork.biases_init = Constant(0) self.stack.initialize() self.x_val = 0.1 * numpy.asarray( list(itertools.permutations(range(4))), dtype=theano.config.floatX) self.x_val = (numpy.ones((24, 4, 3), dtype=theano.config.floatX) * self.x_val[..., None]) self.mask_val = numpy.ones((24, 4), dtype=theano.config.floatX) self.mask_val[12:24, 3] = 0 def test_steps(self): x = tensor.tensor3('x') mask = tensor.matrix('mask') calc_stack_layers = [ theano.function([x, mask], self.stack.apply(x, mask=mask)[i]) for i in range(len(self.layers))] stack_layers = [ f(self.x_val, self.mask_val) for f in calc_stack_layers] h_val = self.x_val for stack_layer_value, bidir_net in zip(stack_layers, self.layers): calc = theano.function([x, mask], bidir_net.apply(x, mask=mask)) simple_layer_value = calc(h_val, self.mask_val) assert_allclose(stack_layer_value, simple_layer_value, rtol=1e-04) h_val = simple_layer_value[..., :3] def test_dims(self): self.assertEqual(self.stack.get_dim("inputs"), 3) for i in range(len(self.layers)): state_name = self.stack.suffix("states", i) self.assertEqual(self.stack.get_dim(state_name), 6)
class TestRecurrentStack(unittest.TestCase): def setUp(self): depth = 4 self.depth = depth dim = 3 # don't change, hardwired in the code transitions = [LSTM(dim=dim) for _ in range(depth)] self.stack0 = RecurrentStack(transitions, weights_init=Constant(2), biases_init=Constant(0)) self.stack0.initialize() self.stack2 = RecurrentStack(transitions, weights_init=Constant(2), biases_init=Constant(0), skip_connections=True) self.stack2.initialize() def do_one_step(self, stack, skip_connections=False, low_memory=False): depth = self.depth # batch=2 h0_val = 0.1 * numpy.array([[[1, 1, 0], [0, 1, 1]]] * depth, dtype=theano.config.floatX) c0_val = 0.1 * numpy.array([[[1, 1, 0], [0, 1, 1]]] * depth, dtype=theano.config.floatX) x_val = 0.1 * numpy.array([range(12), range(12, 24)], dtype=theano.config.floatX) # we will use same weights on all layers W_state2x_val = 2 * numpy.ones((3, 12), dtype=theano.config.floatX) W_state_val = 2 * numpy.ones((3, 12), dtype=theano.config.floatX) W_cell_to_in = 2 * numpy.ones((3,), dtype=theano.config.floatX) W_cell_to_out = 2 * numpy.ones((3,), dtype=theano.config.floatX) W_cell_to_forget = 2 * numpy.ones((3,), dtype=theano.config.floatX) kwargs = OrderedDict() for d in range(depth): if d > 0: suffix = RECURRENTSTACK_SEPARATOR + str(d) else: suffix = '' if d == 0 or skip_connections: kwargs['inputs' + suffix] = tensor.matrix('inputs' + suffix) kwargs['inputs' + suffix].tag.test_value = x_val kwargs['states' + suffix] = tensor.matrix('states' + suffix) kwargs['states' + suffix].tag.test_value = h0_val[d] kwargs['cells' + suffix] = tensor.matrix('cells' + suffix) kwargs['cells' + suffix].tag.test_value = c0_val[d] results = stack.apply(iterate=False, low_memory=low_memory, **kwargs) next_h = theano.function(inputs=list(kwargs.values()), outputs=results) def sigmoid(x): return 1. / (1. + numpy.exp(-x)) h1_val = [] x_v = x_val args_val = [] for d in range(depth): if d == 0 or skip_connections: args_val.append(x_val) h0_v = h0_val[d] args_val.append(h0_v) c0_v = c0_val[d] args_val.append(c0_v) # omitting biases because they are zero activation = numpy.dot(h0_v, W_state_val) + x_v if skip_connections and d > 0: activation += x_val i_t = sigmoid(activation[:, :3] + c0_v * W_cell_to_in) f_t = sigmoid(activation[:, 3:6] + c0_v * W_cell_to_forget) next_cells = f_t * c0_v + i_t * numpy.tanh(activation[:, 6:9]) o_t = sigmoid(activation[:, 9:12] + next_cells * W_cell_to_out) h1_v = o_t * numpy.tanh(next_cells) # current layer output state transformed to input of next x_v = numpy.dot(h1_v, W_state2x_val) h1_val.append(h1_v) res = next_h(*args_val) for d in range(depth): assert_allclose(h1_val[d], res[d * 2], rtol=1e-6) def test_one_step(self): self.do_one_step(self.stack0) self.do_one_step(self.stack0, low_memory=True) self.do_one_step(self.stack2, skip_connections=True) self.do_one_step(self.stack2, skip_connections=True, low_memory=True) def do_many_steps(self, stack, skip_connections=False, low_memory=False): depth = self.depth # 24 steps # 4 batch examples # 12 dimensions per step x_val = (0.1 * numpy.asarray( list(itertools.islice(itertools.permutations(range(12)), 0, 24)), dtype=theano.config.floatX)) x_val = numpy.ones((24, 4, 12), dtype=theano.config.floatX) * x_val[:, None, :] # mask the last third of steps mask_val = numpy.ones((24, 4), dtype=theano.config.floatX) mask_val[12:24, 3] = 0 # unroll all states and cells for all steps and also initial value h_val = numpy.zeros((depth, 25, 4, 3), dtype=theano.config.floatX) c_val = numpy.zeros((depth, 25, 4, 3), dtype=theano.config.floatX) # we will use same weights on all layers W_state2x_val = 2 * numpy.ones((3, 12), dtype=theano.config.floatX) W_state_val = 2 * numpy.ones((3, 12), dtype=theano.config.floatX) W_cell_to_in = 2 * numpy.ones((3,), dtype=theano.config.floatX) W_cell_to_out = 2 * numpy.ones((3,), dtype=theano.config.floatX) W_cell_to_forget = 2 * numpy.ones((3,), dtype=theano.config.floatX) kwargs = OrderedDict() for d in range(depth): if d > 0: suffix = RECURRENTSTACK_SEPARATOR + str(d) else: suffix = '' if d == 0 or skip_connections: kwargs['inputs' + suffix] = tensor.tensor3('inputs' + suffix) kwargs['inputs' + suffix].tag.test_value = x_val kwargs['mask'] = tensor.matrix('mask') kwargs['mask'].tag.test_value = mask_val results = stack.apply(iterate=True, low_memory=low_memory, **kwargs) calc_h = theano.function(inputs=list(kwargs.values()), outputs=results) def sigmoid(x): return 1. / (1. + numpy.exp(-x)) for i in range(1, 25): x_v = x_val[i - 1] h_vs = [] c_vs = [] for d in range(depth): h_v = h_val[d][i - 1, :, :] c_v = c_val[d][i - 1, :, :] activation = numpy.dot(h_v, W_state_val) + x_v if skip_connections and d > 0: activation += x_val[i - 1] i_t = sigmoid(activation[:, :3] + c_v * W_cell_to_in) f_t = sigmoid(activation[:, 3:6] + c_v * W_cell_to_forget) c_v1 = f_t * c_v + i_t * numpy.tanh(activation[:, 6:9]) o_t = sigmoid(activation[:, 9:12] + c_v1 * W_cell_to_out) h_v1 = o_t * numpy.tanh(c_v1) h_v = (mask_val[i - 1, :, None] * h_v1 + (1 - mask_val[i - 1, :, None]) * h_v) c_v = (mask_val[i - 1, :, None] * c_v1 + (1 - mask_val[i - 1, :, None]) * c_v) # current layer output state transformed to input of next x_v = numpy.dot(h_v, W_state2x_val) h_vs.append(h_v) c_vs.append(c_v) for d in range(depth): h_val[d][i, :, :] = h_vs[d] c_val[d][i, :, :] = c_vs[d] args_val = [x_val]*(depth if skip_connections else 1) + [mask_val] res = calc_h(*args_val) for d in range(depth): assert_allclose(h_val[d][1:], res[d * 2], rtol=1e-4) assert_allclose(c_val[d][1:], res[d * 2 + 1], rtol=1e-4) # Also test that initial state is a parameter for h in results: initial_states = VariableFilter(roles=[INITIAL_STATE])( ComputationGraph(h)) assert all(is_shared_variable(initial_state) for initial_state in initial_states) def test_many_steps(self): self.do_many_steps(self.stack0) self.do_many_steps(self.stack0, low_memory=True) self.do_many_steps(self.stack2, skip_connections=True) self.do_many_steps(self.stack2, skip_connections=True, low_memory=True)
def build_model_hard(vocab_size, args, dtype=floatX): logger.info('Building model ...') # Parameters for the model context = args.context state_dim = args.state_dim layers = args.layers skip_connections = args.skip_connections # Symbolic variables # In both cases: Time X Batch x = tensor.lmatrix('features') y = tensor.lmatrix('targets') # Build the model output_names = [] output_dims = [] for d in range(layers): if d > 0: suffix = '_' + str(d) else: suffix = '' if d == 0 or skip_connections: output_names.append("inputs" + suffix) output_dims.append(state_dim) lookup = LookupTable(length=vocab_size, dim=state_dim) lookup.weights_init = initialization.IsotropicGaussian(0.1) lookup.biases_init = initialization.Constant(0) fork = Fork(output_names=output_names, input_dim=args.mini_batch_size, output_dims=output_dims, prototype=FeedforwardSequence([lookup.apply])) transitions = [SimpleRecurrent(dim=state_dim, activation=Tanh())] for i in range(layers - 1): mlp = MLP(activations=[Logistic()], dims=[2 * state_dim, 1], weights_init=initialization.IsotropicGaussian(0.1), biases_init=initialization.Constant(0), name="mlp_" + str(i)) transitions.append( HardGatedRecurrent(dim=state_dim, mlp=mlp, activation=Tanh())) rnn = RecurrentStack(transitions, skip_connections=skip_connections) # dim = layers * state_dim output_layer = Linear(input_dim=layers * state_dim, output_dim=vocab_size, name="output_layer") # Return list of 3D Tensor, one for each layer # (Time X Batch X embedding_dim) pre_rnn = fork.apply(x) # Give a name to the input of each layer if skip_connections: for t in range(len(pre_rnn)): pre_rnn[t].name = "pre_rnn_" + str(t) else: pre_rnn.name = "pre_rnn" # Prepare inputs for the RNN kwargs = OrderedDict() init_states = {} for d in range(layers): if d > 0: suffix = '_' + str(d) else: suffix = '' if skip_connections: kwargs['inputs' + suffix] = pre_rnn[d] elif d == 0: kwargs['inputs' + suffix] = pre_rnn init_states[d] = theano.shared(numpy.zeros( (args.mini_batch_size, state_dim)).astype(floatX), name='state0_%d' % d) kwargs['states' + suffix] = init_states[d] # Apply the RNN to the inputs h = rnn.apply(low_memory=True, **kwargs) # Now we have correctly: # h = [state_1, state_2, state_3 ...] # Save all the last states last_states = {} for d in range(layers): last_states[d] = h[d][-1, :, :] # Concatenate all the states if layers > 1: h = tensor.concatenate(h, axis=2) h.name = "hidden_state" # The updates of the hidden states updates = [] for d in range(layers): updates.append((init_states[d], last_states[d])) presoft = output_layer.apply(h[context:, :, :]) # Define the cost # Compute the probability distribution time, batch, feat = presoft.shape presoft.name = 'presoft' cross_entropy = Softmax().categorical_cross_entropy( y[context:, :].flatten(), presoft.reshape((batch * time, feat))) cross_entropy = cross_entropy / tensor.log(2) cross_entropy.name = "cross_entropy" # TODO: add regularisation for the cost # the log(1) is here in order to differentiate the two variables # for monitoring cost = cross_entropy + tensor.log(1) cost.name = "regularized_cost" # Initialize the model logger.info('Initializing...') fork.initialize() rnn.weights_init = initialization.Orthogonal() rnn.biases_init = initialization.Constant(0) rnn.initialize() output_layer.weights_init = initialization.IsotropicGaussian(0.1) output_layer.biases_init = initialization.Constant(0) output_layer.initialize() return cost, cross_entropy, updates
def build_model_vanilla(vocab_size, args, dtype=floatX): logger.info('Building model ...') # Parameters for the model context = args.context state_dim = args.state_dim layers = args.layers skip_connections = args.skip_connections # Symbolic variables # In both cases: Time X Batch x = tensor.lmatrix('features') y = tensor.lmatrix('targets') # Build the model output_names = [] output_dims = [] for d in range(layers): if d > 0: suffix = '_' + str(d) else: suffix = '' if d == 0 or skip_connections: output_names.append("inputs" + suffix) output_dims.append(state_dim) lookup = LookupTable(length=vocab_size, dim=state_dim) lookup.weights_init = initialization.IsotropicGaussian(0.1) lookup.biases_init = initialization.Constant(0) fork = Fork(output_names=output_names, input_dim=args.mini_batch_size, output_dims=output_dims, prototype=FeedforwardSequence( [lookup.apply])) transitions = [SimpleRecurrent(dim=state_dim, activation=Tanh()) for _ in range(layers)] rnn = RecurrentStack(transitions, skip_connections=skip_connections) # If skip_connections: dim = layers * state_dim # else: dim = state_dim output_layer = Linear( input_dim=skip_connections * layers * state_dim + (1 - skip_connections) * state_dim, output_dim=vocab_size, name="output_layer") # Return list of 3D Tensor, one for each layer # (Time X Batch X embedding_dim) pre_rnn = fork.apply(x) # Give a name to the input of each layer if skip_connections: for t in range(len(pre_rnn)): pre_rnn[t].name = "pre_rnn_" + str(t) else: pre_rnn.name = "pre_rnn" # Prepare inputs for the RNN kwargs = OrderedDict() init_states = {} for d in range(layers): if d > 0: suffix = '_' + str(d) else: suffix = '' if skip_connections: kwargs['inputs' + suffix] = pre_rnn[d] elif d == 0: kwargs['inputs'] = pre_rnn init_states[d] = theano.shared( numpy.zeros((args.mini_batch_size, state_dim)).astype(floatX), name='state0_%d' % d) kwargs['states' + suffix] = init_states[d] # Apply the RNN to the inputs h = rnn.apply(low_memory=True, **kwargs) # We have # h = [state, state_1, state_2 ...] if layers > 1 # h = state if layers == 1 # If we have skip connections, concatenate all the states # Else only consider the state of the highest layer last_states = {} if layers > 1: # Save all the last states for d in range(layers): last_states[d] = h[d][-1, :, :] if skip_connections: h = tensor.concatenate(h, axis=2) else: h = h[-1] else: last_states[0] = h[-1, :, :] h.name = "hidden_state" # The updates of the hidden states updates = [] for d in range(layers): updates.append((init_states[d], last_states[d])) presoft = output_layer.apply(h[context:, :, :]) # Define the cost # Compute the probability distribution time, batch, feat = presoft.shape presoft.name = 'presoft' cross_entropy = Softmax().categorical_cross_entropy( y[context:, :].flatten(), presoft.reshape((batch * time, feat))) cross_entropy = cross_entropy / tensor.log(2) cross_entropy.name = "cross_entropy" # TODO: add regularisation for the cost # the log(1) is here in order to differentiate the two variables # for monitoring cost = cross_entropy + tensor.log(1) cost.name = "regularized_cost" # Initialize the model logger.info('Initializing...') fork.initialize() rnn.weights_init = initialization.Orthogonal() rnn.biases_init = initialization.Constant(0) rnn.initialize() output_layer.weights_init = initialization.IsotropicGaussian(0.1) output_layer.biases_init = initialization.Constant(0) output_layer.initialize() return cost, cross_entropy, updates
def build_fork_lookup(vocab_size, time_length, args): x = tensor.lmatrix('features') virtual_dim = 6 state_dim = 6 skip_connections = False layers = 1 # Build the model output_names = [] output_dims = [] for d in range(layers): if d > 0: suffix = '_' + str(d) else: suffix = '' if d == 0 or skip_connections: output_names.append("inputs" + suffix) output_dims.append(virtual_dim) lookup = LookupTable(length=vocab_size, dim=virtual_dim) lookup.weights_init = initialization.IsotropicGaussian(0.1) lookup.biases_init = initialization.Constant(0) fork = Fork(output_names=output_names, input_dim=time_length, output_dims=output_dims, prototype=FeedforwardSequence( [lookup.apply])) # Note that this order of the periods makes faster modules flow in slower # ones with is the opposite of the original paper transitions = [ClockworkBase(dim=state_dim, activation=Tanh(), period=2 ** i) for i in range(layers)] rnn = RecurrentStack(transitions, skip_connections=skip_connections) # Return list of 3D Tensor, one for each layer # (Batch X Time X embedding_dim) pre_rnn = fork.apply(x) # Give time as the first index for each element in the list: # (Time X Batch X embedding_dim) if layers > 1 and skip_connections: for t in range(len(pre_rnn)): pre_rnn[t] = pre_rnn[t].dimshuffle(1, 0, 2) else: pre_rnn = pre_rnn.dimshuffle(1, 0, 2) f_pre_rnn = theano.function([x], pre_rnn) # Prepare inputs for the RNN kwargs = OrderedDict() for d in range(layers): if d > 0: suffix = '_' + str(d) else: suffix = '' if d == 0 or skip_connections: if skip_connections: kwargs['inputs' + suffix] = pre_rnn[d] else: kwargs['inputs' + suffix] = pre_rnn print kwargs # Apply the RNN to the inputs h = rnn.apply(low_memory=True, **kwargs) fork.initialize() rnn.weights_init = initialization.Orthogonal() rnn.biases_init = initialization.Constant(0) rnn.initialize() f_h = theano.function([x], h) return f_pre_rnn, f_h
def build_model_soft(vocab_size, args, dtype=floatX): logger.info('Building model ...') # Parameters for the model context = args.context state_dim = args.state_dim layers = args.layers skip_connections = args.skip_connections # Symbolic variables # In both cases: Time X Batch x = tensor.lmatrix('features') y = tensor.lmatrix('targets') # Build the model output_names = [] output_dims = [] for d in range(layers): if d > 0: suffix = '_' + str(d) else: suffix = '' if d == 0 or skip_connections: output_names.append("inputs" + suffix) output_dims.append(state_dim) lookup = LookupTable(length=vocab_size, dim=state_dim) lookup.weights_init = initialization.IsotropicGaussian(0.1) lookup.biases_init = initialization.Constant(0) fork = Fork(output_names=output_names, input_dim=args.mini_batch_size, output_dims=output_dims, prototype=FeedforwardSequence( [lookup.apply])) transitions = [SimpleRecurrent(dim=state_dim, activation=Tanh())] # Build the MLP dims = [2 * state_dim] activations = [] for i in range(args.mlp_layers): activations.append(Rectifier()) dims.append(state_dim) # Activation of the last layer of the MLP if args.mlp_activation == "logistic": activations.append(Logistic()) elif args.mlp_activation == "rectifier": activations.append(Rectifier()) elif args.mlp_activation == "hard_logistic": activations.append(HardLogistic()) else: assert False # Output of MLP has dimension 1 dims.append(1) for i in range(layers - 1): mlp = MLP(activations=activations, dims=dims, weights_init=initialization.IsotropicGaussian(0.1), biases_init=initialization.Constant(0), name="mlp_" + str(i)) transitions.append( SoftGatedRecurrent(dim=state_dim, mlp=mlp, activation=Tanh())) rnn = RecurrentStack(transitions, skip_connections=skip_connections) # dim = layers * state_dim output_layer = Linear( input_dim=layers * state_dim, output_dim=vocab_size, name="output_layer") # Return list of 3D Tensor, one for each layer # (Time X Batch X embedding_dim) pre_rnn = fork.apply(x) # Give a name to the input of each layer if skip_connections: for t in range(len(pre_rnn)): pre_rnn[t].name = "pre_rnn_" + str(t) else: pre_rnn.name = "pre_rnn" # Prepare inputs for the RNN kwargs = OrderedDict() init_states = {} for d in range(layers): if d > 0: suffix = '_' + str(d) else: suffix = '' if skip_connections: kwargs['inputs' + suffix] = pre_rnn[d] elif d == 0: kwargs['inputs' + suffix] = pre_rnn init_states[d] = theano.shared( numpy.zeros((args.mini_batch_size, state_dim)).astype(floatX), name='state0_%d' % d) kwargs['states' + suffix] = init_states[d] # Apply the RNN to the inputs h = rnn.apply(low_memory=True, **kwargs) # Now we have: # h = [state, state_1, gate_value_1, state_2, gate_value_2, state_3, ...] # Extract gate_values gate_values = h[2::2] new_h = [h[0]] new_h.extend(h[1::2]) h = new_h # Now we have: # h = [state, state_1, state_2, ...] # gate_values = [gate_value_1, gate_value_2, gate_value_3] for i, gate_value in enumerate(gate_values): gate_value.name = "gate_value_" + str(i) # Save all the last states last_states = {} for d in range(layers): last_states[d] = h[d][-1, :, :] # Concatenate all the states if layers > 1: h = tensor.concatenate(h, axis=2) h.name = "hidden_state" # The updates of the hidden states updates = [] for d in range(layers): updates.append((init_states[d], last_states[d])) presoft = output_layer.apply(h[context:, :, :]) # Define the cost # Compute the probability distribution time, batch, feat = presoft.shape presoft.name = 'presoft' cross_entropy = Softmax().categorical_cross_entropy( y[context:, :].flatten(), presoft.reshape((batch * time, feat))) cross_entropy = cross_entropy / tensor.log(2) cross_entropy.name = "cross_entropy" # TODO: add regularisation for the cost # the log(1) is here in order to differentiate the two variables # for monitoring cost = cross_entropy + tensor.log(1) cost.name = "regularized_cost" # Initialize the model logger.info('Initializing...') fork.initialize() rnn.weights_init = initialization.Orthogonal() rnn.biases_init = initialization.Constant(0) rnn.initialize() output_layer.weights_init = initialization.IsotropicGaussian(0.1) output_layer.biases_init = initialization.Constant(0) output_layer.initialize() return cost, cross_entropy, updates, gate_values
def build_model_lstm(vocab_size, args, dtype=floatX): logger.info('Building model ...') # Parameters for the model context = args.context state_dim = args.state_dim layers = args.layers skip_connections = args.skip_connections virtual_dim = 4 * state_dim # Symbolic variables # In both cases: Time X Batch x = tensor.lmatrix('features') y = tensor.lmatrix('targets') # Build the model output_names = [] output_dims = [] for d in range(layers): if d > 0: suffix = '_' + str(d) else: suffix = '' if d == 0 or skip_connections: output_names.append("inputs" + suffix) output_dims.append(virtual_dim) lookup = LookupTable(length=vocab_size, dim=virtual_dim) lookup.weights_init = initialization.IsotropicGaussian(0.1) lookup.biases_init = initialization.Constant(0) # Make sure time_length is what we need fork = Fork(output_names=output_names, input_dim=args.mini_batch_size, output_dims=output_dims, prototype=FeedforwardSequence([lookup.apply])) transitions = [ LSTM(dim=state_dim, activation=Tanh()) for _ in range(layers) ] rnn = RecurrentStack(transitions, skip_connections=skip_connections) # If skip_connections: dim = layers * state_dim # else: dim = state_dim output_layer = Linear(input_dim=skip_connections * layers * state_dim + (1 - skip_connections) * state_dim, output_dim=vocab_size, name="output_layer") # Return list of 3D Tensor, one for each layer # (Time X Batch X embedding_dim) pre_rnn = fork.apply(x) # Give a name to the input of each layer if skip_connections: for t in range(len(pre_rnn)): pre_rnn[t].name = "pre_rnn_" + str(t) else: pre_rnn.name = "pre_rnn" # Prepare inputs for the RNN kwargs = OrderedDict() init_states = {} init_cells = {} for d in range(layers): if d > 0: suffix = '_' + str(d) else: suffix = '' if skip_connections: kwargs['inputs' + suffix] = pre_rnn[d] elif d == 0: kwargs['inputs'] = pre_rnn init_states[d] = theano.shared(numpy.zeros( (args.mini_batch_size, state_dim)).astype(floatX), name='state0_%d' % d) init_cells[d] = theano.shared(numpy.zeros( (args.mini_batch_size, state_dim)).astype(floatX), name='cell0_%d' % d) kwargs['states' + suffix] = init_states[d] kwargs['cells' + suffix] = init_cells[d] # Apply the RNN to the inputs h = rnn.apply(low_memory=True, **kwargs) # h = [state, cell, in, forget, out, state_1, # cell_1, in_1, forget_1, out_1 ...] last_states = {} last_cells = {} for d in range(layers): last_states[d] = h[5 * d][-1, :, :] last_cells[d] = h[5 * d + 1][-1, :, :] # The updates of the hidden states updates = [] for d in range(layers): updates.append((init_states[d], last_states[d])) updates.append((init_cells[d], last_states[d])) # h = [state, cell, in, forget, out, state_1, # cell_1, in_1, forget_1, out_1 ...] # Extract the values in_gates = h[2::5] forget_gates = h[3::5] out_gates = h[4::5] gate_values = { "in_gates": in_gates, "forget_gates": forget_gates, "out_gates": out_gates } h = h[::5] # Now we have correctly: # h = [state, state_1, state_2 ...] if layers > 1 # h = [state] if layers == 1 # If we have skip connections, concatenate all the states # Else only consider the state of the highest layer if layers > 1: if skip_connections: h = tensor.concatenate(h, axis=2) else: h = h[-1] else: h = h[0] h.name = "hidden_state" presoft = output_layer.apply(h[context:, :, :]) # Define the cost # Compute the probability distribution time, batch, feat = presoft.shape presoft.name = 'presoft' cross_entropy = Softmax().categorical_cross_entropy( y[context:, :].flatten(), presoft.reshape((batch * time, feat))) cross_entropy = cross_entropy / tensor.log(2) cross_entropy.name = "cross_entropy" # TODO: add regularisation for the cost # the log(1) is here in order to differentiate the two variables # for monitoring cost = cross_entropy + tensor.log(1) cost.name = "regularized_cost" # Initialize the model logger.info('Initializing...') fork.initialize() # Dont initialize as Orthogonal if we are about to load new parameters if args.load_path is not None: rnn.weights_init = initialization.Constant(0) else: rnn.weights_init = initialization.Orthogonal() rnn.biases_init = initialization.Constant(0) rnn.initialize() output_layer.weights_init = initialization.IsotropicGaussian(0.1) output_layer.biases_init = initialization.Constant(0) output_layer.initialize() return cost, cross_entropy, updates, gate_values