def build_model_vanilla(args, dtype=floatX): logger.info('Building model ...') # Return list of 3D Tensor, one for each layer # (Time X Batch X embedding_dim) pre_rnn, x_mask = get_prernn(args) transitions = [ SimpleRecurrent(dim=args.state_dim, activation=Tanh()) for _ in range(args.layers) ] rnn = RecurrentStack(transitions, skip_connections=args.skip_connections) initialize_rnn(rnn, args) # Prepare inputs and initial states for the RNN kwargs, inits = get_rnn_kwargs(pre_rnn, args) # Apply the RNN to the inputs h = rnn.apply(low_memory=True, mask=x_mask, **kwargs) # We have # h = [state, state_1, state_2 ...] if args.layers > 1 # h = state if args.layers == 1 # If we have skip connections, concatenate all the states # Else only consider the state of the highest layer last_states = {} hidden_states = [] if args.layers > 1: # Save all the last states for d in range(args.layers): # TODO correct bug # h[d] = h[d] * x_mask last_states[d] = h[d][-1, :, :] h[d].name = "hidden_state_" + str(d) hidden_states.append(h[d]) if args.skip_connections or args.skip_output: h = tensor.concatenate(h, axis=2) else: h = h[-1] else: # TODO correct bug # hidden_states.append(h * x_mask) hidden_states.append(h) hidden_states[0].name = "hidden_state_0" # Note: if we have mask, then updating initial state # with last state does not make sence anymore. last_states[0] = h[-1, :, :] # The updates of the hidden states updates = [] for d in range(args.layers): updates.append((inits[0][d], last_states[d])) presoft = get_presoft(h, args) cost, unregularized_cost = get_costs(presoft, args) return cost, unregularized_cost, updates, hidden_states
def build_model_vanilla(args, dtype=floatX): logger.info('Building model ...') # Return list of 3D Tensor, one for each layer # (Time X Batch X embedding_dim) pre_rnn, x_mask = get_prernn(args) transitions = [SimpleRecurrent(dim=args.state_dim, activation=Tanh()) for _ in range(args.layers)] rnn = RecurrentStack(transitions, skip_connections=args.skip_connections) initialize_rnn(rnn, args) # Prepare inputs and initial states for the RNN kwargs, inits = get_rnn_kwargs(pre_rnn, args) # Apply the RNN to the inputs h = rnn.apply(low_memory=True, mask=x_mask, **kwargs) # We have # h = [state, state_1, state_2 ...] if args.layers > 1 # h = state if args.layers == 1 # If we have skip connections, concatenate all the states # Else only consider the state of the highest layer last_states = {} hidden_states = [] if args.layers > 1: # Save all the last states for d in range(args.layers): # TODO correct bug # h[d] = h[d] * x_mask last_states[d] = h[d][-1, :, :] h[d].name = "hidden_state_" + str(d) hidden_states.append(h[d]) if args.skip_connections or args.skip_output: h = tensor.concatenate(h, axis=2) else: h = h[-1] else: # TODO correct bug # hidden_states.append(h * x_mask) hidden_states.append(h) hidden_states[0].name = "hidden_state_0" # Note: if we have mask, then updating initial state # with last state does not make sence anymore. last_states[0] = h[-1, :, :] # The updates of the hidden states updates = [] for d in range(args.layers): updates.append((inits[0][d], last_states[d])) presoft = get_presoft(h, args) cost, unregularized_cost = get_costs(presoft, args) return cost, unregularized_cost, updates, hidden_states
class TestBidirectionalStack(unittest.TestCase): def setUp(self): prototype = SimpleRecurrent(dim=3, activation=Tanh()) self.layers = [ Bidirectional(weights_init=Orthogonal(), prototype=prototype) for _ in range(3) ] self.stack = RecurrentStack(self.layers) for fork in self.stack.forks: fork.weights_init = Identity(1) fork.biases_init = Constant(0) self.stack.initialize() self.x_val = 0.1 * numpy.asarray( list(itertools.permutations(range(4))), dtype=theano.config.floatX) self.x_val = (numpy.ones( (24, 4, 3), dtype=theano.config.floatX) * self.x_val[..., None]) self.mask_val = numpy.ones((24, 4), dtype=theano.config.floatX) self.mask_val[12:24, 3] = 0 def test_steps(self): x = tensor.tensor3('x') mask = tensor.matrix('mask') calc_stack_layers = [ theano.function([x, mask], self.stack.apply(x, mask=mask)[i]) for i in range(len(self.layers)) ] stack_layers = [ f(self.x_val, self.mask_val) for f in calc_stack_layers ] h_val = self.x_val for stack_layer_value, bidir_net in zip(stack_layers, self.layers): calc = theano.function([x, mask], bidir_net.apply(x, mask=mask)) simple_layer_value = calc(h_val, self.mask_val) assert_allclose(stack_layer_value, simple_layer_value, rtol=1e-04) h_val = simple_layer_value[..., :3] def test_dims(self): self.assertEqual(self.stack.get_dim("inputs"), 3) for i in range(len(self.layers)): state_name = self.stack.suffix("states", i) self.assertEqual(self.stack.get_dim(state_name), 6)
class TestBidirectionalStack(unittest.TestCase): def setUp(self): prototype = SimpleRecurrent(dim=3, activation=Tanh()) self.layers = [ Bidirectional(weights_init=Orthogonal(), prototype=prototype) for _ in range(3)] self.stack = RecurrentStack(self.layers) for fork in self.stack.forks: fork.weights_init = Identity(1) fork.biases_init = Constant(0) self.stack.initialize() self.x_val = 0.1 * numpy.asarray( list(itertools.permutations(range(4))), dtype=theano.config.floatX) self.x_val = (numpy.ones((24, 4, 3), dtype=theano.config.floatX) * self.x_val[..., None]) self.mask_val = numpy.ones((24, 4), dtype=theano.config.floatX) self.mask_val[12:24, 3] = 0 def test_steps(self): x = tensor.tensor3('x') mask = tensor.matrix('mask') calc_stack_layers = [ theano.function([x, mask], self.stack.apply(x, mask=mask)[i]) for i in range(len(self.layers))] stack_layers = [ f(self.x_val, self.mask_val) for f in calc_stack_layers] h_val = self.x_val for stack_layer_value, bidir_net in zip(stack_layers, self.layers): calc = theano.function([x, mask], bidir_net.apply(x, mask=mask)) simple_layer_value = calc(h_val, self.mask_val) assert_allclose(stack_layer_value, simple_layer_value, rtol=1e-04) h_val = simple_layer_value[..., :3] def test_dims(self): self.assertEqual(self.stack.get_dim("inputs"), 3) for i in range(len(self.layers)): state_name = self.stack.suffix("states", i) self.assertEqual(self.stack.get_dim(state_name), 6)
class Interpolator(AbstractReadout): """Readout char by char.""" def __init__(self, vocab_size, embedding_dim, igru_state_dim, igru_depth, trg_dgru_depth, emitter, feedback_brick, merge=None, merge_prototype=None, post_merge=None, **kwargs): merged_dim = igru_state_dim if not merge: merge = Merge(input_names=kwargs['source_names'], prototype=merge_prototype) if not post_merge: post_merge = Bias(dim=merged_dim) # for compatible if igru_depth == 1: self.igru = IGRU(dim=igru_state_dim) else: self.igru = RecurrentStack( [IGRU(dim=igru_state_dim, name='igru')] + [ UpperIGRU(dim=igru_state_dim, activation=Tanh(), name='upper_igru' + str(i)) for i in range(1, igru_depth) ], skip_connections=True) self.embedding_dim = embedding_dim self.emitter = emitter self.feedback_brick = feedback_brick self.merge = merge self.post_merge = post_merge self.merged_dim = merged_dim self.igru_depth = igru_depth self.trg_dgru_depth = trg_dgru_depth self.lookup = LookupTable(name='embeddings') self.vocab_size = vocab_size self.igru_state_dim = igru_state_dim self.gru_to_softmax = Linear(input_dim=igru_state_dim, output_dim=vocab_size) self.gru_fork = Fork([ name for name in self.igru.apply.sequences if name != 'mask' and name != 'input_states' ], prototype=Linear(), name='gru_fork') children = [ self.emitter, self.feedback_brick, self.merge, self.post_merge, self.igru, self.lookup, self.gru_to_softmax, self.gru_fork ] kwargs.setdefault('children', []).extend(children) super(Interpolator, self).__init__(**kwargs) def _push_allocation_config(self): self.lookup.length = self.vocab_size self.lookup.dim = self.embedding_dim self.emitter.readout_dim = self.get_dim('readouts') self.merge.input_names = self.source_names self.merge.input_dims = self.source_dims self.merge.output_dim = self.merged_dim self.post_merge.input_dim = self.merged_dim self.post_merge.output_dim = self.igru_state_dim self.gru_fork.input_dim = self.embedding_dim self.gru_fork.output_dims = [ self.igru.get_dim(name) for name in self.gru_fork.output_names ] @application def initial_igru_outputs(self, batch_size): return self.igru.initial_states(batch_size) @application def emit(self, readouts): return self.emitter.emit(readouts) @application def cost(self, readouts, outputs): return self.emitter.cost(readouts, outputs) @application def initial_outputs(self, batch_size): return self.emitter.initial_outputs(batch_size) @application(outputs=['feedback']) def feedback(self, outputs): return self.feedback_brick.feedback(outputs) @application(outputs=['feedback']) def feedback_apply(self, target_char_seq, target_sample_matrix, target_char_aux): return self.feedback_brick.apply(target_char_seq, target_sample_matrix, target_char_aux) @application def single_feedback(self, target_single_char, batch_size, mask=None, states=None): return self.feedback_brick.single_emit(target_single_char, batch_size, mask, states) @single_feedback.property('outputs') def single_feedback_outputs(self): return [ 'single_feedback' + RECURRENTSTACK_SEPARATOR + str(i) for i in range(self.trg_dgru_depth) ] @application(outputs=['gru_out', 'readout_chars']) def single_readout_gru(self, target_prev_char, target_prev_char_aux, input_states, states): embeddings = self.lookup.apply(target_prev_char) states_dict = {'states': states[0]} if self.igru_depth > 1: for i in range(1, self.igru_depth): states_dict['states' + RECURRENTSTACK_SEPARATOR + str(i)] = states[i] gru_out = self.igru.apply(**merge( self.gru_fork.apply(embeddings, as_dict=True), states_dict, { 'mask': target_prev_char_aux, 'input_states': input_states, 'iterate': False })) if self.igru_depth > 1: readout_chars = self.gru_to_softmax.apply(gru_out[-1]) else: readout_chars = self.gru_to_softmax.apply(gru_out) return gru_out, readout_chars @application def readout(self, **kwargs): merged = self.merge.apply( **{name: kwargs[name] for name in self.merge.input_names}) merged = self.post_merge.apply(merged) return merged @application(outputs=['readout_chars']) def readout_gru(self, target_prev_char_seq, target_prev_char_aux, input_states): embeddings = self.lookup.apply(target_prev_char_seq) gru_out = self.igru.apply( **merge(self.gru_fork.apply(embeddings, as_dict=True), { 'mask': target_prev_char_aux, 'input_states': input_states })) if self.igru_depth > 1: gru_out = gru_out[-1] readout_chars = self.gru_to_softmax.apply(gru_out) return readout_chars def get_dim(self, name): if name == 'outputs': return self.emitter.get_dim(name) elif name == 'feedback': return self.feedback_brick.get_dim(name) elif name == 'readouts': return self.readout_dim return super(AbstractReadout, self).get_dim(name)
class TargetWordEncoder(Initializable): """Word encoder in target side use a single RNN to map a charater-level word to a vector""" def __init__(self, vocab_size, embedding_dim, dgru_state_dim, dgru_depth, **kwargs): super(TargetWordEncoder, self).__init__(**kwargs) self.vocab_size = vocab_size self.embedding_dim = embedding_dim self.dgru_state_dim = dgru_state_dim self.embedding_dim = embedding_dim self.lookup = LookupTable(name='embeddings') self.dgru_depth = dgru_depth self.dgru = RecurrentStack([ DGRU(activation=Tanh(), dim=self.dgru_state_dim) for _ in range(dgru_depth) ], skip_connections=True) self.gru_fork = Fork( [name for name in self.dgru.apply.sequences if name != 'mask'], prototype=Linear(), name='gru_fork') self.children = [self.lookup, self.dgru, self.gru_fork] def _push_allocation_config(self): self.lookup.length = self.vocab_size self.lookup.dim = self.embedding_dim self.gru_fork.input_dim = self.embedding_dim self.gru_fork.output_dims = [ self.dgru.get_dim(name) for name in self.gru_fork.output_names ] @application(inputs=['char_seq', 'sample_matrix', 'char_aux'], outputs=['representation']) def apply(self, char_seq, sample_matrix, char_aux): # Time as first dimension embeddings = self.lookup.apply(char_seq) gru_out = self.dgru.apply(**merge( self.gru_fork.apply(embeddings, as_dict=True), {'mask': char_aux})) if self.dgru_depth > 1: gru_out = gru_out[-1] sampled_representation = tensor.batched_dot( sample_matrix, gru_out.dimshuffle([1, 0, 2])) return sampled_representation.dimshuffle([1, 0, 2]) @application(inputs=['target_single_char']) def single_emit(self, target_single_char, batch_size, mask, states=None): # Time as first dimension # only one batch embeddings = self.lookup.apply(target_single_char) if states is None: states = self.dgru.initial_states(batch_size) states_dict = {'states': states[0]} for i in range(1, self.dgru_depth): states_dict['states' + RECURRENTSTACK_SEPARATOR + str(i)] = states[i] gru_out = self.dgru.apply(**merge( self.gru_fork.apply(embeddings, as_dict=True), states_dict, { 'mask': mask, 'iterate': False })) return gru_out @single_emit.property('outputs') def single_emit_outputs(self): return [ 'gru_out' + RECURRENTSTACK_SEPARATOR + str(i) for i in range(self.dgru_depth) ] def get_dim(self, name): if name in ['output', 'feedback']: return self.dgru_state_dim super(TargetWordEncoder, self).get_dim(name)
def build_model_vanilla(vocab_size, args, dtype=floatX): logger.info('Building model ...') # Parameters for the model context = args.context state_dim = args.state_dim layers = args.layers skip_connections = args.skip_connections # Symbolic variables # In both cases: Time X Batch x = tensor.lmatrix('features') y = tensor.lmatrix('targets') # Build the model output_names = [] output_dims = [] for d in range(layers): if d > 0: suffix = '_' + str(d) else: suffix = '' if d == 0 or skip_connections: output_names.append("inputs" + suffix) output_dims.append(state_dim) lookup = LookupTable(length=vocab_size, dim=state_dim) lookup.weights_init = initialization.IsotropicGaussian(0.1) lookup.biases_init = initialization.Constant(0) fork = Fork(output_names=output_names, input_dim=args.mini_batch_size, output_dims=output_dims, prototype=FeedforwardSequence( [lookup.apply])) transitions = [SimpleRecurrent(dim=state_dim, activation=Tanh()) for _ in range(layers)] rnn = RecurrentStack(transitions, skip_connections=skip_connections) # If skip_connections: dim = layers * state_dim # else: dim = state_dim output_layer = Linear( input_dim=skip_connections * layers * state_dim + (1 - skip_connections) * state_dim, output_dim=vocab_size, name="output_layer") # Return list of 3D Tensor, one for each layer # (Time X Batch X embedding_dim) pre_rnn = fork.apply(x) # Give a name to the input of each layer if skip_connections: for t in range(len(pre_rnn)): pre_rnn[t].name = "pre_rnn_" + str(t) else: pre_rnn.name = "pre_rnn" # Prepare inputs for the RNN kwargs = OrderedDict() init_states = {} for d in range(layers): if d > 0: suffix = '_' + str(d) else: suffix = '' if skip_connections: kwargs['inputs' + suffix] = pre_rnn[d] elif d == 0: kwargs['inputs'] = pre_rnn init_states[d] = theano.shared( numpy.zeros((args.mini_batch_size, state_dim)).astype(floatX), name='state0_%d' % d) kwargs['states' + suffix] = init_states[d] # Apply the RNN to the inputs h = rnn.apply(low_memory=True, **kwargs) # We have # h = [state, state_1, state_2 ...] if layers > 1 # h = state if layers == 1 # If we have skip connections, concatenate all the states # Else only consider the state of the highest layer last_states = {} if layers > 1: # Save all the last states for d in range(layers): last_states[d] = h[d][-1, :, :] if skip_connections: h = tensor.concatenate(h, axis=2) else: h = h[-1] else: last_states[0] = h[-1, :, :] h.name = "hidden_state" # The updates of the hidden states updates = [] for d in range(layers): updates.append((init_states[d], last_states[d])) presoft = output_layer.apply(h[context:, :, :]) # Define the cost # Compute the probability distribution time, batch, feat = presoft.shape presoft.name = 'presoft' cross_entropy = Softmax().categorical_cross_entropy( y[context:, :].flatten(), presoft.reshape((batch * time, feat))) cross_entropy = cross_entropy / tensor.log(2) cross_entropy.name = "cross_entropy" # TODO: add regularisation for the cost # the log(1) is here in order to differentiate the two variables # for monitoring cost = cross_entropy + tensor.log(1) cost.name = "regularized_cost" # Initialize the model logger.info('Initializing...') fork.initialize() rnn.weights_init = initialization.Orthogonal() rnn.biases_init = initialization.Constant(0) rnn.initialize() output_layer.weights_init = initialization.IsotropicGaussian(0.1) output_layer.biases_init = initialization.Constant(0) output_layer.initialize() return cost, cross_entropy, updates
def build_model_cw(args, dtype=floatX): logger.info('Building model ...') # Return list of 3D Tensor, one for each layer # (Time X Batch X embedding_dim) pre_rnn, x_mask = get_prernn(args) # Note that this order of the periods makes faster modules flow in slower # ones with is the opposite of the original paper if args.module_order == "fast_in_slow": transitions = [ClockworkBase( dim=args.state_dim, activation=Tanh(), period=2 ** i) for i in range(args.layers)] elif args.module_order == "slow_in_fast": transitions = [ClockworkBase( dim=args.state_dim, activation=Tanh(), period=2 ** (args.layers - i - 1)) for i in range(args.layers)] else: assert False rnn = RecurrentStack(transitions, skip_connections=args.skip_connections) initialize_rnn(rnn, args) # Prepare inputs and initial states for the RNN kwargs, inits = get_rnn_kwargs(pre_rnn, args) # Apply the RNN to the inputs h = rnn.apply(low_memory=True, mask=x_mask, **kwargs) # In the Clockwork case: # h = [state, time, state_1, time_1 ...] h = h[::2] # Now we have correctly: # h = [state, state_1, state_2 ...] if args.layers > 1 # h = [state] if args.layers == 1 # If we have skip connections, concatenate all the states # Else only consider the state of the highest layer last_states = {} hidden_states = [] if args.layers > 1: # Save all the last states for d in range(args.layers): # TODO correct the bug # h[d] = h[d] * x_mask last_states[d] = h[d][-1, :, :] h[d].name = "hidden_state_" + str(d) hidden_states.append(h[d]) h = tensor.concatenate(h, axis=2) else: h = h[0] * x_mask last_states[0] = h[-1, :, :] h.name = "hidden_state_all" # The updates of the hidden states updates = [] for d in range(args.layers): updates.append((inits[0][d], last_states[d])) presoft = get_presoft(h, args) cost, unregularized_cost = get_costs(presoft, args) return cost, unregularized_cost, updates, hidden_states
def build_model_lstm(args, dtype=floatX): logger.info('Building model ...') # Return list of 3D Tensor, one for each layer # (Time X Batch X embedding_dim) pre_rnn, x_mask = get_prernn(args) transitions = [LSTM(dim=args.state_dim, activation=Tanh()) for _ in range(args.layers)] rnn = RecurrentStack(transitions, skip_connections=args.skip_connections) initialize_rnn(rnn, args) # Prepare inputs and initial states for the RNN kwargs, inits = get_rnn_kwargs(pre_rnn, args) # Apply the RNN to the inputs h = rnn.apply(mask=x_mask, **kwargs) # h = [state, cell, in, forget, out, state_1, # cell_1, in_1, forget_1, out_1 ...] last_states = {} last_cells = {} hidden_states = [] for d in range(args.layers): # TODO correct bug # h[5 * d] = h[5 * d] * x_mask # h[5 * d + 1] = h[5 * d + 1] * x_mask last_states[d] = h[5 * d][-1, :, :] last_cells[d] = h[5 * d + 1][-1, :, :] h[5 * d].name = "hidden_state_" + str(d) h[5 * d + 1].name = "hidden_cell_" + str(d) hidden_states.extend([h[5 * d], h[5 * d + 1]]) # The updates of the hidden states # Note: if we have mask, then updating initial state # with last state does not make sence anymore. updates = [] for d in range(args.layers): updates.append((inits[0][d], last_states[d])) updates.append((inits[1][d], last_states[d])) # h = [state, cell, in, forget, out, state_1, # cell_1, in_1, forget_1, out_1 ...] # Extract the values in_gates = h[2::5] forget_gates = h[3::5] out_gates = h[4::5] gate_values = {"in_gates": in_gates, "forget_gates": forget_gates, "out_gates": out_gates} h = h[::5] # Now we have correctly: # h = [state, state_1, state_2 ...] if args.layers > 1 # h = [state] if args.layers == 1 # If we have skip connections, concatenate all the states # Else only consider the state of the highest layer if args.layers > 1: if args.skip_connections or args.skip_output: h = tensor.concatenate(h, axis=2) else: h = h[-1] else: h = h[0] h.name = "hidden_state_all" presoft = get_presoft(h, args) cost, unregularized_cost = get_costs(presoft, args) return cost, unregularized_cost, updates, gate_values, hidden_states
def build_model_soft(args, dtype=floatX): logger.info('Building model ...') # Return list of 3D Tensor, one for each layer # (Time X Batch X embedding_dim) pre_rnn, x_mask = get_prernn(args) transitions = [SimpleRecurrent(dim=args.state_dim, activation=Tanh())] # Build the MLP dims = [2 * args.state_dim] activations = [] for i in range(args.mlp_layers): activations.append(Rectifier()) dims.append(args.state_dim) # Activation of the last layer of the MLP if args.mlp_activation == "logistic": activations.append(Logistic()) elif args.mlp_activation == "rectifier": activations.append(Rectifier()) elif args.mlp_activation == "hard_logistic": activations.append(HardLogistic()) else: assert False # Output of MLP has dimension 1 dims.append(1) for i in range(args.layers - 1): mlp = MLP(activations=activations, dims=dims, weights_init=initialization.IsotropicGaussian(0.1), biases_init=initialization.Constant(0), name="mlp_" + str(i)) transitions.append( SoftGatedRecurrent(dim=args.state_dim, mlp=mlp, activation=Tanh())) rnn = RecurrentStack(transitions, skip_connections=args.skip_connections) initialize_rnn(rnn, args) # Prepare inputs and initial states for the RNN kwargs, inits = get_rnn_kwargs(pre_rnn, args) # Apply the RNN to the inputs h = rnn.apply(low_memory=True, mask=x_mask, **kwargs) # Now we have: # h = [state, state_1, gate_value_1, state_2, gate_value_2, state_3, ...] # Extract gate_values gate_values = h[2::2] new_h = [h[0]] new_h.extend(h[1::2]) h = new_h # Now we have: # h = [state, state_1, state_2, ...] # gate_values = [gate_value_1, gate_value_2, gate_value_3] for i, gate_value in enumerate(gate_values): gate_value.name = "gate_value_" + str(i) # Save all the last states last_states = {} hidden_states = [] for d in range(args.layers): h[d] = h[d] * x_mask last_states[d] = h[d][-1, :, :] h[d].name = "hidden_state_" + str(d) hidden_states.append(h[d]) # Concatenate all the states if args.layers > 1: h = tensor.concatenate(h, axis=2) h.name = "hidden_state_all" # The updates of the hidden states updates = [] for d in range(args.layers): updates.append((inits[0][d], last_states[d])) presoft = get_presoft(h, args) cost, cross_entropy = get_costs(presoft, args) return cost, cross_entropy, updates, gate_values, hidden_states
def build_model_hard(vocab_size, args, dtype=floatX): logger.info('Building model ...') # Parameters for the model context = args.context state_dim = args.state_dim layers = args.layers skip_connections = args.skip_connections # Symbolic variables # In both cases: Time X Batch x = tensor.lmatrix('features') y = tensor.lmatrix('targets') # Build the model output_names = [] output_dims = [] for d in range(layers): if d > 0: suffix = '_' + str(d) else: suffix = '' if d == 0 or skip_connections: output_names.append("inputs" + suffix) output_dims.append(state_dim) lookup = LookupTable(length=vocab_size, dim=state_dim) lookup.weights_init = initialization.IsotropicGaussian(0.1) lookup.biases_init = initialization.Constant(0) fork = Fork(output_names=output_names, input_dim=args.mini_batch_size, output_dims=output_dims, prototype=FeedforwardSequence([lookup.apply])) transitions = [SimpleRecurrent(dim=state_dim, activation=Tanh())] for i in range(layers - 1): mlp = MLP(activations=[Logistic()], dims=[2 * state_dim, 1], weights_init=initialization.IsotropicGaussian(0.1), biases_init=initialization.Constant(0), name="mlp_" + str(i)) transitions.append( HardGatedRecurrent(dim=state_dim, mlp=mlp, activation=Tanh())) rnn = RecurrentStack(transitions, skip_connections=skip_connections) # dim = layers * state_dim output_layer = Linear(input_dim=layers * state_dim, output_dim=vocab_size, name="output_layer") # Return list of 3D Tensor, one for each layer # (Time X Batch X embedding_dim) pre_rnn = fork.apply(x) # Give a name to the input of each layer if skip_connections: for t in range(len(pre_rnn)): pre_rnn[t].name = "pre_rnn_" + str(t) else: pre_rnn.name = "pre_rnn" # Prepare inputs for the RNN kwargs = OrderedDict() init_states = {} for d in range(layers): if d > 0: suffix = '_' + str(d) else: suffix = '' if skip_connections: kwargs['inputs' + suffix] = pre_rnn[d] elif d == 0: kwargs['inputs' + suffix] = pre_rnn init_states[d] = theano.shared(numpy.zeros( (args.mini_batch_size, state_dim)).astype(floatX), name='state0_%d' % d) kwargs['states' + suffix] = init_states[d] # Apply the RNN to the inputs h = rnn.apply(low_memory=True, **kwargs) # Now we have correctly: # h = [state_1, state_2, state_3 ...] # Save all the last states last_states = {} for d in range(layers): last_states[d] = h[d][-1, :, :] # Concatenate all the states if layers > 1: h = tensor.concatenate(h, axis=2) h.name = "hidden_state" # The updates of the hidden states updates = [] for d in range(layers): updates.append((init_states[d], last_states[d])) presoft = output_layer.apply(h[context:, :, :]) # Define the cost # Compute the probability distribution time, batch, feat = presoft.shape presoft.name = 'presoft' cross_entropy = Softmax().categorical_cross_entropy( y[context:, :].flatten(), presoft.reshape((batch * time, feat))) cross_entropy = cross_entropy / tensor.log(2) cross_entropy.name = "cross_entropy" # TODO: add regularisation for the cost # the log(1) is here in order to differentiate the two variables # for monitoring cost = cross_entropy + tensor.log(1) cost.name = "regularized_cost" # Initialize the model logger.info('Initializing...') fork.initialize() rnn.weights_init = initialization.Orthogonal() rnn.biases_init = initialization.Constant(0) rnn.initialize() output_layer.weights_init = initialization.IsotropicGaussian(0.1) output_layer.biases_init = initialization.Constant(0) output_layer.initialize() return cost, cross_entropy, updates
def build_model_soft(vocab_size, args, dtype=floatX): logger.info('Building model ...') # Parameters for the model context = args.context state_dim = args.state_dim layers = args.layers skip_connections = args.skip_connections # Symbolic variables # In both cases: Time X Batch x = tensor.lmatrix('features') y = tensor.lmatrix('targets') # Build the model output_names = [] output_dims = [] for d in range(layers): if d > 0: suffix = '_' + str(d) else: suffix = '' if d == 0 or skip_connections: output_names.append("inputs" + suffix) output_dims.append(state_dim) lookup = LookupTable(length=vocab_size, dim=state_dim) lookup.weights_init = initialization.IsotropicGaussian(0.1) lookup.biases_init = initialization.Constant(0) fork = Fork(output_names=output_names, input_dim=args.mini_batch_size, output_dims=output_dims, prototype=FeedforwardSequence( [lookup.apply])) transitions = [SimpleRecurrent(dim=state_dim, activation=Tanh())] # Build the MLP dims = [2 * state_dim] activations = [] for i in range(args.mlp_layers): activations.append(Rectifier()) dims.append(state_dim) # Activation of the last layer of the MLP if args.mlp_activation == "logistic": activations.append(Logistic()) elif args.mlp_activation == "rectifier": activations.append(Rectifier()) elif args.mlp_activation == "hard_logistic": activations.append(HardLogistic()) else: assert False # Output of MLP has dimension 1 dims.append(1) for i in range(layers - 1): mlp = MLP(activations=activations, dims=dims, weights_init=initialization.IsotropicGaussian(0.1), biases_init=initialization.Constant(0), name="mlp_" + str(i)) transitions.append( SoftGatedRecurrent(dim=state_dim, mlp=mlp, activation=Tanh())) rnn = RecurrentStack(transitions, skip_connections=skip_connections) # dim = layers * state_dim output_layer = Linear( input_dim=layers * state_dim, output_dim=vocab_size, name="output_layer") # Return list of 3D Tensor, one for each layer # (Time X Batch X embedding_dim) pre_rnn = fork.apply(x) # Give a name to the input of each layer if skip_connections: for t in range(len(pre_rnn)): pre_rnn[t].name = "pre_rnn_" + str(t) else: pre_rnn.name = "pre_rnn" # Prepare inputs for the RNN kwargs = OrderedDict() init_states = {} for d in range(layers): if d > 0: suffix = '_' + str(d) else: suffix = '' if skip_connections: kwargs['inputs' + suffix] = pre_rnn[d] elif d == 0: kwargs['inputs' + suffix] = pre_rnn init_states[d] = theano.shared( numpy.zeros((args.mini_batch_size, state_dim)).astype(floatX), name='state0_%d' % d) kwargs['states' + suffix] = init_states[d] # Apply the RNN to the inputs h = rnn.apply(low_memory=True, **kwargs) # Now we have: # h = [state, state_1, gate_value_1, state_2, gate_value_2, state_3, ...] # Extract gate_values gate_values = h[2::2] new_h = [h[0]] new_h.extend(h[1::2]) h = new_h # Now we have: # h = [state, state_1, state_2, ...] # gate_values = [gate_value_1, gate_value_2, gate_value_3] for i, gate_value in enumerate(gate_values): gate_value.name = "gate_value_" + str(i) # Save all the last states last_states = {} for d in range(layers): last_states[d] = h[d][-1, :, :] # Concatenate all the states if layers > 1: h = tensor.concatenate(h, axis=2) h.name = "hidden_state" # The updates of the hidden states updates = [] for d in range(layers): updates.append((init_states[d], last_states[d])) presoft = output_layer.apply(h[context:, :, :]) # Define the cost # Compute the probability distribution time, batch, feat = presoft.shape presoft.name = 'presoft' cross_entropy = Softmax().categorical_cross_entropy( y[context:, :].flatten(), presoft.reshape((batch * time, feat))) cross_entropy = cross_entropy / tensor.log(2) cross_entropy.name = "cross_entropy" # TODO: add regularisation for the cost # the log(1) is here in order to differentiate the two variables # for monitoring cost = cross_entropy + tensor.log(1) cost.name = "regularized_cost" # Initialize the model logger.info('Initializing...') fork.initialize() rnn.weights_init = initialization.Orthogonal() rnn.biases_init = initialization.Constant(0) rnn.initialize() output_layer.weights_init = initialization.IsotropicGaussian(0.1) output_layer.biases_init = initialization.Constant(0) output_layer.initialize() return cost, cross_entropy, updates, gate_values
def build_model_lstm(vocab_size, args, dtype=floatX): logger.info('Building model ...') # Parameters for the model context = args.context state_dim = args.state_dim layers = args.layers skip_connections = args.skip_connections virtual_dim = 4 * state_dim # Symbolic variables # In both cases: Time X Batch x = tensor.lmatrix('features') y = tensor.lmatrix('targets') # Build the model output_names = [] output_dims = [] for d in range(layers): if d > 0: suffix = '_' + str(d) else: suffix = '' if d == 0 or skip_connections: output_names.append("inputs" + suffix) output_dims.append(virtual_dim) lookup = LookupTable(length=vocab_size, dim=virtual_dim) lookup.weights_init = initialization.IsotropicGaussian(0.1) lookup.biases_init = initialization.Constant(0) # Make sure time_length is what we need fork = Fork(output_names=output_names, input_dim=args.mini_batch_size, output_dims=output_dims, prototype=FeedforwardSequence([lookup.apply])) transitions = [ LSTM(dim=state_dim, activation=Tanh()) for _ in range(layers) ] rnn = RecurrentStack(transitions, skip_connections=skip_connections) # If skip_connections: dim = layers * state_dim # else: dim = state_dim output_layer = Linear(input_dim=skip_connections * layers * state_dim + (1 - skip_connections) * state_dim, output_dim=vocab_size, name="output_layer") # Return list of 3D Tensor, one for each layer # (Time X Batch X embedding_dim) pre_rnn = fork.apply(x) # Give a name to the input of each layer if skip_connections: for t in range(len(pre_rnn)): pre_rnn[t].name = "pre_rnn_" + str(t) else: pre_rnn.name = "pre_rnn" # Prepare inputs for the RNN kwargs = OrderedDict() init_states = {} init_cells = {} for d in range(layers): if d > 0: suffix = '_' + str(d) else: suffix = '' if skip_connections: kwargs['inputs' + suffix] = pre_rnn[d] elif d == 0: kwargs['inputs'] = pre_rnn init_states[d] = theano.shared(numpy.zeros( (args.mini_batch_size, state_dim)).astype(floatX), name='state0_%d' % d) init_cells[d] = theano.shared(numpy.zeros( (args.mini_batch_size, state_dim)).astype(floatX), name='cell0_%d' % d) kwargs['states' + suffix] = init_states[d] kwargs['cells' + suffix] = init_cells[d] # Apply the RNN to the inputs h = rnn.apply(low_memory=True, **kwargs) # h = [state, cell, in, forget, out, state_1, # cell_1, in_1, forget_1, out_1 ...] last_states = {} last_cells = {} for d in range(layers): last_states[d] = h[5 * d][-1, :, :] last_cells[d] = h[5 * d + 1][-1, :, :] # The updates of the hidden states updates = [] for d in range(layers): updates.append((init_states[d], last_states[d])) updates.append((init_cells[d], last_states[d])) # h = [state, cell, in, forget, out, state_1, # cell_1, in_1, forget_1, out_1 ...] # Extract the values in_gates = h[2::5] forget_gates = h[3::5] out_gates = h[4::5] gate_values = { "in_gates": in_gates, "forget_gates": forget_gates, "out_gates": out_gates } h = h[::5] # Now we have correctly: # h = [state, state_1, state_2 ...] if layers > 1 # h = [state] if layers == 1 # If we have skip connections, concatenate all the states # Else only consider the state of the highest layer if layers > 1: if skip_connections: h = tensor.concatenate(h, axis=2) else: h = h[-1] else: h = h[0] h.name = "hidden_state" presoft = output_layer.apply(h[context:, :, :]) # Define the cost # Compute the probability distribution time, batch, feat = presoft.shape presoft.name = 'presoft' cross_entropy = Softmax().categorical_cross_entropy( y[context:, :].flatten(), presoft.reshape((batch * time, feat))) cross_entropy = cross_entropy / tensor.log(2) cross_entropy.name = "cross_entropy" # TODO: add regularisation for the cost # the log(1) is here in order to differentiate the two variables # for monitoring cost = cross_entropy + tensor.log(1) cost.name = "regularized_cost" # Initialize the model logger.info('Initializing...') fork.initialize() # Dont initialize as Orthogonal if we are about to load new parameters if args.load_path is not None: rnn.weights_init = initialization.Constant(0) else: rnn.weights_init = initialization.Orthogonal() rnn.biases_init = initialization.Constant(0) rnn.initialize() output_layer.weights_init = initialization.IsotropicGaussian(0.1) output_layer.biases_init = initialization.Constant(0) output_layer.initialize() return cost, cross_entropy, updates, gate_values
def build_model_cw(args, dtype=floatX): logger.info('Building model ...') # Return list of 3D Tensor, one for each layer # (Time X Batch X embedding_dim) pre_rnn, x_mask = get_prernn(args) # Note that this order of the periods makes faster modules flow in slower # ones with is the opposite of the original paper if args.module_order == "fast_in_slow": transitions = [ ClockworkBase(dim=args.state_dim, activation=Tanh(), period=2**i) for i in range(args.layers) ] elif args.module_order == "slow_in_fast": transitions = [ ClockworkBase(dim=args.state_dim, activation=Tanh(), period=2**(args.layers - i - 1)) for i in range(args.layers) ] else: assert False rnn = RecurrentStack(transitions, skip_connections=args.skip_connections) initialize_rnn(rnn, args) # Prepare inputs and initial states for the RNN kwargs, inits = get_rnn_kwargs(pre_rnn, args) # Apply the RNN to the inputs h = rnn.apply(low_memory=True, mask=x_mask, **kwargs) # In the Clockwork case: # h = [state, time, state_1, time_1 ...] h = h[::2] # Now we have correctly: # h = [state, state_1, state_2 ...] if args.layers > 1 # h = [state] if args.layers == 1 # If we have skip connections, concatenate all the states # Else only consider the state of the highest layer last_states = {} hidden_states = [] if args.layers > 1: # Save all the last states for d in range(args.layers): # TODO correct the bug # h[d] = h[d] * x_mask last_states[d] = h[d][-1, :, :] h[d].name = "hidden_state_" + str(d) hidden_states.append(h[d]) h = tensor.concatenate(h, axis=2) else: h = h[0] * x_mask last_states[0] = h[-1, :, :] h.name = "hidden_state_all" # The updates of the hidden states updates = [] for d in range(args.layers): updates.append((inits[0][d], last_states[d])) presoft = get_presoft(h, args) cost, unregularized_cost = get_costs(presoft, args) return cost, unregularized_cost, updates, hidden_states
class Decimator(Initializable): """Source word encoder, mapping a charater-level word to a vector. This encoder is able to learn the morphology. For compatibility with previous version, we call it Decimator. """ def __init__(self, vocab_size, embedding_dim, dgru_state_dim, dgru_depth, **kwargs): super(Decimator, self).__init__(**kwargs) self.vocab_size = vocab_size self.embedding_dim = embedding_dim self.dgru_state_dim = dgru_state_dim self.embedding_dim = embedding_dim self.lookup = LookupTable(name='embeddings') self.dgru_depth = dgru_depth # representation self.dgru = RecurrentStack([ DGRU(activation=Tanh(), dim=self.dgru_state_dim) for _ in range(dgru_depth) ], skip_connections=True) # importance of this representation self.bidir_w = Bidirectional(RecurrentWithFork( DGRU(activation=Tanh(), dim=self.dgru_state_dim // 2), self.embedding_dim, name='src_word_with_fork'), name='bidir_src_word_encoder') self.gru_fork = Fork( [name for name in self.dgru.apply.sequences if name != 'mask'], prototype=Linear(), name='gru_fork') # map to a energy scalar self.wl = Linear(input_dim=dgru_state_dim, output_dim=1) self.children = [ self.lookup, self.dgru, self.gru_fork, self.bidir_w, self.wl ] def _push_allocation_config(self): self.lookup.length = self.vocab_size self.lookup.dim = self.embedding_dim self.gru_fork.input_dim = self.embedding_dim self.gru_fork.output_dims = [ self.dgru.get_dim(name) for name in self.gru_fork.output_names ] @application(inputs=['char_seq', 'sample_matrix', 'char_aux'], outputs=['representation', 'weight']) def apply(self, char_seq, sample_matrix, char_aux): # Time as first dimension embeddings = self.lookup.apply(char_seq) gru_out = self.dgru.apply(**merge( self.gru_fork.apply(embeddings, as_dict=True), {'mask': char_aux})) wgru_out = tensor.exp( self.wl.apply(self.bidir_w.apply(embeddings, char_aux))) if self.dgru_depth > 1: gru_out = gru_out[-1] gru_out = tensor.addbroadcast(wgru_out, 2) * gru_out sampled_representation = tensor.tanh( tensor.batched_dot(sample_matrix, gru_out.dimshuffle([1, 0, 2]))) return sampled_representation.dimshuffle([1, 0, 2]), wgru_out def get_dim(self, name): if name == 'output': return self.dgru_state_dim super(Decimator, self).get_dim(name)
class SimplePyramidLayer(Initializable): """Basic unit for the pyramid model. """ def __init__(self, batch_size, frame_size, k, depth, size, **kwargs): super(SimplePyramidLayer, self).__init__(**kwargs) target_size = frame_size * k depth_x = depth hidden_size_mlp_x = 32*size depth_transition = depth-1 depth_theta = depth hidden_size_mlp_theta = 32*size hidden_size_recurrent = 32*size*3 activations_x = [Rectifier()]*depth_x dims_x = [frame_size] + [hidden_size_mlp_x]*(depth_x-1) + \ [4*hidden_size_recurrent] activations_theta = [Rectifier()]*depth_theta dims_theta = [hidden_size_recurrent] + \ [hidden_size_mlp_theta]*depth_theta self.mlp_x = MLP(activations = activations_x, dims = dims_x, name = "mlp_x") transition = [GatedRecurrent(dim=hidden_size_recurrent, use_bias = True, name = "gru_{}".format(i) ) for i in range(depth_transition)] self.transition = RecurrentStack( transition, name="transition", skip_connections = True) mlp_theta = MLP( activations = activations_theta, dims = dims_theta, name = "mlp_theta") mlp_gmm = GMMMLP(mlp = mlp_theta, dim = target_size, k = k, const = 0.00001, name = "gmm_wrap") self.gmm_emitter = GMMEmitter(gmmmlp = mlp_gmm, output_size = frame_size, k = k) normal_inputs = [name for name in self.transition.apply.sequences if 'mask' not in name] self.fork = Fork(normal_inputs, input_dim = 4*hidden_size_recurrent, output_dims = self.transition.get_dims(normal_inputs)) self.children = [self.mlp_x, self.transition, self.gmm_emitter, self.fork] def monitoring_vars(self, cg): mu, sigma, coeff = VariableFilter( applications = [self.gmm_emitter.gmmmlp.apply], name_regex = "output")(cg.variables) min_sigma = sigma.min().copy(name="sigma_min") mean_sigma = sigma.mean().copy(name="sigma_mean") max_sigma = sigma.max().copy(name="sigma_max") min_mu = mu.min().copy(name="mu_min") mean_mu = mu.mean().copy(name="mu_mean") max_mu = mu.max().copy(name="mu_max") monitoring_vars = [mean_sigma, min_sigma, min_mu, max_mu, mean_mu, max_sigma] return monitoring_vars @application def cost(self, x, context, **kwargs): x_g = self.mlp_x.apply(context) inputs = self.fork.apply(x_g, as_dict = True) h = self.transition.apply(**dict_union(inputs, kwargs)) self.final_states = [] for var in h: self.final_states.append(var[-1].copy(name = var.name + "_final_value")) cost = self.gmm_emitter.cost(h[-1], x) return cost.mean() @application def generate(context): x_g = self.mlp_x.apply(context) inputs = self.fork.apply(x_g, as_dict = True) h = self.transition.apply(**dict_union(inputs, kwargs)) return self.gmm_emitter.emit(h[-1])
gmm_emitter = GMMEmitter(gmmmlp = mlp_gmm, output_size = frame_size, k = k) bricks = [mlp_x, transition, gmm_emitter] for brick in bricks: brick.weights_init = IsotropicGaussian(0.01) brick.biases_init = Constant(0.) brick.initialize() ############## # Test model ############## x_g = mlp_x.apply(x) h = transition.apply(x_g) mu, sigma, coeff = mlp_gmm.apply(h[-2]) #cost = GMM(y, mu, sigma, coeff) cost = gmm_emitter.cost(h[-2], y) cost = cost.mean() cost.name = 'sequence_log_likelihood' emit = gmm_emitter.emit(h[-2]) emit.name = 'emitter' cg = ComputationGraph(cost) model = Model(cost) #################
def build_fork_lookup(vocab_size, time_length, args): x = tensor.lmatrix('features') virtual_dim = 6 state_dim = 6 skip_connections = False layers = 1 # Build the model output_names = [] output_dims = [] for d in range(layers): if d > 0: suffix = '_' + str(d) else: suffix = '' if d == 0 or skip_connections: output_names.append("inputs" + suffix) output_dims.append(virtual_dim) lookup = LookupTable(length=vocab_size, dim=virtual_dim) lookup.weights_init = initialization.IsotropicGaussian(0.1) lookup.biases_init = initialization.Constant(0) fork = Fork(output_names=output_names, input_dim=time_length, output_dims=output_dims, prototype=FeedforwardSequence( [lookup.apply])) # Note that this order of the periods makes faster modules flow in slower # ones with is the opposite of the original paper transitions = [ClockworkBase(dim=state_dim, activation=Tanh(), period=2 ** i) for i in range(layers)] rnn = RecurrentStack(transitions, skip_connections=skip_connections) # Return list of 3D Tensor, one for each layer # (Batch X Time X embedding_dim) pre_rnn = fork.apply(x) # Give time as the first index for each element in the list: # (Time X Batch X embedding_dim) if layers > 1 and skip_connections: for t in range(len(pre_rnn)): pre_rnn[t] = pre_rnn[t].dimshuffle(1, 0, 2) else: pre_rnn = pre_rnn.dimshuffle(1, 0, 2) f_pre_rnn = theano.function([x], pre_rnn) # Prepare inputs for the RNN kwargs = OrderedDict() for d in range(layers): if d > 0: suffix = '_' + str(d) else: suffix = '' if d == 0 or skip_connections: if skip_connections: kwargs['inputs' + suffix] = pre_rnn[d] else: kwargs['inputs' + suffix] = pre_rnn print kwargs # Apply the RNN to the inputs h = rnn.apply(low_memory=True, **kwargs) fork.initialize() rnn.weights_init = initialization.Orthogonal() rnn.biases_init = initialization.Constant(0) rnn.initialize() f_h = theano.function([x], h) return f_pre_rnn, f_h
gmm_emitter = GMMEmitter(gmmmlp=mlp_gmm, output_size=frame_size, k=k) bricks = [mlp_x, transition, gmm_emitter] for brick in bricks: brick.weights_init = IsotropicGaussian(0.01) brick.biases_init = Constant(0.) brick.initialize() ############## # Test model ############## x_g = mlp_x.apply(x) h = transition.apply(x_g) mu, sigma, coeff = mlp_gmm.apply(h[-2]) cost = gmm_emitter.cost(h[-2], y) cost = cost.mean() cost.name = 'nll' emit = gmm_emitter.emit(h[-2]) emit.name = 'emitter' cg = ComputationGraph(cost) model = Model(cost) ################# # Algorithm #################
class Decimator(Initializable): """Char encoder, mapping a char-level word to a vector""" def __init__(self, vocab_size, embedding_dim, dgru_state_dim, dgru_layers, **kwargs): super(Decimator, self).__init__(**kwargs) self.vocab_size = vocab_size self.embedding_dim = embedding_dim self.dgru_state_dim = dgru_state_dim self.embedding_dim = embedding_dim self.lookup = LookupTable(name='embeddings') self.dgru_layers = dgru_layers self.dgru = RecurrentStack([ DGRU(activation=Tanh(), dim=self.dgru_state_dim) for _ in range(dgru_layers) ], skip_connections=True) self.gru_fork = Fork( [name for name in self.dgru.apply.sequences if name != 'mask'], prototype=Linear(), name='gru_fork') self.children = [self.lookup, self.dgru, self.gru_fork] def _push_allocation_config(self): self.lookup.length = self.vocab_size self.lookup.dim = self.embedding_dim self.gru_fork.input_dim = self.embedding_dim self.gru_fork.output_dims = [ self.dgru.get_dim(name) for name in self.gru_fork.output_names ] @application(inputs=['char_seq', 'sample_matrix', 'char_aux'], outputs=['representation']) def apply(self, char_seq, sample_matrix, char_aux): # Time as first dimension embeddings = self.lookup.apply(char_seq) gru_out = self.dgru.apply(**merge( self.gru_fork.apply(embeddings, as_dict=True), {'mask': char_aux})) if self.dgru_layers > 1: gru_out = gru_out[-1] sampled_representation = tensor.batched_dot( sample_matrix, gru_out.dimshuffle([1, 0, 2])) return sampled_representation.dimshuffle([1, 0, 2]) @application(inputs=['target_single_char'], outputs=['gru_out']) def single_emit(self, target_single_char, batch_size, mask, states=None): # Time as first dimension # only one batch embeddings = self.lookup.apply(target_single_char) if states is None: states = self.dgru.initial_states(batch_size) gru_out = self.dgru.apply( **merge(self.gru_fork.apply(embeddings, as_dict=True), { 'states': states, 'mask': mask, 'iterate': False })) return gru_out def get_dim(self, name): if name in ['output', 'feedback']: return self.dgru_state_dim super(Decimator, self).get_dim(name)