Beispiel #1
0
def build_model_vanilla(args, dtype=floatX):
    logger.info('Building model ...')

    # Return list of 3D Tensor, one for each layer
    # (Time X Batch X embedding_dim)
    pre_rnn, x_mask = get_prernn(args)

    transitions = [
        SimpleRecurrent(dim=args.state_dim, activation=Tanh())
        for _ in range(args.layers)
    ]

    rnn = RecurrentStack(transitions, skip_connections=args.skip_connections)
    initialize_rnn(rnn, args)

    # Prepare inputs and initial states for the RNN
    kwargs, inits = get_rnn_kwargs(pre_rnn, args)

    # Apply the RNN to the inputs
    h = rnn.apply(low_memory=True, mask=x_mask, **kwargs)

    # We have
    # h = [state, state_1, state_2 ...] if args.layers > 1
    # h = state if args.layers == 1

    # If we have skip connections, concatenate all the states
    # Else only consider the state of the highest layer
    last_states = {}
    hidden_states = []
    if args.layers > 1:
        # Save all the last states
        for d in range(args.layers):
            # TODO correct bug
            # h[d] = h[d] * x_mask
            last_states[d] = h[d][-1, :, :]
            h[d].name = "hidden_state_" + str(d)
            hidden_states.append(h[d])
        if args.skip_connections or args.skip_output:
            h = tensor.concatenate(h, axis=2)
        else:
            h = h[-1]
    else:
        # TODO correct bug
        # hidden_states.append(h * x_mask)
        hidden_states.append(h)
        hidden_states[0].name = "hidden_state_0"
        # Note: if we have mask, then updating initial state
        # with last state does not make sence anymore.
        last_states[0] = h[-1, :, :]

    # The updates of the hidden states
    updates = []
    for d in range(args.layers):
        updates.append((inits[0][d], last_states[d]))

    presoft = get_presoft(h, args)

    cost, unregularized_cost = get_costs(presoft, args)

    return cost, unregularized_cost, updates, hidden_states
def build_model_vanilla(args, dtype=floatX):
    logger.info('Building model ...')

    # Return list of 3D Tensor, one for each layer
    # (Time X Batch X embedding_dim)
    pre_rnn, x_mask = get_prernn(args)

    transitions = [SimpleRecurrent(dim=args.state_dim, activation=Tanh())
                   for _ in range(args.layers)]

    rnn = RecurrentStack(transitions, skip_connections=args.skip_connections)
    initialize_rnn(rnn, args)

    # Prepare inputs and initial states for the RNN
    kwargs, inits = get_rnn_kwargs(pre_rnn, args)

    # Apply the RNN to the inputs
    h = rnn.apply(low_memory=True, mask=x_mask, **kwargs)

    # We have
    # h = [state, state_1, state_2 ...] if args.layers > 1
    # h = state if args.layers == 1

    # If we have skip connections, concatenate all the states
    # Else only consider the state of the highest layer
    last_states = {}
    hidden_states = []
    if args.layers > 1:
        # Save all the last states
        for d in range(args.layers):
            # TODO correct bug
            # h[d] = h[d] * x_mask
            last_states[d] = h[d][-1, :, :]
            h[d].name = "hidden_state_" + str(d)
            hidden_states.append(h[d])
        if args.skip_connections or args.skip_output:
            h = tensor.concatenate(h, axis=2)
        else:
            h = h[-1]
    else:
        # TODO correct bug
        # hidden_states.append(h * x_mask)
        hidden_states.append(h)
        hidden_states[0].name = "hidden_state_0"
        # Note: if we have mask, then updating initial state
        # with last state does not make sence anymore.
        last_states[0] = h[-1, :, :]

    # The updates of the hidden states
    updates = []
    for d in range(args.layers):
        updates.append((inits[0][d], last_states[d]))

    presoft = get_presoft(h, args)

    cost, unregularized_cost = get_costs(presoft, args)

    return cost, unregularized_cost, updates, hidden_states
class TestBidirectionalStack(unittest.TestCase):
    def setUp(self):
        prototype = SimpleRecurrent(dim=3, activation=Tanh())
        self.layers = [
            Bidirectional(weights_init=Orthogonal(), prototype=prototype)
            for _ in range(3)
        ]
        self.stack = RecurrentStack(self.layers)
        for fork in self.stack.forks:
            fork.weights_init = Identity(1)
            fork.biases_init = Constant(0)
        self.stack.initialize()

        self.x_val = 0.1 * numpy.asarray(
            list(itertools.permutations(range(4))), dtype=theano.config.floatX)
        self.x_val = (numpy.ones(
            (24, 4, 3), dtype=theano.config.floatX) * self.x_val[..., None])
        self.mask_val = numpy.ones((24, 4), dtype=theano.config.floatX)
        self.mask_val[12:24, 3] = 0

    def test_steps(self):
        x = tensor.tensor3('x')
        mask = tensor.matrix('mask')

        calc_stack_layers = [
            theano.function([x, mask],
                            self.stack.apply(x, mask=mask)[i])
            for i in range(len(self.layers))
        ]
        stack_layers = [
            f(self.x_val, self.mask_val) for f in calc_stack_layers
        ]

        h_val = self.x_val
        for stack_layer_value, bidir_net in zip(stack_layers, self.layers):
            calc = theano.function([x, mask], bidir_net.apply(x, mask=mask))
            simple_layer_value = calc(h_val, self.mask_val)
            assert_allclose(stack_layer_value, simple_layer_value, rtol=1e-04)
            h_val = simple_layer_value[..., :3]

    def test_dims(self):
        self.assertEqual(self.stack.get_dim("inputs"), 3)
        for i in range(len(self.layers)):
            state_name = self.stack.suffix("states", i)
            self.assertEqual(self.stack.get_dim(state_name), 6)
Beispiel #4
0
class TestBidirectionalStack(unittest.TestCase):
    def setUp(self):
        prototype = SimpleRecurrent(dim=3, activation=Tanh())
        self.layers = [
            Bidirectional(weights_init=Orthogonal(), prototype=prototype)
            for _ in range(3)]
        self.stack = RecurrentStack(self.layers)
        for fork in self.stack.forks:
            fork.weights_init = Identity(1)
            fork.biases_init = Constant(0)
        self.stack.initialize()

        self.x_val = 0.1 * numpy.asarray(
            list(itertools.permutations(range(4))),
            dtype=theano.config.floatX)
        self.x_val = (numpy.ones((24, 4, 3), dtype=theano.config.floatX) *
                      self.x_val[..., None])
        self.mask_val = numpy.ones((24, 4), dtype=theano.config.floatX)
        self.mask_val[12:24, 3] = 0

    def test_steps(self):
        x = tensor.tensor3('x')
        mask = tensor.matrix('mask')

        calc_stack_layers = [
            theano.function([x, mask], self.stack.apply(x, mask=mask)[i])
            for i in range(len(self.layers))]
        stack_layers = [
            f(self.x_val, self.mask_val) for f in calc_stack_layers]

        h_val = self.x_val
        for stack_layer_value, bidir_net in zip(stack_layers, self.layers):
            calc = theano.function([x, mask], bidir_net.apply(x, mask=mask))
            simple_layer_value = calc(h_val, self.mask_val)
            assert_allclose(stack_layer_value, simple_layer_value, rtol=1e-04)
            h_val = simple_layer_value[..., :3]

    def test_dims(self):
        self.assertEqual(self.stack.get_dim("inputs"), 3)
        for i in range(len(self.layers)):
            state_name = self.stack.suffix("states", i)
            self.assertEqual(self.stack.get_dim(state_name), 6)
Beispiel #5
0
class Interpolator(AbstractReadout):
    """Readout char by char."""
    def __init__(self,
                 vocab_size,
                 embedding_dim,
                 igru_state_dim,
                 igru_depth,
                 trg_dgru_depth,
                 emitter,
                 feedback_brick,
                 merge=None,
                 merge_prototype=None,
                 post_merge=None,
                 **kwargs):
        merged_dim = igru_state_dim
        if not merge:
            merge = Merge(input_names=kwargs['source_names'],
                          prototype=merge_prototype)
        if not post_merge:
            post_merge = Bias(dim=merged_dim)

        # for compatible
        if igru_depth == 1:
            self.igru = IGRU(dim=igru_state_dim)
        else:
            self.igru = RecurrentStack(
                [IGRU(dim=igru_state_dim, name='igru')] + [
                    UpperIGRU(dim=igru_state_dim,
                              activation=Tanh(),
                              name='upper_igru' + str(i))
                    for i in range(1, igru_depth)
                ],
                skip_connections=True)
        self.embedding_dim = embedding_dim
        self.emitter = emitter
        self.feedback_brick = feedback_brick
        self.merge = merge
        self.post_merge = post_merge
        self.merged_dim = merged_dim
        self.igru_depth = igru_depth
        self.trg_dgru_depth = trg_dgru_depth
        self.lookup = LookupTable(name='embeddings')
        self.vocab_size = vocab_size
        self.igru_state_dim = igru_state_dim
        self.gru_to_softmax = Linear(input_dim=igru_state_dim,
                                     output_dim=vocab_size)
        self.gru_fork = Fork([
            name for name in self.igru.apply.sequences
            if name != 'mask' and name != 'input_states'
        ],
                             prototype=Linear(),
                             name='gru_fork')

        children = [
            self.emitter, self.feedback_brick, self.merge, self.post_merge,
            self.igru, self.lookup, self.gru_to_softmax, self.gru_fork
        ]
        kwargs.setdefault('children', []).extend(children)
        super(Interpolator, self).__init__(**kwargs)

    def _push_allocation_config(self):
        self.lookup.length = self.vocab_size
        self.lookup.dim = self.embedding_dim
        self.emitter.readout_dim = self.get_dim('readouts')
        self.merge.input_names = self.source_names
        self.merge.input_dims = self.source_dims
        self.merge.output_dim = self.merged_dim
        self.post_merge.input_dim = self.merged_dim
        self.post_merge.output_dim = self.igru_state_dim
        self.gru_fork.input_dim = self.embedding_dim
        self.gru_fork.output_dims = [
            self.igru.get_dim(name) for name in self.gru_fork.output_names
        ]

    @application
    def initial_igru_outputs(self, batch_size):
        return self.igru.initial_states(batch_size)

    @application
    def emit(self, readouts):
        return self.emitter.emit(readouts)

    @application
    def cost(self, readouts, outputs):
        return self.emitter.cost(readouts, outputs)

    @application
    def initial_outputs(self, batch_size):
        return self.emitter.initial_outputs(batch_size)

    @application(outputs=['feedback'])
    def feedback(self, outputs):
        return self.feedback_brick.feedback(outputs)

    @application(outputs=['feedback'])
    def feedback_apply(self, target_char_seq, target_sample_matrix,
                       target_char_aux):
        return self.feedback_brick.apply(target_char_seq, target_sample_matrix,
                                         target_char_aux)

    @application
    def single_feedback(self,
                        target_single_char,
                        batch_size,
                        mask=None,
                        states=None):
        return self.feedback_brick.single_emit(target_single_char, batch_size,
                                               mask, states)

    @single_feedback.property('outputs')
    def single_feedback_outputs(self):
        return [
            'single_feedback' + RECURRENTSTACK_SEPARATOR + str(i)
            for i in range(self.trg_dgru_depth)
        ]

    @application(outputs=['gru_out', 'readout_chars'])
    def single_readout_gru(self, target_prev_char, target_prev_char_aux,
                           input_states, states):
        embeddings = self.lookup.apply(target_prev_char)
        states_dict = {'states': states[0]}
        if self.igru_depth > 1:
            for i in range(1, self.igru_depth):
                states_dict['states' + RECURRENTSTACK_SEPARATOR +
                            str(i)] = states[i]
        gru_out = self.igru.apply(**merge(
            self.gru_fork.apply(embeddings, as_dict=True), states_dict, {
                'mask': target_prev_char_aux,
                'input_states': input_states,
                'iterate': False
            }))
        if self.igru_depth > 1:
            readout_chars = self.gru_to_softmax.apply(gru_out[-1])
        else:
            readout_chars = self.gru_to_softmax.apply(gru_out)
        return gru_out, readout_chars

    @application
    def readout(self, **kwargs):
        merged = self.merge.apply(
            **{name: kwargs[name]
               for name in self.merge.input_names})
        merged = self.post_merge.apply(merged)
        return merged

    @application(outputs=['readout_chars'])
    def readout_gru(self, target_prev_char_seq, target_prev_char_aux,
                    input_states):
        embeddings = self.lookup.apply(target_prev_char_seq)
        gru_out = self.igru.apply(
            **merge(self.gru_fork.apply(embeddings, as_dict=True), {
                'mask': target_prev_char_aux,
                'input_states': input_states
            }))
        if self.igru_depth > 1:
            gru_out = gru_out[-1]
        readout_chars = self.gru_to_softmax.apply(gru_out)
        return readout_chars

    def get_dim(self, name):
        if name == 'outputs':
            return self.emitter.get_dim(name)
        elif name == 'feedback':
            return self.feedback_brick.get_dim(name)
        elif name == 'readouts':
            return self.readout_dim
        return super(AbstractReadout, self).get_dim(name)
Beispiel #6
0
class TargetWordEncoder(Initializable):
    """Word encoder in target side use a single RNN to map a charater-level word to a vector"""
    def __init__(self, vocab_size, embedding_dim, dgru_state_dim, dgru_depth,
                 **kwargs):
        super(TargetWordEncoder, self).__init__(**kwargs)

        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.dgru_state_dim = dgru_state_dim
        self.embedding_dim = embedding_dim
        self.lookup = LookupTable(name='embeddings')
        self.dgru_depth = dgru_depth
        self.dgru = RecurrentStack([
            DGRU(activation=Tanh(), dim=self.dgru_state_dim)
            for _ in range(dgru_depth)
        ],
                                   skip_connections=True)

        self.gru_fork = Fork(
            [name for name in self.dgru.apply.sequences if name != 'mask'],
            prototype=Linear(),
            name='gru_fork')

        self.children = [self.lookup, self.dgru, self.gru_fork]

    def _push_allocation_config(self):
        self.lookup.length = self.vocab_size
        self.lookup.dim = self.embedding_dim

        self.gru_fork.input_dim = self.embedding_dim
        self.gru_fork.output_dims = [
            self.dgru.get_dim(name) for name in self.gru_fork.output_names
        ]

    @application(inputs=['char_seq', 'sample_matrix', 'char_aux'],
                 outputs=['representation'])
    def apply(self, char_seq, sample_matrix, char_aux):
        # Time as first dimension
        embeddings = self.lookup.apply(char_seq)
        gru_out = self.dgru.apply(**merge(
            self.gru_fork.apply(embeddings, as_dict=True), {'mask': char_aux}))
        if self.dgru_depth > 1:
            gru_out = gru_out[-1]
        sampled_representation = tensor.batched_dot(
            sample_matrix, gru_out.dimshuffle([1, 0, 2]))
        return sampled_representation.dimshuffle([1, 0, 2])

    @application(inputs=['target_single_char'])
    def single_emit(self, target_single_char, batch_size, mask, states=None):
        # Time as first dimension
        # only one batch
        embeddings = self.lookup.apply(target_single_char)
        if states is None:
            states = self.dgru.initial_states(batch_size)
        states_dict = {'states': states[0]}
        for i in range(1, self.dgru_depth):
            states_dict['states' + RECURRENTSTACK_SEPARATOR +
                        str(i)] = states[i]
        gru_out = self.dgru.apply(**merge(
            self.gru_fork.apply(embeddings, as_dict=True), states_dict, {
                'mask': mask,
                'iterate': False
            }))
        return gru_out

    @single_emit.property('outputs')
    def single_emit_outputs(self):
        return [
            'gru_out' + RECURRENTSTACK_SEPARATOR + str(i)
            for i in range(self.dgru_depth)
        ]

    def get_dim(self, name):
        if name in ['output', 'feedback']:
            return self.dgru_state_dim
        super(TargetWordEncoder, self).get_dim(name)
def build_model_vanilla(vocab_size, args, dtype=floatX):
    logger.info('Building model ...')

    # Parameters for the model
    context = args.context
    state_dim = args.state_dim
    layers = args.layers
    skip_connections = args.skip_connections

    # Symbolic variables
    # In both cases: Time X Batch
    x = tensor.lmatrix('features')
    y = tensor.lmatrix('targets')

    # Build the model
    output_names = []
    output_dims = []
    for d in range(layers):
        if d > 0:
            suffix = '_' + str(d)
        else:
            suffix = ''
        if d == 0 or skip_connections:
            output_names.append("inputs" + suffix)
            output_dims.append(state_dim)

    lookup = LookupTable(length=vocab_size, dim=state_dim)
    lookup.weights_init = initialization.IsotropicGaussian(0.1)
    lookup.biases_init = initialization.Constant(0)

    fork = Fork(output_names=output_names, input_dim=args.mini_batch_size,
                output_dims=output_dims,
                prototype=FeedforwardSequence(
                    [lookup.apply]))

    transitions = [SimpleRecurrent(dim=state_dim, activation=Tanh())
                   for _ in range(layers)]

    rnn = RecurrentStack(transitions, skip_connections=skip_connections)

    # If skip_connections: dim = layers * state_dim
    # else: dim = state_dim
    output_layer = Linear(
        input_dim=skip_connections * layers *
        state_dim + (1 - skip_connections) * state_dim,
        output_dim=vocab_size, name="output_layer")

    # Return list of 3D Tensor, one for each layer
    # (Time X Batch X embedding_dim)
    pre_rnn = fork.apply(x)

    # Give a name to the input of each layer
    if skip_connections:
        for t in range(len(pre_rnn)):
            pre_rnn[t].name = "pre_rnn_" + str(t)
    else:
        pre_rnn.name = "pre_rnn"

    # Prepare inputs for the RNN
    kwargs = OrderedDict()
    init_states = {}
    for d in range(layers):
        if d > 0:
            suffix = '_' + str(d)
        else:
            suffix = ''
        if skip_connections:
            kwargs['inputs' + suffix] = pre_rnn[d]
        elif d == 0:
            kwargs['inputs'] = pre_rnn
        init_states[d] = theano.shared(
            numpy.zeros((args.mini_batch_size, state_dim)).astype(floatX),
            name='state0_%d' % d)
        kwargs['states' + suffix] = init_states[d]

    # Apply the RNN to the inputs
    h = rnn.apply(low_memory=True, **kwargs)

    # We have
    # h = [state, state_1, state_2 ...] if layers > 1
    # h = state if layers == 1

    # If we have skip connections, concatenate all the states
    # Else only consider the state of the highest layer
    last_states = {}
    if layers > 1:
        # Save all the last states
        for d in range(layers):
            last_states[d] = h[d][-1, :, :]
        if skip_connections:
            h = tensor.concatenate(h, axis=2)
        else:
            h = h[-1]
    else:
        last_states[0] = h[-1, :, :]
    h.name = "hidden_state"

    # The updates of the hidden states
    updates = []
    for d in range(layers):
        updates.append((init_states[d], last_states[d]))

    presoft = output_layer.apply(h[context:, :, :])
    # Define the cost
    # Compute the probability distribution
    time, batch, feat = presoft.shape
    presoft.name = 'presoft'

    cross_entropy = Softmax().categorical_cross_entropy(
        y[context:, :].flatten(),
        presoft.reshape((batch * time, feat)))
    cross_entropy = cross_entropy / tensor.log(2)
    cross_entropy.name = "cross_entropy"

    # TODO: add regularisation for the cost
    # the log(1) is here in order to differentiate the two variables
    # for monitoring
    cost = cross_entropy + tensor.log(1)
    cost.name = "regularized_cost"

    # Initialize the model
    logger.info('Initializing...')

    fork.initialize()

    rnn.weights_init = initialization.Orthogonal()
    rnn.biases_init = initialization.Constant(0)
    rnn.initialize()

    output_layer.weights_init = initialization.IsotropicGaussian(0.1)
    output_layer.biases_init = initialization.Constant(0)
    output_layer.initialize()

    return cost, cross_entropy, updates
def build_model_cw(args, dtype=floatX):
    logger.info('Building model ...')

    # Return list of 3D Tensor, one for each layer
    # (Time X Batch X embedding_dim)
    pre_rnn, x_mask = get_prernn(args)

    # Note that this order of the periods makes faster modules flow in slower
    # ones with is the opposite of the original paper
    if args.module_order == "fast_in_slow":
        transitions = [ClockworkBase(
            dim=args.state_dim, activation=Tanh(),
            period=2 ** i) for i in range(args.layers)]
    elif args.module_order == "slow_in_fast":
        transitions = [ClockworkBase(
            dim=args.state_dim,
            activation=Tanh(),
            period=2 ** (args.layers - i - 1)) for i in range(args.layers)]
    else:
        assert False

    rnn = RecurrentStack(transitions, skip_connections=args.skip_connections)
    initialize_rnn(rnn, args)

    # Prepare inputs and initial states for the RNN
    kwargs, inits = get_rnn_kwargs(pre_rnn, args)

    # Apply the RNN to the inputs
    h = rnn.apply(low_memory=True, mask=x_mask, **kwargs)

    # In the Clockwork case:
    # h = [state, time, state_1, time_1 ...]
    h = h[::2]

    # Now we have correctly:
    # h = [state, state_1, state_2 ...] if args.layers > 1
    # h = [state] if args.layers == 1

    # If we have skip connections, concatenate all the states
    # Else only consider the state of the highest layer
    last_states = {}
    hidden_states = []
    if args.layers > 1:
        # Save all the last states
        for d in range(args.layers):
            # TODO correct the bug
            # h[d] = h[d] * x_mask
            last_states[d] = h[d][-1, :, :]
            h[d].name = "hidden_state_" + str(d)
            hidden_states.append(h[d])
        h = tensor.concatenate(h, axis=2)
    else:
        h = h[0] * x_mask
        last_states[0] = h[-1, :, :]
    h.name = "hidden_state_all"

    # The updates of the hidden states
    updates = []
    for d in range(args.layers):
        updates.append((inits[0][d], last_states[d]))

    presoft = get_presoft(h, args)

    cost, unregularized_cost = get_costs(presoft, args)

    return cost, unregularized_cost, updates, hidden_states
Beispiel #9
0
def build_model_lstm(args, dtype=floatX):
    logger.info('Building model ...')

    # Return list of 3D Tensor, one for each layer
    # (Time X Batch X embedding_dim)
    pre_rnn, x_mask = get_prernn(args)

    transitions = [LSTM(dim=args.state_dim, activation=Tanh())
                   for _ in range(args.layers)]

    rnn = RecurrentStack(transitions, skip_connections=args.skip_connections)
    initialize_rnn(rnn, args)

    # Prepare inputs and initial states for the RNN
    kwargs, inits = get_rnn_kwargs(pre_rnn, args)

    # Apply the RNN to the inputs
    h = rnn.apply(mask=x_mask, **kwargs)

    # h = [state, cell, in, forget, out, state_1,
    #        cell_1, in_1, forget_1, out_1 ...]

    last_states = {}
    last_cells = {}
    hidden_states = []
    for d in range(args.layers):
        # TODO correct bug
        # h[5 * d] = h[5 * d] * x_mask
        # h[5 * d + 1] = h[5 * d + 1] * x_mask

        last_states[d] = h[5 * d][-1, :, :]
        last_cells[d] = h[5 * d + 1][-1, :, :]

        h[5 * d].name = "hidden_state_" + str(d)
        h[5 * d + 1].name = "hidden_cell_" + str(d)
        hidden_states.extend([h[5 * d], h[5 * d + 1]])

    # The updates of the hidden states
    # Note: if we have mask, then updating initial state
    # with last state does not make sence anymore.
    updates = []
    for d in range(args.layers):
        updates.append((inits[0][d], last_states[d]))
        updates.append((inits[1][d], last_states[d]))

    # h = [state, cell, in, forget, out, state_1,
    #        cell_1, in_1, forget_1, out_1 ...]

    # Extract the values
    in_gates = h[2::5]
    forget_gates = h[3::5]
    out_gates = h[4::5]

    gate_values = {"in_gates": in_gates,
                   "forget_gates": forget_gates,
                   "out_gates": out_gates}

    h = h[::5]

    # Now we have correctly:
    # h = [state, state_1, state_2 ...] if args.layers > 1
    # h = [state] if args.layers == 1

    # If we have skip connections, concatenate all the states
    # Else only consider the state of the highest layer
    if args.layers > 1:
        if args.skip_connections or args.skip_output:
            h = tensor.concatenate(h, axis=2)
        else:
            h = h[-1]
    else:
        h = h[0]
    h.name = "hidden_state_all"

    presoft = get_presoft(h, args)

    cost, unregularized_cost = get_costs(presoft, args)

    return cost, unregularized_cost, updates, gate_values, hidden_states
Beispiel #10
0
def build_model_soft(args, dtype=floatX):
    logger.info('Building model ...')

    # Return list of 3D Tensor, one for each layer
    # (Time X Batch X embedding_dim)
    pre_rnn, x_mask = get_prernn(args)

    transitions = [SimpleRecurrent(dim=args.state_dim, activation=Tanh())]

    # Build the MLP
    dims = [2 * args.state_dim]
    activations = []
    for i in range(args.mlp_layers):
        activations.append(Rectifier())
        dims.append(args.state_dim)

    # Activation of the last layer of the MLP
    if args.mlp_activation == "logistic":
        activations.append(Logistic())
    elif args.mlp_activation == "rectifier":
        activations.append(Rectifier())
    elif args.mlp_activation == "hard_logistic":
        activations.append(HardLogistic())
    else:
        assert False

    # Output of MLP has dimension 1
    dims.append(1)

    for i in range(args.layers - 1):
        mlp = MLP(activations=activations, dims=dims,
                  weights_init=initialization.IsotropicGaussian(0.1),
                  biases_init=initialization.Constant(0),
                  name="mlp_" + str(i))
        transitions.append(
            SoftGatedRecurrent(dim=args.state_dim,
                               mlp=mlp,
                               activation=Tanh()))

    rnn = RecurrentStack(transitions, skip_connections=args.skip_connections)
    initialize_rnn(rnn, args)

    # Prepare inputs and initial states for the RNN
    kwargs, inits = get_rnn_kwargs(pre_rnn, args)

    # Apply the RNN to the inputs
    h = rnn.apply(low_memory=True, mask=x_mask, **kwargs)

    # Now we have:
    # h = [state, state_1, gate_value_1, state_2, gate_value_2, state_3, ...]

    # Extract gate_values
    gate_values = h[2::2]
    new_h = [h[0]]
    new_h.extend(h[1::2])
    h = new_h

    # Now we have:
    # h = [state, state_1, state_2, ...]
    # gate_values = [gate_value_1, gate_value_2, gate_value_3]

    for i, gate_value in enumerate(gate_values):
        gate_value.name = "gate_value_" + str(i)

    # Save all the last states
    last_states = {}
    hidden_states = []
    for d in range(args.layers):
        h[d] = h[d] * x_mask
        last_states[d] = h[d][-1, :, :]
        h[d].name = "hidden_state_" + str(d)
        hidden_states.append(h[d])

    # Concatenate all the states
    if args.layers > 1:
        h = tensor.concatenate(h, axis=2)
    h.name = "hidden_state_all"

    # The updates of the hidden states
    updates = []
    for d in range(args.layers):
        updates.append((inits[0][d], last_states[d]))

    presoft = get_presoft(h, args)

    cost, cross_entropy = get_costs(presoft, args)

    return cost, cross_entropy, updates, gate_values, hidden_states
Beispiel #11
0
def build_model_hard(vocab_size, args, dtype=floatX):
    logger.info('Building model ...')

    # Parameters for the model
    context = args.context
    state_dim = args.state_dim
    layers = args.layers
    skip_connections = args.skip_connections

    # Symbolic variables
    # In both cases: Time X Batch
    x = tensor.lmatrix('features')
    y = tensor.lmatrix('targets')

    # Build the model
    output_names = []
    output_dims = []
    for d in range(layers):
        if d > 0:
            suffix = '_' + str(d)
        else:
            suffix = ''
        if d == 0 or skip_connections:
            output_names.append("inputs" + suffix)
            output_dims.append(state_dim)

    lookup = LookupTable(length=vocab_size, dim=state_dim)
    lookup.weights_init = initialization.IsotropicGaussian(0.1)
    lookup.biases_init = initialization.Constant(0)

    fork = Fork(output_names=output_names,
                input_dim=args.mini_batch_size,
                output_dims=output_dims,
                prototype=FeedforwardSequence([lookup.apply]))

    transitions = [SimpleRecurrent(dim=state_dim, activation=Tanh())]
    for i in range(layers - 1):
        mlp = MLP(activations=[Logistic()],
                  dims=[2 * state_dim, 1],
                  weights_init=initialization.IsotropicGaussian(0.1),
                  biases_init=initialization.Constant(0),
                  name="mlp_" + str(i))
        transitions.append(
            HardGatedRecurrent(dim=state_dim, mlp=mlp, activation=Tanh()))

    rnn = RecurrentStack(transitions, skip_connections=skip_connections)

    # dim = layers * state_dim
    output_layer = Linear(input_dim=layers * state_dim,
                          output_dim=vocab_size,
                          name="output_layer")

    # Return list of 3D Tensor, one for each layer
    # (Time X Batch X embedding_dim)
    pre_rnn = fork.apply(x)

    # Give a name to the input of each layer
    if skip_connections:
        for t in range(len(pre_rnn)):
            pre_rnn[t].name = "pre_rnn_" + str(t)
    else:
        pre_rnn.name = "pre_rnn"

    # Prepare inputs for the RNN
    kwargs = OrderedDict()
    init_states = {}
    for d in range(layers):
        if d > 0:
            suffix = '_' + str(d)
        else:
            suffix = ''
        if skip_connections:
            kwargs['inputs' + suffix] = pre_rnn[d]
        elif d == 0:
            kwargs['inputs' + suffix] = pre_rnn
        init_states[d] = theano.shared(numpy.zeros(
            (args.mini_batch_size, state_dim)).astype(floatX),
                                       name='state0_%d' % d)
        kwargs['states' + suffix] = init_states[d]

    # Apply the RNN to the inputs
    h = rnn.apply(low_memory=True, **kwargs)

    # Now we have correctly:
    # h = [state_1, state_2, state_3 ...]

    # Save all the last states
    last_states = {}
    for d in range(layers):
        last_states[d] = h[d][-1, :, :]

    # Concatenate all the states
    if layers > 1:
        h = tensor.concatenate(h, axis=2)
    h.name = "hidden_state"

    # The updates of the hidden states
    updates = []
    for d in range(layers):
        updates.append((init_states[d], last_states[d]))

    presoft = output_layer.apply(h[context:, :, :])
    # Define the cost
    # Compute the probability distribution
    time, batch, feat = presoft.shape
    presoft.name = 'presoft'

    cross_entropy = Softmax().categorical_cross_entropy(
        y[context:, :].flatten(), presoft.reshape((batch * time, feat)))
    cross_entropy = cross_entropy / tensor.log(2)
    cross_entropy.name = "cross_entropy"

    # TODO: add regularisation for the cost
    # the log(1) is here in order to differentiate the two variables
    # for monitoring
    cost = cross_entropy + tensor.log(1)
    cost.name = "regularized_cost"

    # Initialize the model
    logger.info('Initializing...')

    fork.initialize()

    rnn.weights_init = initialization.Orthogonal()
    rnn.biases_init = initialization.Constant(0)
    rnn.initialize()

    output_layer.weights_init = initialization.IsotropicGaussian(0.1)
    output_layer.biases_init = initialization.Constant(0)
    output_layer.initialize()

    return cost, cross_entropy, updates
def build_model_soft(vocab_size, args, dtype=floatX):
    logger.info('Building model ...')

    # Parameters for the model
    context = args.context
    state_dim = args.state_dim
    layers = args.layers
    skip_connections = args.skip_connections

    # Symbolic variables
    # In both cases: Time X Batch
    x = tensor.lmatrix('features')
    y = tensor.lmatrix('targets')

    # Build the model
    output_names = []
    output_dims = []
    for d in range(layers):
        if d > 0:
            suffix = '_' + str(d)
        else:
            suffix = ''
        if d == 0 or skip_connections:
            output_names.append("inputs" + suffix)
            output_dims.append(state_dim)

    lookup = LookupTable(length=vocab_size, dim=state_dim)
    lookup.weights_init = initialization.IsotropicGaussian(0.1)
    lookup.biases_init = initialization.Constant(0)

    fork = Fork(output_names=output_names, input_dim=args.mini_batch_size,
                output_dims=output_dims,
                prototype=FeedforwardSequence(
                    [lookup.apply]))

    transitions = [SimpleRecurrent(dim=state_dim, activation=Tanh())]

    # Build the MLP
    dims = [2 * state_dim]
    activations = []
    for i in range(args.mlp_layers):
        activations.append(Rectifier())
        dims.append(state_dim)

    # Activation of the last layer of the MLP
    if args.mlp_activation == "logistic":
        activations.append(Logistic())
    elif args.mlp_activation == "rectifier":
        activations.append(Rectifier())
    elif args.mlp_activation == "hard_logistic":
        activations.append(HardLogistic())
    else:
        assert False

    # Output of MLP has dimension 1
    dims.append(1)

    for i in range(layers - 1):
        mlp = MLP(activations=activations, dims=dims,
                  weights_init=initialization.IsotropicGaussian(0.1),
                  biases_init=initialization.Constant(0),
                  name="mlp_" + str(i))
        transitions.append(
            SoftGatedRecurrent(dim=state_dim,
                               mlp=mlp,
                               activation=Tanh()))

    rnn = RecurrentStack(transitions, skip_connections=skip_connections)

    # dim = layers * state_dim
    output_layer = Linear(
        input_dim=layers * state_dim,
        output_dim=vocab_size, name="output_layer")

    # Return list of 3D Tensor, one for each layer
    # (Time X Batch X embedding_dim)
    pre_rnn = fork.apply(x)

    # Give a name to the input of each layer
    if skip_connections:
        for t in range(len(pre_rnn)):
            pre_rnn[t].name = "pre_rnn_" + str(t)
    else:
        pre_rnn.name = "pre_rnn"

    # Prepare inputs for the RNN
    kwargs = OrderedDict()
    init_states = {}
    for d in range(layers):
        if d > 0:
            suffix = '_' + str(d)
        else:
            suffix = ''
        if skip_connections:
            kwargs['inputs' + suffix] = pre_rnn[d]
        elif d == 0:
            kwargs['inputs' + suffix] = pre_rnn
        init_states[d] = theano.shared(
            numpy.zeros((args.mini_batch_size, state_dim)).astype(floatX),
            name='state0_%d' % d)
        kwargs['states' + suffix] = init_states[d]

    # Apply the RNN to the inputs
    h = rnn.apply(low_memory=True, **kwargs)

    # Now we have:
    # h = [state, state_1, gate_value_1, state_2, gate_value_2, state_3, ...]

    # Extract gate_values
    gate_values = h[2::2]
    new_h = [h[0]]
    new_h.extend(h[1::2])
    h = new_h

    # Now we have:
    # h = [state, state_1, state_2, ...]
    # gate_values = [gate_value_1, gate_value_2, gate_value_3]

    for i, gate_value in enumerate(gate_values):
        gate_value.name = "gate_value_" + str(i)

    # Save all the last states
    last_states = {}
    for d in range(layers):
        last_states[d] = h[d][-1, :, :]

    # Concatenate all the states
    if layers > 1:
        h = tensor.concatenate(h, axis=2)
    h.name = "hidden_state"

    # The updates of the hidden states
    updates = []
    for d in range(layers):
        updates.append((init_states[d], last_states[d]))

    presoft = output_layer.apply(h[context:, :, :])
    # Define the cost
    # Compute the probability distribution
    time, batch, feat = presoft.shape
    presoft.name = 'presoft'

    cross_entropy = Softmax().categorical_cross_entropy(
        y[context:, :].flatten(),
        presoft.reshape((batch * time, feat)))
    cross_entropy = cross_entropy / tensor.log(2)
    cross_entropy.name = "cross_entropy"

    # TODO: add regularisation for the cost
    # the log(1) is here in order to differentiate the two variables
    # for monitoring
    cost = cross_entropy + tensor.log(1)
    cost.name = "regularized_cost"

    # Initialize the model
    logger.info('Initializing...')

    fork.initialize()

    rnn.weights_init = initialization.Orthogonal()
    rnn.biases_init = initialization.Constant(0)
    rnn.initialize()

    output_layer.weights_init = initialization.IsotropicGaussian(0.1)
    output_layer.biases_init = initialization.Constant(0)
    output_layer.initialize()

    return cost, cross_entropy, updates, gate_values
Beispiel #13
0
def build_model_lstm(vocab_size, args, dtype=floatX):
    logger.info('Building model ...')

    # Parameters for the model
    context = args.context
    state_dim = args.state_dim
    layers = args.layers
    skip_connections = args.skip_connections

    virtual_dim = 4 * state_dim

    # Symbolic variables
    # In both cases: Time X Batch
    x = tensor.lmatrix('features')
    y = tensor.lmatrix('targets')

    # Build the model
    output_names = []
    output_dims = []
    for d in range(layers):
        if d > 0:
            suffix = '_' + str(d)
        else:
            suffix = ''
        if d == 0 or skip_connections:
            output_names.append("inputs" + suffix)
            output_dims.append(virtual_dim)

    lookup = LookupTable(length=vocab_size, dim=virtual_dim)
    lookup.weights_init = initialization.IsotropicGaussian(0.1)
    lookup.biases_init = initialization.Constant(0)

    # Make sure time_length is what we need
    fork = Fork(output_names=output_names,
                input_dim=args.mini_batch_size,
                output_dims=output_dims,
                prototype=FeedforwardSequence([lookup.apply]))

    transitions = [
        LSTM(dim=state_dim, activation=Tanh()) for _ in range(layers)
    ]

    rnn = RecurrentStack(transitions, skip_connections=skip_connections)

    # If skip_connections: dim = layers * state_dim
    # else: dim = state_dim
    output_layer = Linear(input_dim=skip_connections * layers * state_dim +
                          (1 - skip_connections) * state_dim,
                          output_dim=vocab_size,
                          name="output_layer")

    # Return list of 3D Tensor, one for each layer
    # (Time X Batch X embedding_dim)
    pre_rnn = fork.apply(x)

    # Give a name to the input of each layer
    if skip_connections:
        for t in range(len(pre_rnn)):
            pre_rnn[t].name = "pre_rnn_" + str(t)
    else:
        pre_rnn.name = "pre_rnn"

    # Prepare inputs for the RNN
    kwargs = OrderedDict()
    init_states = {}
    init_cells = {}
    for d in range(layers):
        if d > 0:
            suffix = '_' + str(d)
        else:
            suffix = ''
        if skip_connections:
            kwargs['inputs' + suffix] = pre_rnn[d]
        elif d == 0:
            kwargs['inputs'] = pre_rnn
        init_states[d] = theano.shared(numpy.zeros(
            (args.mini_batch_size, state_dim)).astype(floatX),
                                       name='state0_%d' % d)
        init_cells[d] = theano.shared(numpy.zeros(
            (args.mini_batch_size, state_dim)).astype(floatX),
                                      name='cell0_%d' % d)
        kwargs['states' + suffix] = init_states[d]
        kwargs['cells' + suffix] = init_cells[d]

    # Apply the RNN to the inputs
    h = rnn.apply(low_memory=True, **kwargs)

    # h = [state, cell, in, forget, out, state_1,
    #        cell_1, in_1, forget_1, out_1 ...]

    last_states = {}
    last_cells = {}
    for d in range(layers):
        last_states[d] = h[5 * d][-1, :, :]
        last_cells[d] = h[5 * d + 1][-1, :, :]

    # The updates of the hidden states
    updates = []
    for d in range(layers):
        updates.append((init_states[d], last_states[d]))
        updates.append((init_cells[d], last_states[d]))

    # h = [state, cell, in, forget, out, state_1,
    #        cell_1, in_1, forget_1, out_1 ...]

    # Extract the values
    in_gates = h[2::5]
    forget_gates = h[3::5]
    out_gates = h[4::5]

    gate_values = {
        "in_gates": in_gates,
        "forget_gates": forget_gates,
        "out_gates": out_gates
    }

    h = h[::5]

    # Now we have correctly:
    # h = [state, state_1, state_2 ...] if layers > 1
    # h = [state] if layers == 1

    # If we have skip connections, concatenate all the states
    # Else only consider the state of the highest layer
    if layers > 1:
        if skip_connections:
            h = tensor.concatenate(h, axis=2)
        else:
            h = h[-1]
    else:
        h = h[0]
    h.name = "hidden_state"

    presoft = output_layer.apply(h[context:, :, :])
    # Define the cost
    # Compute the probability distribution
    time, batch, feat = presoft.shape
    presoft.name = 'presoft'

    cross_entropy = Softmax().categorical_cross_entropy(
        y[context:, :].flatten(), presoft.reshape((batch * time, feat)))
    cross_entropy = cross_entropy / tensor.log(2)
    cross_entropy.name = "cross_entropy"

    # TODO: add regularisation for the cost
    # the log(1) is here in order to differentiate the two variables
    # for monitoring
    cost = cross_entropy + tensor.log(1)
    cost.name = "regularized_cost"

    # Initialize the model
    logger.info('Initializing...')

    fork.initialize()

    # Dont initialize as Orthogonal if we are about to load new parameters
    if args.load_path is not None:
        rnn.weights_init = initialization.Constant(0)
    else:
        rnn.weights_init = initialization.Orthogonal()
    rnn.biases_init = initialization.Constant(0)
    rnn.initialize()

    output_layer.weights_init = initialization.IsotropicGaussian(0.1)
    output_layer.biases_init = initialization.Constant(0)
    output_layer.initialize()

    return cost, cross_entropy, updates, gate_values
def build_model_cw(args, dtype=floatX):
    logger.info('Building model ...')

    # Return list of 3D Tensor, one for each layer
    # (Time X Batch X embedding_dim)
    pre_rnn, x_mask = get_prernn(args)

    # Note that this order of the periods makes faster modules flow in slower
    # ones with is the opposite of the original paper
    if args.module_order == "fast_in_slow":
        transitions = [
            ClockworkBase(dim=args.state_dim, activation=Tanh(), period=2**i)
            for i in range(args.layers)
        ]
    elif args.module_order == "slow_in_fast":
        transitions = [
            ClockworkBase(dim=args.state_dim,
                          activation=Tanh(),
                          period=2**(args.layers - i - 1))
            for i in range(args.layers)
        ]
    else:
        assert False

    rnn = RecurrentStack(transitions, skip_connections=args.skip_connections)
    initialize_rnn(rnn, args)

    # Prepare inputs and initial states for the RNN
    kwargs, inits = get_rnn_kwargs(pre_rnn, args)

    # Apply the RNN to the inputs
    h = rnn.apply(low_memory=True, mask=x_mask, **kwargs)

    # In the Clockwork case:
    # h = [state, time, state_1, time_1 ...]
    h = h[::2]

    # Now we have correctly:
    # h = [state, state_1, state_2 ...] if args.layers > 1
    # h = [state] if args.layers == 1

    # If we have skip connections, concatenate all the states
    # Else only consider the state of the highest layer
    last_states = {}
    hidden_states = []
    if args.layers > 1:
        # Save all the last states
        for d in range(args.layers):
            # TODO correct the bug
            # h[d] = h[d] * x_mask
            last_states[d] = h[d][-1, :, :]
            h[d].name = "hidden_state_" + str(d)
            hidden_states.append(h[d])
        h = tensor.concatenate(h, axis=2)
    else:
        h = h[0] * x_mask
        last_states[0] = h[-1, :, :]
    h.name = "hidden_state_all"

    # The updates of the hidden states
    updates = []
    for d in range(args.layers):
        updates.append((inits[0][d], last_states[d]))

    presoft = get_presoft(h, args)

    cost, unregularized_cost = get_costs(presoft, args)

    return cost, unregularized_cost, updates, hidden_states
Beispiel #15
0
class Decimator(Initializable):
    """Source word encoder, mapping a charater-level word to a vector.
        This encoder is able to learn the morphology.
        For compatibility with previous version, we call it Decimator.
    """
    def __init__(self, vocab_size, embedding_dim, dgru_state_dim, dgru_depth,
                 **kwargs):
        super(Decimator, self).__init__(**kwargs)

        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.dgru_state_dim = dgru_state_dim
        self.embedding_dim = embedding_dim
        self.lookup = LookupTable(name='embeddings')
        self.dgru_depth = dgru_depth
        # representation
        self.dgru = RecurrentStack([
            DGRU(activation=Tanh(), dim=self.dgru_state_dim)
            for _ in range(dgru_depth)
        ],
                                   skip_connections=True)
        # importance of this representation
        self.bidir_w = Bidirectional(RecurrentWithFork(
            DGRU(activation=Tanh(), dim=self.dgru_state_dim // 2),
            self.embedding_dim,
            name='src_word_with_fork'),
                                     name='bidir_src_word_encoder')

        self.gru_fork = Fork(
            [name for name in self.dgru.apply.sequences if name != 'mask'],
            prototype=Linear(),
            name='gru_fork')
        # map to a energy scalar
        self.wl = Linear(input_dim=dgru_state_dim, output_dim=1)

        self.children = [
            self.lookup, self.dgru, self.gru_fork, self.bidir_w, self.wl
        ]

    def _push_allocation_config(self):
        self.lookup.length = self.vocab_size
        self.lookup.dim = self.embedding_dim

        self.gru_fork.input_dim = self.embedding_dim
        self.gru_fork.output_dims = [
            self.dgru.get_dim(name) for name in self.gru_fork.output_names
        ]

    @application(inputs=['char_seq', 'sample_matrix', 'char_aux'],
                 outputs=['representation', 'weight'])
    def apply(self, char_seq, sample_matrix, char_aux):
        # Time as first dimension
        embeddings = self.lookup.apply(char_seq)
        gru_out = self.dgru.apply(**merge(
            self.gru_fork.apply(embeddings, as_dict=True), {'mask': char_aux}))
        wgru_out = tensor.exp(
            self.wl.apply(self.bidir_w.apply(embeddings, char_aux)))

        if self.dgru_depth > 1:
            gru_out = gru_out[-1]

        gru_out = tensor.addbroadcast(wgru_out, 2) * gru_out
        sampled_representation = tensor.tanh(
            tensor.batched_dot(sample_matrix, gru_out.dimshuffle([1, 0, 2])))
        return sampled_representation.dimshuffle([1, 0, 2]), wgru_out

    def get_dim(self, name):
        if name == 'output':
            return self.dgru_state_dim
        super(Decimator, self).get_dim(name)
Beispiel #16
0
class SimplePyramidLayer(Initializable):
    """Basic unit for the pyramid model.

    """
    def __init__(self,
				 batch_size,
				 frame_size,
				 k,
				 depth,
				 size,
				  **kwargs):
		super(SimplePyramidLayer, self).__init__(**kwargs)

		target_size = frame_size * k

		depth_x = depth
		hidden_size_mlp_x = 32*size

		depth_transition = depth-1

		depth_theta = depth
		hidden_size_mlp_theta = 32*size
		hidden_size_recurrent = 32*size*3

		activations_x = [Rectifier()]*depth_x

		dims_x = [frame_size] + [hidden_size_mlp_x]*(depth_x-1) + \
		         [4*hidden_size_recurrent]

		activations_theta = [Rectifier()]*depth_theta

		dims_theta = [hidden_size_recurrent] + \
		             [hidden_size_mlp_theta]*depth_theta

		self.mlp_x = MLP(activations = activations_x,
		            dims = dims_x,
		            name = "mlp_x")

		transition = [GatedRecurrent(dim=hidden_size_recurrent, 
		                   use_bias = True,
		                   name = "gru_{}".format(i) ) for i in range(depth_transition)]

		self.transition = RecurrentStack( transition,
		            name="transition", skip_connections = True)

		mlp_theta = MLP( activations = activations_theta,
		             dims = dims_theta,
		             name = "mlp_theta")

		mlp_gmm = GMMMLP(mlp = mlp_theta,
		                  dim = target_size,
		                  k = k,
		                  const = 0.00001,
		                  name = "gmm_wrap")

		self.gmm_emitter = GMMEmitter(gmmmlp = mlp_gmm,
		  output_size = frame_size, k = k)

		normal_inputs = [name for name in self.transition.apply.sequences
		                 if 'mask' not in name]

		self.fork = Fork(normal_inputs,
						 input_dim = 4*hidden_size_recurrent,
						 output_dims = self.transition.get_dims(normal_inputs))

		self.children = [self.mlp_x, self.transition,
		                 self.gmm_emitter, self.fork]

    def monitoring_vars(self, cg):

        mu, sigma, coeff = VariableFilter(
        	applications = [self.gmm_emitter.gmmmlp.apply],
        	name_regex = "output")(cg.variables)

        min_sigma = sigma.min().copy(name="sigma_min")
        mean_sigma = sigma.mean().copy(name="sigma_mean")
        max_sigma = sigma.max().copy(name="sigma_max")

        min_mu = mu.min().copy(name="mu_min")
        mean_mu = mu.mean().copy(name="mu_mean")
        max_mu = mu.max().copy(name="mu_max")

        monitoring_vars = [mean_sigma, min_sigma,
            min_mu, max_mu, mean_mu, max_sigma]

        return monitoring_vars

    @application
    def cost(self, x, context, **kwargs):
        x_g = self.mlp_x.apply(context)
        inputs = self.fork.apply(x_g, as_dict = True)
        h = self.transition.apply(**dict_union(inputs, kwargs))

        self.final_states = []
        for var in h:
        	self.final_states.append(var[-1].copy(name = var.name + "_final_value"))

        cost = self.gmm_emitter.cost(h[-1], x)
        return cost.mean()

    @application
    def generate(context):
        x_g = self.mlp_x.apply(context)
        inputs = self.fork.apply(x_g, as_dict = True)
        h = self.transition.apply(**dict_union(inputs, kwargs))
        return self.gmm_emitter.emit(h[-1])
Beispiel #17
0
gmm_emitter = GMMEmitter(gmmmlp = mlp_gmm, output_size = frame_size, k = k)

bricks = [mlp_x, transition, gmm_emitter]

for brick in bricks:
    brick.weights_init = IsotropicGaussian(0.01)
    brick.biases_init = Constant(0.)
    brick.initialize()

##############
# Test model
##############

x_g = mlp_x.apply(x)
h = transition.apply(x_g)
mu, sigma, coeff = mlp_gmm.apply(h[-2])

#cost = GMM(y, mu, sigma, coeff)

cost = gmm_emitter.cost(h[-2], y)
cost = cost.mean()
cost.name = 'sequence_log_likelihood'

emit = gmm_emitter.emit(h[-2])
emit.name = 'emitter'

cg = ComputationGraph(cost)
model = Model(cost)

#################
def build_fork_lookup(vocab_size, time_length, args):
    x = tensor.lmatrix('features')
    virtual_dim = 6
    state_dim = 6
    skip_connections = False
    layers = 1

    # Build the model
    output_names = []
    output_dims = []
    for d in range(layers):
        if d > 0:
            suffix = '_' + str(d)
        else:
            suffix = ''
        if d == 0 or skip_connections:
            output_names.append("inputs" + suffix)
            output_dims.append(virtual_dim)

    lookup = LookupTable(length=vocab_size, dim=virtual_dim)
    lookup.weights_init = initialization.IsotropicGaussian(0.1)
    lookup.biases_init = initialization.Constant(0)

    fork = Fork(output_names=output_names, input_dim=time_length,
                output_dims=output_dims,
                prototype=FeedforwardSequence(
                    [lookup.apply]))

    # Note that this order of the periods makes faster modules flow in slower
    # ones with is the opposite of the original paper
    transitions = [ClockworkBase(dim=state_dim, activation=Tanh(),
                                 period=2 ** i) for i in range(layers)]

    rnn = RecurrentStack(transitions, skip_connections=skip_connections)

    # Return list of 3D Tensor, one for each layer
    # (Batch X Time X embedding_dim)
    pre_rnn = fork.apply(x)

    # Give time as the first index for each element in the list:
    # (Time X Batch X embedding_dim)
    if layers > 1 and skip_connections:
        for t in range(len(pre_rnn)):
            pre_rnn[t] = pre_rnn[t].dimshuffle(1, 0, 2)
    else:
        pre_rnn = pre_rnn.dimshuffle(1, 0, 2)

    f_pre_rnn = theano.function([x], pre_rnn)

    # Prepare inputs for the RNN
    kwargs = OrderedDict()
    for d in range(layers):
        if d > 0:
            suffix = '_' + str(d)
        else:
            suffix = ''
        if d == 0 or skip_connections:
            if skip_connections:
                kwargs['inputs' + suffix] = pre_rnn[d]
            else:
                kwargs['inputs' + suffix] = pre_rnn

    print kwargs
    # Apply the RNN to the inputs
    h = rnn.apply(low_memory=True, **kwargs)

    fork.initialize()

    rnn.weights_init = initialization.Orthogonal()
    rnn.biases_init = initialization.Constant(0)
    rnn.initialize()

    f_h = theano.function([x], h)
    return f_pre_rnn, f_h
Beispiel #19
0
gmm_emitter = GMMEmitter(gmmmlp=mlp_gmm, output_size=frame_size, k=k)

bricks = [mlp_x, transition, gmm_emitter]

for brick in bricks:
    brick.weights_init = IsotropicGaussian(0.01)
    brick.biases_init = Constant(0.)
    brick.initialize()

##############
# Test model
##############

x_g = mlp_x.apply(x)
h = transition.apply(x_g)
mu, sigma, coeff = mlp_gmm.apply(h[-2])

cost = gmm_emitter.cost(h[-2], y)
cost = cost.mean()
cost.name = 'nll'

emit = gmm_emitter.emit(h[-2])
emit.name = 'emitter'

cg = ComputationGraph(cost)
model = Model(cost)

#################
# Algorithm
#################
Beispiel #20
0
class Decimator(Initializable):
    """Char encoder, mapping a char-level word to a vector"""
    def __init__(self, vocab_size, embedding_dim, dgru_state_dim, dgru_layers,
                 **kwargs):
        super(Decimator, self).__init__(**kwargs)

        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.dgru_state_dim = dgru_state_dim
        self.embedding_dim = embedding_dim
        self.lookup = LookupTable(name='embeddings')
        self.dgru_layers = dgru_layers
        self.dgru = RecurrentStack([
            DGRU(activation=Tanh(), dim=self.dgru_state_dim)
            for _ in range(dgru_layers)
        ],
                                   skip_connections=True)

        self.gru_fork = Fork(
            [name for name in self.dgru.apply.sequences if name != 'mask'],
            prototype=Linear(),
            name='gru_fork')

        self.children = [self.lookup, self.dgru, self.gru_fork]

    def _push_allocation_config(self):
        self.lookup.length = self.vocab_size
        self.lookup.dim = self.embedding_dim

        self.gru_fork.input_dim = self.embedding_dim
        self.gru_fork.output_dims = [
            self.dgru.get_dim(name) for name in self.gru_fork.output_names
        ]

    @application(inputs=['char_seq', 'sample_matrix', 'char_aux'],
                 outputs=['representation'])
    def apply(self, char_seq, sample_matrix, char_aux):
        # Time as first dimension
        embeddings = self.lookup.apply(char_seq)
        gru_out = self.dgru.apply(**merge(
            self.gru_fork.apply(embeddings, as_dict=True), {'mask': char_aux}))
        if self.dgru_layers > 1:
            gru_out = gru_out[-1]
        sampled_representation = tensor.batched_dot(
            sample_matrix, gru_out.dimshuffle([1, 0, 2]))
        return sampled_representation.dimshuffle([1, 0, 2])

    @application(inputs=['target_single_char'], outputs=['gru_out'])
    def single_emit(self, target_single_char, batch_size, mask, states=None):
        # Time as first dimension
        # only one batch
        embeddings = self.lookup.apply(target_single_char)
        if states is None:
            states = self.dgru.initial_states(batch_size)
        gru_out = self.dgru.apply(
            **merge(self.gru_fork.apply(embeddings, as_dict=True), {
                'states': states,
                'mask': mask,
                'iterate': False
            }))
        return gru_out

    def get_dim(self, name):
        if name in ['output', 'feedback']:
            return self.dgru_state_dim
        super(Decimator, self).get_dim(name)