class TestBidirectionalStack(unittest.TestCase):
    def setUp(self):
        prototype = SimpleRecurrent(dim=3, activation=Tanh())
        self.layers = [
            Bidirectional(weights_init=Orthogonal(), prototype=prototype)
            for _ in range(3)
        ]
        self.stack = RecurrentStack(self.layers)
        for fork in self.stack.forks:
            fork.weights_init = Identity(1)
            fork.biases_init = Constant(0)
        self.stack.initialize()

        self.x_val = 0.1 * numpy.asarray(
            list(itertools.permutations(range(4))), dtype=theano.config.floatX)
        self.x_val = (numpy.ones(
            (24, 4, 3), dtype=theano.config.floatX) * self.x_val[..., None])
        self.mask_val = numpy.ones((24, 4), dtype=theano.config.floatX)
        self.mask_val[12:24, 3] = 0

    def test_steps(self):
        x = tensor.tensor3('x')
        mask = tensor.matrix('mask')

        calc_stack_layers = [
            theano.function([x, mask],
                            self.stack.apply(x, mask=mask)[i])
            for i in range(len(self.layers))
        ]
        stack_layers = [
            f(self.x_val, self.mask_val) for f in calc_stack_layers
        ]

        h_val = self.x_val
        for stack_layer_value, bidir_net in zip(stack_layers, self.layers):
            calc = theano.function([x, mask], bidir_net.apply(x, mask=mask))
            simple_layer_value = calc(h_val, self.mask_val)
            assert_allclose(stack_layer_value, simple_layer_value, rtol=1e-04)
            h_val = simple_layer_value[..., :3]

    def test_dims(self):
        self.assertEqual(self.stack.get_dim("inputs"), 3)
        for i in range(len(self.layers)):
            state_name = self.stack.suffix("states", i)
            self.assertEqual(self.stack.get_dim(state_name), 6)
Exemple #2
0
class TestBidirectionalStack(unittest.TestCase):
    def setUp(self):
        prototype = SimpleRecurrent(dim=3, activation=Tanh())
        self.layers = [
            Bidirectional(weights_init=Orthogonal(), prototype=prototype)
            for _ in range(3)]
        self.stack = RecurrentStack(self.layers)
        for fork in self.stack.forks:
            fork.weights_init = Identity(1)
            fork.biases_init = Constant(0)
        self.stack.initialize()

        self.x_val = 0.1 * numpy.asarray(
            list(itertools.permutations(range(4))),
            dtype=theano.config.floatX)
        self.x_val = (numpy.ones((24, 4, 3), dtype=theano.config.floatX) *
                      self.x_val[..., None])
        self.mask_val = numpy.ones((24, 4), dtype=theano.config.floatX)
        self.mask_val[12:24, 3] = 0

    def test_steps(self):
        x = tensor.tensor3('x')
        mask = tensor.matrix('mask')

        calc_stack_layers = [
            theano.function([x, mask], self.stack.apply(x, mask=mask)[i])
            for i in range(len(self.layers))]
        stack_layers = [
            f(self.x_val, self.mask_val) for f in calc_stack_layers]

        h_val = self.x_val
        for stack_layer_value, bidir_net in zip(stack_layers, self.layers):
            calc = theano.function([x, mask], bidir_net.apply(x, mask=mask))
            simple_layer_value = calc(h_val, self.mask_val)
            assert_allclose(stack_layer_value, simple_layer_value, rtol=1e-04)
            h_val = simple_layer_value[..., :3]

    def test_dims(self):
        self.assertEqual(self.stack.get_dim("inputs"), 3)
        for i in range(len(self.layers)):
            state_name = self.stack.suffix("states", i)
            self.assertEqual(self.stack.get_dim(state_name), 6)
Exemple #3
0
class TestRecurrentStack(unittest.TestCase):
    def setUp(self):
        depth = 4
        self.depth = depth
        dim = 3  # don't change, hardwired in the code
        transitions = [LSTM(dim=dim) for _ in range(depth)]
        self.stack0 = RecurrentStack(transitions,
                                     weights_init=Constant(2),
                                     biases_init=Constant(0))
        self.stack0.initialize()

        self.stack2 = RecurrentStack(transitions,
                                     weights_init=Constant(2),
                                     biases_init=Constant(0),
                                     skip_connections=True)
        self.stack2.initialize()

    def do_one_step(self, stack, skip_connections=False, low_memory=False):
        depth = self.depth

        # batch=2
        h0_val = 0.1 * numpy.array([[[1, 1, 0], [0, 1, 1]]] * depth,
                                   dtype=theano.config.floatX)
        c0_val = 0.1 * numpy.array([[[1, 1, 0], [0, 1, 1]]] * depth,
                                   dtype=theano.config.floatX)
        x_val = 0.1 * numpy.array([range(12), range(12, 24)],
                                  dtype=theano.config.floatX)
        # we will use same weights on all layers
        W_state2x_val = 2 * numpy.ones((3, 12), dtype=theano.config.floatX)
        W_state_val = 2 * numpy.ones((3, 12), dtype=theano.config.floatX)
        W_cell_to_in = 2 * numpy.ones((3,), dtype=theano.config.floatX)
        W_cell_to_out = 2 * numpy.ones((3,), dtype=theano.config.floatX)
        W_cell_to_forget = 2 * numpy.ones((3,), dtype=theano.config.floatX)

        kwargs = OrderedDict()
        for d in range(depth):
            if d > 0:
                suffix = RECURRENTSTACK_SEPARATOR + str(d)
            else:
                suffix = ''
            if d == 0 or skip_connections:
                kwargs['inputs' + suffix] = tensor.matrix('inputs' + suffix)
                kwargs['inputs' + suffix].tag.test_value = x_val
            kwargs['states' + suffix] = tensor.matrix('states' + suffix)
            kwargs['states' + suffix].tag.test_value = h0_val[d]
            kwargs['cells' + suffix] = tensor.matrix('cells' + suffix)
            kwargs['cells' + suffix].tag.test_value = c0_val[d]
        results = stack.apply(iterate=False, low_memory=low_memory, **kwargs)
        next_h = theano.function(inputs=list(kwargs.values()),
                                 outputs=results)

        def sigmoid(x):
            return 1. / (1. + numpy.exp(-x))

        h1_val = []
        x_v = x_val
        args_val = []
        for d in range(depth):
            if d == 0 or skip_connections:
                args_val.append(x_val)
            h0_v = h0_val[d]
            args_val.append(h0_v)
            c0_v = c0_val[d]
            args_val.append(c0_v)

            # omitting biases because they are zero
            activation = numpy.dot(h0_v, W_state_val) + x_v
            if skip_connections and d > 0:
                activation += x_val

            i_t = sigmoid(activation[:, :3] + c0_v * W_cell_to_in)
            f_t = sigmoid(activation[:, 3:6] + c0_v * W_cell_to_forget)
            next_cells = f_t * c0_v + i_t * numpy.tanh(activation[:, 6:9])
            o_t = sigmoid(activation[:, 9:12] +
                          next_cells * W_cell_to_out)
            h1_v = o_t * numpy.tanh(next_cells)
            # current layer output state transformed to input of next
            x_v = numpy.dot(h1_v, W_state2x_val)

            h1_val.append(h1_v)

        res = next_h(*args_val)
        for d in range(depth):
            assert_allclose(h1_val[d], res[d * 2], rtol=1e-6)

    def test_one_step(self):
        self.do_one_step(self.stack0)
        self.do_one_step(self.stack0, low_memory=True)
        self.do_one_step(self.stack2, skip_connections=True)
        self.do_one_step(self.stack2, skip_connections=True, low_memory=True)

    def do_many_steps(self, stack, skip_connections=False, low_memory=False):
        depth = self.depth

        # 24 steps
        #  4 batch examples
        # 12 dimensions per step
        x_val = (0.1 * numpy.asarray(
            list(itertools.islice(itertools.permutations(range(12)), 0, 24)),
            dtype=theano.config.floatX))
        x_val = numpy.ones((24, 4, 12),
                           dtype=theano.config.floatX) * x_val[:, None, :]
        # mask the last third of steps
        mask_val = numpy.ones((24, 4), dtype=theano.config.floatX)
        mask_val[12:24, 3] = 0
        # unroll all states and cells for all steps and also initial value
        h_val = numpy.zeros((depth, 25, 4, 3), dtype=theano.config.floatX)
        c_val = numpy.zeros((depth, 25, 4, 3), dtype=theano.config.floatX)
        # we will use same weights on all layers
        W_state2x_val = 2 * numpy.ones((3, 12), dtype=theano.config.floatX)
        W_state_val = 2 * numpy.ones((3, 12), dtype=theano.config.floatX)
        W_cell_to_in = 2 * numpy.ones((3,), dtype=theano.config.floatX)
        W_cell_to_out = 2 * numpy.ones((3,), dtype=theano.config.floatX)
        W_cell_to_forget = 2 * numpy.ones((3,), dtype=theano.config.floatX)

        kwargs = OrderedDict()

        for d in range(depth):
            if d > 0:
                suffix = RECURRENTSTACK_SEPARATOR + str(d)
            else:
                suffix = ''
            if d == 0 or skip_connections:
                kwargs['inputs' + suffix] = tensor.tensor3('inputs' + suffix)
                kwargs['inputs' + suffix].tag.test_value = x_val

        kwargs['mask'] = tensor.matrix('mask')
        kwargs['mask'].tag.test_value = mask_val
        results = stack.apply(iterate=True, low_memory=low_memory, **kwargs)
        calc_h = theano.function(inputs=list(kwargs.values()),
                                 outputs=results)

        def sigmoid(x):
            return 1. / (1. + numpy.exp(-x))

        for i in range(1, 25):
            x_v = x_val[i - 1]
            h_vs = []
            c_vs = []
            for d in range(depth):
                h_v = h_val[d][i - 1, :, :]
                c_v = c_val[d][i - 1, :, :]
                activation = numpy.dot(h_v, W_state_val) + x_v
                if skip_connections and d > 0:
                    activation += x_val[i - 1]

                i_t = sigmoid(activation[:, :3] + c_v * W_cell_to_in)
                f_t = sigmoid(activation[:, 3:6] + c_v * W_cell_to_forget)
                c_v1 = f_t * c_v + i_t * numpy.tanh(activation[:, 6:9])
                o_t = sigmoid(activation[:, 9:12] +
                              c_v1 * W_cell_to_out)
                h_v1 = o_t * numpy.tanh(c_v1)
                h_v = (mask_val[i - 1, :, None] * h_v1 +
                       (1 - mask_val[i - 1, :, None]) * h_v)
                c_v = (mask_val[i - 1, :, None] * c_v1 +
                       (1 - mask_val[i - 1, :, None]) * c_v)
                # current layer output state transformed to input of next
                x_v = numpy.dot(h_v, W_state2x_val)

                h_vs.append(h_v)
                c_vs.append(c_v)

            for d in range(depth):
                h_val[d][i, :, :] = h_vs[d]
                c_val[d][i, :, :] = c_vs[d]

        args_val = [x_val]*(depth if skip_connections else 1) + [mask_val]
        res = calc_h(*args_val)
        for d in range(depth):
            assert_allclose(h_val[d][1:], res[d * 2], rtol=1e-4)
            assert_allclose(c_val[d][1:], res[d * 2 + 1], rtol=1e-4)

        # Also test that initial state is a parameter
        for h in results:
            initial_states = VariableFilter(roles=[INITIAL_STATE])(
                ComputationGraph(h))
            assert all(is_shared_variable(initial_state)
                       for initial_state in initial_states)

    def test_many_steps(self):
        self.do_many_steps(self.stack0)
        self.do_many_steps(self.stack0, low_memory=True)
        self.do_many_steps(self.stack2, skip_connections=True)
        self.do_many_steps(self.stack2, skip_connections=True, low_memory=True)
Exemple #4
0
def build_model_hard(vocab_size, args, dtype=floatX):
    logger.info('Building model ...')

    # Parameters for the model
    context = args.context
    state_dim = args.state_dim
    layers = args.layers
    skip_connections = args.skip_connections

    # Symbolic variables
    # In both cases: Time X Batch
    x = tensor.lmatrix('features')
    y = tensor.lmatrix('targets')

    # Build the model
    output_names = []
    output_dims = []
    for d in range(layers):
        if d > 0:
            suffix = '_' + str(d)
        else:
            suffix = ''
        if d == 0 or skip_connections:
            output_names.append("inputs" + suffix)
            output_dims.append(state_dim)

    lookup = LookupTable(length=vocab_size, dim=state_dim)
    lookup.weights_init = initialization.IsotropicGaussian(0.1)
    lookup.biases_init = initialization.Constant(0)

    fork = Fork(output_names=output_names,
                input_dim=args.mini_batch_size,
                output_dims=output_dims,
                prototype=FeedforwardSequence([lookup.apply]))

    transitions = [SimpleRecurrent(dim=state_dim, activation=Tanh())]
    for i in range(layers - 1):
        mlp = MLP(activations=[Logistic()],
                  dims=[2 * state_dim, 1],
                  weights_init=initialization.IsotropicGaussian(0.1),
                  biases_init=initialization.Constant(0),
                  name="mlp_" + str(i))
        transitions.append(
            HardGatedRecurrent(dim=state_dim, mlp=mlp, activation=Tanh()))

    rnn = RecurrentStack(transitions, skip_connections=skip_connections)

    # dim = layers * state_dim
    output_layer = Linear(input_dim=layers * state_dim,
                          output_dim=vocab_size,
                          name="output_layer")

    # Return list of 3D Tensor, one for each layer
    # (Time X Batch X embedding_dim)
    pre_rnn = fork.apply(x)

    # Give a name to the input of each layer
    if skip_connections:
        for t in range(len(pre_rnn)):
            pre_rnn[t].name = "pre_rnn_" + str(t)
    else:
        pre_rnn.name = "pre_rnn"

    # Prepare inputs for the RNN
    kwargs = OrderedDict()
    init_states = {}
    for d in range(layers):
        if d > 0:
            suffix = '_' + str(d)
        else:
            suffix = ''
        if skip_connections:
            kwargs['inputs' + suffix] = pre_rnn[d]
        elif d == 0:
            kwargs['inputs' + suffix] = pre_rnn
        init_states[d] = theano.shared(numpy.zeros(
            (args.mini_batch_size, state_dim)).astype(floatX),
                                       name='state0_%d' % d)
        kwargs['states' + suffix] = init_states[d]

    # Apply the RNN to the inputs
    h = rnn.apply(low_memory=True, **kwargs)

    # Now we have correctly:
    # h = [state_1, state_2, state_3 ...]

    # Save all the last states
    last_states = {}
    for d in range(layers):
        last_states[d] = h[d][-1, :, :]

    # Concatenate all the states
    if layers > 1:
        h = tensor.concatenate(h, axis=2)
    h.name = "hidden_state"

    # The updates of the hidden states
    updates = []
    for d in range(layers):
        updates.append((init_states[d], last_states[d]))

    presoft = output_layer.apply(h[context:, :, :])
    # Define the cost
    # Compute the probability distribution
    time, batch, feat = presoft.shape
    presoft.name = 'presoft'

    cross_entropy = Softmax().categorical_cross_entropy(
        y[context:, :].flatten(), presoft.reshape((batch * time, feat)))
    cross_entropy = cross_entropy / tensor.log(2)
    cross_entropy.name = "cross_entropy"

    # TODO: add regularisation for the cost
    # the log(1) is here in order to differentiate the two variables
    # for monitoring
    cost = cross_entropy + tensor.log(1)
    cost.name = "regularized_cost"

    # Initialize the model
    logger.info('Initializing...')

    fork.initialize()

    rnn.weights_init = initialization.Orthogonal()
    rnn.biases_init = initialization.Constant(0)
    rnn.initialize()

    output_layer.weights_init = initialization.IsotropicGaussian(0.1)
    output_layer.biases_init = initialization.Constant(0)
    output_layer.initialize()

    return cost, cross_entropy, updates
def build_model_vanilla(vocab_size, args, dtype=floatX):
    logger.info('Building model ...')

    # Parameters for the model
    context = args.context
    state_dim = args.state_dim
    layers = args.layers
    skip_connections = args.skip_connections

    # Symbolic variables
    # In both cases: Time X Batch
    x = tensor.lmatrix('features')
    y = tensor.lmatrix('targets')

    # Build the model
    output_names = []
    output_dims = []
    for d in range(layers):
        if d > 0:
            suffix = '_' + str(d)
        else:
            suffix = ''
        if d == 0 or skip_connections:
            output_names.append("inputs" + suffix)
            output_dims.append(state_dim)

    lookup = LookupTable(length=vocab_size, dim=state_dim)
    lookup.weights_init = initialization.IsotropicGaussian(0.1)
    lookup.biases_init = initialization.Constant(0)

    fork = Fork(output_names=output_names, input_dim=args.mini_batch_size,
                output_dims=output_dims,
                prototype=FeedforwardSequence(
                    [lookup.apply]))

    transitions = [SimpleRecurrent(dim=state_dim, activation=Tanh())
                   for _ in range(layers)]

    rnn = RecurrentStack(transitions, skip_connections=skip_connections)

    # If skip_connections: dim = layers * state_dim
    # else: dim = state_dim
    output_layer = Linear(
        input_dim=skip_connections * layers *
        state_dim + (1 - skip_connections) * state_dim,
        output_dim=vocab_size, name="output_layer")

    # Return list of 3D Tensor, one for each layer
    # (Time X Batch X embedding_dim)
    pre_rnn = fork.apply(x)

    # Give a name to the input of each layer
    if skip_connections:
        for t in range(len(pre_rnn)):
            pre_rnn[t].name = "pre_rnn_" + str(t)
    else:
        pre_rnn.name = "pre_rnn"

    # Prepare inputs for the RNN
    kwargs = OrderedDict()
    init_states = {}
    for d in range(layers):
        if d > 0:
            suffix = '_' + str(d)
        else:
            suffix = ''
        if skip_connections:
            kwargs['inputs' + suffix] = pre_rnn[d]
        elif d == 0:
            kwargs['inputs'] = pre_rnn
        init_states[d] = theano.shared(
            numpy.zeros((args.mini_batch_size, state_dim)).astype(floatX),
            name='state0_%d' % d)
        kwargs['states' + suffix] = init_states[d]

    # Apply the RNN to the inputs
    h = rnn.apply(low_memory=True, **kwargs)

    # We have
    # h = [state, state_1, state_2 ...] if layers > 1
    # h = state if layers == 1

    # If we have skip connections, concatenate all the states
    # Else only consider the state of the highest layer
    last_states = {}
    if layers > 1:
        # Save all the last states
        for d in range(layers):
            last_states[d] = h[d][-1, :, :]
        if skip_connections:
            h = tensor.concatenate(h, axis=2)
        else:
            h = h[-1]
    else:
        last_states[0] = h[-1, :, :]
    h.name = "hidden_state"

    # The updates of the hidden states
    updates = []
    for d in range(layers):
        updates.append((init_states[d], last_states[d]))

    presoft = output_layer.apply(h[context:, :, :])
    # Define the cost
    # Compute the probability distribution
    time, batch, feat = presoft.shape
    presoft.name = 'presoft'

    cross_entropy = Softmax().categorical_cross_entropy(
        y[context:, :].flatten(),
        presoft.reshape((batch * time, feat)))
    cross_entropy = cross_entropy / tensor.log(2)
    cross_entropy.name = "cross_entropy"

    # TODO: add regularisation for the cost
    # the log(1) is here in order to differentiate the two variables
    # for monitoring
    cost = cross_entropy + tensor.log(1)
    cost.name = "regularized_cost"

    # Initialize the model
    logger.info('Initializing...')

    fork.initialize()

    rnn.weights_init = initialization.Orthogonal()
    rnn.biases_init = initialization.Constant(0)
    rnn.initialize()

    output_layer.weights_init = initialization.IsotropicGaussian(0.1)
    output_layer.biases_init = initialization.Constant(0)
    output_layer.initialize()

    return cost, cross_entropy, updates
def build_fork_lookup(vocab_size, time_length, args):
    x = tensor.lmatrix('features')
    virtual_dim = 6
    state_dim = 6
    skip_connections = False
    layers = 1

    # Build the model
    output_names = []
    output_dims = []
    for d in range(layers):
        if d > 0:
            suffix = '_' + str(d)
        else:
            suffix = ''
        if d == 0 or skip_connections:
            output_names.append("inputs" + suffix)
            output_dims.append(virtual_dim)

    lookup = LookupTable(length=vocab_size, dim=virtual_dim)
    lookup.weights_init = initialization.IsotropicGaussian(0.1)
    lookup.biases_init = initialization.Constant(0)

    fork = Fork(output_names=output_names, input_dim=time_length,
                output_dims=output_dims,
                prototype=FeedforwardSequence(
                    [lookup.apply]))

    # Note that this order of the periods makes faster modules flow in slower
    # ones with is the opposite of the original paper
    transitions = [ClockworkBase(dim=state_dim, activation=Tanh(),
                                 period=2 ** i) for i in range(layers)]

    rnn = RecurrentStack(transitions, skip_connections=skip_connections)

    # Return list of 3D Tensor, one for each layer
    # (Batch X Time X embedding_dim)
    pre_rnn = fork.apply(x)

    # Give time as the first index for each element in the list:
    # (Time X Batch X embedding_dim)
    if layers > 1 and skip_connections:
        for t in range(len(pre_rnn)):
            pre_rnn[t] = pre_rnn[t].dimshuffle(1, 0, 2)
    else:
        pre_rnn = pre_rnn.dimshuffle(1, 0, 2)

    f_pre_rnn = theano.function([x], pre_rnn)

    # Prepare inputs for the RNN
    kwargs = OrderedDict()
    for d in range(layers):
        if d > 0:
            suffix = '_' + str(d)
        else:
            suffix = ''
        if d == 0 or skip_connections:
            if skip_connections:
                kwargs['inputs' + suffix] = pre_rnn[d]
            else:
                kwargs['inputs' + suffix] = pre_rnn

    print kwargs
    # Apply the RNN to the inputs
    h = rnn.apply(low_memory=True, **kwargs)

    fork.initialize()

    rnn.weights_init = initialization.Orthogonal()
    rnn.biases_init = initialization.Constant(0)
    rnn.initialize()

    f_h = theano.function([x], h)
    return f_pre_rnn, f_h
def build_model_soft(vocab_size, args, dtype=floatX):
    logger.info('Building model ...')

    # Parameters for the model
    context = args.context
    state_dim = args.state_dim
    layers = args.layers
    skip_connections = args.skip_connections

    # Symbolic variables
    # In both cases: Time X Batch
    x = tensor.lmatrix('features')
    y = tensor.lmatrix('targets')

    # Build the model
    output_names = []
    output_dims = []
    for d in range(layers):
        if d > 0:
            suffix = '_' + str(d)
        else:
            suffix = ''
        if d == 0 or skip_connections:
            output_names.append("inputs" + suffix)
            output_dims.append(state_dim)

    lookup = LookupTable(length=vocab_size, dim=state_dim)
    lookup.weights_init = initialization.IsotropicGaussian(0.1)
    lookup.biases_init = initialization.Constant(0)

    fork = Fork(output_names=output_names, input_dim=args.mini_batch_size,
                output_dims=output_dims,
                prototype=FeedforwardSequence(
                    [lookup.apply]))

    transitions = [SimpleRecurrent(dim=state_dim, activation=Tanh())]

    # Build the MLP
    dims = [2 * state_dim]
    activations = []
    for i in range(args.mlp_layers):
        activations.append(Rectifier())
        dims.append(state_dim)

    # Activation of the last layer of the MLP
    if args.mlp_activation == "logistic":
        activations.append(Logistic())
    elif args.mlp_activation == "rectifier":
        activations.append(Rectifier())
    elif args.mlp_activation == "hard_logistic":
        activations.append(HardLogistic())
    else:
        assert False

    # Output of MLP has dimension 1
    dims.append(1)

    for i in range(layers - 1):
        mlp = MLP(activations=activations, dims=dims,
                  weights_init=initialization.IsotropicGaussian(0.1),
                  biases_init=initialization.Constant(0),
                  name="mlp_" + str(i))
        transitions.append(
            SoftGatedRecurrent(dim=state_dim,
                               mlp=mlp,
                               activation=Tanh()))

    rnn = RecurrentStack(transitions, skip_connections=skip_connections)

    # dim = layers * state_dim
    output_layer = Linear(
        input_dim=layers * state_dim,
        output_dim=vocab_size, name="output_layer")

    # Return list of 3D Tensor, one for each layer
    # (Time X Batch X embedding_dim)
    pre_rnn = fork.apply(x)

    # Give a name to the input of each layer
    if skip_connections:
        for t in range(len(pre_rnn)):
            pre_rnn[t].name = "pre_rnn_" + str(t)
    else:
        pre_rnn.name = "pre_rnn"

    # Prepare inputs for the RNN
    kwargs = OrderedDict()
    init_states = {}
    for d in range(layers):
        if d > 0:
            suffix = '_' + str(d)
        else:
            suffix = ''
        if skip_connections:
            kwargs['inputs' + suffix] = pre_rnn[d]
        elif d == 0:
            kwargs['inputs' + suffix] = pre_rnn
        init_states[d] = theano.shared(
            numpy.zeros((args.mini_batch_size, state_dim)).astype(floatX),
            name='state0_%d' % d)
        kwargs['states' + suffix] = init_states[d]

    # Apply the RNN to the inputs
    h = rnn.apply(low_memory=True, **kwargs)

    # Now we have:
    # h = [state, state_1, gate_value_1, state_2, gate_value_2, state_3, ...]

    # Extract gate_values
    gate_values = h[2::2]
    new_h = [h[0]]
    new_h.extend(h[1::2])
    h = new_h

    # Now we have:
    # h = [state, state_1, state_2, ...]
    # gate_values = [gate_value_1, gate_value_2, gate_value_3]

    for i, gate_value in enumerate(gate_values):
        gate_value.name = "gate_value_" + str(i)

    # Save all the last states
    last_states = {}
    for d in range(layers):
        last_states[d] = h[d][-1, :, :]

    # Concatenate all the states
    if layers > 1:
        h = tensor.concatenate(h, axis=2)
    h.name = "hidden_state"

    # The updates of the hidden states
    updates = []
    for d in range(layers):
        updates.append((init_states[d], last_states[d]))

    presoft = output_layer.apply(h[context:, :, :])
    # Define the cost
    # Compute the probability distribution
    time, batch, feat = presoft.shape
    presoft.name = 'presoft'

    cross_entropy = Softmax().categorical_cross_entropy(
        y[context:, :].flatten(),
        presoft.reshape((batch * time, feat)))
    cross_entropy = cross_entropy / tensor.log(2)
    cross_entropy.name = "cross_entropy"

    # TODO: add regularisation for the cost
    # the log(1) is here in order to differentiate the two variables
    # for monitoring
    cost = cross_entropy + tensor.log(1)
    cost.name = "regularized_cost"

    # Initialize the model
    logger.info('Initializing...')

    fork.initialize()

    rnn.weights_init = initialization.Orthogonal()
    rnn.biases_init = initialization.Constant(0)
    rnn.initialize()

    output_layer.weights_init = initialization.IsotropicGaussian(0.1)
    output_layer.biases_init = initialization.Constant(0)
    output_layer.initialize()

    return cost, cross_entropy, updates, gate_values
Exemple #8
0
def build_model_lstm(vocab_size, args, dtype=floatX):
    logger.info('Building model ...')

    # Parameters for the model
    context = args.context
    state_dim = args.state_dim
    layers = args.layers
    skip_connections = args.skip_connections

    virtual_dim = 4 * state_dim

    # Symbolic variables
    # In both cases: Time X Batch
    x = tensor.lmatrix('features')
    y = tensor.lmatrix('targets')

    # Build the model
    output_names = []
    output_dims = []
    for d in range(layers):
        if d > 0:
            suffix = '_' + str(d)
        else:
            suffix = ''
        if d == 0 or skip_connections:
            output_names.append("inputs" + suffix)
            output_dims.append(virtual_dim)

    lookup = LookupTable(length=vocab_size, dim=virtual_dim)
    lookup.weights_init = initialization.IsotropicGaussian(0.1)
    lookup.biases_init = initialization.Constant(0)

    # Make sure time_length is what we need
    fork = Fork(output_names=output_names,
                input_dim=args.mini_batch_size,
                output_dims=output_dims,
                prototype=FeedforwardSequence([lookup.apply]))

    transitions = [
        LSTM(dim=state_dim, activation=Tanh()) for _ in range(layers)
    ]

    rnn = RecurrentStack(transitions, skip_connections=skip_connections)

    # If skip_connections: dim = layers * state_dim
    # else: dim = state_dim
    output_layer = Linear(input_dim=skip_connections * layers * state_dim +
                          (1 - skip_connections) * state_dim,
                          output_dim=vocab_size,
                          name="output_layer")

    # Return list of 3D Tensor, one for each layer
    # (Time X Batch X embedding_dim)
    pre_rnn = fork.apply(x)

    # Give a name to the input of each layer
    if skip_connections:
        for t in range(len(pre_rnn)):
            pre_rnn[t].name = "pre_rnn_" + str(t)
    else:
        pre_rnn.name = "pre_rnn"

    # Prepare inputs for the RNN
    kwargs = OrderedDict()
    init_states = {}
    init_cells = {}
    for d in range(layers):
        if d > 0:
            suffix = '_' + str(d)
        else:
            suffix = ''
        if skip_connections:
            kwargs['inputs' + suffix] = pre_rnn[d]
        elif d == 0:
            kwargs['inputs'] = pre_rnn
        init_states[d] = theano.shared(numpy.zeros(
            (args.mini_batch_size, state_dim)).astype(floatX),
                                       name='state0_%d' % d)
        init_cells[d] = theano.shared(numpy.zeros(
            (args.mini_batch_size, state_dim)).astype(floatX),
                                      name='cell0_%d' % d)
        kwargs['states' + suffix] = init_states[d]
        kwargs['cells' + suffix] = init_cells[d]

    # Apply the RNN to the inputs
    h = rnn.apply(low_memory=True, **kwargs)

    # h = [state, cell, in, forget, out, state_1,
    #        cell_1, in_1, forget_1, out_1 ...]

    last_states = {}
    last_cells = {}
    for d in range(layers):
        last_states[d] = h[5 * d][-1, :, :]
        last_cells[d] = h[5 * d + 1][-1, :, :]

    # The updates of the hidden states
    updates = []
    for d in range(layers):
        updates.append((init_states[d], last_states[d]))
        updates.append((init_cells[d], last_states[d]))

    # h = [state, cell, in, forget, out, state_1,
    #        cell_1, in_1, forget_1, out_1 ...]

    # Extract the values
    in_gates = h[2::5]
    forget_gates = h[3::5]
    out_gates = h[4::5]

    gate_values = {
        "in_gates": in_gates,
        "forget_gates": forget_gates,
        "out_gates": out_gates
    }

    h = h[::5]

    # Now we have correctly:
    # h = [state, state_1, state_2 ...] if layers > 1
    # h = [state] if layers == 1

    # If we have skip connections, concatenate all the states
    # Else only consider the state of the highest layer
    if layers > 1:
        if skip_connections:
            h = tensor.concatenate(h, axis=2)
        else:
            h = h[-1]
    else:
        h = h[0]
    h.name = "hidden_state"

    presoft = output_layer.apply(h[context:, :, :])
    # Define the cost
    # Compute the probability distribution
    time, batch, feat = presoft.shape
    presoft.name = 'presoft'

    cross_entropy = Softmax().categorical_cross_entropy(
        y[context:, :].flatten(), presoft.reshape((batch * time, feat)))
    cross_entropy = cross_entropy / tensor.log(2)
    cross_entropy.name = "cross_entropy"

    # TODO: add regularisation for the cost
    # the log(1) is here in order to differentiate the two variables
    # for monitoring
    cost = cross_entropy + tensor.log(1)
    cost.name = "regularized_cost"

    # Initialize the model
    logger.info('Initializing...')

    fork.initialize()

    # Dont initialize as Orthogonal if we are about to load new parameters
    if args.load_path is not None:
        rnn.weights_init = initialization.Constant(0)
    else:
        rnn.weights_init = initialization.Orthogonal()
    rnn.biases_init = initialization.Constant(0)
    rnn.initialize()

    output_layer.weights_init = initialization.IsotropicGaussian(0.1)
    output_layer.biases_init = initialization.Constant(0)
    output_layer.initialize()

    return cost, cross_entropy, updates, gate_values