Example #1
0
def example2():
    """GRU"""
    x = tensor.tensor3('x')
    dim = 3

    fork = Fork(input_dim=dim, output_dims=[dim, dim*2],name='fork',output_names=["linear","gates"], weights_init=initialization.Identity(),biases_init=Constant(0))
    gru = GatedRecurrent(dim=dim, weights_init=initialization.Identity(),biases_init=Constant(0))

    fork.initialize()
    gru.initialize()

    linear, gate_inputs = fork.apply(x)
    h = gru.apply(linear, gate_inputs)

    f = theano.function([x], h)
    print(f(np.ones((dim, 1, dim), dtype=theano.config.floatX))) 

    doubler = Linear(
                 input_dim=dim, output_dim=dim, weights_init=initialization.Identity(2),
                 biases_init=initialization.Constant(0))
    doubler.initialize()

    lin, gate = fork.apply(doubler.apply(x))
    h_doubler = gru.apply(lin,gate)

    f = theano.function([x], h_doubler)
    print(f(np.ones((dim, 1, dim), dtype=theano.config.floatX))) 
Example #2
0
def example2():
    """GRU"""
    x = tensor.tensor3('x')
    dim = 3

    fork = Fork(input_dim=dim,
                output_dims=[dim, dim * 2],
                name='fork',
                output_names=["linear", "gates"],
                weights_init=initialization.Identity(),
                biases_init=Constant(0))
    gru = GatedRecurrent(dim=dim,
                         weights_init=initialization.Identity(),
                         biases_init=Constant(0))

    fork.initialize()
    gru.initialize()

    linear, gate_inputs = fork.apply(x)
    h = gru.apply(linear, gate_inputs)

    f = theano.function([x], h)
    print(f(np.ones((dim, 1, dim), dtype=theano.config.floatX)))

    doubler = Linear(input_dim=dim,
                     output_dim=dim,
                     weights_init=initialization.Identity(2),
                     biases_init=initialization.Constant(0))
    doubler.initialize()

    lin, gate = fork.apply(doubler.apply(x))
    h_doubler = gru.apply(lin, gate)

    f = theano.function([x], h_doubler)
    print(f(np.ones((dim, 1, dim), dtype=theano.config.floatX)))
Example #3
0
def gru_layer(dim, h, n, x_mask, first, **kwargs):
    fork = Fork(output_names=['linear' + str(n), 'gates' + str(n)],
                name='fork' + str(n),
                input_dim=dim,
                output_dims=[dim, dim * 2])
    gru = GatedRecurrent(dim=dim, name='gru' + str(n))
    initialize([fork, gru])
    linear, gates = fork.apply(h)
    if first:
        gruApply = gru.apply(linear, gates, mask=x_mask, **kwargs)
    else:
        gruApply = gru.apply(linear, gates, **kwargs)
    return gruApply
Example #4
0
def gru_layer(dim, h, n):
    fork = Fork(output_names=['linear' + str(n), 'gates' + str(n)],
                name='fork' + str(n), input_dim=dim, output_dims=[dim, dim * 2])
    gru = GatedRecurrent(dim=dim, name='gru' + str(n))
    initialize([fork, gru])
    linear, gates = fork.apply(h)
    return gru.apply(linear, gates)
Example #5
0
def gru_layer(dim, h, n):
    fork = Fork(output_names=['linear' + str(n), 'gates' + str(n)],
                name='fork' + str(n), input_dim=dim, output_dims=[dim, dim * 2])
    gru = GatedRecurrent(dim=dim, name='gru' + str(n))
    initialize([fork, gru])
    linear, gates = fork.apply(h)
    return gru.apply(linear, gates)
Example #6
0
def gru_layer(dim, h, n):
    fork = Fork(
        output_names=["linear" + str(n), "gates" + str(n)],
        name="fork" + str(n),
        input_dim=dim,
        output_dims=[dim, dim * 2],
    )
    gru = GatedRecurrent(dim=dim, name="gru" + str(n))
    initialize([fork, gru])
    linear, gates = fork.apply(h)
    return gru.apply(linear, gates)
Example #7
0
class Encoder(Initializable):
    def __init__(self,
                 vocab_size,
                 embedding_dim,
                 state_dim,
                 reverse=True,
                 **kwargs):
        super(Encoder, self).__init__(**kwargs)
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.state_dim = state_dim
        self.reverse = reverse

        self.lookup = LookupTable(name='embeddings')
        self.transition = GatedRecurrent(Tanh(), name='encoder_transition')
        self.fork = Fork([
            name for name in self.transition.apply.sequences if name != 'mask'
        ],
                         prototype=Linear())

        self.children = [self.lookup, self.transition, self.fork]

    def _push_allocation_config(self):
        self.lookup.length = self.vocab_size
        self.lookup.dim = self.embedding_dim
        self.transition.dim = self.state_dim
        self.fork.input_dim = self.embedding_dim
        self.fork.output_dims = [
            self.state_dim for _ in self.fork.output_names
        ]

    @application(inputs=['source_sentence', 'source_sentence_mask'],
                 outputs=['representation'])
    def apply(self, source_sentence, source_sentence_mask):
        # Time as first dimension
        source_sentence = source_sentence.dimshuffle(1, 0)
        source_sentence_mask = source_sentence_mask.T
        if self.reverse:
            source_sentence = source_sentence[::-1]
            source_sentence_mask = source_sentence_mask[::-1]

        embeddings = self.lookup.apply(source_sentence)
        representation = self.transition.apply(
            **merge(self.fork.apply(embeddings, as_dict=True),
                    {'mask': source_sentence_mask}))
        return representation[-1]
Example #8
0
class InnerRecurrent(BaseRecurrent, Initializable):
    def __init__(self, inner_input_dim, outer_input_dim, inner_dim, **kwargs):
        self.inner_gru = GatedRecurrent(dim=inner_dim, name='inner_gru')

        self.inner_input_fork = Fork(
            output_names=[name for name in self.inner_gru.apply.sequences
                          if 'mask' not in name],
            input_dim=inner_input_dim, name='inner_input_fork')
        self.outer_input_fork = Fork(
            output_names=[name for name in self.inner_gru.apply.sequences
                          if 'mask' not in name],
            input_dim=outer_input_dim, name='inner_outer_fork')

        super(InnerRecurrent, self).__init__(**kwargs)

        self.children = [
            self.inner_gru, self.inner_input_fork, self.outer_input_fork]

    def _push_allocation_config(self):
        self.inner_input_fork.output_dims = self.inner_gru.get_dims(
            self.inner_input_fork.output_names)
        self.outer_input_fork.output_dims = self.inner_gru.get_dims(
            self.outer_input_fork.output_names)

    @recurrent(sequences=['inner_inputs'], states=['states'],
               contexts=['outer_inputs'], outputs=['states'])
    def apply(self, inner_inputs, states, outer_inputs):
        forked_inputs = self.inner_input_fork.apply(inner_inputs, as_dict=True)
        forked_states = self.outer_input_fork.apply(outer_inputs, as_dict=True)

        gru_inputs = {key: forked_inputs[key] + forked_states[key]
                      for key in forked_inputs.keys()}

        new_states = self.inner_gru.apply(
            iterate=False,
            **dict_union(gru_inputs, {'states': states}))
        return new_states  # mean according to the time axis

    def get_dim(self, name):
        if name == 'states':
            return self.inner_gru.get_dim(name)
        else:
            return AttributeError
Example #9
0
class GatedRecurrentWithContext(Initializable):
    def __init__(self, *args, **kwargs):
        self.gated_recurrent = GatedRecurrent(*args, **kwargs)
        self.children = [self.gated_recurrent]

    @application(states=['states'],
                 outputs=['states'],
                 contexts=[
                     'readout_context', 'transition_context', 'update_context',
                     'reset_context'
                 ])
    def apply(self, transition_context, update_context, reset_context, *args,
              **kwargs):
        kwargs['inputs'] += transition_context
        kwargs['update_inputs'] += update_context
        kwargs['reset_inputs'] += reset_context
        # readout_context was only added for the Readout brick, discard it
        kwargs.pop('readout_context')
        return self.gated_recurrent.apply(*args, **kwargs)

    def get_dim(self, name):
        if name in [
                'readout_context', 'transition_context', 'update_context',
                'reset_context'
        ]:
            return self.dim
        return self.gated_recurrent.get_dim(name)

    def __getattr__(self, name):
        if name == 'gated_recurrent':
            raise AttributeError
        return getattr(self.gated_recurrent, name)

    @apply.property('sequences')
    def apply_inputs(self):
        sequences = ['mask', 'inputs']
        if self.use_update_gate:
            sequences.append('update_inputs')
        if self.use_reset_gate:
            sequences.append('reset_inputs')
        return sequences
Example #10
0
class Encoder(Initializable):
    def __init__(self, vocab_size, embedding_dim, state_dim, reverse=True,
                 **kwargs):
        super(Encoder, self).__init__(**kwargs)
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.state_dim = state_dim
        self.reverse = reverse

        self.lookup = LookupTable(name='embeddings')
        self.transition = GatedRecurrent(Tanh(), name='encoder_transition')
        self.fork = Fork([name for name in self.transition.apply.sequences
                          if name != 'mask'], prototype=Linear())

        self.children = [self.lookup, self.transition, self.fork]

    def _push_allocation_config(self):
        self.lookup.length = self.vocab_size
        self.lookup.dim = self.embedding_dim
        self.transition.dim = self.state_dim
        self.fork.input_dim = self.embedding_dim
        self.fork.output_dims = [self.state_dim
                                 for _ in self.fork.output_names]

    @application(inputs=['source_sentence', 'source_sentence_mask'],
                 outputs=['representation'])
    def apply(self, source_sentence, source_sentence_mask):
        # Time as first dimension
        source_sentence = source_sentence.dimshuffle(1, 0)
        source_sentence_mask = source_sentence_mask.T
        if self.reverse:
            source_sentence = source_sentence[::-1]
            source_sentence_mask = source_sentence_mask[::-1]

        embeddings = self.lookup.apply(source_sentence)
        representation = self.transition.apply(**merge(
            self.fork.apply(embeddings, as_dict=True),
            {'mask': source_sentence_mask}
        ))
        return representation[-1]
Example #11
0
class Encoder(Initializable):
    """Encoder of RNNsearch model."""

    def __init__(self, blockid, vocab_size, embedding_dim, state_dim, **kwargs):
        super(Encoder, self).__init__(**kwargs)
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.state_dim = state_dim
        self.blockid = blockid

        self.lookup = LookupTable(name='embeddings' + '_' + self.blockid)
        self.gru = GatedRecurrent(activation=Tanh(), dim=state_dim, name = "GatedRNN" + self.blockid)
        self.fwd_fork = Fork(
            [name for name in self.gru.apply.sequences
             if name != 'mask'], prototype=Linear(), name='fwd_fork' + '_' + self.blockid)

        self.children = [self.lookup, self.gru, self.fwd_fork]

    def _push_allocation_config(self):
        self.lookup.length = self.vocab_size
        self.lookup.dim = self.embedding_dim

        self.fwd_fork.input_dim = self.embedding_dim
        self.fwd_fork.output_dims = [self.gru.get_dim(name)
                                     for name in self.fwd_fork.output_names]

    @application(inputs=['source_sentence', 'source_sentence_mask'],
                 outputs=['representation'])
    def apply(self, source_sentence, source_sentence_mask):
        # Time as first dimension
        source_sentence = source_sentence.T
        source_sentence_mask = source_sentence_mask.T

        embeddings = self.lookup.apply(source_sentence)
        grupara =  merge( self.fwd_fork.apply(embeddings, as_dict=True) , {'mask': source_sentence_mask})
        representation = self.gru.apply(**grupara)
        return representation
Example #12
0
class Encoder(Initializable):
    def __init__(self, vocab_size, embedding_dim, state_dim, **kwargs):
        super(Encoder, self).__init__(**kwargs)
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.state_dim = state_dim

        self.lookup = LookupTable(name='embeddings')
        self.GRU = GatedRecurrent(activation=Tanh(), dim=state_dim)
        self.children = [self.lookup, self.GRU]

    def _push_allocation_config(self):
        self.lookup.length = self.vocab_size
        self.lookup.dim = self.embedding_dim

    @application(inputs=['source_sentence', 'source_sentence_mask'],
                 outputs=['representation'])
    def apply(self, source_sentence, source_sentence_mask):
        source_sentence = source_sentence.T
        source_sentence_mask = source_sentence_mask.T

        embeddings = self.lookup.apply(source_sentence)
        representation = self.GRU.apply(embeddings, embeddings)
        return representation
Example #13
0
class GatedRecurrentWithContext(Initializable):
    def __init__(self, *args, **kwargs):
        self.gated_recurrent = GatedRecurrent(*args, **kwargs)
        self.children = [self.gated_recurrent]

    @application(states=['states'], outputs=['states'],
                 contexts=['readout_context', 'transition_context',
                           'update_context', 'reset_context'])
    def apply(self, transition_context, update_context, reset_context,
              *args, **kwargs):
        kwargs['inputs'] += transition_context
        kwargs['update_inputs'] += update_context
        kwargs['reset_inputs'] += reset_context
        # readout_context was only added for the Readout brick, discard it
        kwargs.pop('readout_context')
        return self.gated_recurrent.apply(*args, **kwargs)

    def get_dim(self, name):
        if name in ['readout_context', 'transition_context',
                    'update_context', 'reset_context']:
            return self.dim
        return self.gated_recurrent.get_dim(name)

    def __getattr__(self, name):
        if name == 'gated_recurrent':
            raise AttributeError
        return getattr(self.gated_recurrent, name)

    @apply.property('sequences')
    def apply_inputs(self):
        sequences = ['mask', 'inputs']
        if self.use_update_gate:
            sequences.append('update_inputs')
        if self.use_reset_gate:
            sequences.append('reset_inputs')
        return sequences
Example #14
0
iteration = 300 # number of epochs of gradient descent

print "Building Model"
# Symbolic variables
x = tensor.tensor3('x', dtype=floatX)
target = tensor.tensor3('target', dtype=floatX)

# Build the model
linear = Linear(input_dim = n_u, output_dim = n_h, name="first_layer")
rnn = GatedRecurrent(dim=n_h, activation=Tanh())
linear2 = Linear(input_dim = n_h, output_dim = n_y, name="output_layer")
sigm = Sigmoid()

x_transform = linear.apply(x)
h = rnn.apply(x_transform)
predict = sigm.apply(linear2.apply(h))


# only for generation B x h_dim
h_initial = tensor.tensor3('h_initial', dtype=floatX)
h_testing = rnn.apply(x_transform, h_initial, iterate=False)
y_hat_testing = linear2.apply(h_testing)
y_hat_testing = sigm.apply(y_hat_testing)
y_hat_testing.name = 'y_hat_testing'


# Cost function
cost = SquaredError().apply(predict,target)

# Initialization
Example #15
0
class Parrot(Initializable, Random):
    def __init__(
            self,
            input_dim=420,  # Dimension of the text labels
            output_dim=63,  # Dimension of vocoder fram
            rnn_h_dim=1024,  # Size of rnn hidden state
            readouts_dim=1024,  # Size of readouts (summary of rnn)
            weak_feedback=False,  # Feedback to the top rnn layer
            full_feedback=False,  # Feedback to all rnn layers
            feedback_noise_level=None,  # Amount of noise in feedback
            layer_norm=False,  # Use simple normalization?
            use_speaker=False,  # Condition on the speaker id?
            num_speakers=21,  # How many speakers there are?
            speaker_dim=128,  # Size of speaker embedding
            which_cost='MSE',  # Train with MSE or GMM
            k_gmm=20,  # How many components in the GMM
            sampling_bias=0,  # Make samples more likely (Graves13)
            epsilon=1e-5,  # Numerical stabilities
            num_characters=43,  # how many chars in the labels
            attention_type='graves',  # graves or softmax
            attention_size=10,  # number of gaussians in the attention
            attention_alignment=1.,  # audio steps per letter at initialization
            sharpening_coeff=1.,
            timing_coeff=1.,
            encoder_type=None,
            encoder_dim=128,
            **kwargs):

        super(Parrot, self).__init__(**kwargs)

        self.input_dim = input_dim
        self.output_dim = output_dim
        self.rnn_h_dim = rnn_h_dim
        self.readouts_dim = readouts_dim
        self.layer_norm = layer_norm
        self.which_cost = which_cost
        self.use_speaker = use_speaker
        self.full_feedback = full_feedback
        self.feedback_noise_level = feedback_noise_level
        self.epsilon = epsilon

        self.num_characters = num_characters
        self.attention_type = attention_type
        self.attention_alignment = attention_alignment
        self.attention_size = attention_size
        self.sharpening_coeff = sharpening_coeff
        self.timing_coeff = timing_coeff

        self.encoder_type = encoder_type
        self.encoder_dim = encoder_dim

        self.encoded_input_dim = input_dim

        if self.encoder_type == 'bidirectional':
            self.encoded_input_dim = 2 * encoder_dim

        if self.feedback_noise_level is not None:
            self.noise_level_var = tensor.scalar('feedback_noise_level')

        self.rnn1 = GatedRecurrent(dim=rnn_h_dim, name='rnn1')
        self.rnn2 = GatedRecurrent(dim=rnn_h_dim, name='rnn2')
        self.rnn3 = GatedRecurrent(dim=rnn_h_dim, name='rnn3')

        self.h1_to_readout = Linear(input_dim=rnn_h_dim,
                                    output_dim=readouts_dim,
                                    name='h1_to_readout')

        self.h2_to_readout = Linear(input_dim=rnn_h_dim,
                                    output_dim=readouts_dim,
                                    name='h2_to_readout')

        self.h3_to_readout = Linear(input_dim=rnn_h_dim,
                                    output_dim=readouts_dim,
                                    name='h3_to_readout')

        self.h1_to_h2 = Fork(output_names=['rnn2_inputs', 'rnn2_gates'],
                             input_dim=rnn_h_dim,
                             output_dims=[rnn_h_dim, 2 * rnn_h_dim],
                             name='h1_to_h2')

        self.h1_to_h3 = Fork(output_names=['rnn3_inputs', 'rnn3_gates'],
                             input_dim=rnn_h_dim,
                             output_dims=[rnn_h_dim, 2 * rnn_h_dim],
                             name='h1_to_h3')

        self.h2_to_h3 = Fork(output_names=['rnn3_inputs', 'rnn3_gates'],
                             input_dim=rnn_h_dim,
                             output_dims=[rnn_h_dim, 2 * rnn_h_dim],
                             name='h2_to_h3')

        if which_cost == 'MSE':
            self.readout_to_output = Linear(input_dim=readouts_dim,
                                            output_dim=output_dim,
                                            name='readout_to_output')
        elif which_cost == 'GMM':
            self.sampling_bias = sampling_bias
            self.k_gmm = k_gmm
            self.readout_to_output = Fork(
                output_names=['gmm_mu', 'gmm_sigma', 'gmm_coeff'],
                input_dim=readouts_dim,
                output_dims=[output_dim * k_gmm, output_dim * k_gmm, k_gmm],
                name='readout_to_output')

        self.encoder = Encoder(encoder_type,
                               num_characters,
                               input_dim,
                               encoder_dim,
                               name='encoder')

        self.children = [
            self.encoder, self.rnn1, self.rnn2, self.rnn3, self.h1_to_readout,
            self.h2_to_readout, self.h3_to_readout, self.h1_to_h2,
            self.h1_to_h3, self.h2_to_h3, self.readout_to_output
        ]

        self.inp_to_h1 = Fork(output_names=['rnn1_inputs', 'rnn1_gates'],
                              input_dim=self.encoded_input_dim,
                              output_dims=[rnn_h_dim, 2 * rnn_h_dim],
                              name='inp_to_h1')

        self.inp_to_h2 = Fork(output_names=['rnn2_inputs', 'rnn2_gates'],
                              input_dim=self.encoded_input_dim,
                              output_dims=[rnn_h_dim, 2 * rnn_h_dim],
                              name='inp_to_h2')

        self.inp_to_h3 = Fork(output_names=['rnn3_inputs', 'rnn3_gates'],
                              input_dim=self.encoded_input_dim,
                              output_dims=[rnn_h_dim, 2 * rnn_h_dim],
                              name='inp_to_h3')

        self.children += [self.inp_to_h1, self.inp_to_h2, self.inp_to_h3]

        self.h1_to_att = Fork(output_names=['alpha', 'beta', 'kappa'],
                              input_dim=rnn_h_dim,
                              output_dims=[attention_size] * 3,
                              name='h1_to_att')

        self.att_to_readout = Linear(input_dim=self.encoded_input_dim,
                                     output_dim=readouts_dim,
                                     name='att_to_readout')

        self.children += [self.h1_to_att, self.att_to_readout]

        if use_speaker:
            self.num_speakers = num_speakers
            self.speaker_dim = speaker_dim
            self.embed_speaker = LookupTable(num_speakers, speaker_dim)

            self.speaker_to_h1 = Fork(
                output_names=['rnn1_inputs', 'rnn1_gates'],
                input_dim=speaker_dim,
                output_dims=[rnn_h_dim, 2 * rnn_h_dim],
                name='speaker_to_h1')

            self.speaker_to_h2 = Fork(
                output_names=['rnn2_inputs', 'rnn2_gates'],
                input_dim=speaker_dim,
                output_dims=[rnn_h_dim, 2 * rnn_h_dim],
                name='speaker_to_h2')

            self.speaker_to_h3 = Fork(
                output_names=['rnn3_inputs', 'rnn3_gates'],
                input_dim=speaker_dim,
                output_dims=[rnn_h_dim, 2 * rnn_h_dim],
                name='speaker_to_h3')

            self.speaker_to_readout = Linear(input_dim=speaker_dim,
                                             output_dim=readouts_dim,
                                             name='speaker_to_readout')

            if which_cost == 'MSE':
                self.speaker_to_output = Linear(input_dim=speaker_dim,
                                                output_dim=output_dim,
                                                name='speaker_to_output')
            elif which_cost == 'GMM':
                self.speaker_to_output = Fork(
                    output_names=['gmm_mu', 'gmm_sigma', 'gmm_coeff'],
                    input_dim=speaker_dim,
                    output_dims=[
                        output_dim * k_gmm, output_dim * k_gmm, k_gmm
                    ],
                    name='speaker_to_output')

            self.children += [
                self.embed_speaker, self.speaker_to_h1, self.speaker_to_h2,
                self.speaker_to_h3, self.speaker_to_readout,
                self.speaker_to_output
            ]

        if full_feedback:
            self.out_to_h2 = Fork(output_names=['rnn2_inputs', 'rnn2_gates'],
                                  input_dim=output_dim,
                                  output_dims=[rnn_h_dim, 2 * rnn_h_dim],
                                  name='out_to_h2')

            self.out_to_h3 = Fork(output_names=['rnn3_inputs', 'rnn3_gates'],
                                  input_dim=output_dim,
                                  output_dims=[rnn_h_dim, 2 * rnn_h_dim],
                                  name='out_to_h3')
            self.children += [self.out_to_h2, self.out_to_h3]
            weak_feedback = True

        self.weak_feedback = weak_feedback

        if weak_feedback:
            self.out_to_h1 = Fork(output_names=['rnn1_inputs', 'rnn1_gates'],
                                  input_dim=output_dim,
                                  output_dims=[rnn_h_dim, 2 * rnn_h_dim],
                                  name='out_to_h1')
            self.children += [self.out_to_h1]

    def _allocate(self):
        self.initial_w = shared_floatx_zeros((self.encoded_input_dim, ),
                                             name="initial_w")

        add_role(self.initial_w, INITIAL_STATE)

    def symbolic_input_variables(self):
        features = tensor.tensor3('features')
        features_mask = tensor.matrix('features_mask')
        labels = tensor.imatrix('labels')
        labels_mask = tensor.matrix('labels_mask')

        start_flag = tensor.scalar('start_flag')

        if self.use_speaker:
            speaker = tensor.imatrix('speaker_index')
        else:
            speaker = None

        return features, features_mask, labels, labels_mask, \
            speaker, start_flag

    def initial_states(self, batch_size):
        initial_h1 = self.rnn1.initial_states(batch_size)
        initial_h2 = self.rnn2.initial_states(batch_size)
        initial_h3 = self.rnn3.initial_states(batch_size)

        last_h1 = shared_floatx_zeros((batch_size, self.rnn_h_dim))
        last_h2 = shared_floatx_zeros((batch_size, self.rnn_h_dim))
        last_h3 = shared_floatx_zeros((batch_size, self.rnn_h_dim))

        # Defining for all
        initial_k = tensor.zeros((batch_size, self.attention_size),
                                 dtype=floatX)
        last_k = shared_floatx_zeros((batch_size, self.attention_size))

        # Trainable initial state for w. Why not for k?
        initial_w = tensor.repeat(self.initial_w[None, :], batch_size, 0)

        last_w = shared_floatx_zeros((batch_size, self.encoded_input_dim))

        return initial_h1, last_h1, initial_h2, last_h2, initial_h3, last_h3, \
            initial_w, last_w, initial_k, last_k

    @application
    def compute_cost(self, features, features_mask, labels, labels_mask,
                     speaker, start_flag, batch_size):

        if speaker is None:
            assert not self.use_speaker

        target_features = features[1:]
        mask = features_mask[1:]

        cell_shape = (mask.shape[0], batch_size, self.rnn_h_dim)
        gat_shape = (mask.shape[0], batch_size, 2 * self.rnn_h_dim)
        cell_h1 = tensor.zeros(cell_shape, dtype=floatX)
        cell_h2 = tensor.zeros(cell_shape, dtype=floatX)
        cell_h3 = tensor.zeros(cell_shape, dtype=floatX)
        gat_h1 = tensor.zeros(gat_shape, dtype=floatX)
        gat_h2 = tensor.zeros(gat_shape, dtype=floatX)
        gat_h3 = tensor.zeros(gat_shape, dtype=floatX)

        if self.weak_feedback:
            input_features = features[:-1]

            if self.feedback_noise_level:
                noise = self.theano_rng.normal(size=input_features.shape,
                                               avg=0.,
                                               std=1.)
                input_features += self.noise_level_var * noise

            out_cell_h1, out_gat_h1 = self.out_to_h1.apply(input_features)

            to_normalize = [out_cell_h1, out_gat_h1]
            out_cell_h1, out_gat_h1 = \
                [_apply_norm(x, self.layer_norm) for x in to_normalize]

            cell_h1 += out_cell_h1
            gat_h1 += out_gat_h1

        if self.full_feedback:
            assert self.weak_feedback
            out_cell_h2, out_gat_h2 = self.out_to_h2.apply(input_features)
            out_cell_h3, out_gat_h3 = self.out_to_h3.apply(input_features)

            to_normalize = [out_cell_h2, out_gat_h2, out_cell_h3, out_gat_h3]
            out_cell_h2, out_gat_h2, out_cell_h3, out_gat_h3 = \
                [_apply_norm(x, self.layer_norm) for x in to_normalize]

            cell_h2 += out_cell_h2
            gat_h2 += out_gat_h2
            cell_h3 += out_cell_h3
            gat_h3 += out_gat_h3

        if self.use_speaker:
            speaker = speaker[:, 0]
            emb_speaker = self.embed_speaker.apply(speaker)
            emb_speaker = tensor.shape_padleft(emb_speaker)

            spk_cell_h1, spk_gat_h1 = self.speaker_to_h1.apply(emb_speaker)
            spk_cell_h2, spk_gat_h2 = self.speaker_to_h2.apply(emb_speaker)
            spk_cell_h3, spk_gat_h3 = self.speaker_to_h3.apply(emb_speaker)

            to_normalize = [
                spk_cell_h1, spk_gat_h1, spk_cell_h2, spk_gat_h2, spk_cell_h3,
                spk_gat_h3
            ]

            spk_cell_h1, spk_gat_h1, spk_cell_h2, spk_gat_h2, \
                spk_cell_h3, spk_gat_h3, = \
                [_apply_norm(x, self.layer_norm) for x in to_normalize]

            cell_h1 = spk_cell_h1 + cell_h1
            cell_h2 = spk_cell_h2 + cell_h2
            cell_h3 = spk_cell_h3 + cell_h3
            gat_h1 = spk_gat_h1 + gat_h1
            gat_h2 = spk_gat_h2 + gat_h2
            gat_h3 = spk_gat_h3 + gat_h3

        initial_h1, last_h1, initial_h2, last_h2, initial_h3, last_h3, \
            initial_w, last_w, initial_k, last_k = \
            self.initial_states(batch_size)

        # If it's a new example, use initial states.
        input_h1 = tensor.switch(start_flag, initial_h1, last_h1)
        input_h2 = tensor.switch(start_flag, initial_h2, last_h2)
        input_h3 = tensor.switch(start_flag, initial_h3, last_h3)
        input_w = tensor.switch(start_flag, initial_w, last_w)
        input_k = tensor.switch(start_flag, initial_k, last_k)

        context_oh = self.encoder.apply(labels) * \
            tensor.shape_padright(labels_mask)

        u = tensor.shape_padleft(tensor.arange(labels.shape[1], dtype=floatX),
                                 2)

        def step(inp_h1_t, gat_h1_t, inp_h2_t, gat_h2_t, inp_h3_t, gat_h3_t,
                 h1_tm1, h2_tm1, h3_tm1, k_tm1, w_tm1, context_oh):

            attinp_h1, attgat_h1 = self.inp_to_h1.apply(w_tm1)
            inp_h1_t += attinp_h1
            gat_h1_t += attgat_h1

            h1_t = self.rnn1.apply(inp_h1_t, gat_h1_t, h1_tm1, iterate=False)

            a_t, b_t, k_t = self.h1_to_att.apply(h1_t)

            if self.attention_type == "softmax":
                a_t = tensor.nnet.softmax(a_t) + self.epsilon
            else:
                a_t = tensor.exp(a_t) + self.epsilon

            b_t = tensor.exp(b_t) + self.epsilon
            k_t = k_tm1 + self.attention_alignment * tensor.exp(k_t)

            a_t_ = a_t
            a_t = tensor.shape_padright(a_t)
            b_t = tensor.shape_padright(b_t)
            k_t_ = tensor.shape_padright(k_t)

            # batch size X att size X len context
            if self.attention_type == "softmax":
                # numpy.sqrt(1/(2*numpy.pi)) is the weird number
                phi_t = 0.3989422917366028 * tensor.sum(
                    a_t * tensor.sqrt(b_t) * tensor.exp(-0.5 * b_t *
                                                        (k_t_ - u)**2),
                    axis=1)
            else:
                phi_t = tensor.sum(a_t * tensor.exp(-b_t * (k_t_ - u)**2),
                                   axis=1)

            # batch size X len context X num letters
            w_t = (tensor.shape_padright(phi_t) * context_oh).sum(axis=1)

            attinp_h2, attgat_h2 = self.inp_to_h2.apply(w_t)
            attinp_h3, attgat_h3 = self.inp_to_h3.apply(w_t)
            inp_h2_t += attinp_h2
            gat_h2_t += attgat_h2
            inp_h3_t += attinp_h3
            gat_h3_t += attgat_h3

            h1inp_h2, h1gat_h2 = self.h1_to_h2.apply(h1_t)
            h1inp_h3, h1gat_h3 = self.h1_to_h3.apply(h1_t)

            to_normalize = [h1inp_h2, h1gat_h2, h1inp_h3, h1gat_h3]
            h1inp_h2, h1gat_h2, h1inp_h3, h1gat_h3 = \
                [_apply_norm(x, self.layer_norm) for x in to_normalize]

            h2_t = self.rnn2.apply(inp_h2_t + h1inp_h2,
                                   gat_h2_t + h1gat_h2,
                                   h2_tm1,
                                   iterate=False)

            h2inp_h3, h2gat_h3 = self.h2_to_h3.apply(h2_t)

            to_normalize = [h2inp_h3, h2gat_h3]
            h2inp_h3, h2gat_h3 = \
                [_apply_norm(x, self.layer_norm) for x in to_normalize]

            h3_t = self.rnn3.apply(inp_h3_t + h1inp_h3 + h2inp_h3,
                                   gat_h3_t + h1gat_h3 + h2gat_h3,
                                   h3_tm1,
                                   iterate=False)

            return h1_t, h2_t, h3_t, k_t, w_t, phi_t, a_t_

        (h1, h2, h3, k, w, phi, pi_att), scan_updates = theano.scan(
            fn=step,
            sequences=[cell_h1, gat_h1, cell_h2, gat_h2, cell_h3, gat_h3],
            non_sequences=[context_oh],
            outputs_info=[
                input_h1, input_h2, input_h3, input_k, input_w, None, None
            ])

        h1_out = self.h1_to_readout.apply(h1)
        h2_out = self.h2_to_readout.apply(h2)
        h3_out = self.h3_to_readout.apply(h3)

        to_normalize = [h1_out, h2_out, h3_out]
        h1_out, h2_out, h3_out = \
            [_apply_norm(x, self.layer_norm) for x in to_normalize]

        readouts = h1_out + h2_out + h3_out

        if self.use_speaker:
            readouts += self.speaker_to_readout.apply(emb_speaker)

        readouts += self.att_to_readout.apply(w)

        predicted = self.readout_to_output.apply(readouts)

        if self.which_cost == 'MSE':
            if self.use_speaker:
                predicted += self.speaker_to_output.apply(emb_speaker)
            cost = tensor.sum((predicted - target_features)**2, axis=-1)

            next_x = predicted
            # Dummy value for coeff
            coeff = predicted
        elif self.which_cost == 'GMM':
            mu, sigma, coeff = predicted
            if self.use_speaker:
                spk_to_out = self.speaker_to_output.apply(emb_speaker)
                mu += spk_to_out[0]
                sigma += spk_to_out[1]
                coeff += spk_to_out[2]

            # When training there should not be sampling_bias
            sigma = tensor.exp(sigma) + self.epsilon

            coeff = tensor.nnet.softmax(coeff.reshape(
                (-1, self.k_gmm))).reshape(coeff.shape) + self.epsilon

            cost = cost_gmm(target_features, mu, sigma, coeff)
            next_x = sample_gmm(mu, sigma, coeff, self.theano_rng)

        cost = (cost * mask).sum() / (mask.sum() + 1e-5) + 0. * start_flag

        updates = []
        updates.append((last_h1, h1[-1]))
        updates.append((last_h2, h2[-1]))
        updates.append((last_h3, h3[-1]))
        updates.append((last_k, k[-1]))
        updates.append((last_w, w[-1]))

        attention_vars = [next_x, k, w, coeff, phi, pi_att]

        return cost, scan_updates + updates, attention_vars

    @application
    def sample_model_fun(self, labels, labels_mask, speaker, num_samples,
                         seq_size):

        initial_h1, last_h1, initial_h2, last_h2, initial_h3, last_h3, \
            initial_w, last_w, initial_k, last_k = \
            self.initial_states(num_samples)

        initial_x = numpy.zeros((num_samples, self.output_dim), dtype=floatX)

        cell_shape = (seq_size, num_samples, self.rnn_h_dim)
        gat_shape = (seq_size, num_samples, 2 * self.rnn_h_dim)
        cell_h1 = tensor.zeros(cell_shape, dtype=floatX)
        cell_h2 = tensor.zeros(cell_shape, dtype=floatX)
        cell_h3 = tensor.zeros(cell_shape, dtype=floatX)
        gat_h1 = tensor.zeros(gat_shape, dtype=floatX)
        gat_h2 = tensor.zeros(gat_shape, dtype=floatX)
        gat_h3 = tensor.zeros(gat_shape, dtype=floatX)

        if self.use_speaker:
            speaker = speaker[:, 0]
            emb_speaker = self.embed_speaker.apply(speaker)

            # Applied before the broadcast.
            spk_readout = self.speaker_to_readout.apply(emb_speaker)
            spk_output = self.speaker_to_output.apply(emb_speaker)

            # Add dimension to repeat with time.
            emb_speaker = tensor.shape_padleft(emb_speaker)

            spk_cell_h1, spk_gat_h1 = self.speaker_to_h1.apply(emb_speaker)
            spk_cell_h2, spk_gat_h2 = self.speaker_to_h2.apply(emb_speaker)
            spk_cell_h3, spk_gat_h3 = self.speaker_to_h3.apply(emb_speaker)

            to_normalize = [
                spk_cell_h1, spk_gat_h1, spk_cell_h2, spk_gat_h2, spk_cell_h3,
                spk_gat_h3
            ]

            spk_cell_h1, spk_gat_h1, spk_cell_h2, spk_gat_h2, \
                spk_cell_h3, spk_gat_h3, = \
                [_apply_norm(x, self.layer_norm) for x in to_normalize]

            cell_h1 += spk_cell_h1
            cell_h2 += spk_cell_h2
            cell_h3 += spk_cell_h3
            gat_h1 += spk_gat_h1
            gat_h2 += spk_gat_h2
            gat_h3 += spk_gat_h3

        context_oh = self.encoder.apply(labels) * \
            tensor.shape_padright(labels_mask)

        u = tensor.shape_padleft(tensor.arange(labels.shape[1], dtype=floatX),
                                 2)

        def sample_step(inp_cell_h1_t, inp_gat_h1_t, inp_cell_h2_t,
                        inp_gat_h2_t, inp_cell_h3_t, inp_gat_h3_t, x_tm1,
                        h1_tm1, h2_tm1, h3_tm1, k_tm1, w_tm1):

            cell_h1_t = inp_cell_h1_t
            cell_h2_t = inp_cell_h2_t
            cell_h3_t = inp_cell_h3_t

            gat_h1_t = inp_gat_h1_t
            gat_h2_t = inp_gat_h2_t
            gat_h3_t = inp_gat_h3_t

            attinp_h1, attgat_h1 = self.inp_to_h1.apply(w_tm1)
            cell_h1_t += attinp_h1
            gat_h1_t += attgat_h1

            if self.weak_feedback:
                out_cell_h1_t, out_gat_h1_t = self.out_to_h1.apply(x_tm1)

                to_normalize = [out_cell_h1_t, out_gat_h1_t]
                out_cell_h1_t, out_gat_h1_t = \
                    [_apply_norm(x, self.layer_norm) for x in to_normalize]

                cell_h1_t += out_cell_h1_t
                gat_h1_t += out_gat_h1_t

            if self.full_feedback:
                out_cell_h2_t, out_gat_h2_t = self.out_to_h2.apply(x_tm1)
                out_cell_h3_t, out_gat_h3_t = self.out_to_h3.apply(x_tm1)

                to_normalize = [
                    out_cell_h2_t, out_gat_h2_t, out_cell_h3_t, out_gat_h3_t
                ]
                out_cell_h2_t, out_gat_h2_t, \
                    out_cell_h3_t, out_gat_h3_t = \
                    [_apply_norm(x, self.layer_norm) for x in to_normalize]

                cell_h2_t += out_cell_h2_t
                cell_h3_t += out_cell_h3_t
                gat_h2_t += out_gat_h2_t
                gat_h3_t += out_gat_h3_t

            h1_t = self.rnn1.apply(cell_h1_t, gat_h1_t, h1_tm1, iterate=False)

            a_t, b_t, k_t = self.h1_to_att.apply(h1_t)

            if self.attention_type == "softmax":
                a_t = tensor.nnet.softmax(a_t) + self.epsilon
            else:
                a_t = tensor.exp(a_t) + self.epsilon

            b_t = tensor.exp(b_t) * self.sharpening_coeff + self.epsilon
            k_t = k_tm1 + self.attention_alignment * \
                tensor.exp(k_t) / self.timing_coeff

            a_t_ = a_t
            a_t = tensor.shape_padright(a_t)
            b_t = tensor.shape_padright(b_t)
            k_t_ = tensor.shape_padright(k_t)

            # batch size X att size X len context
            if self.attention_type == "softmax":
                # numpy.sqrt(1/(2*numpy.pi)) is the weird number
                phi_t = 0.3989422917366028 * tensor.sum(
                    a_t * tensor.sqrt(b_t) * tensor.exp(-0.5 * b_t *
                                                        (k_t_ - u)**2),
                    axis=1)
            else:
                phi_t = tensor.sum(a_t * tensor.exp(-b_t * (k_t_ - u)**2),
                                   axis=1)

            # batch size X len context X num letters
            w_t = (tensor.shape_padright(phi_t) * context_oh).sum(axis=1)

            attinp_h2, attgat_h2 = self.inp_to_h2.apply(w_t)
            attinp_h3, attgat_h3 = self.inp_to_h3.apply(w_t)
            cell_h2_t += attinp_h2
            gat_h2_t += attgat_h2
            cell_h3_t += attinp_h3
            gat_h3_t += attgat_h3

            h1inp_h2, h1gat_h2 = self.h1_to_h2.apply(h1_t)
            h1inp_h3, h1gat_h3 = self.h1_to_h3.apply(h1_t)

            to_normalize = [h1inp_h2, h1gat_h2, h1inp_h3, h1gat_h3]
            h1inp_h2, h1gat_h2, h1inp_h3, h1gat_h3 = \
                [_apply_norm(x, self.layer_norm) for x in to_normalize]

            h2_t = self.rnn2.apply(cell_h2_t + h1inp_h2,
                                   gat_h2_t + h1gat_h2,
                                   h2_tm1,
                                   iterate=False)

            h2inp_h3, h2gat_h3 = self.h2_to_h3.apply(h2_t)

            to_normalize = [h2inp_h3, h2gat_h3]
            h2inp_h3, h2gat_h3 = \
                [_apply_norm(x, self.layer_norm) for x in to_normalize]

            h3_t = self.rnn3.apply(cell_h3_t + h1inp_h3 + h2inp_h3,
                                   gat_h3_t + h1gat_h3 + h2gat_h3,
                                   h3_tm1,
                                   iterate=False)

            h1_out_t = self.h1_to_readout.apply(h1_t)
            h2_out_t = self.h2_to_readout.apply(h2_t)
            h3_out_t = self.h3_to_readout.apply(h3_t)

            to_normalize = [h1_out_t, h2_out_t, h3_out_t]
            h1_out_t, h2_out_t, h3_out_t = \
                [_apply_norm(x, self.layer_norm) for x in to_normalize]

            readout_t = h1_out_t + h2_out_t + h3_out_t

            readout_t += self.att_to_readout.apply(w_t)

            if self.use_speaker:
                readout_t += spk_readout

            output_t = self.readout_to_output.apply(readout_t)

            if self.which_cost == 'MSE':
                predicted_x_t = output_t
                if self.use_speaker:
                    predicted_x_t += spk_output

                # Dummy value for coeff_t
                coeff_t = predicted_x_t
            elif self.which_cost == "GMM":
                mu_t, sigma_t, coeff_t = output_t
                if self.use_speaker:
                    mu_t += spk_output[0]
                    sigma_t += spk_output[1]
                    coeff_t += spk_output[2]

                sigma_t = tensor.exp(sigma_t - self.sampling_bias) + \
                    self.epsilon

                coeff_t = tensor.nnet.softmax(
                    coeff_t.reshape(
                        (-1, self.k_gmm)) * (1. + self.sampling_bias)).reshape(
                            coeff_t.shape) + self.epsilon

                predicted_x_t = sample_gmm(mu_t, sigma_t, coeff_t,
                                           self.theano_rng)

            return predicted_x_t, h1_t, h2_t, h3_t, \
                k_t, w_t, coeff_t, phi_t, a_t_

        (sample_x, h1, h2, h3, k, w, pi, phi, pi_att), updates = theano.scan(
            fn=sample_step,
            sequences=[cell_h1, gat_h1, cell_h2, gat_h2, cell_h3, gat_h3],
            non_sequences=[],
            outputs_info=[
                initial_x, initial_h1, initial_h2, initial_h3, initial_k,
                initial_w, None, None, None
            ])

        return sample_x, k, w, pi, phi, pi_att, updates

    def sample_model(self, labels_tr, labels_mask_tr, features_mask_tr,
                     speaker_tr, num_samples, num_steps):

        features, features_mask, labels, labels_mask, speaker, start_flag = \
            self.symbolic_input_variables()

        sample_x, k, w, pi, phi, pi_att, updates = \
            self.sample_model_fun(
                labels, labels_mask, speaker,
                num_samples, num_steps)

        theano_inputs = [labels, labels_mask]
        numpy_inputs = (labels_tr, labels_mask_tr)

        if self.use_speaker:
            theano_inputs += [speaker]
            numpy_inputs += (speaker_tr, )

        return function(theano_inputs, [sample_x, k, w, pi, phi, pi_att],
                        updates=updates)(*numpy_inputs)

    def sample_using_input(self, data_tr, num_samples):
        # Used to predict the values using the dataset

        features, features_mask, labels, labels_mask, speaker, start_flag = \
            self.symbolic_input_variables()

        cost, updates, attention_vars = self.compute_cost(
            features, features_mask, labels, labels_mask, speaker, start_flag,
            num_samples)
        sample_x, k, w, pi, phi, pi_att = attention_vars

        theano_vars = [
            features, features_mask, labels, labels_mask, speaker, start_flag
        ]
        theano_vars = [x for x in theano_vars if x is not None]
        theano_vars = list(set(theano_vars))
        theano_vars = {x.name: x for x in theano_vars}

        theano_inputs = []
        numpy_inputs = []

        for key in data_tr.keys():
            theano_inputs.append(theano_vars[key])
            numpy_inputs.append(data_tr[key])

        return function(theano_inputs, [sample_x, k, w, pi, phi, pi_att],
                        updates=updates)(*numpy_inputs)
Example #16
0
class TestGatedRecurrent(unittest.TestCase):
    def setUp(self):
        self.gated = GatedRecurrent(
            dim=3, weights_init=Constant(2),
            activation=Tanh(), gate_activation=Tanh())
        self.gated.initialize()
        self.reset_only = GatedRecurrent(
            dim=3, weights_init=IsotropicGaussian(),
            activation=Tanh(), gate_activation=Tanh(),
            use_update_gate=False, seed=1)
        self.reset_only.initialize()

    def test_one_step(self):
        h0 = tensor.matrix('h0')
        x = tensor.matrix('x')
        z = tensor.matrix('z')
        r = tensor.matrix('r')
        h1 = self.gated.apply(x, z, r, h0, iterate=False)
        next_h = theano.function(inputs=[h0, x, z, r], outputs=[h1])

        h0_val = 0.1 * numpy.array([[1, 1, 0], [0, 1, 1]],
                                   dtype=floatX)
        x_val = 0.1 * numpy.array([[1, 2, 3], [4, 5, 6]],
                                  dtype=floatX)
        zi_val = (h0_val + x_val) / 2
        ri_val = -x_val
        W_val = 2 * numpy.ones((3, 3), dtype=floatX)

        z_val = numpy.tanh(h0_val.dot(W_val) + zi_val)
        r_val = numpy.tanh(h0_val.dot(W_val) + ri_val)
        h1_val = (z_val * numpy.tanh((r_val * h0_val).dot(W_val) + x_val) +
                  (1 - z_val) * h0_val)
        assert_allclose(h1_val, next_h(h0_val, x_val, zi_val, ri_val)[0],
                        rtol=1e-6)

    def test_reset_only_many_steps(self):
        x = tensor.tensor3('x')
        ri = tensor.tensor3('ri')
        mask = tensor.matrix('mask')
        h = self.reset_only.apply(x, reset_inputs=ri, mask=mask)
        calc_h = theano.function(inputs=[x, ri, mask], outputs=[h])

        x_val = 0.1 * numpy.asarray(list(itertools.permutations(range(4))),
                                    dtype=floatX)
        x_val = numpy.ones((24, 4, 3), dtype=floatX) * x_val[..., None]
        ri_val = 0.3 - x_val
        mask_val = numpy.ones((24, 4), dtype=floatX)
        mask_val[12:24, 3] = 0
        h_val = numpy.zeros((25, 4, 3), dtype=floatX)
        W = self.reset_only.state_to_state.get_value()
        U = self.reset_only.state_to_reset.get_value()

        for i in range(1, 25):
            r_val = numpy.tanh(h_val[i - 1].dot(U) + ri_val[i - 1])
            h_val[i] = numpy.tanh((r_val * h_val[i - 1]).dot(W) +
                                  x_val[i - 1])
            h_val[i] = (mask_val[i - 1, :, None] * h_val[i] +
                        (1 - mask_val[i - 1, :, None]) * h_val[i - 1])
        h_val = h_val[1:]
        # TODO Figure out why this tolerance needs to be so big
        assert_allclose(h_val, calc_h(x_val, ri_val,  mask_val)[0], 1e-03)
Example #17
0
class GatedRecurrentFull(Initializable):
    """A wrapper around the GatedRecurrent brick that improves usability.
    It contains:
        * A fork to map to initialize the reset and the update units.
        * Better initialization to initialize the different pieces
    While this works, there is probably a better more elegant way to do this.

    Parameters
    ----------
    hidden_dim : int
        dimension of the hidden state
    activation : :class:`.Brick`
    gate_activation: :class:`.Brick`

    state_to_state_init: object
        Weight Initialization
    state_to_reset_init: object
        Weight Initialization
    state_to_update_init: obje64
        Weight Initialization

    input_to_state_transform: :class:`.Brick`
        [CvMG14] uses Linear transform
    input_to_reset_transform: :class:`.Brick`
        [CvMG14] uses Linear transform
    input_to_update_transform: :class:`.Brick`
        [CvMG14] uses Linear transform

    References
    ---------
        self.rnn = GatedRecurrent(
                weights_init=Constant(np.nan),
                dim=self.hidden_dim,
                activation=self.activation,
                gate_activation=self.gate_activation)
    .. [CvMG14] Kyunghyun Cho, Bart van Merriënboer, Çağlar Gülçehre,
        Dzmitry Bahdanau, Fethi Bougares, Holger Schwenk, and Yoshua
        Bengio, *Learning Phrase Representations using RNN Encoder-Decoder
        for Statistical Machine Translation*, EMNLP (2014), pp. 1724-1734.

    """
    @lazy(allocation=['hidden_dim', 'state_to_state_init', 'state_to_update_init', 'state_to_reset_init'],
            initialization=['input_to_state_transform', 'input_to_update_transform', 'input_to_reset_transform'])
    def __init__(self, hidden_dim, activation=None, gate_activation=None,
        state_to_state_init=None, state_to_update_init=None, state_to_reset_init=None,
        input_to_state_transform=None, input_to_update_transform=None, input_to_reset_transform=None,
        **kwargs):

        super(GatedRecurrentFull, self).__init__(**kwargs)
        self.hidden_dim = hidden_dim

        self.state_to_state_init = state_to_state_init
        self.state_to_update_init = state_to_update_init
        self.state_to_reset_init = state_to_reset_init

        self.input_to_state_transform = input_to_state_transform
        self.input_to_update_transform = input_to_update_transform
        self.input_to_reset_transform = input_to_reset_transform
        self.input_to_state_transform.name += "_input_to_state_transform"
        self.input_to_update_transform.name += "_input_to_update_transform"
        self.input_to_reset_transform.name += "_input_to_reset_transform"

        self.use_mine = True
        if self.use_mine:
            self.rnn = GatedRecurrentFast(
                    weights_init=Constant(np.nan),
                    dim=self.hidden_dim,
                    activation=activation,
                    gate_activation=gate_activation)
        else:
            self.rnn = GatedRecurrent(
                    weights_init=Constant(np.nan),
                    dim=self.hidden_dim,
                    activation=activation,
                    gate_activation=gate_activation)

        self.children = [self.rnn,
                self.input_to_state_transform, self.input_to_update_transform, self.input_to_reset_transform]
        self.children.extend(self.rnn.children)

    def initialize(self):
        super(GatedRecurrentFull, self).initialize()

        self.input_to_state_transform.initialize()
        self.input_to_update_transform.initialize()
        self.input_to_reset_transform.initialize()

        self.rnn.initialize()

        weight_shape = (self.hidden_dim, self.hidden_dim)
        state_to_state = self.state_to_state_init.generate(rng=self.rng, shape=weight_shape)
        state_to_update= self.state_to_update_init.generate(rng=self.rng, shape=weight_shape)
        state_to_reset = self.state_to_reset_init.generate(rng=self.rng, shape=weight_shape)

        self.rnn.state_to_state.set_value(state_to_state)

        if self.use_mine:
            self.rnn.state_to_update.set_value(state_to_update)
            self.rnn.state_to_reset.set_value(state_to_reset)
        else:
            self.rnn.state_to_gates.set_value(np.hstack((state_to_update, state_to_reset)))

    @application(inputs=['input_'], outputs=['output'])
    def apply(self, input_, mask=None):
        """

        Parameters
        ----------
        inputs_ : :class:`~tensor.TensorVariable`
            sequence to feed into GRU. Axes are mb, sequence, features

        mask : :class:`~tensor.TensorVariable`
            A 1D binary array with 1 or 0 to represent data given available.

        Returns
        -------
        output: :class:`theano.tensor.TensorVariable`
            sequence to feed out. Axes are batch, sequence, features
        """
        states_from_in = self.input_to_state_transform.apply(input_)
        update_from_in = self.input_to_update_transform.apply(input_)
        reset_from_in = self.input_to_reset_transform.apply(input_)

        gate_inputs = tensor.concatenate([update_from_in, reset_from_in], axis=2)

        if self.use_mine:
            output = self.rnn.apply(inputs=states_from_in, update_inputs=update_from_in, reset_inputs=reset_from_in, mask=mask)
        else:
            output = self.rnn.apply(inputs=states_from_in, gate_inputs=gate_inputs)

        return output
Example #18
0
class Parrot(Initializable, Random):
    def __init__(
            self,
            input_dim=420,  # Dimension of the text labels
            output_dim=63,  # Dimension of vocoder fram
            rnn_h_dim=1024,  # Size of rnn hidden state
            readouts_dim=1024,  # Size of readouts (summary of rnn)
            weak_feedback=False,  # Feedback to the top rnn layer
            full_feedback=False,  # Feedback to all rnn layers
            feedback_noise_level=None,  # Amount of noise in feedback
            layer_norm=False,  # Use simple normalization?
            use_speaker=False,  # Condition on the speaker id?
            num_speakers=21,  # How many speakers there are?
            speaker_dim=128,  # Size of speaker embedding
            which_cost='MSE',  # Train with MSE or GMM
            k_gmm=20,  # How many components in the GMM
            sampling_bias=0,  # Make samples more likely (Graves13)
            epsilon=1e-5,  # Numerical stabilities
            num_characters=43,  # how many chars in the labels
            attention_type='graves',  # graves or softmax
            attention_size=10,  # number of gaussians in the attention
            attention_alignment=1.,  # audio steps per letter at initialization
            sharpening_coeff=1.,
            timing_coeff=1.,
            encoder_type=None,
            encoder_dim=128,
            raw_output=False,
            **kwargs):

        super(Parrot, self).__init__(**kwargs)

        self.input_dim = input_dim
        self.output_dim = output_dim
        self.rnn_h_dim = rnn_h_dim
        self.readouts_dim = readouts_dim
        self.layer_norm = layer_norm
        self.which_cost = which_cost
        self.use_speaker = use_speaker
        self.full_feedback = full_feedback
        self.feedback_noise_level = feedback_noise_level
        self.epsilon = epsilon

        self.num_characters = num_characters
        self.attention_type = attention_type
        self.attention_alignment = attention_alignment
        self.attention_size = attention_size
        self.sharpening_coeff = sharpening_coeff
        self.timing_coeff = timing_coeff

        self.encoder_type = encoder_type
        self.encoder_dim = encoder_dim

        self.encoded_input_dim = input_dim

        self.raw_output = raw_output

        if self.encoder_type == 'bidirectional':
            self.encoded_input_dim = 2 * encoder_dim

        if self.feedback_noise_level is not None:
            self.noise_level_var = tensor.scalar('feedback_noise_level')

        self.rnn1 = GatedRecurrent(dim=rnn_h_dim, name='rnn1')
        self.rnn2 = GatedRecurrent(dim=rnn_h_dim, name='rnn2')
        self.rnn3 = GatedRecurrent(dim=rnn_h_dim, name='rnn3')

        self.h1_to_readout = Linear(
            input_dim=rnn_h_dim,
            output_dim=readouts_dim,
            name='h1_to_readout')

        self.h2_to_readout = Linear(
            input_dim=rnn_h_dim,
            output_dim=readouts_dim,
            name='h2_to_readout')

        self.h3_to_readout = Linear(
            input_dim=rnn_h_dim,
            output_dim=readouts_dim,
            name='h3_to_readout')

        self.h1_to_h2 = Fork(
            output_names=['rnn2_inputs', 'rnn2_gates'],
            input_dim=rnn_h_dim,
            output_dims=[rnn_h_dim, 2 * rnn_h_dim],
            name='h1_to_h2')

        self.h1_to_h3 = Fork(
            output_names=['rnn3_inputs', 'rnn3_gates'],
            input_dim=rnn_h_dim,
            output_dims=[rnn_h_dim, 2 * rnn_h_dim],
            name='h1_to_h3')

        self.h2_to_h3 = Fork(
            output_names=['rnn3_inputs', 'rnn3_gates'],
            input_dim=rnn_h_dim,
            output_dims=[rnn_h_dim, 2 * rnn_h_dim],
            name='h2_to_h3')

        if which_cost == 'MSE':
            self.readout_to_output = Linear(
                input_dim=readouts_dim,
                output_dim=output_dim,
                name='readout_to_output')
        elif which_cost == 'GMM':
            self.sampling_bias = sampling_bias
            self.k_gmm = k_gmm
            self.readout_to_output = Fork(
                output_names=['gmm_mu', 'gmm_sigma', 'gmm_coeff'],
                input_dim=readouts_dim,
                output_dims=[output_dim * k_gmm, output_dim * k_gmm, k_gmm],
                name='readout_to_output')

        self.encoder = Encoder(
            encoder_type,
            num_characters,
            input_dim,
            encoder_dim,
            name='encoder')

        self.children = [
            self.encoder,
            self.rnn1,
            self.rnn2,
            self.rnn3,
            self.h1_to_readout,
            self.h2_to_readout,
            self.h3_to_readout,
            self.h1_to_h2,
            self.h1_to_h3,
            self.h2_to_h3,
            self.readout_to_output]

        self.inp_to_h1 = Fork(
            output_names=['rnn1_inputs', 'rnn1_gates'],
            input_dim=self.encoded_input_dim,
            output_dims=[rnn_h_dim, 2 * rnn_h_dim],
            name='inp_to_h1')

        self.inp_to_h2 = Fork(
            output_names=['rnn2_inputs', 'rnn2_gates'],
            input_dim=self.encoded_input_dim,
            output_dims=[rnn_h_dim, 2 * rnn_h_dim],
            name='inp_to_h2')

        self.inp_to_h3 = Fork(
            output_names=['rnn3_inputs', 'rnn3_gates'],
            input_dim=self.encoded_input_dim,
            output_dims=[rnn_h_dim, 2 * rnn_h_dim],
            name='inp_to_h3')

        self.children += [
            self.inp_to_h1,
            self.inp_to_h2,
            self.inp_to_h3]

        self.h1_to_att = Fork(
            output_names=['alpha', 'beta', 'kappa'],
            input_dim=rnn_h_dim,
            output_dims=[attention_size] * 3,
            name='h1_to_att')

        self.att_to_readout = Linear(
            input_dim=self.encoded_input_dim,
            output_dim=readouts_dim,
            name='att_to_readout')

        self.children += [
            self.h1_to_att,
            self.att_to_readout]

        if use_speaker:
            self.num_speakers = num_speakers
            self.speaker_dim = speaker_dim
            self.embed_speaker = LookupTable(num_speakers, speaker_dim)

            self.speaker_to_h1 = Fork(
                output_names=['rnn1_inputs', 'rnn1_gates'],
                input_dim=speaker_dim,
                output_dims=[rnn_h_dim, 2 * rnn_h_dim],
                name='speaker_to_h1')

            self.speaker_to_h2 = Fork(
                output_names=['rnn2_inputs', 'rnn2_gates'],
                input_dim=speaker_dim,
                output_dims=[rnn_h_dim, 2 * rnn_h_dim],
                name='speaker_to_h2')

            self.speaker_to_h3 = Fork(
                output_names=['rnn3_inputs', 'rnn3_gates'],
                input_dim=speaker_dim,
                output_dims=[rnn_h_dim, 2 * rnn_h_dim],
                name='speaker_to_h3')

            self.speaker_to_readout = Linear(
                input_dim=speaker_dim,
                output_dim=readouts_dim,
                name='speaker_to_readout')

            if which_cost == 'MSE':
                self.speaker_to_output = Linear(
                    input_dim=speaker_dim,
                    output_dim=output_dim,
                    name='speaker_to_output')
            elif which_cost == 'GMM':
                self.speaker_to_output = Fork(
                    output_names=['gmm_mu', 'gmm_sigma', 'gmm_coeff'],
                    input_dim=speaker_dim,
                    output_dims=[
                        output_dim * k_gmm, output_dim * k_gmm, k_gmm],
                    name='speaker_to_output')

            self.children += [
                self.embed_speaker,
                self.speaker_to_h1,
                self.speaker_to_h2,
                self.speaker_to_h3,
                self.speaker_to_readout,
                self.speaker_to_output]

        if full_feedback:
            self.out_to_h2 = Fork(
                output_names=['rnn2_inputs', 'rnn2_gates'],
                input_dim=output_dim,
                output_dims=[rnn_h_dim, 2 * rnn_h_dim],
                name='out_to_h2')

            self.out_to_h3 = Fork(
                output_names=['rnn3_inputs', 'rnn3_gates'],
                input_dim=output_dim,
                output_dims=[rnn_h_dim, 2 * rnn_h_dim],
                name='out_to_h3')
            self.children += [
                self.out_to_h2,
                self.out_to_h3]
            weak_feedback = True

        self.weak_feedback = weak_feedback

        if weak_feedback:
            self.out_to_h1 = Fork(
                output_names=['rnn1_inputs', 'rnn1_gates'],
                input_dim=output_dim,
                output_dims=[rnn_h_dim, 2 * rnn_h_dim],
                name='out_to_h1')
            self.children += [
                self.out_to_h1]

        if self.raw_output:
            self.sampleRnn = SampleRnn()
            self.children += [self.sampleRnn]

    def _allocate(self):
        self.initial_w = shared_floatx_zeros(
            (self.encoded_input_dim,), name="initial_w")

        add_role(self.initial_w, INITIAL_STATE)

    def symbolic_input_variables(self):
        features = tensor.tensor3('features')
        features_mask = tensor.matrix('features_mask')
        labels = tensor.imatrix('labels')
        labels_mask = tensor.matrix('labels_mask')

        start_flag = tensor.scalar('start_flag')

        if self.use_speaker:
            speaker = tensor.imatrix('speaker_index')
        else:
            speaker = None

        if self.raw_output:
            raw_sequence = tensor.itensor3('raw_audio')
        else:
            raw_sequence = None

        return features, features_mask, labels, labels_mask, \
            speaker, start_flag, raw_sequence

    def initial_states(self, batch_size):
        initial_h1 = self.rnn1.initial_states(batch_size)
        initial_h2 = self.rnn2.initial_states(batch_size)
        initial_h3 = self.rnn3.initial_states(batch_size)

        last_h1 = shared_floatx_zeros((batch_size, self.rnn_h_dim))
        last_h2 = shared_floatx_zeros((batch_size, self.rnn_h_dim))
        last_h3 = shared_floatx_zeros((batch_size, self.rnn_h_dim))

        # Defining for all
        initial_k = tensor.zeros(
            (batch_size, self.attention_size), dtype=floatX)
        last_k = shared_floatx_zeros((batch_size, self.attention_size))

        # Trainable initial state for w. Why not for k?
        initial_w = tensor.repeat(self.initial_w[None, :], batch_size, 0)

        last_w = shared_floatx_zeros((batch_size, self.encoded_input_dim))

        return initial_h1, last_h1, initial_h2, last_h2, initial_h3, last_h3, \
            initial_w, last_w, initial_k, last_k

    @application
    def compute_cost(
            self, features, features_mask, labels, labels_mask,
            speaker, start_flag, batch_size, raw_audio=None):

        if speaker is None:
            assert not self.use_speaker

        target_features = features[1:]
        mask = features_mask[1:]

        cell_shape = (mask.shape[0], batch_size, self.rnn_h_dim)
        gat_shape = (mask.shape[0], batch_size, 2 * self.rnn_h_dim)
        cell_h1 = tensor.zeros(cell_shape, dtype=floatX)
        cell_h2 = tensor.zeros(cell_shape, dtype=floatX)
        cell_h3 = tensor.zeros(cell_shape, dtype=floatX)
        gat_h1 = tensor.zeros(gat_shape, dtype=floatX)
        gat_h2 = tensor.zeros(gat_shape, dtype=floatX)
        gat_h3 = tensor.zeros(gat_shape, dtype=floatX)

        if self.weak_feedback:
            input_features = features[:-1]

            if self.feedback_noise_level:
                noise = self.theano_rng.normal(
                    size=input_features.shape,
                    avg=0., std=1.)
                input_features += self.noise_level_var * noise

            out_cell_h1, out_gat_h1 = self.out_to_h1.apply(input_features)

            to_normalize = [
                out_cell_h1, out_gat_h1]
            out_cell_h1, out_gat_h1 = \
                [_apply_norm(x, self.layer_norm) for x in to_normalize]

            cell_h1 += out_cell_h1
            gat_h1 += out_gat_h1

        if self.full_feedback:
            assert self.weak_feedback
            out_cell_h2, out_gat_h2 = self.out_to_h2.apply(input_features)
            out_cell_h3, out_gat_h3 = self.out_to_h3.apply(input_features)

            to_normalize = [
                out_cell_h2, out_gat_h2, out_cell_h3, out_gat_h3]
            out_cell_h2, out_gat_h2, out_cell_h3, out_gat_h3 = \
                [_apply_norm(x, self.layer_norm) for x in to_normalize]

            cell_h2 += out_cell_h2
            gat_h2 += out_gat_h2
            cell_h3 += out_cell_h3
            gat_h3 += out_gat_h3

        if self.use_speaker:
            speaker = speaker[:, 0]
            emb_speaker = self.embed_speaker.apply(speaker)
            emb_speaker = tensor.shape_padleft(emb_speaker)

            spk_cell_h1, spk_gat_h1 = self.speaker_to_h1.apply(emb_speaker)
            spk_cell_h2, spk_gat_h2 = self.speaker_to_h2.apply(emb_speaker)
            spk_cell_h3, spk_gat_h3 = self.speaker_to_h3.apply(emb_speaker)

            to_normalize = [
                spk_cell_h1, spk_gat_h1, spk_cell_h2, spk_gat_h2,
                spk_cell_h3, spk_gat_h3]

            spk_cell_h1, spk_gat_h1, spk_cell_h2, spk_gat_h2, \
                spk_cell_h3, spk_gat_h3, = \
                [_apply_norm(x, self.layer_norm) for x in to_normalize]

            cell_h1 = spk_cell_h1 + cell_h1
            cell_h2 = spk_cell_h2 + cell_h2
            cell_h3 = spk_cell_h3 + cell_h3
            gat_h1 = spk_gat_h1 + gat_h1
            gat_h2 = spk_gat_h2 + gat_h2
            gat_h3 = spk_gat_h3 + gat_h3

        initial_h1, last_h1, initial_h2, last_h2, initial_h3, last_h3, \
            initial_w, last_w, initial_k, last_k = \
            self.initial_states(batch_size)

        # If it's a new example, use initial states.
        input_h1 = tensor.switch(
            start_flag, initial_h1, last_h1)
        input_h2 = tensor.switch(
            start_flag, initial_h2, last_h2)
        input_h3 = tensor.switch(
            start_flag, initial_h3, last_h3)
        input_w = tensor.switch(
            start_flag, initial_w, last_w)
        input_k = tensor.switch(
            start_flag, initial_k, last_k)

        context_oh = self.encoder.apply(labels) * \
            tensor.shape_padright(labels_mask)

        u = tensor.shape_padleft(
            tensor.arange(labels.shape[1], dtype=floatX), 2)

        def step(
                inp_h1_t, gat_h1_t, inp_h2_t, gat_h2_t, inp_h3_t, gat_h3_t,
                h1_tm1, h2_tm1, h3_tm1, k_tm1, w_tm1, context_oh):

            attinp_h1, attgat_h1 = self.inp_to_h1.apply(w_tm1)
            inp_h1_t += attinp_h1
            gat_h1_t += attgat_h1

            h1_t = self.rnn1.apply(
                inp_h1_t,
                gat_h1_t,
                h1_tm1, iterate=False)

            a_t, b_t, k_t = self.h1_to_att.apply(h1_t)

            if self.attention_type == "softmax":
                a_t = tensor.nnet.softmax(a_t) + self.epsilon
            else:
                a_t = tensor.exp(a_t) + self.epsilon

            b_t = tensor.exp(b_t) + self.epsilon
            k_t = k_tm1 + self.attention_alignment * tensor.exp(k_t)

            a_t_ = a_t
            a_t = tensor.shape_padright(a_t)
            b_t = tensor.shape_padright(b_t)
            k_t_ = tensor.shape_padright(k_t)

            # batch size X att size X len context
            if self.attention_type == "softmax":
                # numpy.sqrt(1/(2*numpy.pi)) is the weird number
                phi_t = 0.3989422917366028 * tensor.sum(
                    a_t * tensor.sqrt(b_t) *
                    tensor.exp(-0.5 * b_t * (k_t_ - u)**2), axis=1)
            else:
                phi_t = tensor.sum(
                    a_t * tensor.exp(-b_t * (k_t_ - u)**2), axis=1)

            # batch size X len context X num letters
            w_t = (tensor.shape_padright(phi_t) * context_oh).sum(axis=1)

            attinp_h2, attgat_h2 = self.inp_to_h2.apply(w_t)
            attinp_h3, attgat_h3 = self.inp_to_h3.apply(w_t)
            inp_h2_t += attinp_h2
            gat_h2_t += attgat_h2
            inp_h3_t += attinp_h3
            gat_h3_t += attgat_h3

            h1inp_h2, h1gat_h2 = self.h1_to_h2.apply(h1_t)
            h1inp_h3, h1gat_h3 = self.h1_to_h3.apply(h1_t)

            to_normalize = [
                h1inp_h2, h1gat_h2, h1inp_h3, h1gat_h3]
            h1inp_h2, h1gat_h2, h1inp_h3, h1gat_h3 = \
                [_apply_norm(x, self.layer_norm) for x in to_normalize]

            h2_t = self.rnn2.apply(
                inp_h2_t + h1inp_h2,
                gat_h2_t + h1gat_h2,
                h2_tm1, iterate=False)

            h2inp_h3, h2gat_h3 = self.h2_to_h3.apply(h2_t)

            to_normalize = [
                h2inp_h3, h2gat_h3]
            h2inp_h3, h2gat_h3 = \
                [_apply_norm(x, self.layer_norm) for x in to_normalize]

            h3_t = self.rnn3.apply(
                inp_h3_t + h1inp_h3 + h2inp_h3,
                gat_h3_t + h1gat_h3 + h2gat_h3,
                h3_tm1, iterate=False)

            return h1_t, h2_t, h3_t, k_t, w_t, phi_t, a_t_

        (h1, h2, h3, k, w, phi, pi_att), scan_updates = theano.scan(
            fn=step,
            sequences=[cell_h1, gat_h1, cell_h2, gat_h2, cell_h3, gat_h3],
            non_sequences=[context_oh],
            outputs_info=[
                input_h1,
                input_h2,
                input_h3,
                input_k,
                input_w,
                None,
                None])

        h1_out = self.h1_to_readout.apply(h1)
        h2_out = self.h2_to_readout.apply(h2)
        h3_out = self.h3_to_readout.apply(h3)

        to_normalize = [
            h1_out, h2_out, h3_out]
        h1_out, h2_out, h3_out = \
            [_apply_norm(x, self.layer_norm) for x in to_normalize]

        readouts = h1_out + h2_out + h3_out

        if self.use_speaker:
            readouts += self.speaker_to_readout.apply(emb_speaker)

        readouts += self.att_to_readout.apply(w)

        predicted = self.readout_to_output.apply(readouts)

        if self.which_cost == 'MSE':
            if self.use_speaker:
                predicted += self.speaker_to_output.apply(emb_speaker)
            cost = tensor.sum((predicted - target_features) ** 2, axis=-1)

            next_x = predicted
            # Dummy value for coeff
            coeff = predicted
        elif self.which_cost == 'GMM':
            mu, sigma, coeff = predicted
            if self.use_speaker:
                spk_to_out = self.speaker_to_output.apply(emb_speaker)
                mu += spk_to_out[0]
                sigma += spk_to_out[1]
                coeff += spk_to_out[2]

            # When training there should not be sampling_bias
            sigma = tensor.exp(sigma) + self.epsilon

            coeff = tensor.nnet.softmax(
                coeff.reshape(
                    (-1, self.k_gmm))).reshape(
                        coeff.shape) + self.epsilon

            cost = cost_gmm(target_features, mu, sigma, coeff)
            next_x = sample_gmm(mu, sigma, coeff, self.theano_rng)

        cost = (cost * mask).sum() / (mask.sum() + 1e-5) + 0. * start_flag

        updates = []
        updates.append((last_h1, h1[-1]))
        updates.append((last_h2, h2[-1]))
        updates.append((last_h3, h3[-1]))
        updates.append((last_k, k[-1]))
        updates.append((last_w, w[-1]))

        cost_raw = None
        if self.raw_output:
            raw_mask = tensor.extra_ops.repeat(features_mask, 80, axis=0)
            raw_mask = raw_mask.dimshuffle(1, 0)

            # breakpointOp = PdbBreakpoint("Raw mask breakpoint")
            # condition = tensor.gt(raw_mask.shape[0], 0)
            # raw_mask = breakpointOp(condition, raw_mask)

            predicted_transposed = predicted.dimshuffle(1, 0, 2)

            last_h0, last_big_h0 = self.sampleRnn.initial_states(batch_size)
            raw_audio_reshaped = raw_audio.dimshuffle(1, 0, 2)
            raw_audio_reshaped = raw_audio_reshaped.reshape((raw_audio_reshaped.shape[0], -1))
            cost_raw, ip_cost, all_params, ip_params, other_params, new_h0, new_big_h0 =\
                self.sampleRnn.apply(raw_audio_reshaped, predicted_transposed, last_h0, last_big_h0, start_flag, raw_mask)

            if self.sampleRnn.N_RNN == 1:
                new_h0 = tensor.unbroadcast(new_h0, 1)
                new_big_h0 = tensor.unbroadcast(new_big_h0, 1)


            updates.append((last_h0, new_h0))
            updates.append((last_big_h0, new_big_h0))
            # cost = cost + 80.*cost_raw
            alpha_ = numpy.float32(0.)
            beta_ = numpy.float32(1.)
            cost = alpha_*cost + beta_*cost_raw

        attention_vars = [next_x, k, w, coeff, phi, pi_att]

        return cost, scan_updates + updates, attention_vars, cost_raw

    @application
    def sample_model_fun(
            self, labels, labels_mask, speaker, num_samples, seq_size):

        initial_h1, last_h1, initial_h2, last_h2, initial_h3, last_h3, \
            initial_w, last_w, initial_k, last_k = \
            self.initial_states(num_samples)

        initial_x = numpy.zeros(
            (num_samples, self.output_dim), dtype=floatX)

        cell_shape = (seq_size, num_samples, self.rnn_h_dim)
        gat_shape = (seq_size, num_samples, 2 * self.rnn_h_dim)
        cell_h1 = tensor.zeros(cell_shape, dtype=floatX)
        cell_h2 = tensor.zeros(cell_shape, dtype=floatX)
        cell_h3 = tensor.zeros(cell_shape, dtype=floatX)
        gat_h1 = tensor.zeros(gat_shape, dtype=floatX)
        gat_h2 = tensor.zeros(gat_shape, dtype=floatX)
        gat_h3 = tensor.zeros(gat_shape, dtype=floatX)

        if self.use_speaker:
            speaker = speaker[:, 0]
            emb_speaker = self.embed_speaker.apply(speaker)

            # Applied before the broadcast.
            spk_readout = self.speaker_to_readout.apply(emb_speaker)
            spk_output = self.speaker_to_output.apply(emb_speaker)

            # Add dimension to repeat with time.
            emb_speaker = tensor.shape_padleft(emb_speaker)

            spk_cell_h1, spk_gat_h1 = self.speaker_to_h1.apply(emb_speaker)
            spk_cell_h2, spk_gat_h2 = self.speaker_to_h2.apply(emb_speaker)
            spk_cell_h3, spk_gat_h3 = self.speaker_to_h3.apply(emb_speaker)

            to_normalize = [
                spk_cell_h1, spk_gat_h1, spk_cell_h2, spk_gat_h2,
                spk_cell_h3, spk_gat_h3]

            spk_cell_h1, spk_gat_h1, spk_cell_h2, spk_gat_h2, \
                spk_cell_h3, spk_gat_h3, = \
                [_apply_norm(x, self.layer_norm) for x in to_normalize]

            cell_h1 += spk_cell_h1
            cell_h2 += spk_cell_h2
            cell_h3 += spk_cell_h3
            gat_h1 += spk_gat_h1
            gat_h2 += spk_gat_h2
            gat_h3 += spk_gat_h3

        context_oh = self.encoder.apply(labels) * \
            tensor.shape_padright(labels_mask)

        u = tensor.shape_padleft(
            tensor.arange(labels.shape[1], dtype=floatX), 2)

        def sample_step(
                inp_cell_h1_t, inp_gat_h1_t, inp_cell_h2_t, inp_gat_h2_t,
                inp_cell_h3_t, inp_gat_h3_t, x_tm1, h1_tm1, h2_tm1, h3_tm1,
                k_tm1, w_tm1):

            cell_h1_t = inp_cell_h1_t
            cell_h2_t = inp_cell_h2_t
            cell_h3_t = inp_cell_h3_t

            gat_h1_t = inp_gat_h1_t
            gat_h2_t = inp_gat_h2_t
            gat_h3_t = inp_gat_h3_t

            attinp_h1, attgat_h1 = self.inp_to_h1.apply(w_tm1)
            cell_h1_t += attinp_h1
            gat_h1_t += attgat_h1

            if self.weak_feedback:
                out_cell_h1_t, out_gat_h1_t = self.out_to_h1.apply(x_tm1)

                to_normalize = [
                    out_cell_h1_t, out_gat_h1_t]
                out_cell_h1_t, out_gat_h1_t = \
                    [_apply_norm(x, self.layer_norm) for x in to_normalize]

                cell_h1_t += out_cell_h1_t
                gat_h1_t += out_gat_h1_t

            if self.full_feedback:
                out_cell_h2_t, out_gat_h2_t = self.out_to_h2.apply(x_tm1)
                out_cell_h3_t, out_gat_h3_t = self.out_to_h3.apply(x_tm1)

                to_normalize = [
                    out_cell_h2_t, out_gat_h2_t,
                    out_cell_h3_t, out_gat_h3_t]
                out_cell_h2_t, out_gat_h2_t, \
                    out_cell_h3_t, out_gat_h3_t = \
                    [_apply_norm(x, self.layer_norm) for x in to_normalize]

                cell_h2_t += out_cell_h2_t
                cell_h3_t += out_cell_h3_t
                gat_h2_t += out_gat_h2_t
                gat_h3_t += out_gat_h3_t

            h1_t = self.rnn1.apply(
                cell_h1_t,
                gat_h1_t,
                h1_tm1, iterate=False)

            a_t, b_t, k_t = self.h1_to_att.apply(h1_t)

            if self.attention_type == "softmax":
                a_t = tensor.nnet.softmax(a_t) + self.epsilon
            else:
                a_t = tensor.exp(a_t) + self.epsilon

            b_t = tensor.exp(b_t) * self.sharpening_coeff + self.epsilon
            k_t = k_tm1 + self.attention_alignment * \
                tensor.exp(k_t) / self.timing_coeff

            a_t_ = a_t
            a_t = tensor.shape_padright(a_t)
            b_t = tensor.shape_padright(b_t)
            k_t_ = tensor.shape_padright(k_t)

            # batch size X att size X len context
            if self.attention_type == "softmax":
                # numpy.sqrt(1/(2*numpy.pi)) is the weird number
                phi_t = 0.3989422917366028 * tensor.sum(
                    a_t * tensor.sqrt(b_t) *
                    tensor.exp(-0.5 * b_t * (k_t_ - u)**2), axis=1)
            else:
                phi_t = tensor.sum(
                    a_t * tensor.exp(-b_t * (k_t_ - u)**2), axis=1)

            # batch size X len context X num letters
            w_t = (tensor.shape_padright(phi_t) * context_oh).sum(axis=1)

            attinp_h2, attgat_h2 = self.inp_to_h2.apply(w_t)
            attinp_h3, attgat_h3 = self.inp_to_h3.apply(w_t)
            cell_h2_t += attinp_h2
            gat_h2_t += attgat_h2
            cell_h3_t += attinp_h3
            gat_h3_t += attgat_h3

            h1inp_h2, h1gat_h2 = self.h1_to_h2.apply(h1_t)
            h1inp_h3, h1gat_h3 = self.h1_to_h3.apply(h1_t)

            to_normalize = [
                h1inp_h2, h1gat_h2, h1inp_h3, h1gat_h3]
            h1inp_h2, h1gat_h2, h1inp_h3, h1gat_h3 = \
                [_apply_norm(x, self.layer_norm) for x in to_normalize]

            h2_t = self.rnn2.apply(
                cell_h2_t + h1inp_h2,
                gat_h2_t + h1gat_h2,
                h2_tm1, iterate=False)

            h2inp_h3, h2gat_h3 = self.h2_to_h3.apply(h2_t)

            to_normalize = [
                h2inp_h3, h2gat_h3]
            h2inp_h3, h2gat_h3 = \
                [_apply_norm(x, self.layer_norm) for x in to_normalize]

            h3_t = self.rnn3.apply(
                cell_h3_t + h1inp_h3 + h2inp_h3,
                gat_h3_t + h1gat_h3 + h2gat_h3,
                h3_tm1, iterate=False)

            h1_out_t = self.h1_to_readout.apply(h1_t)
            h2_out_t = self.h2_to_readout.apply(h2_t)
            h3_out_t = self.h3_to_readout.apply(h3_t)

            to_normalize = [
                h1_out_t, h2_out_t, h3_out_t]
            h1_out_t, h2_out_t, h3_out_t = \
                [_apply_norm(x, self.layer_norm) for x in to_normalize]

            readout_t = h1_out_t + h2_out_t + h3_out_t

            readout_t += self.att_to_readout.apply(w_t)

            if self.use_speaker:
                readout_t += spk_readout

            output_t = self.readout_to_output.apply(readout_t)

            if self.which_cost == 'MSE':
                predicted_x_t = output_t
                if self.use_speaker:
                    predicted_x_t += spk_output

                # Dummy value for coeff_t
                coeff_t = predicted_x_t
            elif self.which_cost == "GMM":
                mu_t, sigma_t, coeff_t = output_t
                if self.use_speaker:
                    mu_t += spk_output[0]
                    sigma_t += spk_output[1]
                    coeff_t += spk_output[2]

                sigma_t = tensor.exp(sigma_t - self.sampling_bias) + \
                    self.epsilon

                coeff_t = tensor.nnet.softmax(
                    coeff_t.reshape(
                        (-1, self.k_gmm)) * (1. + self.sampling_bias)).reshape(
                            coeff_t.shape) + self.epsilon

                predicted_x_t = sample_gmm(
                    mu_t, sigma_t, coeff_t, self.theano_rng)

            return predicted_x_t, h1_t, h2_t, h3_t, \
                k_t, w_t, coeff_t, phi_t, a_t_

        (sample_x, h1, h2, h3, k, w, pi, phi, pi_att), updates = theano.scan(
            fn=sample_step,
            sequences=[
                cell_h1,
                gat_h1,
                cell_h2,
                gat_h2,
                cell_h3,
                gat_h3],
            non_sequences=[],
            outputs_info=[
                initial_x,
                initial_h1,
                initial_h2,
                initial_h3,
                initial_k,
                initial_w,
                None,
                None,
                None])

        return sample_x, k, w, pi, phi, pi_att, updates

    def sample_model(
            self, labels_tr, labels_mask_tr, features_mask_tr,
            speaker_tr, num_samples, num_steps):

        features, features_mask, labels, labels_mask, speaker, start_flag, raw_sequence = \
            self.symbolic_input_variables()

        sample_x, k, w, pi, phi, pi_att, updates = \
            self.sample_model_fun(
                labels, labels_mask, speaker,
                num_samples, num_steps)

        theano_inputs = [labels, labels_mask]
        numpy_inputs = (labels_tr, labels_mask_tr)

        if self.use_speaker:
            theano_inputs += [speaker]
            numpy_inputs += (speaker_tr,)

        return function(
            theano_inputs,
            [sample_x, k, w, pi, phi, pi_att],
            updates=updates)(*numpy_inputs)

    def sample_using_input(self, data_tr, num_samples):
        # Used to predict the values using the dataset

        features, features_mask, labels, labels_mask, speaker, start_flag, raw_sequence = \
            self.symbolic_input_variables()

        cost, updates, attention_vars = self.compute_cost(
            features, features_mask, labels, labels_mask,
            speaker, start_flag, num_samples)
        sample_x, k, w, pi, phi, pi_att = attention_vars

        theano_vars = [
            features, features_mask, labels, labels_mask, speaker, start_flag]
        theano_vars = [x for x in theano_vars if x is not None]
        theano_vars = list(set(theano_vars))
        theano_vars = {x.name: x for x in theano_vars}

        theano_inputs = []
        numpy_inputs = []

        for key in data_tr.keys():
            theano_inputs.append(theano_vars[key])
            numpy_inputs.append(data_tr[key])

        return function(
            theano_inputs, [sample_x, k, w, pi, phi, pi_att],
            updates=updates)(*numpy_inputs)
Example #19
0
class TestGatedRecurrent(unittest.TestCase):
    def setUp(self):
        self.gated = GatedRecurrent(
            dim=3, activation=Tanh(),
            gate_activation=Tanh(), weights_init=Constant(2))
        self.gated.initialize()
        self.reset_only = GatedRecurrent(
            dim=3, activation=Tanh(),
            gate_activation=Tanh(),
            weights_init=IsotropicGaussian(), seed=1)
        self.reset_only.initialize()

    def test_one_step(self):
        h0 = tensor.matrix('h0')
        x = tensor.matrix('x')
        gi = tensor.matrix('gi')
        h1 = self.gated.apply(x, gi, h0, iterate=False)
        next_h = theano.function(inputs=[h0, x, gi], outputs=[h1])

        h0_val = 0.1 * numpy.array([[1, 1, 0], [0, 1, 1]],
                                   dtype=theano.config.floatX)
        x_val = 0.1 * numpy.array([[1, 2, 3], [4, 5, 6]],
                                  dtype=theano.config.floatX)
        zi_val = (h0_val + x_val) / 2
        ri_val = -x_val
        W_val = 2 * numpy.ones((3, 3), dtype=theano.config.floatX)

        z_val = numpy.tanh(h0_val.dot(W_val) + zi_val)
        r_val = numpy.tanh(h0_val.dot(W_val) + ri_val)
        h1_val = (z_val * numpy.tanh((r_val * h0_val).dot(W_val) + x_val) +
                  (1 - z_val) * h0_val)
        assert_allclose(
            h1_val, next_h(h0_val, x_val, numpy.hstack([zi_val, ri_val]))[0],
            rtol=1e-6)

    def test_many_steps(self):
        x = tensor.tensor3('x')
        gi = tensor.tensor3('gi')
        mask = tensor.matrix('mask')
        h = self.reset_only.apply(x, gi, mask=mask)
        calc_h = theano.function(inputs=[x, gi, mask], outputs=[h])

        x_val = 0.1 * numpy.asarray(list(itertools.permutations(range(4))),
                                    dtype=theano.config.floatX)
        x_val = numpy.ones((24, 4, 3),
                           dtype=theano.config.floatX) * x_val[..., None]
        ri_val = 0.3 - x_val
        zi_val = 2 * ri_val
        mask_val = numpy.ones((24, 4), dtype=theano.config.floatX)
        mask_val[12:24, 3] = 0
        h_val = numpy.zeros((25, 4, 3), dtype=theano.config.floatX)
        W = self.reset_only.state_to_state.get_value()
        Wz = self.reset_only.state_to_gates.get_value()[:, :3]
        Wr = self.reset_only.state_to_gates.get_value()[:, 3:]

        for i in range(1, 25):
            z_val = numpy.tanh(h_val[i - 1].dot(Wz) + zi_val[i - 1])
            r_val = numpy.tanh(h_val[i - 1].dot(Wr) + ri_val[i - 1])
            h_val[i] = numpy.tanh((r_val * h_val[i - 1]).dot(W) +
                                  x_val[i - 1])
            h_val[i] = z_val * h_val[i] + (1 - z_val) * h_val[i - 1]
            h_val[i] = (mask_val[i - 1, :, None] * h_val[i] +
                        (1 - mask_val[i - 1, :, None]) * h_val[i - 1])
        h_val = h_val[1:]
        # TODO Figure out why this tolerance needs to be so big
        assert_allclose(
            h_val,
            calc_h(x_val, numpy.concatenate(
                [zi_val, ri_val], axis=2), mask_val)[0],
            1e-04)

        # Also test that initial state is a parameter
        initial_state, = VariableFilter(roles=[INITIAL_STATE])(
            ComputationGraph(h))
        assert is_shared_variable(initial_state)
        assert initial_state.name == 'initial_state'
Example #20
0
class TestGatedRecurrent(unittest.TestCase):
    def setUp(self):
        self.gated = GatedRecurrent(
            dim=3, weights_init=Constant(2),
            activation=Tanh(), gate_activation=Tanh())
        self.gated.initialize()
        self.reset_only = GatedRecurrent(
            dim=3, weights_init=IsotropicGaussian(),
            activation=Tanh(), gate_activation=Tanh(),
            use_update_gate=False, rng=numpy.random.RandomState(1))
        self.reset_only.initialize()

    def test_one_step(self):
        h0 = tensor.matrix('h0')
        x = tensor.matrix('x')
        z = tensor.matrix('z')
        r = tensor.matrix('r')
        h1 = self.gated.apply(x, z, r, h0, iterate=False)
        next_h = theano.function(inputs=[h0, x, z, r], outputs=[h1])

        h0_val = 0.1 * numpy.array([[1, 1, 0], [0, 1, 1]],
                                   dtype=floatX)
        x_val = 0.1 * numpy.array([[1, 2, 3], [4, 5, 6]],
                                  dtype=floatX)
        zi_val = (h0_val + x_val) / 2
        ri_val = -x_val
        W_val = 2 * numpy.ones((3, 3), dtype=floatX)

        z_val = numpy.tanh(h0_val.dot(W_val) + zi_val)
        r_val = numpy.tanh(h0_val.dot(W_val) + ri_val)
        h1_val = (z_val * numpy.tanh((r_val * h0_val).dot(W_val) + x_val)
                  + (1 - z_val) * h0_val)
        assert_allclose(h1_val, next_h(h0_val, x_val, zi_val, ri_val)[0],
                        rtol=1e-6)

    def test_reset_only_many_steps(self):
        x = tensor.tensor3('x')
        ri = tensor.tensor3('ri')
        mask = tensor.matrix('mask')
        h = self.reset_only.apply(x, reset_inputs=ri, mask=mask)
        calc_h = theano.function(inputs=[x, ri, mask], outputs=[h])

        x_val = 0.1 * numpy.asarray(list(itertools.permutations(range(4))),
                                    dtype=floatX)
        x_val = numpy.ones((24, 4, 3), dtype=floatX) * x_val[..., None]
        ri_val = 0.3 - x_val
        mask_val = numpy.ones((24, 4), dtype=floatX)
        mask_val[12:24, 3] = 0
        h_val = numpy.zeros((25, 4, 3), dtype=floatX)
        W = self.reset_only.state_to_state.get_value()
        U = self.reset_only.state_to_reset.get_value()

        for i in range(1, 25):
            r_val = numpy.tanh(h_val[i - 1].dot(U) + ri_val[i - 1])
            h_val[i] = numpy.tanh((r_val * h_val[i - 1]).dot(W)
                                  + x_val[i - 1])
            h_val[i] = (mask_val[i - 1, :, None] * h_val[i] +
                        (1 - mask_val[i - 1, :, None]) * h_val[i - 1])
        h_val = h_val[1:]
        # TODO Figure out why this tolerance needs to be so big
        assert_allclose(h_val, calc_h(x_val, ri_val,  mask_val)[0], 1e-03)
Example #21
0
class Scribe(Initializable):
    def __init__(self,
                 k=20,
                 rec_h_dim=400,
                 att_size=10,
                 num_letters=68,
                 sampling_bias=0.,
                 attention_type="graves",
                 epsilon=1e-6,
                 attention_alignment=1.,
                 **kwargs):
        super(Scribe, self).__init__(**kwargs)

        # For now only softmax and graves are supported.
        assert attention_type in ["graves", "softmax"]

        readouts_dim = 1 + 6 * k

        self.k = k
        self.rec_h_dim = rec_h_dim
        self.att_size = att_size
        self.num_letters = num_letters
        self.sampling_bias = sampling_bias
        self.attention_type = attention_type
        self.epsilon = epsilon
        self.attention_alignment = attention_alignment

        self.cell1 = GatedRecurrent(dim=rec_h_dim, name='cell1')

        self.inp_to_h1 = Fork(output_names=['cell1_inputs', 'cell1_gates'],
                              input_dim=3,
                              output_dims=[rec_h_dim, 2 * rec_h_dim],
                              name='inp_to_h1')

        self.h1_to_readout = Linear(input_dim=rec_h_dim,
                                    output_dim=readouts_dim,
                                    name='h1_to_readout')

        self.h1_to_att = Fork(output_names=['alpha', 'beta', 'kappa'],
                              input_dim=rec_h_dim,
                              output_dims=[att_size] * 3,
                              name='h1_to_att')

        self.att_to_h1 = Fork(output_names=['cell1_inputs', 'cell1_gates'],
                              input_dim=num_letters,
                              output_dims=[rec_h_dim, 2 * rec_h_dim],
                              name='att_to_h1')

        self.att_to_readout = Linear(input_dim=num_letters,
                                     output_dim=readouts_dim,
                                     name='att_to_readout')

        self.emitter = BivariateGMMEmitter(k=k, sampling_bias=sampling_bias)

        self.children = [
            self.cell1, self.inp_to_h1, self.h1_to_readout, self.h1_to_att,
            self.att_to_h1, self.att_to_readout, self.emitter
        ]

    def _allocate(self):
        self.initial_w = shared_floatx_zeros((self.num_letters, ),
                                             name="initial_w")

        add_role(self.initial_w, INITIAL_STATE)

    def symbolic_input_variables(self):
        data = tensor.tensor3('features')
        data_mask = tensor.matrix('features_mask')
        context = tensor.imatrix('transcripts')
        context_mask = tensor.matrix('transcripts_mask')
        start_flag = tensor.scalar('start_flag')

        return data, data_mask, context, context_mask, start_flag

    def initial_states(self, batch_size):
        initial_h1 = self.cell1.initial_states(batch_size)
        initial_kappa = shared_floatx_zeros((batch_size, self.att_size))
        initial_w = tensor.repeat(self.initial_w[None, :], batch_size, 0)
        last_h1 = shared_floatx_zeros((batch_size, self.rec_h_dim))
        last_w = shared_floatx_zeros((batch_size, self.num_letters))
        use_last_states = shared(numpy.asarray(0., dtype=floatX))

        return initial_h1, initial_kappa, initial_w, \
            last_h1, last_w, use_last_states

    @application
    def compute_cost(self, data, data_mask, context, context_mask, start_flag,
                     batch_size):
        x = data[:-1]
        target = data[1:]
        mask = data_mask[1:]
        xinp_h1, xgat_h1 = self.inp_to_h1.apply(x)
        context_oh = one_hot(context, self.num_letters) * \
            tensor.shape_padright(context_mask)

        initial_h1, initial_kappa, initial_w, \
            last_h1, last_w, use_last_states = \
            self.initial_states(batch_size)

        input_h1 = tensor.switch(use_last_states, last_h1, initial_h1)
        input_w = tensor.switch(use_last_states, last_w, initial_w)

        u = tensor.shape_padleft(tensor.arange(context.shape[1], dtype=floatX),
                                 2)

        def step(xinp_h1_t, xgat_h1_t, h1_tm1, k_tm1, w_tm1, ctx):

            attinp_h1, attgat_h1 = self.att_to_h1.apply(w_tm1)

            h1_t = self.cell1.apply(xinp_h1_t + attinp_h1,
                                    xgat_h1_t + attgat_h1,
                                    h1_tm1,
                                    iterate=False)

            a_t, b_t, k_t = self.h1_to_att.apply(h1_t)

            if self.attention_type == "softmax":
                a_t = tensor.nnet.softmax(a_t)
            else:
                a_t = tensor.exp(a_t)

            b_t = tensor.exp(b_t) + self.epsilon
            k_t = k_tm1 + self.attention_alignment * tensor.exp(k_t)

            a_t = tensor.shape_padright(a_t)
            b_t = tensor.shape_padright(b_t)
            k_t_ = tensor.shape_padright(k_t)

            # batch size X att size X len context
            if self.attention_type == "softmax":
                # numpy.sqrt(1/(2*numpy.pi)) is the weird number
                phi_t = 0.3989422917366028 * tensor.sum(
                    a_t * tensor.sqrt(b_t) * tensor.exp(-0.5 * b_t *
                                                        (k_t_ - u)**2),
                    axis=1)
            else:
                phi_t = tensor.sum(a_t * tensor.exp(-b_t * (k_t_ - u)**2),
                                   axis=1)

            # batch size X len context X num letters
            w_t = (tensor.shape_padright(phi_t) * ctx).sum(axis=1)

            return h1_t, k_t, w_t

        (h1, kappa, w), scan_updates = theano.scan(
            fn=step,
            sequences=[xinp_h1, xgat_h1],
            non_sequences=[context_oh],
            outputs_info=[input_h1, initial_kappa, input_w])

        readouts = self.h1_to_readout.apply(h1) + \
            self.att_to_readout.apply(w)

        cost = self.emitter.cost(readouts, target)
        cost = (cost * mask).sum() / (mask.sum() + 1e-5) + 0. * start_flag

        updates = []
        updates.append((last_h1, h1[-1]))
        updates.append((initial_kappa,
                        tensor.switch(start_flag, 0. * initial_kappa,
                                      kappa[-1])))
        updates.append((last_w, w[-1]))
        updates.append((use_last_states, 1. - start_flag))

        return cost, scan_updates + updates

    @application
    def sample_model(self, context, context_mask, n_steps, batch_size):

        initial_h1, initial_kappa, initial_w, \
            last_h1, last_w, use_last_states = \
            self.initial_states(batch_size)

        initial_x = self.emitter.initial_outputs(batch_size)

        context_oh = one_hot(context, self.num_letters) * \
            tensor.shape_padright(context_mask)

        u = tensor.shape_padleft(tensor.arange(context.shape[1], dtype=floatX),
                                 2)

        def sample_step(x_tm1, h1_tm1, k_tm1, w_tm1, ctx):
            xinp_h1_t, xgat_h1_t = self.inp_to_h1.apply(x_tm1)

            attinp_h1, attgat_h1 = self.att_to_h1.apply(w_tm1)

            h1_t = self.cell1.apply(xinp_h1_t + attinp_h1,
                                    xgat_h1_t + attgat_h1,
                                    h1_tm1,
                                    iterate=False)

            a_t, b_t, k_t = self.h1_to_att.apply(h1_t)

            if self.attention_type == "softmax":
                a_t = tensor.nnet.softmax(a_t)
            else:
                a_t = tensor.exp(a_t)

            b_t = tensor.exp(b_t) + self.epsilon
            k_t = k_tm1 + self.attention_alignment * tensor.exp(k_t)

            a_t = tensor.shape_padright(a_t)
            b_t = tensor.shape_padright(b_t)
            k_t_ = tensor.shape_padright(k_t)

            # batch size X att size X len context
            if self.attention_type == "softmax":
                # numpy.sqrt(1/(2*numpy.pi)) is the weird number
                phi_t = 0.3989422917366028 * tensor.sum(
                    a_t * tensor.sqrt(b_t) * tensor.exp(-0.5 * b_t *
                                                        (k_t_ - u)**2),
                    axis=1)
            else:
                phi_t = tensor.sum(a_t * tensor.exp(-b_t * (k_t_ - u)**2),
                                   axis=1)

            # batch size X len context X num letters
            w_t = (tensor.shape_padright(phi_t) * ctx).sum(axis=1)

            readout_t = self.h1_to_readout.apply(h1_t) + \
                self.att_to_readout.apply(w_t)

            x_t = self.emitter.emit(readout_t)

            mu_t, sigma_t, corr_t, pi_t, penup_t = \
                self.emitter.components(readout_t)

            return x_t, h1_t, k_t, w_t, pi_t, phi_t, a_t

        (sample_x, h1, k, w, pi, phi,
         pi_att), updates = theano.scan(fn=sample_step,
                                        n_steps=n_steps,
                                        sequences=[],
                                        non_sequences=[context_oh],
                                        outputs_info=[
                                            initial_x.eval(), initial_h1,
                                            initial_kappa, initial_w, None,
                                            None, None
                                        ])

        return sample_x, pi, phi, pi_att, updates
Example #22
0
    def __init__(self, config):
        inp = tensor.imatrix('bytes')

        embed = theano.shared(config.embedding_matrix.astype(theano.config.floatX),
                              name='embedding_matrix')
        in_repr = embed[inp.flatten(), :].reshape((inp.shape[0], inp.shape[1], config.repr_dim))
        in_repr.name = 'in_repr'

        bricks = []
        states = []

        # Construct predictive GRU hierarchy
        hidden = []
        costs = []
        next_target = in_repr.dimshuffle(1, 0, 2)
        for i, (hdim, cf, q) in enumerate(zip(config.hidden_dims,
                                                   config.cost_factors,
                                                   config.hidden_q)):
            init_state = theano.shared(numpy.zeros((config.num_seqs, hdim)).astype(theano.config.floatX),
                                       name='st0_%d'%i)

            linear = Linear(input_dim=config.repr_dim, output_dim=3*hdim,
                            name="lstm_in_%d"%i)
            lstm = GatedRecurrent(dim=hdim, activation=config.activation_function,
                        name="lstm_rec_%d"%i)
            linear2 = Linear(input_dim=hdim, output_dim=config.repr_dim, name='lstm_out_%d'%i)
            tanh = Tanh('lstm_out_tanh_%d'%i)
            bricks += [linear, lstm, linear2, tanh]
            if i > 0:
                linear1 = Linear(input_dim=config.hidden_dims[i-1], output_dim=3*hdim,
                                 name='lstm_in2_%d'%i)
                bricks += [linear1]

            next_target = tensor.cast(next_target, dtype=theano.config.floatX)
            inter = linear.apply(theano.gradient.disconnected_grad(next_target))
            if i > 0:
                inter += linear1.apply(theano.gradient.disconnected_grad(hidden[-1][:-1,:,:]))
            new_hidden = lstm.apply(inputs=inter[:,:,:hdim],
                                    gate_inputs=inter[:,:,hdim:],
                                    states=init_state)
            states.append((init_state, new_hidden[-1, :, :]))

            hidden += [tensor.concatenate([init_state[None,:,:], new_hidden],axis=0)]
            pred = tanh.apply(linear2.apply(hidden[-1][:-1,:,:]))
            costs += [numpy.float32(cf) * (-next_target * pred).sum(axis=2).mean()]
            costs += [numpy.float32(cf) * q * abs(pred).sum(axis=2).mean()]
            diff = next_target - pred
            next_target = tensor.ge(diff, 0.5) - tensor.le(diff, -0.5)


        # Construct output from hidden states
        hidden = [s.dimshuffle(1, 0, 2) for s in hidden]

        out_parts = []
        out_dims = config.out_hidden + [config.io_dim]
        for i, (dim, state) in enumerate(zip(config.hidden_dims, hidden)):
            pred_linear = Linear(input_dim=dim, output_dim=out_dims[0],
                                name='pred_linear_%d'%i)
            bricks.append(pred_linear)
            lin = theano.gradient.disconnected_grad(state)
            out_parts.append(pred_linear.apply(lin))

        # Do prediction and calculate cost
        out = sum(out_parts)

        if len(out_dims) > 1:
            out = config.out_hidden_act[0](name='out_act0').apply(out)
            mlp = MLP(dims=out_dims,
                      activations=[x(name='out_act%d'%i) for i, x in enumerate(config.out_hidden_act[1:])]
                                 +[Identity()],
                      name='out_mlp')
            bricks.append(mlp)
            out = mlp.apply(out.reshape((inp.shape[0]*(inp.shape[1]+1),-1))
                           ).reshape((inp.shape[0],inp.shape[1]+1,-1))

        pred = out.argmax(axis=2)

        cost = Softmax().categorical_cross_entropy(inp.flatten(),
                                                   out[:,:-1,:].reshape((inp.shape[0]*inp.shape[1],
                                                                config.io_dim))).mean()
        error_rate = tensor.neq(inp.flatten(), pred[:,:-1].flatten()).mean()

        sgd_cost = cost + sum(costs)
            
        # Initialize all bricks
        for brick in bricks:
            brick.weights_init = config.weights_init
            brick.biases_init = config.biases_init
            brick.initialize()

        # apply noise
        cg = ComputationGraph([sgd_cost, cost, error_rate]+costs)
        if config.weight_noise > 0:
            noise_vars = VariableFilter(roles=[WEIGHT])(cg)
            cg = apply_noise(cg, noise_vars, config.weight_noise)
        sgd_cost = cg.outputs[0]
        cost = cg.outputs[1]
        error_rate = cg.outputs[2]
        costs = cg.outputs[3:]


        # put stuff into self that is usefull for training or extensions
        self.sgd_cost = sgd_cost

        sgd_cost.name = 'sgd_cost'
        for i in range(len(costs)):
            costs[i].name = 'pred_cost_%d'%i
        cost.name = 'cost'
        error_rate.name = 'error_rate'
        self.monitor_vars = [costs, [cost],
                             [error_rate]]

        self.out = out[:,1:,:]
        self.pred = pred[:,1:]

        self.states = states
Example #23
0
class GatedRecurrentFull(Initializable):
    """A wrapper around the GatedRecurrent brick that improves usability.
    It contains:
        * A fork to map to initialize the reset and the update units.
        * Better initialization to initialize the different pieces
    While this works, there is probably a better more elegant way to do this.

    Parameters
    ----------
    hidden_dim : int
        dimension of the hidden state
    activation : :class:`.Brick`
    gate_activation: :class:`.Brick`

    state_to_state_init: object
        Weight Initialization
    state_to_reset_init: object
        Weight Initialization
    state_to_update_init: obje64
        Weight Initialization

    input_to_state_transform: :class:`.Brick`
        [CvMG14] uses Linear transform
    input_to_reset_transform: :class:`.Brick`
        [CvMG14] uses Linear transform
    input_to_update_transform: :class:`.Brick`
        [CvMG14] uses Linear transform

    References
    ---------
        self.rnn = GatedRecurrent(
                weights_init=Constant(np.nan),
                dim=self.hidden_dim,
                activation=self.activation,
                gate_activation=self.gate_activation)
    .. [CvMG14] Kyunghyun Cho, Bart van Merriënboer, Çağlar Gülçehre,
        Dzmitry Bahdanau, Fethi Bougares, Holger Schwenk, and Yoshua
        Bengio, *Learning Phrase Representations using RNN Encoder-Decoder
        for Statistical Machine Translation*, EMNLP (2014), pp. 1724-1734.

    """
    @lazy(allocation=[
        'hidden_dim', 'state_to_state_init', 'state_to_update_init',
        'state_to_reset_init'
    ],
          initialization=[
              'input_to_state_transform', 'input_to_update_transform',
              'input_to_reset_transform'
          ])
    def __init__(self,
                 hidden_dim,
                 activation=None,
                 gate_activation=None,
                 state_to_state_init=None,
                 state_to_update_init=None,
                 state_to_reset_init=None,
                 input_to_state_transform=None,
                 input_to_update_transform=None,
                 input_to_reset_transform=None,
                 **kwargs):

        super(GatedRecurrentFull, self).__init__(**kwargs)
        self.hidden_dim = hidden_dim

        self.state_to_state_init = state_to_state_init
        self.state_to_update_init = state_to_update_init
        self.state_to_reset_init = state_to_reset_init

        self.input_to_state_transform = input_to_state_transform
        self.input_to_update_transform = input_to_update_transform
        self.input_to_reset_transform = input_to_reset_transform
        self.input_to_state_transform.name += "_input_to_state_transform"
        self.input_to_update_transform.name += "_input_to_update_transform"
        self.input_to_reset_transform.name += "_input_to_reset_transform"

        self.use_mine = True
        if self.use_mine:
            self.rnn = GatedRecurrentFast(weights_init=Constant(np.nan),
                                          dim=self.hidden_dim,
                                          activation=activation,
                                          gate_activation=gate_activation)
        else:
            self.rnn = GatedRecurrent(weights_init=Constant(np.nan),
                                      dim=self.hidden_dim,
                                      activation=activation,
                                      gate_activation=gate_activation)

        self.children = [
            self.rnn, self.input_to_state_transform,
            self.input_to_update_transform, self.input_to_reset_transform
        ]
        self.children.extend(self.rnn.children)

    def initialize(self):
        super(GatedRecurrentFull, self).initialize()

        self.input_to_state_transform.initialize()
        self.input_to_update_transform.initialize()
        self.input_to_reset_transform.initialize()

        self.rnn.initialize()

        weight_shape = (self.hidden_dim, self.hidden_dim)
        state_to_state = self.state_to_state_init.generate(rng=self.rng,
                                                           shape=weight_shape)
        state_to_update = self.state_to_update_init.generate(
            rng=self.rng, shape=weight_shape)
        state_to_reset = self.state_to_reset_init.generate(rng=self.rng,
                                                           shape=weight_shape)

        self.rnn.state_to_state.set_value(state_to_state)

        if self.use_mine:
            self.rnn.state_to_update.set_value(state_to_update)
            self.rnn.state_to_reset.set_value(state_to_reset)
        else:
            self.rnn.state_to_gates.set_value(
                np.hstack((state_to_update, state_to_reset)))

    @application(inputs=['input_'], outputs=['output'])
    def apply(self, input_, mask=None):
        """

        Parameters
        ----------
        inputs_ : :class:`~tensor.TensorVariable`
            sequence to feed into GRU. Axes are mb, sequence, features

        mask : :class:`~tensor.TensorVariable`
            A 1D binary array with 1 or 0 to represent data given available.

        Returns
        -------
        output: :class:`theano.tensor.TensorVariable`
            sequence to feed out. Axes are batch, sequence, features
        """
        states_from_in = self.input_to_state_transform.apply(input_)
        update_from_in = self.input_to_update_transform.apply(input_)
        reset_from_in = self.input_to_reset_transform.apply(input_)

        gate_inputs = tensor.concatenate([update_from_in, reset_from_in],
                                         axis=2)

        if self.use_mine:
            output = self.rnn.apply(inputs=states_from_in,
                                    update_inputs=update_from_in,
                                    reset_inputs=reset_from_in,
                                    mask=mask)
        else:
            output = self.rnn.apply(inputs=states_from_in,
                                    gate_inputs=gate_inputs)

        return output