def test_sequence_generator():
    # Disclaimer: here we only check shapes, not values.

    output_dim = 1
    dim = 20
    batch_size = 30
    n_steps = 10

    transition = GatedRecurrent(
        name="transition", activation=Tanh(), dim=dim,
        weights_init=Orthogonal())
    generator = SequenceGenerator(
        LinearReadout(readout_dim=output_dim, source_names=["states"],
                      emitter=TestEmitter(name="emitter"), name="readout"),
        transition,
        weights_init=IsotropicGaussian(0.01), biases_init=Constant(0),
        name="generator")
    generator.initialize()

    y = tensor.tensor3('y')
    mask = tensor.matrix('mask')
    costs = generator.cost(y, mask)
    assert costs.ndim == 2
    costs_val = theano.function([y, mask], [costs])(
        numpy.zeros((n_steps, batch_size, output_dim), dtype=floatX),
        numpy.ones((n_steps, batch_size), dtype=floatX))[0]
    assert costs_val.shape == (n_steps, batch_size)

    states, outputs, costs = [variable.eval() for variable in
                              generator.generate(
                                  iterate=True, batch_size=batch_size,
                                  n_steps=n_steps)]
    assert states.shape == (n_steps, batch_size, dim)
    assert outputs.shape == (n_steps, batch_size, output_dim)
    assert costs.shape == (n_steps, batch_size)
Beispiel #2
0
    def __init__(self, vocab_size, embedding_dim, state_dim,
                 representation_dim, **kwargs):
        super(Decoder, self).__init__(**kwargs)
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.state_dim = state_dim
        self.representation_dim = representation_dim

        self.transition = GRUInitialState(attended_dim=state_dim,
                                          dim=state_dim,
                                          activation=Tanh(),
                                          name='decoder')
        self.attention = SequenceContentAttention(
            state_names=self.transition.apply.states,
            attended_dim=representation_dim,
            match_dim=state_dim,
            name="attention")

        readout = Readout(source_names=[
            'states', 'feedback', self.attention.take_glimpses.outputs[0]
        ],
                          readout_dim=self.vocab_size,
                          emitter=SoftmaxEmitter(initial_output=-1),
                          feedback_brick=LookupFeedbackWMT15(
                              vocab_size, embedding_dim),
                          post_merge=InitializableFeedforwardSequence([
                              Bias(dim=state_dim, name='maxout_bias').apply,
                              Maxout(num_pieces=2, name='maxout').apply,
                              Linear(input_dim=state_dim / 2,
                                     output_dim=embedding_dim,
                                     use_bias=False,
                                     name='softmax0').apply,
                              Linear(input_dim=embedding_dim,
                                     name='softmax1').apply
                          ]),
                          merged_dim=state_dim,
                          merge_prototype=Linear(use_bias=True))

        self.sequence_generator = SequenceGenerator(
            readout=readout,
            transition=self.transition,
            attention=self.attention,
            fork=Fork([
                name
                for name in self.transition.apply.sequences if name != 'mask'
            ],
                      prototype=Linear()))

        self.children = [self.sequence_generator]
Beispiel #3
0
	def __init__(self, dimen, vocab_size): #{
		# No idea what this is doing, but otherwise "allocated" is not set
		super(MorphGen, self).__init__(self)

		# The encoder 
		encoder = Bidirectional(SimpleRecurrent(dim=dimen, activation=Tanh()))

		# What is this doing ? 
		fork = Fork([name for name in encoder.prototype.apply.sequences if name != 'mask'])
		fork.input_dim = dimen
		fork.output_dims = [encoder.prototype.get_dim(name) for name in fork.input_names]

		lookup = LookupTable(vocab_size, dimen)

		transition = SimpleRecurrent(dim=dimen, activation=Tanh(), name="transition")

		atten = SequenceContentAttention(state_names=transition.apply.states,attended_dim=2*dimen, match_dim=dimen, name="attention")

		readout = Readout(
			readout_dim=vocab_size,
			source_names=[transition.apply.states[0],
			atten.take_glimpses.outputs[0]],
			emitter=SoftmaxEmitter(name="emitter"),
			feedback_brick=LookupFeedback(vocab_size, dimen),
			name="readout");

		generator = SequenceGenerator(readout=readout, transition=transition, attention=atten,name="generator")
	
		self.lookup = lookup
		self.fork = fork
		self.encoder = encoder
		self.generator = generator
		self.children = [lookup, fork, encoder, generator]
Beispiel #4
0
    def __init__(self, dimension, alphabet_size, **kwargs):
        super(WordReverser, self).__init__(**kwargs)
        encoder = Bidirectional(
            SimpleRecurrent(dim=dimension, activation=Tanh()))
        fork = Fork([name for name in encoder.prototype.apply.sequences
                    if name != 'mask'])
        fork.input_dim = dimension
        fork.output_dims = [dimension for name in fork.input_names]
        lookup = LookupTable(alphabet_size, dimension)
        transition = SimpleRecurrent(
            activation=Tanh(),
            dim=dimension, name="transition")
        attention = SequenceContentAttention(
            state_names=transition.apply.states,
            attended_dim=2 * dimension, match_dim=dimension, name="attention")
        readout = Readout(
            readout_dim=alphabet_size,
            source_names=[transition.apply.states[0],
                          attention.take_glimpses.outputs[0]],
            emitter=SoftmaxEmitter(name="emitter"),
            feedback_brick=LookupFeedback(alphabet_size, dimension),
            name="readout")
        generator = SequenceGenerator(
            readout=readout, transition=transition, attention=attention,
            name="generator")

        self.lookup = lookup
        self.fork = fork
        self.encoder = encoder
        self.generator = generator
        self.children = [lookup, fork, encoder, generator]
Beispiel #5
0
    def __init__(self,hidden_size_recurrent, k, **kwargs):
        super(Scribe, self).__init__(**kwargs)

        readout_size =6*k+1
        transition = [GatedRecurrent(dim=hidden_size_recurrent, 
                      name = "gru_{}".format(i) ) for i in range(3)]

        transition = RecurrentStack( transition,
                name="transition", skip_connections = True)

        emitter = BivariateGMMEmitter(k = k)

        source_names = [name for name in transition.apply.states 
                                      if 'states' in name]

        readout = Readout(
            readout_dim = readout_size,
            source_names =source_names,
            emitter=emitter,
            name="readout")

        self.generator = SequenceGenerator(readout=readout, 
                                  transition=transition,
                                  name = "generator")

        self.children = [self.generator]
Beispiel #6
0
    def __init__(self, dimension, alphabet_size, **kwargs):
        super(SimpleGenerator, self).__init__(**kwargs)
        lookup = LookupTable(alphabet_size, dimension)
        transition = SimpleRecurrent(activation=Tanh(),
                                     dim=dimension,
                                     name="transition")
        attention = SequenceContentAttention(
            state_names=transition.apply.states,
            attended_dim=dimension,
            match_dim=dimension,
            name="attention")
        readout = Readout(readout_dim=alphabet_size,
                          source_names=[
                              transition.apply.states[0],
                              attention.take_glimpses.outputs[0]
                          ],
                          emitter=SoftmaxEmitter(name="emitter"),
                          feedback_brick=LookupFeedback(
                              alphabet_size, dimension),
                          name="readout")
        generator = SequenceGenerator(readout=readout,
                                      transition=transition,
                                      attention=attention,
                                      name="generator")

        self.lookup = lookup
        self.generator = generator
        self.children = [lookup, generator]
    def __init__(self, vocab_size, embedding_dim, state_dim,
                 representation_dim, theano_seed=None, **kwargs):
        super(Decoder, self).__init__(**kwargs)
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.state_dim = state_dim
        self.representation_dim = representation_dim
        self.theano_seed = theano_seed

        # Initialize gru with special initial state
        self.transition = GRUInitialState(
            attended_dim=state_dim, dim=state_dim,
            activation=Tanh(), name='decoder')

        # Initialize the attention mechanism
        self.attention = SequenceContentAttention(
            state_names=self.transition.apply.states,
            attended_dim=representation_dim,
            match_dim=state_dim, name="attention")

        # Initialize the readout, note that SoftmaxEmitter emits -1 for
        # initial outputs which is used by LookupFeedBackWMT15
        readout = Readout(
            source_names=['states', 'feedback',
                          self.attention.take_glimpses.outputs[0]],
            readout_dim=self.vocab_size,
            emitter=SoftmaxEmitter(initial_output=-1, theano_seed=theano_seed),
            feedback_brick=LookupFeedbackWMT15(vocab_size, embedding_dim),
            post_merge=InitializableFeedforwardSequence(
                [Bias(dim=state_dim, name='maxout_bias').apply,
                 Maxout(num_pieces=2, name='maxout').apply,
                 Linear(input_dim=state_dim / 2, output_dim=embedding_dim,
                        use_bias=False, name='softmax0').apply,
                 Linear(input_dim=embedding_dim, name='softmax1').apply]),
            merged_dim=state_dim)

        # Build sequence generator accordingly
        self.sequence_generator = SequenceGenerator(
            readout=readout,
            transition=self.transition,
            attention=self.attention,
            fork=Fork([name for name in self.transition.apply.sequences
                       if name != 'mask'], prototype=Linear())
        )

        self.children = [self.sequence_generator]
Beispiel #8
0
    def __init__(self, vocab_size, embedding_dim, state_dim,
                 representation_dim, **kwargs):
        super(Decoder, self).__init__(**kwargs)
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.state_dim = state_dim
        self.representation_dim = representation_dim

        readout = Readout(
            source_names=['states', 'feedback', 'readout_context'],
            readout_dim=self.vocab_size,
            emitter=SoftmaxEmitter(),
            feedback_brick=LookupFeedback(vocab_size, embedding_dim),
            post_merge=InitializableFeedforwardSequence([
                Bias(dim=1000).apply,
                Maxout(num_pieces=2).apply,
                Linear(input_dim=state_dim / 2, output_dim=100,
                       use_bias=False).apply,
                Linear(input_dim=100).apply
            ]),
            merged_dim=1000)

        self.transition = GatedRecurrentWithContext(Tanh(),
                                                    dim=state_dim,
                                                    name='decoder')
        # Readout will apply the linear transformation to 'readout_context'
        # with a Merge brick, so no need to fork it here
        self.fork = Fork([
            name for name in self.transition.apply.contexts +
            self.transition.apply.states if name != 'readout_context'
        ],
                         prototype=Linear())
        self.tanh = Tanh()

        self.sequence_generator = SequenceGenerator(
            readout=readout,
            transition=self.transition,
            fork_inputs=[
                name for name in self.transition.apply.sequences
                if name != 'mask'
            ],
        )

        self.children = [self.fork, self.sequence_generator, self.tanh]
Beispiel #9
0
def getRnnGenerator(vocab_size,hidden_dim,input_dim=512):
    """
    "Apply" the RNN to the input x
    For initializing the network, the vocab size needs to be known
    Default of the hidden layer is set tot 512 like Karpathy
    """
    generator = SequenceGenerator(
        Readout(readout_dim = vocab_size,
                source_names = ["states"], # transition.apply.states ???
                emitter = SoftmaxEmitter(name="emitter"),
                feedback_brick = LookupFeedback(
                    vocab_size,
                    input_dim,
                    name = 'feedback'
                ),
                name = "readout"
        ),
        MySimpleRecurrent(
            name = "transition",
            activation = Tanh(),
            dim = hidden_dim
        ),
        weights_init = IsotropicGaussian(0.01),
        biases_init  = Constant(0),
        name = "generator"
    )
    generator.push_initialization_config()
    generator.transition.weights_init = IsotropicGaussian(0.01)
    generator.initialize()
    
    return generator
def test_integer_sequence_generator():
    # Disclaimer: here we only check shapes, not values.

    readout_dim = 5
    feedback_dim = 3
    dim = 20
    batch_size = 30
    n_steps = 10

    transition = GatedRecurrent(
        name="transition", activation=Tanh(), dim=dim,
        weights_init=Orthogonal())
    generator = SequenceGenerator(
        LinearReadout(readout_dim=readout_dim, source_names=["states"],
                      emitter=SoftmaxEmitter(name="emitter"),
                      feedbacker=LookupFeedback(readout_dim, feedback_dim),
                      name="readout"),
        transition,
        weights_init=IsotropicGaussian(0.01), biases_init=Constant(0),
        name="generator")
    generator.initialize()

    y = tensor.lmatrix('y')
    mask = tensor.matrix('mask')
    costs = generator.cost(y, mask)
    assert costs.ndim == 2
    costs_val = theano.function([y, mask], [costs])(
        numpy.zeros((n_steps, batch_size), dtype='int64'),
        numpy.ones((n_steps, batch_size), dtype=floatX))[0]
    assert costs_val.shape == (n_steps, batch_size)

    states, outputs, costs = generator.generate(
        iterate=True, batch_size=batch_size, n_steps=n_steps)
    states_val, outputs_val, costs_val = theano.function(
        [], [states, outputs, costs],
        updates=costs.owner.inputs[0].owner.tag.updates)()
    assert states_val.shape == (n_steps, batch_size, dim)
    assert outputs_val.shape == (n_steps, batch_size)
    assert outputs_val.dtype == 'int64'
    assert costs_val.shape == (n_steps, batch_size)
def test_sequence_generator():
    # Disclaimer: here we only check shapes, not values.

    output_dim = 1
    dim = 20
    batch_size = 30
    n_steps = 10

    transition = GatedRecurrent(
        name="transition", activation=Tanh(), dim=dim,
        weights_init=Orthogonal())
    generator = SequenceGenerator(
        LinearReadout(readout_dim=output_dim, source_names=["states"],
                      emitter=TestEmitter(name="emitter"), name="readout"),
        transition,
        weights_init=IsotropicGaussian(0.01), biases_init=Constant(0),
        name="generator")
    generator.initialize()

    y = tensor.tensor3('y')
    mask = tensor.matrix('mask')
    costs = generator.cost(y, mask)
    assert costs.ndim == 2
    costs_val = theano.function([y, mask], [costs])(
        numpy.zeros((n_steps, batch_size, output_dim), dtype=floatX),
        numpy.ones((n_steps, batch_size), dtype=floatX))[0]
    assert costs_val.shape == (n_steps, batch_size)

    states, outputs, costs = [variable.eval() for variable in
                              generator.generate(
                                  iterate=True, batch_size=batch_size,
                                  n_steps=n_steps)]
    assert states.shape == (n_steps, batch_size, dim)
    assert outputs.shape == (n_steps, batch_size, output_dim)
    assert costs.shape == (n_steps, batch_size)
Beispiel #12
0
def test_recurrentstack_sequence_generator():
    """Test RecurrentStack behaviour inside a SequenceGenerator.

    """
    floatX = theano.config.floatX
    rng = numpy.random.RandomState(1234)

    output_dim = 1
    dim = 20
    batch_size = 30
    n_steps = 10

    depth=2
    transitions = [LSTM(dim=dim) for _ in range(depth)]
    transition = RecurrentStack(transitions,fast=True,
                                weights_init=Constant(2),
                                biases_init=Constant(0))
    generator = SequenceGenerator(
        Readout(readout_dim=output_dim, source_names=["states_%d"%(depth-1)],
                emitter=TestEmitter()),
        transition,
        weights_init=IsotropicGaussian(0.1), biases_init=Constant(0.0),
        seed=1234)
    generator.initialize()

    y = tensor.tensor3('y')

    cost = generator.cost(y)

    # Check that all states can be accessed and not just the state connected
    # to readout.
    cg = ComputationGraph(cost)
    from blocks.roles import INPUT, OUTPUT
    dropout_target = VariableFilter(roles=[INNER_OUTPUT],
                                    # bricks=transitions,
                                    # name_regex='*'
                                    )(cg.variables)
    assert_equal(len(dropout_target), depth)
Beispiel #13
0
    def __init__(self, vocab_size, embedding_dim, state_dim,
                 representation_dim, theano_seed=None, **kwargs):
        super(Decoder, self).__init__(**kwargs)
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.state_dim = state_dim
        self.representation_dim = representation_dim
        self.theano_seed = theano_seed

        # Initialize gru with special initial state
        self.transition = GRUInitialState(
            attended_dim=state_dim, dim=state_dim,
            activation=Tanh(), name='decoder')

        # Initialize the attention mechanism
        self.attention = SequenceContentAttention(
            state_names=self.transition.apply.states,
            attended_dim=representation_dim,
            match_dim=state_dim, name="attention")

        # Initialize the readout, note that SoftmaxEmitter emits -1 for
        # initial outputs which is used by LookupFeedBackWMT15
        readout = Readout(
            source_names=['states', 'feedback',
                          self.attention.take_glimpses.outputs[0]],
            readout_dim=self.vocab_size,
            emitter=SoftmaxEmitter(initial_output=-1, theano_seed=theano_seed),
            feedback_brick=LookupFeedbackWMT15(vocab_size, embedding_dim),
            post_merge=InitializableFeedforwardSequence(
                [Bias(dim=state_dim, name='maxout_bias').apply,
                 Maxout(num_pieces=2, name='maxout').apply,
                 Linear(input_dim=state_dim / 2, output_dim=embedding_dim,
                        use_bias=False, name='softmax0').apply,
                 Linear(input_dim=embedding_dim, name='softmax1').apply]),
            merged_dim=state_dim)

        # Build sequence generator accordingly
        self.sequence_generator = SequenceGenerator(
            readout=readout,
            transition=self.transition,
            attention=self.attention,
            fork=Fork([name for name in self.transition.apply.sequences
                       if name != 'mask'], prototype=Linear())
        )

        self.children = [self.sequence_generator]
Beispiel #14
0
    def __init__(self, vocab_size, embedding_dim, state_dim,
                 representation_dim, **kwargs):
        super(Decoder, self).__init__(**kwargs)
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.state_dim = state_dim
        self.representation_dim = representation_dim

        self.transition = GRUInitialState(
            attended_dim=state_dim, dim=state_dim,
            activation=Tanh(), name='decoder')
        self.attention = SequenceContentAttention(
            state_names=self.transition.apply.states,
            attended_dim=representation_dim,
            match_dim=state_dim, name="attention")

        readout = Readout(
            source_names=['states', 'feedback', self.attention.take_glimpses.outputs[0]],
            readout_dim=self.vocab_size,
            emitter=SoftmaxEmitter(initial_output=-1),
            feedback_brick=LookupFeedbackWMT15(vocab_size, embedding_dim),
            post_merge=InitializableFeedforwardSequence(
                [Bias(dim=state_dim, name='maxout_bias').apply,
                 Maxout(num_pieces=2, name='maxout').apply,
                 Linear(input_dim=state_dim / 2, output_dim=embedding_dim,
                        use_bias=False, name='softmax0').apply,
                 Linear(input_dim=embedding_dim, name='softmax1').apply]),
            merged_dim=state_dim,
            merge_prototype=Linear(use_bias=True))

        self.sequence_generator = SequenceGenerator(
            readout=readout,
            transition=self.transition,
            attention=self.attention,
            fork=Fork([name for name in self.transition.apply.sequences
                       if name != 'mask'], prototype=Linear())
        )

        self.children = [self.sequence_generator]
    def __init__(self, vocab_size, embedding_dim, 
            state_dim, theano_seed=None, **kwargs):
        super(Decoder, self).__init__(**kwargs)
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.theano_seed = theano_seed

        self.transition = GatedRecurrent(dim=state_dim, 
                activation=Tanh(), name='decoder')

        readout = Readout(
                source_names=['states'],
                readout_dim=self.vocab_size,
                merged_dim=state_dim)

        self.sequence_generator = SequenceGenerator(
                readout=readout,
                transition=self.transition,
                fork=Fork([name for name in self.transition.apply.sequences
                    if name != 'mask'], prototype=Linear()))

        self.children = [self.sequence_generator]
Beispiel #16
0
def test_integer_sequence_generator():
    # Disclaimer: here we only check shapes, not values.

    readout_dim = 5
    feedback_dim = 3
    dim = 20
    batch_size = 30
    n_steps = 10

    transition = GatedRecurrent(name="transition",
                                activation=Tanh(),
                                dim=dim,
                                weights_init=Orthogonal())
    generator = SequenceGenerator(LinearReadout(
        readout_dim=readout_dim,
        source_names=["states"],
        emitter=SoftmaxEmitter(name="emitter"),
        feedbacker=LookupFeedback(readout_dim, feedback_dim),
        name="readout"),
                                  transition,
                                  weights_init=IsotropicGaussian(0.01),
                                  biases_init=Constant(0),
                                  name="generator")
    generator.initialize()

    y = tensor.lmatrix('y')
    mask = tensor.matrix('mask')
    costs = generator.cost(y, mask)
    assert costs.ndim == 2
    costs_val = theano.function([y, mask],
                                [costs])(numpy.zeros((n_steps, batch_size),
                                                     dtype='int64'),
                                         numpy.ones((n_steps, batch_size),
                                                    dtype=floatX))[0]
    assert costs_val.shape == (n_steps, batch_size)

    states, outputs, costs = generator.generate(iterate=True,
                                                batch_size=batch_size,
                                                n_steps=n_steps)
    states_val, outputs_val, costs_val = theano.function(
        [], [states, outputs, costs],
        updates=costs.owner.inputs[0].owner.tag.updates)()
    assert states_val.shape == (n_steps, batch_size, dim)
    assert outputs_val.shape == (n_steps, batch_size)
    assert outputs_val.dtype == 'int64'
    assert costs_val.shape == (n_steps, batch_size)
Beispiel #17
0
    def __init__(self, vocab_size, embedding_dim, state_dim,
                 representation_dim, **kwargs):
        super(Decoder, self).__init__(**kwargs)
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.state_dim = state_dim
        self.representation_dim = representation_dim

        readout = Readout(
            source_names=['states', 'feedback', 'readout_context'],
            readout_dim=self.vocab_size,
            emitter=SoftmaxEmitter(),
            feedback_brick=LookupFeedback(vocab_size, embedding_dim),
            post_merge=InitializableFeedforwardSequence(
                [Bias(dim=1000).apply,
                 Maxout(num_pieces=2).apply,
                 Linear(input_dim=state_dim / 2, output_dim=100,
                        use_bias=False).apply,
                 Linear(input_dim=100).apply]),
            merged_dim=1000)

        self.transition = GatedRecurrentWithContext(Tanh(), dim=state_dim,
                                                    name='decoder')
        # Readout will apply the linear transformation to 'readout_context'
        # with a Merge brick, so no need to fork it here
        self.fork = Fork([name for name in
                          self.transition.apply.contexts +
                          self.transition.apply.states
                          if name != 'readout_context'], prototype=Linear())
        self.tanh = Tanh()

        self.sequence_generator = SequenceGenerator(
            readout=readout, transition=self.transition,
            fork_inputs=[name for name in self.transition.apply.sequences
                         if name != 'mask'],
        )

        self.children = [self.fork, self.sequence_generator, self.tanh]
Beispiel #18
0
def build_model(alphabet_size, config):
    layers = config['lstm_layers']
    dimensions = [config['lstm_dim_' + str(i)] for i in range(layers)]
    uniform_width = config['lstm_init_width']
    stack = []
    for dim in dimensions:
        stack.append(LSTM(dim=dim, use_bias=True, 
                          weights_init = Uniform(width=uniform_width),
                          forget_init=Constant(1.)))
    recurrent_stack = RecurrentStack(stack, name='transition')

    readout = Readout(readout_dim=alphabet_size,
                      source_names=['states#' + str(layers - 1)],
                      emitter=SoftmaxEmitter(name='emitter'),
                      feedback_brick=LookupFeedback(alphabet_size,
                                                    feedback_dim=alphabet_size,
                                                    name='feedback'),
                      name='readout')

    generator = SequenceGenerator(readout=readout,
                                  transition=recurrent_stack,
                                  weights_init=Uniform(width=uniform_width),
                                  biases_init=Constant(0),
                                  name='generator')
    generator.push_initialization_config()
    generator.initialize()

    x = tensor.lmatrix('features')
    mask = tensor.fmatrix('features_mask')
    cost_matrix = generator.cost_matrix(x, mask=mask)

    log2e = math.log(math.e, 2)
    if 'batch_length' in config:
        length = config['batch_length'] - config['batch_overlap']

        cost = log2e * aggregation.mean(cost_matrix[:,-length:].sum(), 
                                    mask[:,-length:].sum())
    else:
        cost = log2e * aggregation.mean(cost_matrix[:,:].sum(), 
                                    mask[:,:].sum())
        
    cost.name = 'bits_per_character'

    return generator, cost
class Decoder(Initializable):
    def __init__(self, vocab_size, embedding_dim, 
            state_dim, theano_seed=None, **kwargs):
        super(Decoder, self).__init__(**kwargs)
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.theano_seed = theano_seed

        self.transition = GatedRecurrent(dim=state_dim, 
                activation=Tanh(), name='decoder')

        readout = Readout(
                source_names=['states'],
                readout_dim=self.vocab_size,
                merged_dim=state_dim)

        self.sequence_generator = SequenceGenerator(
                readout=readout,
                transition=self.transition,
                fork=Fork([name for name in self.transition.apply.sequences
                    if name != 'mask'], prototype=Linear()))

        self.children = [self.sequence_generator]

    @application(inputs=['representation', 'source_sentence_mask',
                         'target_sentence_mask', 'target_sentence'],
                 outputs=['cost'])
    def cost(self, representation, source_sentence_mask,
            target_sentence, target_sentence_mask):
        source_sentence_mask = source_sentence_mask.T
        target_sentence = target_sentence.T
        target_sentence_mask = target_sentence_mask.T

        cost = self.sequence_generator.cost_matrix(**{
            'mask': target_sentence_mask,
            'outputs': target_sentence})
Beispiel #20
0
class Decoder(Initializable):
    """Decoder of RNNsearch model which uses a full softmax output
    layer and embedding matrices for feedback.
    """
    def __init__(self,
                 vocab_size,
                 embedding_dim,
                 state_dim,
                 att_dim,
                 maxout_dim,
                 representation_dim,
                 attention_strategy='content',
                 attention_sources='s',
                 readout_sources='sfa',
                 memory='none',
                 memory_size=500,
                 seq_len=50,
                 init_strategy='last',
                 make_prunable=False,
                 theano_seed=None,
                 **kwargs):
        """Creates a new decoder brick.
        
        Args:
            vocab_size (int): Target language vocabulary size
            embedding_dim (int): Size of feedback embedding layer
            state_dim (int): Number of hidden units
            att_dim (int): Size of attention match vector
            maxout_dim (int): Size of maxout layer
            representation_dim (int): Dimension of source annotations
            attention_strategy (string): Which attention should be used
                                         cf.  ``_initialize_attention``
            attention_sources (string): Defines the sources used by the 
                                        attention model 's' for decoder
                                        states, 'f' for feedback
            readout_sources (string): Defines the sources used in the 
                                      readout network. 's' for decoder
                                      states, 'f' for feedback, 'a' for
                                      attention (context vector)
            memory (string): Which external memory should be used
                             (cf.  ``_initialize_attention``)
            memory_size (int): Size of the external memory structure
            seq_len (int): Maximum sentence length
            init_strategy (string): How to initialize the RNN state
                                    (cf.  ``GRUInitialState``)
            theano_seed: Random seed
        """
        super(Decoder, self).__init__(**kwargs)
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.state_dim = state_dim
        self.representation_dim = representation_dim
        self.theano_seed = theano_seed

        # Initialize gru with special initial state
        self.transition = GRUInitialState(attended_dim=representation_dim / 2,
                                          init_strategy=init_strategy,
                                          dim=state_dim,
                                          activation=Tanh(),
                                          name='decoder')

        # Initialize the attention mechanism
        att_dim = att_dim if att_dim > 0 else state_dim
        self.attention, src_names = _initialize_attention(
            attention_strategy, seq_len, self.transition, representation_dim,
            att_dim, attention_sources, readout_sources, memory, memory_size)

        # Initialize the readout, note that SoftmaxEmitter emits -1 for
        # initial outputs which is used by LookupFeedBackWMT15
        maxout_dim = maxout_dim if maxout_dim > 0 else state_dim
        post_layers = [
            Bias(dim=maxout_dim, name='maxout_bias').apply,
            Maxout(num_pieces=2, name='maxout').apply,
            Linear(input_dim=maxout_dim / 2,
                   output_dim=embedding_dim,
                   use_bias=False,
                   name='softmax0').apply,
            Linear(input_dim=embedding_dim, name='softmax1').apply
        ]
        if make_prunable:
            post_merge = PrunableInitializableFeedforwardSequence(post_layers)
        else:
            post_merge = InitializableFeedforwardSequence(post_layers)
        readout = Readout(source_names=src_names,
                          readout_dim=self.vocab_size,
                          emitter=SoftmaxEmitter(initial_output=-1,
                                                 theano_seed=theano_seed),
                          feedback_brick=LookupFeedbackWMT15(
                              vocab_size, embedding_dim),
                          post_merge=post_merge,
                          merged_dim=maxout_dim)

        # Build sequence generator accordingly
        if make_prunable:
            self.sequence_generator = PrunableSequenceGenerator(
                readout=readout,
                transition=self.transition,
                attention=self.attention,
                fork=Fork([
                    name for name in self.transition.apply.sequences
                    if name != 'mask'
                ],
                          prototype=Linear()))
        else:
            self.sequence_generator = SequenceGenerator(
                readout=readout,
                transition=self.transition,
                attention=self.attention,
                fork=Fork([
                    name for name in self.transition.apply.sequences
                    if name != 'mask'
                ],
                          prototype=Linear()))

        self.children = [self.sequence_generator]

    @application(inputs=[
        'representation', 'source_sentence_mask', 'target_sentence_mask',
        'target_sentence'
    ],
                 outputs=['cost'])
    def cost(self, representation, source_sentence_mask, target_sentence,
             target_sentence_mask):

        target_sentence = target_sentence.T
        target_sentence_mask = target_sentence_mask.T

        # Get the cost matrix
        cost = self.sequence_generator.cost_matrix(
            **{
                'mask': target_sentence_mask,
                'outputs': target_sentence,
                'attended': representation,
                'attended_mask': source_sentence_mask
            })

        return (cost * target_sentence_mask).sum() / \
            target_sentence_mask.shape[1]

    @application
    def generate(self, source_shape, representation, **kwargs):
        return self.sequence_generator.generate(
            n_steps=2 * source_shape[1],
            batch_size=source_shape[0],
            attended=representation,
            attended_mask=tensor.ones(source_shape).T,
            **kwargs)
class Decoder(Initializable):
    """Decoder of RNNsearch model."""
    def __init__(self,
                 vocab_size,
                 embedding_dim,
                 state_dim,
                 representation_dim,
                 theano_seed=None,
                 **kwargs):
        super(Decoder, self).__init__(**kwargs)
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.state_dim = state_dim
        self.representation_dim = representation_dim
        self.theano_seed = theano_seed

        # Initialize gru with special initial state
        self.transition = GRUInitialState(attended_dim=state_dim,
                                          dim=state_dim,
                                          activation=Tanh(),
                                          name='decoder')

        # Initialize the attention mechanism
        self.attention = SequenceContentAttention(
            state_names=self.transition.apply.states,
            attended_dim=representation_dim,
            match_dim=state_dim,
            name="attention")

        # Initialize the readout, note that SoftmaxEmitter emits -1 for
        # initial outputs which is used by LookupFeedBackWMT15
        readout = Readout(source_names=[
            'states', 'feedback', self.attention.take_glimpses.outputs[0]
        ],
                          readout_dim=self.vocab_size,
                          emitter=SoftmaxEmitter(initial_output=-1,
                                                 theano_seed=theano_seed),
                          feedback_brick=LookupFeedbackWMT15(
                              vocab_size, embedding_dim),
                          post_merge=InitializableFeedforwardSequence([
                              Bias(dim=state_dim, name='maxout_bias').apply,
                              Maxout(num_pieces=2, name='maxout').apply,
                              Linear(input_dim=state_dim / 2,
                                     output_dim=embedding_dim,
                                     use_bias=False,
                                     name='softmax0').apply,
                              Linear(input_dim=embedding_dim,
                                     name='softmax1').apply
                          ]),
                          merged_dim=state_dim)

        # Build sequence generator accordingly
        self.sequence_generator = SequenceGenerator(
            readout=readout,
            transition=self.transition,
            attention=self.attention,
            fork=Fork([
                name
                for name in self.transition.apply.sequences if name != 'mask'
            ],
                      prototype=Linear()))

        self.children = [self.sequence_generator]

    @application(inputs=[
        'representation', 'source_sentence_mask', 'target_sentence_mask',
        'target_sentence'
    ],
                 outputs=['cost'])
    def cost(self, representation, source_sentence_mask, target_sentence,
             target_sentence_mask):

        source_sentence_mask = source_sentence_mask.T
        target_sentence = target_sentence.T
        target_sentence_mask = target_sentence_mask.T

        # Get the cost matrix
        cost = self.sequence_generator.cost_matrix(
            **{
                'mask': target_sentence_mask,
                'outputs': target_sentence,
                'attended': representation,
                'attended_mask': source_sentence_mask
            })

        return (cost * target_sentence_mask).sum() / \
            target_sentence_mask.shape[1]

    @application
    def generate(self, source_sentence, representation, **kwargs):
        return self.sequence_generator.generate(
            n_steps=2 * source_sentence.shape[1],
            batch_size=source_sentence.shape[0],
            attended=representation,
            attended_mask=tensor.ones(source_sentence.shape).T,
            **kwargs)
Beispiel #22
0
                   name = "gru_{}".format(i) ) for i in range(3)]

transition = RecurrentStack( transition,
            name="transition", skip_connections = True)

emitter = BivariateGMMEmitter(k = k)

source_names = [name for name in transition.apply.states if 'states' in name]
readout = Readout(
    readout_dim = readout_size,
    source_names =source_names,
    emitter=emitter,
    name="readout")

generator = SequenceGenerator(readout=readout, 
                              transition=transition,
                              name = "generator")

generator.weights_init = IsotropicGaussian(0.01)
generator.biases_init = Constant(0.001)
generator.push_initialization_config()

#generator.transition.weights_init = initialization.Identity(0.98)
#generator.transition.biases_init = IsotropicGaussian(0.01,0.9)
generator.transition.push_initialization_config()
generator.initialize()

cost_matrix = generator.cost_matrix(x, x_mask)
cost = cost_matrix.sum(axis = 0).mean()
cost.name = "nll"
Beispiel #23
0
    def __init__(self, config, vocab_size):
        context = tensor.imatrix('context')
        context_mask = tensor.imatrix('context_mask')
        answer = tensor.imatrix('answer')
        answer_mask = tensor.imatrix('answer_mask')

        bricks = []

        context = context.dimshuffle(1, 0)
        context_mask = context_mask.dimshuffle(1, 0)
        answer = answer.dimshuffle(1, 0)
        answer_mask = answer_mask.dimshuffle(1, 0)

        context_bag = to_bag(context, vocab_size)

        # Embed questions and context
        embed = LookupTable(vocab_size, config.embed_size, name='embed')
        embed.weights_init = IsotropicGaussian(0.01)
        #embeddings_initial_value = init_embedding_table(filename='embeddings/vocab_embeddings.txt')
        #embed.weights_init = Constant(embeddings_initial_value)

        # Calculate context encoding (concatenate layer1)
        cembed = embed.apply(context)
        clstms, chidden_list = make_bidir_lstm_stack(
            cembed, config.embed_size,
            context_mask.astype(theano.config.floatX), config.ctx_lstm_size,
            config.ctx_skip_connections, 'ctx')
        bricks = bricks + clstms
        if config.ctx_skip_connections:
            cenc_dim = 2 * sum(config.ctx_lstm_size)  #2 : fw & bw
            cenc = tensor.concatenate(chidden_list, axis=2)
        else:
            cenc_dim = 2 * config.ctx_lstm_size[-1]
            cenc = tensor.concatenate(chidden_list[-2:], axis=2)
        cenc.name = 'cenc'

        # Build the encoder bricks
        transition = GatedRecurrent(activation=Tanh(),
                                    dim=config.generator_lstm_size,
                                    name="transition")
        attention = SequenceContentAttention(
            state_names=transition.apply.states,
            attended_dim=cenc_dim,
            match_dim=config.generator_lstm_size,
            name="attention")
        readout = Readout(readout_dim=vocab_size,
                          source_names=[
                              transition.apply.states[0],
                              attention.take_glimpses.outputs[0]
                          ],
                          emitter=MaskedSoftmaxEmitter(context_bag=context_bag,
                                                       name='emitter'),
                          feedback_brick=LookupFeedback(
                              vocab_size, config.feedback_size),
                          name="readout")
        generator = SequenceGenerator(readout=readout,
                                      transition=transition,
                                      attention=attention,
                                      name="generator")

        cost = generator.cost(answer,
                              answer_mask.astype(theano.config.floatX),
                              attended=cenc,
                              attended_mask=context_mask.astype(
                                  theano.config.floatX),
                              name="cost")
        self.predictions = generator.generate(
            n_steps=7,
            batch_size=config.batch_size,
            attended=cenc,
            attended_mask=context_mask.astype(theano.config.floatX),
            iterate=True)[1]

        # Apply dropout
        cg = ComputationGraph([cost])

        if config.w_noise > 0:
            noise_vars = VariableFilter(roles=[WEIGHT])(cg)
            cg = apply_noise(cg, noise_vars, config.w_noise)
        if config.dropout > 0:
            cg = apply_dropout(cg, chidden_list, config.dropout)
        [cost_reg] = cg.outputs

        # Other stuff
        cost.name = 'cost'
        cost_reg.name = 'cost_reg'

        self.sgd_cost = cost_reg
        self.monitor_vars = [[cost_reg]]
        self.monitor_vars_valid = [[cost_reg]]

        # initialize new stuff manually (change!)
        generator.weights_init = IsotropicGaussian(0.01)
        generator.biases_init = Constant(0)
        generator.push_allocation_config()
        generator.push_initialization_config()
        transition.weights_init = Orthogonal()
        generator.initialize()

        # Initialize bricks
        embed.initialize()
        for brick in bricks:
            brick.weights_init = config.weights_init
            brick.biases_init = config.biases_init
            brick.initialize()
def test_sequence_generator():
    """Test a sequence generator with no contexts and continuous outputs.

    Such sequence generators can be used to model e.g. dynamical systems.

    """
    rng = numpy.random.RandomState(1234)

    output_dim = 1
    dim = 20
    batch_size = 30
    n_steps = 10

    transition = SimpleRecurrent(activation=Tanh(), dim=dim,
                                 weights_init=Orthogonal())
    generator = SequenceGenerator(
        Readout(readout_dim=output_dim, source_names=["states"],
                emitter=TestEmitter()),
        transition,
        weights_init=IsotropicGaussian(0.1), biases_init=Constant(0.0),
        seed=1234)
    generator.initialize()

    # Test 'cost_matrix' method
    y = tensor.tensor3('y')
    mask = tensor.matrix('mask')
    costs = generator.cost_matrix(y, mask)
    assert costs.ndim == 2
    y_test = rng.uniform(size=(n_steps, batch_size, output_dim)).astype(floatX)
    m_test = numpy.ones((n_steps, batch_size), dtype=floatX)
    costs_val = theano.function([y, mask], [costs])(y_test, m_test)[0]
    assert costs_val.shape == (n_steps, batch_size)
    assert_allclose(costs_val.sum(), 115.593, rtol=1e-5)

    # Test 'cost' method
    cost = generator.cost(y, mask)
    assert cost.ndim == 0
    cost_val = theano.function([y, mask], [cost])(y_test, m_test)
    assert_allclose(cost_val, 3.8531, rtol=1e-5)

    # Test 'AUXILIARY' variable 'per_sequence_element' in 'cost' method
    cg = ComputationGraph([cost])
    var_filter = VariableFilter(roles=[AUXILIARY])
    aux_var_name = '_'.join([generator.name, generator.cost.name,
                             'per_sequence_element'])
    cost_per_el = [el for el in var_filter(cg.variables)
                   if el.name == aux_var_name][0]
    assert cost_per_el.ndim == 0
    cost_per_el_val = theano.function([y, mask], [cost_per_el])(y_test, m_test)
    assert_allclose(cost_per_el_val, 0.38531, rtol=1e-5)

    # Test 'generate' method
    states, outputs, costs = [variable.eval() for variable in
                              generator.generate(
                                  states=rng.uniform(
                                      size=(batch_size, dim)).astype(floatX),
                                  iterate=True, batch_size=batch_size,
                                  n_steps=n_steps)]
    assert states.shape == (n_steps, batch_size, dim)
    assert outputs.shape == (n_steps, batch_size, output_dim)
    assert costs.shape == (n_steps, batch_size)
    assert_allclose(outputs.sum(), -0.33683, rtol=1e-5)
    assert_allclose(states.sum(), 15.7909, rtol=1e-5)
    # There is no generation cost in this case, since generation is
    # deterministic
    assert_allclose(costs.sum(), 0.0)
Beispiel #25
0
class NoLookupDecoder(Initializable):
    """This is the decoder implementation without embedding layer or
    softmax. The target sentence is represented as a sequence of #
    vectors as defined by the sparse feature map.
    """
    def __init__(self,
                 vocab_size,
                 embedding_dim,
                 state_dim,
                 representation_dim,
                 attention_strategy='content',
                 attention_sources='s',
                 readout_sources='sfa',
                 memory='none',
                 memory_size=500,
                 seq_len=50,
                 init_strategy='last',
                 theano_seed=None,
                 **kwargs):
        """Creates a new decoder brick without embedding.
        
        Args:
            vocab_size (int): Target language vocabulary size
            embedding_dim (int): Size of feedback embedding layer
            state_dim (int): Number of hidden units
            representation_dim (int): Dimension of source annotations
            attention_strategy (string): Which attention should be used
                                         cf.  ``_initialize_attention``
            attention_sources (string): Defines the sources used by the 
                                        attention model 's' for decoder
                                        states, 'f' for feedback
            readout_sources (string): Defines the sources used in the 
                                      readout network. 's' for decoder
                                      states, 'f' for feedback, 'a' for
                                      attention (context vector)
            memory (string): Which external memory should be used
                             (cf.  ``_initialize_attention``)
            memory_size (int): Size of the external memory structure
            seq_len (int): Maximum sentence length
            init_strategy (string): How to initialize the RNN state
                                    (cf.  ``GRUInitialState``)
            theano_seed: Random seed
        """
        super(NoLookupDecoder, self).__init__(**kwargs)
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.state_dim = state_dim
        self.representation_dim = representation_dim
        self.theano_seed = theano_seed

        # Initialize gru with special initial state
        self.transition = GRUInitialState(attended_dim=state_dim,
                                          init_strategy=init_strategy,
                                          dim=state_dim,
                                          activation=Tanh(),
                                          name='decoder')

        # Initialize the attention mechanism
        self.attention, src_names = _initialize_attention(
            attention_strategy, seq_len, self.transition, representation_dim,
            state_dim, attention_sources, readout_sources, memory, memory_size)

        # Initialize the readout, note that SoftmaxEmitter emits -1 for
        # initial outputs which is used by LookupFeedBackWMT15
        readout = Readout(
            source_names=src_names,
            readout_dim=embedding_dim,
            emitter=NoLookupEmitter(initial_output=-1,
                                    readout_dim=embedding_dim,
                                    cost_brick=SquaredError()),
            #                        cost_brick=CategoricalCrossEntropy()),
            feedback_brick=TrivialFeedback(output_dim=embedding_dim),
            post_merge=InitializableFeedforwardSequence([
                Bias(dim=state_dim, name='maxout_bias').apply,
                Maxout(num_pieces=2, name='maxout').apply,
                Linear(input_dim=state_dim / 2,
                       output_dim=embedding_dim,
                       use_bias=False,
                       name='softmax0').apply,
                Logistic(name='softmax1').apply
            ]),
            merged_dim=state_dim)

        # Build sequence generator accordingly
        self.sequence_generator = SequenceGenerator(
            readout=readout,
            transition=self.transition,
            attention=self.attention,
            fork=Fork([
                name
                for name in self.transition.apply.sequences if name != 'mask'
            ],
                      prototype=Linear()))

        self.children = [self.sequence_generator]

    @application(inputs=[
        'representation', 'representation_mask', 'target_sentence_mask',
        'target_sentence'
    ],
                 outputs=['cost'])
    def cost(self, representation, representation_mask, target_sentence,
             target_sentence_mask):

        target_sentence = target_sentence.T
        target_sentence_mask = target_sentence_mask.T

        # Get the cost matrix
        cost = self.sequence_generator.cost_matrix(
            **{
                'mask': target_sentence_mask,
                'outputs': target_sentence,
                'attended': representation,
                'attended_mask': representation_mask
            })

        return (cost * target_sentence_mask).sum() / \
            target_sentence_mask.shape[1]

    @application
    def generate(self, source_shape, representation, **kwargs):
        return self.sequence_generator.generate(
            n_steps=2 * source_shape[1],
            batch_size=source_shape[0],
            attended=representation,
            attended_mask=tensor.ones(source_shape).T,
            **kwargs)
Beispiel #26
0
def main(mode, save_path, steps, num_batches):
    num_states = MarkovChainDataset.num_states

    if mode == "train":
        # Experiment configuration
        rng = numpy.random.RandomState(1)
        batch_size = 50
        seq_len = 100
        dim = 10
        feedback_dim = 8

        # Build the bricks and initialize them
        transition = GatedRecurrent(name="transition", dim=dim,
                                    activation=Tanh())
        generator = SequenceGenerator(
            Readout(readout_dim=num_states, source_names=["states"],
                    emitter=SoftmaxEmitter(name="emitter"),
                    feedback_brick=LookupFeedback(
                        num_states, feedback_dim, name='feedback'),
                    name="readout"),
            transition,
            weights_init=IsotropicGaussian(0.01), biases_init=Constant(0),
            name="generator")
        generator.push_initialization_config()
        transition.weights_init = Orthogonal()
        generator.initialize()

        # Give an idea of what's going on.
        logger.info("Parameters:\n" +
                    pprint.pformat(
                        [(key, value.get_value().shape) for key, value
                         in Selector(generator).get_params().items()],
                        width=120))
        logger.info("Markov chain entropy: {}".format(
            MarkovChainDataset.entropy))
        logger.info("Expected min error: {}".format(
            -MarkovChainDataset.entropy * seq_len))

        # Build the cost computation graph.
        x = tensor.lmatrix('data')
        cost = aggregation.mean(generator.cost_matrix(x[:, :]).sum(),
                                x.shape[1])
        cost.name = "sequence_log_likelihood"

        algorithm = GradientDescent(
            cost=cost, params=list(Selector(generator).get_params().values()),
            step_rule=Scale(0.001))
        main_loop = MainLoop(
            algorithm=algorithm,
            data_stream=DataStream(
                MarkovChainDataset(rng, seq_len),
                iteration_scheme=ConstantScheme(batch_size)),
            model=Model(cost),
            extensions=[FinishAfter(after_n_batches=num_batches),
                        TrainingDataMonitoring([cost], prefix="this_step",
                                               after_batch=True),
                        TrainingDataMonitoring([cost], prefix="average",
                                               every_n_batches=100),
                        Checkpoint(save_path, every_n_batches=500),
                        Printing(every_n_batches=100)])
        main_loop.run()
    elif mode == "sample":
        main_loop = cPickle.load(open(save_path, "rb"))
        generator = main_loop.model

        sample = ComputationGraph(generator.generate(
            n_steps=steps, batch_size=1, iterate=True)).get_theano_function()

        states, outputs, costs = [data[:, 0] for data in sample()]

        numpy.set_printoptions(precision=3, suppress=True)
        print("Generation cost:\n{}".format(costs.sum()))

        freqs = numpy.bincount(outputs).astype(theano.config.floatX)
        freqs /= freqs.sum()
        print("Frequencies:\n {} vs {}".format(freqs,
                                               MarkovChainDataset.equilibrium))

        trans_freqs = numpy.zeros((num_states, num_states),
                                  dtype=theano.config.floatX)
        for a, b in zip(outputs, outputs[1:]):
            trans_freqs[a, b] += 1
        trans_freqs /= trans_freqs.sum(axis=1)[:, None]
        print("Transition frequencies:\n{}\nvs\n{}".format(
            trans_freqs, MarkovChainDataset.trans_prob))
    else:
        assert False
Beispiel #27
0
            weights_init=Orthogonal())

rnn = RecurrentStack([lstm1, lstm2, lstm3],
                     name="transition")

readout = Readout(readout_dim = alphabet_size,
                  source_names=["states#2"],
                  emitter=SoftmaxEmitter(name="emitter"),
                  feedback_brick=LookupFeedback(alphabet_size,
                                                feedback_dim=alphabet_size,
                                                name="feedback"),
                  name="readout")

seq_gen = SequenceGenerator(readout=readout,
                            transition=rnn,
                            weights_init=IsotropicGaussian(0.01),
                            biases_init=Constant(0),
                            name="generator")

seq_gen.push_initialization_config()
rnn.weights_init = Orthogonal()
seq_gen.initialize()

# z markov_tutorial
x = tensor.lvector('features')
x = x.reshape( (x.shape[0], 1) )
cost = aggregation.mean(seq_gen.cost_matrix(x[:,:]).sum(), x.shape[1])
cost.name = "sequence_log_likelihood"
cost_cg = ComputationGraph(cost)

# theano.printing.pydotprint(cost, outfile="./pics/symbolic_graph_unopt.png", var_with_name_simple=True)
Beispiel #28
0
                                       repeat=20)

if args.mode == "train":
    seq_len = 100
    dim = 100
    feedback_dim = 100

    # Build the bricks and initialize them
    transition = GatedRecurrent(name="transition", dim=dim, activation=Tanh())
    generator = SequenceGenerator(
        Readout(
            readout_dim=vocab_size,
            source_names=["states"],  # transition.apply.states ???
            emitter=SoftmaxEmitter(name="emitter"),
            feedback_brick=LookupFeedback(vocab_size,
                                          feedback_dim,
                                          name='feedback'),
            name="readout"),
        transition,
        weights_init=IsotropicGaussian(0.01),
        biases_init=Constant(0),
        name="generator")
    generator.push_initialization_config()
    transition.weights_init = Orthogonal()
    generator.initialize()

    # Build the cost computation graph.
    x = tensor.lmatrix('inchar')

    cost = generator.cost(outputs=x)
    cost.name = "sequence_cost"
Beispiel #29
0
def main():
    logging.basicConfig(
        level=logging.DEBUG,
        format="%(asctime)s: %(name)s: %(levelname)s: %(message)s")

    parser = argparse.ArgumentParser(
        "Case study of generating a Markov chain with RNN.",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument(
        "mode",
        choices=["train", "sample"],
        help="The mode to run. Use `train` to train a new model"
        " and `sample` to sample a sequence generated by an"
        " existing one.")
    parser.add_argument("prefix",
                        default="sine",
                        help="The prefix for model, timing and state files")
    parser.add_argument("--steps",
                        type=int,
                        default=100,
                        help="Number of steps to plot")
    args = parser.parse_args()

    dim = 10
    num_states = ChainIterator.num_states
    feedback_dim = 8

    transition = GatedRecurrent(name="transition", activation=Tanh(), dim=dim)
    generator = SequenceGenerator(LinearReadout(
        readout_dim=num_states,
        source_names=["states"],
        emitter=SoftmaxEmitter(name="emitter"),
        feedbacker=LookupFeedback(num_states, feedback_dim, name='feedback'),
        name="readout"),
                                  transition,
                                  weights_init=IsotropicGaussian(0.01),
                                  biases_init=Constant(0),
                                  name="generator")
    generator.allocate()
    logger.debug("Parameters:\n" + pprint.pformat(
        [(key, value.get_value().shape)
         for key, value in Selector(generator).get_params().items()],
        width=120))

    if args.mode == "train":
        rng = numpy.random.RandomState(1)
        batch_size = 50

        generator.push_initialization_config()
        transition.weights_init = Orthogonal()
        generator.initialize()
        logger.debug("transition.weights_init={}".format(
            transition.weights_init))

        cost = generator.cost(tensor.lmatrix('x')).sum()
        gh_model = GroundhogModel(generator, cost)
        state = GroundhogState(args.prefix, batch_size,
                               learning_rate=0.0001).as_dict()
        data = ChainIterator(rng, 100, batch_size)
        trainer = SGD(gh_model, state, data)
        main_loop = MainLoop(data, None, None, gh_model, trainer, state, None)
        main_loop.main()
    elif args.mode == "sample":
        load_params(generator, args.prefix + "model.npz")

        sample = ComputationGraph(
            generator.generate(n_steps=args.steps, batch_size=1,
                               iterate=True)).function()

        states, outputs, costs = [data[:, 0] for data in sample()]

        numpy.set_printoptions(precision=3, suppress=True)
        print("Generation cost:\n{}".format(costs.sum()))

        freqs = numpy.bincount(outputs).astype(floatX)
        freqs /= freqs.sum()
        print("Frequencies:\n {} vs {}".format(freqs,
                                               ChainIterator.equilibrium))

        trans_freqs = numpy.zeros((num_states, num_states), dtype=floatX)
        for a, b in zip(outputs, outputs[1:]):
            trans_freqs[a, b] += 1
        trans_freqs /= trans_freqs.sum(axis=1)[:, None]
        print("Transition frequencies:\n{}\nvs\n{}".format(
            trans_freqs, ChainIterator.trans_prob))
    else:
        assert False
Beispiel #30
0
    def __init__(
        self,
        recordings_source,
        labels_source,
        eos_label,
        num_features,
        num_phonemes,
        dim_dec,
        dims_bidir,
        dims_bottom,
        enc_transition,
        dec_transition,
        use_states_for_readout,
        attention_type,
        lm=None,
        character_map=None,
        subsample=None,
        dims_top=None,
        prior=None,
        conv_n=None,
        bottom_activation=None,
        post_merge_activation=None,
        post_merge_dims=None,
        dim_matcher=None,
        embed_outputs=True,
        dec_stack=1,
        conv_num_filters=1,
        data_prepend_eos=True,
        energy_normalizer=None,  # softmax is th edefault set in SequenceContentAndConvAttention
        **kwargs):
        if bottom_activation is None:
            bottom_activation = Tanh()
        if post_merge_activation is None:
            post_merge_activation = Tanh()
        super(SpeechRecognizer, self).__init__(**kwargs)
        self.recordings_source = recordings_source
        self.labels_source = labels_source
        self.eos_label = eos_label
        self.data_prepend_eos = data_prepend_eos

        self.rec_weights_init = None
        self.initial_states_init = None

        self.enc_transition = enc_transition
        self.dec_transition = dec_transition
        self.dec_stack = dec_stack

        bottom_activation = bottom_activation
        post_merge_activation = post_merge_activation

        if dim_matcher is None:
            dim_matcher = dim_dec

        # The bottom part, before BiRNN
        if dims_bottom:
            bottom = MLP([bottom_activation] * len(dims_bottom),
                         [num_features] + dims_bottom,
                         name="bottom")
        else:
            bottom = Identity(name='bottom')

        # BiRNN
        if not subsample:
            subsample = [1] * len(dims_bidir)
        encoder = Encoder(
            self.enc_transition, dims_bidir,
            dims_bottom[-1] if len(dims_bottom) else num_features, subsample)

        # The top part, on top of BiRNN but before the attention
        if dims_top:
            top = MLP([Tanh()],
                      [2 * dims_bidir[-1]] + dims_top + [2 * dims_bidir[-1]],
                      name="top")
        else:
            top = Identity(name='top')

        if dec_stack == 1:
            transition = self.dec_transition(dim=dim_dec,
                                             activation=Tanh(),
                                             name="transition")
        else:
            transitions = [
                self.dec_transition(dim=dim_dec,
                                    activation=Tanh(),
                                    name="transition_{}".format(trans_level))
                for trans_level in xrange(dec_stack)
            ]
            transition = RecurrentStack(transitions=transitions,
                                        skip_connections=True)
        # Choose attention mechanism according to the configuration
        if attention_type == "content":
            attention = SequenceContentAttention(
                state_names=transition.apply.states,
                attended_dim=2 * dims_bidir[-1],
                match_dim=dim_matcher,
                name="cont_att")
        elif attention_type == "content_and_conv":
            attention = SequenceContentAndConvAttention(
                state_names=transition.apply.states,
                conv_n=conv_n,
                conv_num_filters=conv_num_filters,
                attended_dim=2 * dims_bidir[-1],
                match_dim=dim_matcher,
                prior=prior,
                energy_normalizer=energy_normalizer,
                name="conv_att")
        else:
            raise ValueError(
                "Unknown attention type {}".format(attention_type))
        if embed_outputs:
            feedback = LookupFeedback(num_phonemes + 1, dim_dec)
        else:
            feedback = OneOfNFeedback(num_phonemes + 1)
        if lm:
            # In case we use LM it is Readout that is responsible
            # for normalization.
            emitter = LMEmitter()
        else:
            emitter = SoftmaxEmitter(initial_output=num_phonemes,
                                     name="emitter")
        readout_config = dict(readout_dim=num_phonemes,
                              source_names=(transition.apply.states if
                                            use_states_for_readout else []) +
                              [attention.take_glimpses.outputs[0]],
                              emitter=emitter,
                              feedback_brick=feedback,
                              name="readout")
        if post_merge_dims:
            readout_config['merged_dim'] = post_merge_dims[0]
            readout_config['post_merge'] = InitializableSequence(
                [
                    Bias(post_merge_dims[0]).apply,
                    post_merge_activation.apply,
                    MLP(
                        [post_merge_activation] *
                        (len(post_merge_dims) - 1) + [Identity()],
                        # MLP was designed to support Maxout is activation
                        # (because Maxout in a way is not one). However
                        # a single layer Maxout network works with the trick below.
                        # For deeper Maxout network one has to use the
                        # Sequence brick.
                        [
                            d //
                            getattr(post_merge_activation, 'num_pieces', 1)
                            for d in post_merge_dims
                        ] + [num_phonemes]).apply,
                ],
                name='post_merge')
        readout = Readout(**readout_config)

        language_model = None
        if lm:
            lm_weight = lm.pop('weight', 0.0)
            normalize_am_weights = lm.pop('normalize_am_weights', True)
            normalize_lm_weights = lm.pop('normalize_lm_weights', False)
            normalize_tot_weights = lm.pop('normalize_tot_weights', False)
            am_beta = lm.pop('am_beta', 1.0)
            if normalize_am_weights + normalize_lm_weights + normalize_tot_weights < 1:
                logger.warn(
                    "Beam search is prone to fail with no log-prob normalization"
                )
            language_model = LanguageModel(nn_char_map=character_map, **lm)
            readout = ShallowFusionReadout(
                lm_costs_name='lm_add',
                lm_weight=lm_weight,
                normalize_am_weights=normalize_am_weights,
                normalize_lm_weights=normalize_lm_weights,
                normalize_tot_weights=normalize_tot_weights,
                am_beta=am_beta,
                **readout_config)

        generator = SequenceGenerator(readout=readout,
                                      transition=transition,
                                      attention=attention,
                                      language_model=language_model,
                                      name="generator")

        # Remember child bricks
        self.encoder = encoder
        self.bottom = bottom
        self.top = top
        self.generator = generator
        self.children = [encoder, top, bottom, generator]

        # Create input variables
        self.recordings = tensor.tensor3(self.recordings_source)
        self.recordings_mask = tensor.matrix(self.recordings_source + "_mask")
        self.labels = tensor.lmatrix(self.labels_source)
        self.labels_mask = tensor.matrix(self.labels_source + "_mask")
        self.batch_inputs = [
            self.recordings, self.recordings_source, self.labels,
            self.labels_mask
        ]
        self.single_recording = tensor.matrix(self.recordings_source)
        self.single_transcription = tensor.lvector(self.labels_source)
Beispiel #31
0
class Decoder(Initializable):
    def __init__(self, vocab_size, embedding_dim, state_dim,
                 representation_dim, **kwargs):
        super(Decoder, self).__init__(**kwargs)
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.state_dim = state_dim
        self.representation_dim = representation_dim

        self.transition = GRUInitialState(
            attended_dim=state_dim, dim=state_dim,
            activation=Tanh(), name='decoder')
        self.attention = SequenceContentAttention(
            state_names=self.transition.apply.states,
            attended_dim=representation_dim,
            match_dim=state_dim, name="attention")

        readout = Readout(
            source_names=['states', 'feedback', self.attention.take_glimpses.outputs[0]],
            readout_dim=self.vocab_size,
            emitter=SoftmaxEmitter(initial_output=-1),
            feedback_brick=LookupFeedbackWMT15(vocab_size, embedding_dim),
            post_merge=InitializableFeedforwardSequence(
                [Bias(dim=state_dim, name='maxout_bias').apply,
                 Maxout(num_pieces=2, name='maxout').apply,
                 Linear(input_dim=state_dim / 2, output_dim=embedding_dim,
                        use_bias=False, name='softmax0').apply,
                 Linear(input_dim=embedding_dim, name='softmax1').apply]),
            merged_dim=state_dim)

        self.sequence_generator = SequenceGenerator(
            readout=readout,
            transition=self.transition,
            attention=self.attention,
            fork=Fork([name for name in self.transition.apply.sequences
                       if name != 'mask'], prototype=Linear())
        )

        self.children = [self.sequence_generator]

    @application(inputs=['representation', 'source_sentence_mask',
                         'target_sentence_mask', 'target_sentence'],
                 outputs=['cost'])
    def cost(self, representation, source_sentence_mask,
             target_sentence, target_sentence_mask):

        source_sentence_mask = source_sentence_mask.T
        target_sentence = target_sentence.T
        target_sentence_mask = target_sentence_mask.T

        # Get the cost matrix
        cost = self.sequence_generator.cost_matrix(
                    **{'mask': target_sentence_mask,
                       'outputs': target_sentence,
                       'attended': representation,
                       'attended_mask': source_sentence_mask}
        )

        return (cost * target_sentence_mask).sum() / target_sentence_mask.shape[1]

    @application
    def generate(self, source_sentence, representation):
        return self.sequence_generator.generate(
            n_steps=2 * source_sentence.shape[1],
            batch_size=source_sentence.shape[0],
            attended=representation,
            attended_mask=tensor.ones(source_sentence.shape).T)
Beispiel #32
0
              state_names = source_names,
              state_dims = [hidden_size_recurrent],
              attended_dim = context_size,
              name = "attention")

#ipdb.set_trace()
# Verify source names
readout = Readout(
    readout_dim = hidden_size_recurrent,
    source_names =source_names + ['feedback'] + ['glimpses'],
    emitter=gmm_emitter,
    feedback_brick = feedback,
    name="readout")

generator = SequenceGenerator(readout=readout, 
                              transition=transition,
                              attention = attention,
                              name = "generator")

mlp_context = MLP(activations = activations_context,
                  dims = dims_context)

bricks = [mlp_context]

for brick in bricks:
    brick.weights_init = IsotropicGaussian(0.01)
    brick.biases_init = Constant(0.)
    brick.initialize()

generator.weights_init = IsotropicGaussian(0.01)
generator.biases_init = Constant(0.)
generator.push_initialization_config()
Beispiel #33
0
def main(name, epochs, batch_size, learning_rate,
         dim, mix_dim, old_model_name, max_length, bokeh, GRU, dropout,
         depth, max_grad, step_method, epsilon, sample, skip, uniform, top):

    #----------------------------------------------------------------------
    datasource = name

    def shnum(x):
        """ Convert a positive float into a short tag-usable string
             E.g.: 0 -> 0, 0.005 -> 53, 100 -> 1-2
        """
        return '0' if x <= 0 else '%s%d' % (("%e"%x)[0], -np.floor(np.log10(x)))

    jobname = "%s-%dX%dm%dd%dr%sb%de%s" % (datasource, depth, dim, mix_dim,
                                           int(dropout*10),
                                           shnum(learning_rate), batch_size,
                                           shnum(epsilon))
    if max_length != 600:
        jobname += '-L%d'%max_length

    if GRU:
        jobname += 'g'
    if max_grad != 5.:
        jobname += 'G%g'%max_grad
    if step_method != 'adam':
        jobname += step_method
    if skip:
        jobname += 'D'
        assert depth > 1
    if top:
        jobname += 'T'
        assert depth > 1
    if uniform > 0.:
        jobname += 'u%d'%int(uniform*100)

    if debug:
        jobname += ".debug"

    if sample:
        print("Sampling")
    else:
        print("\nRunning experiment %s" % jobname)
    if old_model_name:
        print("starting from model %s"%old_model_name)

    #----------------------------------------------------------------------
    transitions = [GatedRecurrent(dim=dim) if GRU else LSTM(dim=dim)
                   for _ in range(depth)]
    if depth > 1:
        transition = RecurrentStack(transitions, name="transition",
                                    fast=True, skip_connections=skip or top)
        if skip:
            source_names=['states'] + ['states_%d'%d for d in range(1,depth)]
        else:
            source_names=['states_%d'%(depth-1)]
    else:
        transition = transitions[0]
        transition.name = "transition"
        source_names=['states']

    emitter = SketchEmitter(mix_dim=mix_dim,
                            epsilon=epsilon,
                            name="emitter")
    readout = Readout(
        readout_dim=emitter.get_dim('inputs'),
        source_names=source_names,
        emitter=emitter,
        name="readout")
    normal_inputs = [name for name in transition.apply.sequences
                     if 'mask' not in name]
    fork = Fork(normal_inputs, prototype=Linear(use_bias=True))
    generator = SequenceGenerator(readout=readout, transition=transition,
                                  fork=fork)

    # Initialization settings
    if uniform > 0.:
        generator.weights_init = Uniform(width=uniform*2.)
    else:
        generator.weights_init = OrthogonalGlorot()
    generator.biases_init = Constant(0)

    # Build the cost computation graph [steps, batch_size, 3]
    x = T.tensor3('features', dtype=floatX)
    if debug:
        x.tag.test_value = np.ones((max_length,batch_size,3)).astype(floatX)
    x = x[:max_length,:,:]  # has to be after setting test_value
    cost = generator.cost(x)
    cost.name = "sequence_log_likelihood"

    # Give an idea of what's going on
    model = Model(cost)
    params = model.get_params()
    logger.info("Parameters:\n" +
                pprint.pformat(
                    [(key, value.get_value().shape) for key, value
                     in params.items()],
                    width=120))
    model_size = 0
    for v in params.itervalues():
        s = v.get_value().shape
        model_size += s[0] * (s[1] if len(s) > 1 else 1)
    logger.info("Total number of parameters %d"%model_size)

    #------------------------------------------------------------
    extensions = []
    if old_model_name == 'continue':
        extensions.append(LoadFromDump(jobname))
    elif old_model_name:
        # or you can just load the weights without state using:
        old_params = LoadFromDump(old_model_name).manager.load_parameters()
        model.set_param_values(old_params)
    else:
        # Initialize parameters
        for brick in model.get_top_bricks():
            brick.initialize()

    if sample:
        assert old_model_name and old_model_name != 'continue'
        Sample(generator, steps=max_length, path=old_model_name).do(None)
        exit(0)

    #------------------------------------------------------------
    # Define the training algorithm.
    cg = ComputationGraph(cost)
    if dropout > 0.:
        from blocks.roles import INPUT, OUTPUT
        dropout_target = VariableFilter(roles=[OUTPUT],
                                        bricks=transitions,
                                        name_regex='states')(cg.variables)
        print('# dropout %d' % len(dropout_target))
        cg = apply_dropout(cg, dropout_target, dropout)
        opt_cost = cg.outputs[0]
    else:
        opt_cost = cost

    if step_method == 'adam':
        step_rule = Adam(learning_rate)
    elif step_method == 'rmsprop':
        step_rule = RMSProp(learning_rate, decay_rate=0.95)
    elif step_method == 'adagrad':
        step_rule = AdaGrad(learning_rate)
    elif step_method == 'adadelta':
        step_rule = AdaDelta()
    elif step_method == 'scale':
        step_rule = Scale(learning_rate)
    else:
        raise Exception('Unknown sttep method %s'%step_method)

    step_rule = CompositeRule([StepClipping(max_grad), step_rule])

    algorithm = GradientDescent(
        cost=opt_cost, params=cg.parameters,
        step_rule=step_rule)

    #------------------------------------------------------------
    observables = [cost]

    # Fetch variables useful for debugging
    (energies,) = VariableFilter(
        applications=[generator.readout.readout],
        name_regex="output")(cg.variables)
    min_energy = named_copy(energies.min(), "min_energy")
    max_energy = named_copy(energies.max(), "max_energy")
    observables += [min_energy, max_energy]

    # (activations,) = VariableFilter(
    #     applications=[generator.transition.apply],
    #     name=generator.transition.apply.states[0])(cg.variables)
    # mean_activation = named_copy(abs(activations).mean(),
    #                              "mean_activation")
    # observables.append(mean_activation)

    observables += [algorithm.total_step_norm, algorithm.total_gradient_norm]
    for name, param in params.items():
        observables.append(named_copy(
            param.norm(2), name + "_norm"))
        observables.append(named_copy(
            algorithm.gradients[param].norm(2), name + "_grad_norm"))

    #------------------------------------------------------------
    datasource_fname = os.path.join(fuel.config.data_path, datasource,
                                    datasource+'.hdf5')

    train_ds = H5PYDataset(datasource_fname, #max_length=max_length,
                             which_set='train', sources=('features',),
                             load_in_memory=True)
    train_stream = DataStream(train_ds,
                              iteration_scheme=ShuffledScheme(
                                  train_ds.num_examples, batch_size))

    test_ds = H5PYDataset(datasource_fname, #max_length=max_length,
                            which_set='test', sources=('features',),
                            load_in_memory=True)
    test_stream  = DataStream(test_ds,
                              iteration_scheme=SequentialScheme(
                                  test_ds.num_examples, batch_size))

    train_stream = Mapping(train_stream, _transpose)
    test_stream = Mapping(test_stream, _transpose)

    def stream_stats(ds, label):
        itr = ds.get_epoch_iterator(as_dict=True)
        batch_count = 0
        examples_count = 0
        for batch in itr:
            batch_count += 1
            examples_count += batch['features'].shape[1]
        print('%s #batch %d #examples %d' %
              (label, batch_count, examples_count))

    stream_stats(train_stream, 'train')
    stream_stats(test_stream, 'test')

    extensions += [Timing(every_n_batches=10),
                   TrainingDataMonitoring(
                       observables, prefix="train",
                       every_n_batches=10),
                   DataStreamMonitoring(
                       [cost],  # without dropout
                       test_stream,
                       prefix="test",
                       on_resumption=True,
                       after_epoch=False,  # by default this is True
                       every_n_batches=100),
                   # all monitored data is ready so print it...
                   # (next steps may take more time and we want to see the
                   # results as soon as possible so print as soon as you can)
                   Printing(every_n_batches=10),
                   # perform multiple dumps at different intervals
                   # so if one of them breaks (has nan) we can hopefully
                   # find a model from few batches ago in the other
                   Dump(jobname, every_n_batches=11),
                   Dump(jobname+'.test', every_n_batches=100),
                   Sample(generator, steps=max_length,
                          path=jobname+'.test',
                          every_n_batches=100),
                   ProgressBar(),
                   FinishAfter(after_n_epochs=epochs)
                    # This shows a way to handle NaN emerging during
                    # training: simply finish it.
                    .add_condition("after_batch", _is_nan),
                   ]

    if bokeh:
        from blocks.extensions.plot import Plot
        extensions.append(Plot(
            'sketch',
            channels=[
                ['cost'],]))

    # Construct the main loop and start training!
    main_loop = MainLoop(
        model=model,
        data_stream=train_stream,
        algorithm=algorithm,
        extensions=extensions
        )

    main_loop.run()
Beispiel #34
0
def main(mode, save_path, steps, num_batches, load_params):
    chars = (list(string.ascii_uppercase) + list(range(10)) +
             [' ', '.', ',', '\'', '"', '!', '?', '<UNK>'])
    char_to_ind = {char: i for i, char in enumerate(chars)}
    ind_to_char = {v: k for k, v in char_to_ind.iteritems()}

    train_dataset = TextFile(['/Tmp/serdyuk/data/wsj_text_train'],
                             char_to_ind, bos_token=None, eos_token=None,
                             level='character')
    valid_dataset = TextFile(['/Tmp/serdyuk/data/wsj_text_valid'],
                             char_to_ind, bos_token=None, eos_token=None,
                             level='character')

    vocab_size = len(char_to_ind)
    logger.info('Dictionary size: {}'.format(vocab_size))
    if mode == 'continue':
        continue_training(save_path)
        return
    elif mode == "sample":
        main_loop = load(open(save_path, "rb"))
        generator = main_loop.model.get_top_bricks()[-1]

        sample = ComputationGraph(generator.generate(
            n_steps=steps, batch_size=1, iterate=True)).get_theano_function()

        states, outputs, costs = [data[:, 0] for data in sample()]
        print("".join([ind_to_char[s] for s in outputs]))

        numpy.set_printoptions(precision=3, suppress=True)
        print("Generation cost:\n{}".format(costs.sum()))

        freqs = numpy.bincount(outputs).astype(floatX)
        freqs /= freqs.sum()

        trans_freqs = numpy.zeros((vocab_size, vocab_size), dtype=floatX)
        for a, b in zip(outputs, outputs[1:]):
            trans_freqs[a, b] += 1
        trans_freqs /= trans_freqs.sum(axis=1)[:, None]
        return

    # Experiment configuration
    batch_size = 20
    dim = 650
    feedback_dim = 650

    valid_stream = valid_dataset.get_example_stream()
    valid_stream = Batch(valid_stream,
                         iteration_scheme=ConstantScheme(batch_size))
    valid_stream = Padding(valid_stream)
    valid_stream = Mapping(valid_stream, _transpose)

    # Build the bricks and initialize them

    transition = GatedRecurrent(name="transition", dim=dim,
                                activation=Tanh())
    generator = SequenceGenerator(
        Readout(readout_dim=vocab_size, source_names=transition.apply.states,
                emitter=SoftmaxEmitter(name="emitter"),
                feedback_brick=LookupFeedback(
                    vocab_size, feedback_dim, name='feedback'),
                name="readout"),
        transition,
        weights_init=Uniform(std=0.04), biases_init=Constant(0),
        name="generator")
    generator.push_initialization_config()
    transition.weights_init = Orthogonal()
    transition.push_initialization_config()
    generator.initialize()

    # Build the cost computation graph.
    features = tensor.lmatrix('features')
    features_mask = tensor.matrix('features_mask')
    cost_matrix = generator.cost_matrix(
        features, mask=features_mask)
    batch_cost = cost_matrix.sum()
    cost = aggregation.mean(
        batch_cost,
        features.shape[1])
    cost.name = "sequence_log_likelihood"
    char_cost = aggregation.mean(
        batch_cost, features_mask.sum())
    char_cost.name = 'character_log_likelihood'
    ppl = 2 ** (cost / numpy.log(2))
    ppl.name = 'ppl'
    bits_per_char = char_cost / tensor.log(2)
    bits_per_char.name = 'bits_per_char'
    length = features.shape[0]
    length.name = 'length'

    model = Model(batch_cost)
    if load_params:
        params = load_parameter_values(save_path)
        model.set_parameter_values(params)

    if mode == "train":
        # Give an idea of what's going on.
        logger.info("Parameters:\n" +
                    pprint.pformat(
                        [(key, value.get_value().shape) for key, value
                         in Selector(generator).get_parameters().items()],
                        width=120))

        train_stream = train_dataset.get_example_stream()
        train_stream = Mapping(train_stream, _truncate)
        train_stream = Batch(train_stream,
                             iteration_scheme=ConstantScheme(batch_size))
        train_stream = Padding(train_stream)
        train_stream = Mapping(train_stream, _transpose)

        parameters = model.get_parameter_dict()
        maxnorm_subjects = VariableFilter(roles=[WEIGHT])(parameters.values())
        algorithm = GradientDescent(
            cost=batch_cost,
            parameters=parameters.values(),
            step_rule=CompositeRule([StepClipping(1000.), 
                AdaDelta(epsilon=1e-8) #, Restrict(VariableClipping(1.0, axis=0), maxnorm_subjects)
                                     ]))
        ft = features[:6, 0]
        ft.name = 'feature_example'

        observables = [cost, ppl, char_cost, length, bits_per_char]
        for name, param in parameters.items():
            num_elements = numpy.product(param.get_value().shape)
            norm = param.norm(2) / num_elements ** 0.5
            grad_norm = algorithm.gradients[param].norm(2) / num_elements ** 0.5
            step_norm = algorithm.steps[param].norm(2) / num_elements ** 0.5
            stats = tensor.stack(norm, grad_norm, step_norm, step_norm / grad_norm)
            stats.name = name + '_stats'
            observables.append(stats)
        track_the_best_bpc = TrackTheBest('valid_bits_per_char')
        root_path, extension = os.path.splitext(save_path)

        this_step_monitoring = TrainingDataMonitoring(
            observables + [ft], prefix="this_step", after_batch=True)
        average_monitoring = TrainingDataMonitoring(
            observables + [algorithm.total_step_norm,
                           algorithm.total_gradient_norm], 
            prefix="average",
            every_n_batches=10)
        valid_monitoring = DataStreamMonitoring(
            observables, prefix="valid",
            every_n_batches=1500, before_training=False,
            data_stream=valid_stream)
        main_loop = MainLoop(
            algorithm=algorithm,
            data_stream=train_stream,
            model=model,
            extensions=[
                this_step_monitoring,
                average_monitoring,
                valid_monitoring,
                track_the_best_bpc,
                Checkpoint(save_path, ),
                Checkpoint(save_path,
                           every_n_batches=500,
                           save_separately=["model", "log"],
                           use_cpickle=True)
                    .add_condition(
                    ['after_epoch'],
                    OnLogRecord(track_the_best_bpc.notification_name),
                    (root_path + "_best" + extension,)),
                Timing(after_batch=True),
                Printing(every_n_batches=10),
                Plot(root_path,
                     [[average_monitoring.record_name(cost),
                       valid_monitoring.record_name(cost)],
                      [average_monitoring.record_name(algorithm.total_step_norm)],
                      [average_monitoring.record_name(algorithm.total_gradient_norm)],
                      [average_monitoring.record_name(ppl),
                       valid_monitoring.record_name(ppl)],
                      [average_monitoring.record_name(char_cost),
                       valid_monitoring.record_name(char_cost)],
                      [average_monitoring.record_name(bits_per_char),
                       valid_monitoring.record_name(bits_per_char)]],
                     every_n_batches=10)
            ])
        main_loop.run()

    elif mode == 'evaluate':
        with open('/data/lisatmp3/serdyuk/wsj_lms/lms/wsj_trigram_with_initial_eos/lexicon.txt') as f:
            raw_words = [line.split()[1:-1] for line in f.readlines()]
            words = [[char_to_ind[c] if c in char_to_ind else char_to_ind['<UNK>'] for c in w] 
                     for w in raw_words]
        max_word_length = max([len(w) for w in words])
        
        initial_states = tensor.matrix('init_states')
        cost_matrix_step = generator.cost_matrix(features, mask=features_mask,
                                                 states=initial_states)
        cg = ComputationGraph(cost_matrix_step)
        states = cg.auxiliary_variables[-2]
        compute_cost = theano.function([features, features_mask, initial_states], 
                                       [cost_matrix_step.sum(axis=0), states])

        cost_matrix = generator.cost_matrix(features, mask=features_mask)
        initial_cg = ComputationGraph(cost_matrix)
        initial_states = initial_cg.auxiliary_variables[-2]

        total_word_cost = 0
        num_words = 0
        examples = numpy.zeros((max_word_length + 1, len(words)),
                               dtype='int64')
        all_masks = numpy.zeros((max_word_length + 1, len(words)),
                                dtype=floatX)

        for i, word in enumerate(words):
            examples[:len(word), i] = word
            all_masks[:len(word), i] = 1.

        single_space = numpy.array([char_to_ind[' ']])[:, None]

        for batch in valid_stream.get_epoch_iterator():
            for example, mask in equizip(batch[0].T, batch[1].T):
                example = example[:(mask.sum())]
                spc_inds = list(numpy.where(example == char_to_ind[" "])[0])
                state = generator.transition.transition.initial_states_.get_value()[None, :]
                for i, j in equizip([-1] + spc_inds, spc_inds + [-1]):
                    word = example[(i+1):j, None]
                    word_cost, states = compute_cost(
                        word, numpy.ones_like(word, dtype=floatX), state)
                    state = states[-1]

                    costs = numpy.exp(-compute_cost(
                        examples, all_masks, numpy.tile(state, [examples.shape[1], 1]))[0])

                    _, space_states = compute_cost(
                        single_space, numpy.ones_like(single_space, dtype=floatX), state)
                    state = space_states[-1]

                    word_prob = numpy.exp(-word_cost)
                    total_word_cost += word_cost + numpy.log(numpy.sum(costs))
                    num_words += 1
                    print(word_prob)
                    print(numpy.sum(costs))
                    print("Average cost", total_word_cost / num_words)
                    print("PPL", numpy.exp(total_word_cost / num_words))

        print("Word-level perplexity")
        print(total_word_cost / num_words)
    else:
        assert False
    def __init__(self, config, vocab_size):
        question = tensor.imatrix('question')
        question_mask = tensor.imatrix('question_mask')
        context = tensor.imatrix('context')
        context_mask = tensor.imatrix('context_mask')
        answer = tensor.ivector('answer')
        candidates = tensor.imatrix('candidates')
        candidates_mask = tensor.imatrix('candidates_mask')

        bricks = []

        question = question.dimshuffle(1, 0)
        question_mask = question_mask.dimshuffle(1, 0)
        context = context.dimshuffle(1, 0)
        context_mask = context_mask.dimshuffle(1, 0)

        # Embed questions and cntext
        embed = LookupTable(vocab_size, config.embed_size, name='question_embed')
        bricks.append(embed)

        qembed = embed.apply(question)
        cembed = embed.apply(context)

        qlstms, qhidden_list = make_bidir_lstm_stack(qembed, config.embed_size, question_mask.astype(theano.config.floatX),
                                                     config.question_lstm_size, config.question_skip_connections, 'q')
        clstms, chidden_list = make_bidir_lstm_stack(cembed, config.embed_size, context_mask.astype(theano.config.floatX),
                                                     config.ctx_lstm_size, config.ctx_skip_connections, 'ctx')
        bricks = bricks + qlstms + clstms

        # Calculate question encoding (concatenate layer1)
        if config.question_skip_connections:
            qenc_dim = 2*sum(config.question_lstm_size)
            qenc = tensor.concatenate([h[-1,:,:] for h in qhidden_list], axis=1)
        else:
            qenc_dim = 2*config.question_lstm_size[-1]
            qenc = tensor.concatenate([h[-1,:,:] for h in qhidden_list[-2:]], axis=1)
        qenc.name = 'qenc'

        # Calculate context encoding (concatenate layer1)
        if config.ctx_skip_connections: #default yes
            cenc_dim = 2*sum(config.ctx_lstm_size) #2 : fw & bw
            cenc = tensor.concatenate(chidden_list, axis=2)
        else:
            cenc_dim = 2*config.ctx_lstm_size[-1]
            cenc = tensor.concatenate(chidden_list[-2:], axis=2)
        cenc.name = 'cenc'

        # Attention mechanism MLP           activation: Tanh, identity
        attention_mlp = MLP(dims=config.attention_mlp_hidden + [1],
                            activations=config.attention_mlp_activations[1:] + [Identity()],
                            name='attention_mlp')
        attention_qlinear = Linear(input_dim=qenc_dim, output_dim=config.attention_mlp_hidden[0], name='attq') #Wum
        attention_clinear = Linear(input_dim=cenc_dim, output_dim=config.attention_mlp_hidden[0], use_bias=False, name='attc') # Wym
        bricks += [attention_mlp, attention_qlinear, attention_clinear]
        layer1 = Tanh().apply(attention_clinear.apply(cenc.reshape((cenc.shape[0]*cenc.shape[1], cenc.shape[2])))
                                        .reshape((cenc.shape[0],cenc.shape[1],config.attention_mlp_hidden[0]))
                             + attention_qlinear.apply(qenc)[None, :, :])
        layer1.name = 'layer1'
        att_weights = attention_mlp.apply(layer1.reshape((layer1.shape[0]*layer1.shape[1], layer1.shape[2])))
        att_weights.name = 'att_weights_0'
        att_weights = att_weights.reshape((layer1.shape[0], layer1.shape[1]))
        att_weights.name = 'att_weights'

        attended = tensor.sum(cenc * tensor.nnet.softmax(att_weights.T).T[:, :, None], axis=0)
        attended.name = 'attended'

        print("attended shape: %d" %attended.shape)

        dimension = qenc_dim + cenc_dim
        transition = SimpleRecurrent(activation=Tanh(),dim=dimension, name="transition")

        readout = Readout(
            readout_dim=vocab_size,
            source_names=[transition.apply.states[0]],
            emitter=SoftmaxEmitter(name="emitter"),
            feedback_brick=LookupFeedback(vocab_size, dimension),
            name="readout")

        generator = SequenceGenerator(
            readout=readout, transition=transition,
            name="generator")

        self.generator = generator
        bricks += [generator]


        cost = self.generator.cost()




        # Now we can calculate our output
        out_mlp = MLP(dims=[cenc_dim + qenc_dim] + config.out_mlp_hidden + [config.n_entities],
                      activations=config.out_mlp_activations + [Identity()],
                      name='out_mlp')
        bricks += [out_mlp]
        probs = out_mlp.apply(tensor.concatenate([attended, qenc], axis=1))
        probs.name = 'probs'

        is_candidate = tensor.eq(tensor.arange(config.n_entities, dtype='int32')[None, None, :],
                                 tensor.switch(candidates_mask, candidates, -tensor.ones_like(candidates))[:, :, None]).sum(axis=1)
        probs = tensor.switch(is_candidate, probs, -1000 * tensor.ones_like(probs))

        # Calculate prediction, cost and error rate
        pred = probs.argmax(axis=1)
        cost = Softmax().categorical_cross_entropy(answer, probs).mean()
        error_rate = tensor.neq(answer, pred).mean()

        # Apply dropout
        cg = ComputationGraph([cost, error_rate])
        if config.w_noise > 0:
            noise_vars = VariableFilter(roles=[WEIGHT])(cg)
            cg = apply_noise(cg, noise_vars, config.w_noise)
        if config.dropout > 0:
            cg = apply_dropout(cg, qhidden_list + chidden_list, config.dropout)
        [cost_reg, error_rate_reg] = cg.outputs

        # Other stuff
        cost_reg.name = cost.name = 'cost'
        error_rate_reg.name = error_rate.name = 'error_rate'

        self.sgd_cost = cost_reg
        self.monitor_vars = [[cost_reg], [error_rate_reg]]
        self.monitor_vars_valid = [[cost], [error_rate]]

        # Initialize bricks
        for brick in bricks:
            brick.weights_init = config.weights_init
            brick.biases_init = config.biases_init
            brick.initialize()
Beispiel #36
0
def main():
    logging.basicConfig(
        level=logging.DEBUG,
        format="%(asctime)s: %(name)s: %(levelname)s: %(message)s")

    parser = argparse.ArgumentParser(
        "Case study of generating a Markov chain with RNN.",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument(
        "mode", choices=["train", "sample"],
        help="The mode to run. Use `train` to train a new model"
             " and `sample` to sample a sequence generated by an"
             " existing one.")
    parser.add_argument(
        "prefix", default="sine",
        help="The prefix for model, timing and state files")
    parser.add_argument(
        "--steps", type=int, default=100,
        help="Number of steps to plot")
    args = parser.parse_args()

    dim = 10
    num_states = ChainIterator.num_states
    feedback_dim = 8

    transition = GatedRecurrent(name="transition", activation=Tanh(), dim=dim)
    generator = SequenceGenerator(
        LinearReadout(readout_dim=num_states, source_names=["states"],
                      emitter=SoftmaxEmitter(name="emitter"),
                      feedbacker=LookupFeedback(
                          num_states, feedback_dim, name='feedback'),
                      name="readout"),
        transition,
        weights_init=IsotropicGaussian(0.01), biases_init=Constant(0),
        name="generator")
    generator.allocate()
    logger.debug("Parameters:\n" +
                 pprint.pformat(
                     [(key, value.get_value().shape) for key, value
                      in Selector(generator).get_params().items()],
                     width=120))

    if args.mode == "train":
        rng = numpy.random.RandomState(1)
        batch_size = 50

        generator.push_initialization_config()
        transition.weights_init = Orthogonal()
        generator.initialize()
        logger.debug("transition.weights_init={}".format(
            transition.weights_init))

        cost = generator.cost(tensor.lmatrix('x')).sum()
        gh_model = GroundhogModel(generator, cost)
        state = GroundhogState(args.prefix, batch_size,
                               learning_rate=0.0001).as_dict()
        data = ChainIterator(rng, 100, batch_size)
        trainer = SGD(gh_model, state, data)
        main_loop = MainLoop(data, None, None, gh_model, trainer, state, None)
        main_loop.main()
    elif args.mode == "sample":
        load_params(generator,  args.prefix + "model.npz")

        sample = ComputationGraph(generator.generate(
            n_steps=args.steps, batch_size=1, iterate=True)).function()

        states, outputs, costs = [data[:, 0] for data in sample()]

        numpy.set_printoptions(precision=3, suppress=True)
        print("Generation cost:\n{}".format(costs.sum()))

        freqs = numpy.bincount(outputs).astype(floatX)
        freqs /= freqs.sum()
        print("Frequencies:\n {} vs {}".format(freqs,
                                               ChainIterator.equilibrium))

        trans_freqs = numpy.zeros((num_states, num_states), dtype=floatX)
        for a, b in zip(outputs, outputs[1:]):
            trans_freqs[a, b] += 1
        trans_freqs /= trans_freqs.sum(axis=1)[:, None]
        print("Transition frequencies:\n{}\nvs\n{}".format(
            trans_freqs, ChainIterator.trans_prob))
    else:
        assert False
Beispiel #37
0
source_names=['states']
readout = Readout(
    readout_dim = hidden_size_recurrent,
    source_names =source_names,
    emitter=emitter,
    feedback_brick = feedback,
    name="readout")

attention = SimpleSequenceAttention(
              state_names = source_names,
              state_dims = [hidden_size_recurrent],
              attended_dim = context_size)

generator = SequenceGenerator(readout=readout, 
                              transition=transition,
                              attention = attention,
                              name = "generator")

generator.weights_init = IsotropicGaussian(0.01)
generator.biases_init = Constant(0.)
generator.initialize()

mlp_context.weights_init = IsotropicGaussian(0.01)
mlp_context.biases_init = Constant(0.)
mlp_context.initialize()

#ipdb.set_trace()
cost_matrix = generator.cost_matrix(x, x_mask,
        attended = mlp_context.apply(context))
cost = cost_matrix.sum()/x_mask.sum()
cost.name = "sequence_log_likelihood"
Beispiel #38
0
def main():
    logging.basicConfig(
        level=logging.DEBUG,
        format="%(asctime)s: %(name)s: %(levelname)s: %(message)s")

    parser = argparse.ArgumentParser(
        "Case study of language modeling with RNN",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument(
        "mode", choices=["train", "sample"],
        help="The mode to run. Use `train` to train a new model"
             " and `sample` to sample a sequence generated by an"
             " existing one.")
    parser.add_argument(
        "prefix", default="sine",
        help="The prefix for model, timing and state files")
    parser.add_argument(
        "state", nargs="?", default="",
        help="Changes to Groundhog state")
    parser.add_argument("--path", help="Path to a language dataset")
    parser.add_argument("--dict", help="Path to the dataset dictionary")
    parser.add_argument("--restart", help="Start anew")
    parser.add_argument(
        "--reset", action="store_true", default=False,
        help="Reset the hidden state between batches")
    parser.add_argument(
        "--steps", type=int, default=100,
        help="Number of steps to plot for the 'sample' mode"
             " OR training sequence length for the 'train' mode.")
    args = parser.parse_args()
    logger.debug("Args:\n" + str(args))

    dim = 200
    num_chars = 50

    transition = GatedRecurrent(
        name="transition", activation=Tanh(), dim=dim,
        weights_init=Orthogonal())
    generator = SequenceGenerator(
        LinearReadout(readout_dim=num_chars, source_names=["states"],
                      emitter=SoftmaxEmitter(name="emitter"),
                      feedbacker=LookupFeedback(
                          num_chars, dim, name='feedback'),
                      name="readout"),
        transition,
        weights_init=IsotropicGaussian(0.01), biases_init=Constant(0),
        name="generator")
    generator.allocate()
    logger.debug("Parameters:\n" +
                 pprint.pformat(
                     [(key, value.get_value().shape) for key, value
                      in Selector(generator).get_params().items()],
                     width=120))

    if args.mode == "train":
        batch_size = 1
        seq_len = args.steps

        generator.initialize()

        # Build cost computation graph that uses the saved hidden states.
        # An issue: for Groundhog this is completely transparent, that's
        # why it does not carry the hidden state over the period when
        # validation in done. We should find a way to fix in the future.
        x = tensor.lmatrix('x')
        init_states = shared_floatx_zeros((batch_size, dim),
                                          name='init_states')
        reset = tensor.scalar('reset')
        cost = ComputationGraph(
            generator.cost(x, states=init_states * reset).sum())
        # TODO: better search routine
        states = [v for v in cost.variables
                  if hasattr(v.tag, 'application_call')
                  and v.tag.application_call.brick == generator.transition
                  and (v.tag.application_call.application ==
                       generator.transition.apply)
                  and v.tag.role == VariableRole.OUTPUT
                  and v.tag.name == 'states']
        assert len(states) == 1
        states = states[0]

        gh_model = GroundhogModel(generator, cost)
        gh_model.properties.append(
            ('bpc', cost.outputs[0] * numpy.log(2) / seq_len))
        gh_model.properties.append(('mean_init_state', init_states.mean()))
        gh_model.properties.append(('reset', reset))
        if not args.reset:
            gh_model.updates.append((init_states, states[-1]))

        state = GroundhogState(args.prefix, batch_size,
                               learning_rate=0.0001).as_dict()
        changes = eval("dict({})".format(args.state))
        state.update(changes)

        def output_format(x, y, reset):
            return dict(x=x[:, None], reset=reset)
        train, valid, test = [
            LMIterator(batch_size=batch_size,
                       use_infinite_loop=mode == 'train',
                       path=args.path,
                       seq_len=seq_len,
                       mode=mode,
                       chunks='chars',
                       output_format=output_format,
                       can_fit=True)
            for mode in ['train', 'valid', 'test']]

        trainer = SGD(gh_model, state, train)
        state['on_nan'] = 'warn'
        state['cutoff'] = 1.

        main_loop = MainLoop(train, valid, None, gh_model,
                             trainer, state, None)
        if not args.restart:
            main_loop.load()
        main_loop.main()
    elif args.mode == "sample":
        load_params(generator,  args.prefix + "model.npz")

        chars = numpy.load(args.dict)['unique_chars']

        sample = ComputationGraph(generator.generate(
            n_steps=args.steps, batch_size=10, iterate=True)).function()

        states, outputs, costs = sample()

        for i in range(10):
            print("Generation cost: {}".format(costs[:, i].sum()))
            print("".join([chars[o] for o in outputs[:, i]]))
    else:
        assert False
Beispiel #39
0
from play.bricks.recurrent import SimpleSequenceAttention

attention = SimpleSequenceAttention(
    state_names=source_names, state_dims=[hidden_size_recurrent], attended_dim=context_size, name="attention"
)

readout = Readout(
    readout_dim=hidden_size_recurrent,
    source_names=source_names + ["feedback"] + ["glimpses"],
    emitter=emitter,
    feedback_brick=feedback,
    name="readout",
)

generator = SequenceGenerator(readout=readout, transition=transition, attention=attention, name="generator")

generator.weights_init = IsotropicGaussian(0.01)
generator.biases_init = Constant(0.0)
generator.push_initialization_config()

generator.transition.biases_init = IsotropicGaussian(0.01, 1)
generator.transition.push_initialization_config()

generator.initialize()

lookup.weights_init = IsotropicGaussian(0.001)
lookup.biases_init = Constant(0.0)
lookup.initialize()

# states = {}
def test_with_attention():
    """Test a sequence generator with continuous outputs and attention."""
    rng = numpy.random.RandomState(1234)

    inp_dim = 2
    inp_len = 10
    attended_dim = 3
    attended_len = 11
    batch_size = 4
    n_steps = 30

    # For values
    def rand(size):
        return rng.uniform(size=size).astype(floatX)

    # For masks
    def generate_mask(length, batch_size):
        mask = numpy.ones((length, batch_size), dtype=floatX)
        # To make it look like read data
        for i in range(batch_size):
            mask[1 + rng.randint(0, length - 1):, i] = 0.0
        return mask

    output_vals = rand((inp_len, batch_size, inp_dim))
    output_mask_vals = generate_mask(inp_len, batch_size)
    attended_vals = rand((attended_len, batch_size, attended_dim))
    attended_mask_vals = generate_mask(attended_len, batch_size)

    transition = TestTransition(
        dim=inp_dim, attended_dim=attended_dim, activation=Identity())
    attention = SequenceContentAttention(
        state_names=transition.apply.states, match_dim=inp_dim)
    generator = SequenceGenerator(
        Readout(
            readout_dim=inp_dim,
            source_names=[transition.apply.states[0],
                          attention.take_glimpses.outputs[0]],
            emitter=TestEmitter()),
        transition=transition,
        attention=attention,
        weights_init=IsotropicGaussian(0.1), biases_init=Constant(0),
        add_contexts=False, seed=1234)
    generator.initialize()

    # Test 'cost_matrix' method
    attended = tensor.tensor3("attended")
    attended_mask = tensor.matrix("attended_mask")
    outputs = tensor.tensor3('outputs')
    mask = tensor.matrix('mask')
    costs = generator.cost_matrix(outputs, mask,
                                  attended=attended,
                                  attended_mask=attended_mask)
    costs_vals = costs.eval({outputs: output_vals,
                             mask: output_mask_vals,
                             attended: attended_vals,
                             attended_mask: attended_mask_vals})
    assert costs_vals.shape == (inp_len, batch_size)
    assert_allclose(costs_vals.sum(), 13.5042, rtol=1e-5)

    # Test `generate` method
    results = (
        generator.generate(n_steps=n_steps, batch_size=attended.shape[1],
                           attended=attended, attended_mask=attended_mask))
    assert len(results) == 5
    states_vals, outputs_vals, glimpses_vals, weights_vals, costs_vals = (
        theano.function([attended, attended_mask], results)
        (attended_vals, attended_mask_vals))
    assert states_vals.shape == (n_steps, batch_size, inp_dim)
    assert states_vals.shape == outputs_vals.shape
    assert glimpses_vals.shape == (n_steps, batch_size, attended_dim)
    assert weights_vals.shape == (n_steps, batch_size, attended_len)
    assert costs_vals.shape == (n_steps, batch_size)
    assert_allclose(states_vals.sum(), 23.4172, rtol=1e-5)
    # There is no generation cost in this case, since generation is
    # deterministic
    assert_allclose(costs_vals.sum(), 0.0, rtol=1e-5)
    assert_allclose(weights_vals.sum(), 120.0, rtol=1e-5)
    assert_allclose(glimpses_vals.sum(), 199.2402, rtol=1e-5)
    assert_allclose(outputs_vals.sum(), -11.6008, rtol=1e-5)
Beispiel #41
0
class Decoder(Initializable):
    def __init__(self, vocab_size, embedding_dim, state_dim,
                 representation_dim, **kwargs):
        super(Decoder, self).__init__(**kwargs)
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.state_dim = state_dim
        self.representation_dim = representation_dim

        readout = Readout(
            source_names=['states', 'feedback', 'readout_context'],
            readout_dim=self.vocab_size,
            emitter=SoftmaxEmitter(),
            feedback_brick=LookupFeedback(vocab_size, embedding_dim),
            post_merge=InitializableFeedforwardSequence(
                [Bias(dim=1000).apply,
                 Maxout(num_pieces=2).apply,
                 Linear(input_dim=state_dim / 2, output_dim=100,
                        use_bias=False).apply,
                 Linear(input_dim=100).apply]),
            merged_dim=1000)

        self.transition = GatedRecurrentWithContext(Tanh(), dim=state_dim,
                                                    name='decoder')
        # Readout will apply the linear transformation to 'readout_context'
        # with a Merge brick, so no need to fork it here
        self.fork = Fork([name for name in
                          self.transition.apply.contexts +
                          self.transition.apply.states
                          if name != 'readout_context'], prototype=Linear())
        self.tanh = Tanh()

        self.sequence_generator = SequenceGenerator(
            readout=readout, transition=self.transition,
            fork_inputs=[name for name in self.transition.apply.sequences
                         if name != 'mask'],
        )

        self.children = [self.fork, self.sequence_generator, self.tanh]

    def _push_allocation_config(self):
        self.fork.input_dim = self.representation_dim
        self.fork.output_dims = [self.state_dim
                                 for _ in self.fork.output_names]

    @application(inputs=['representation', 'target_sentence_mask',
                         'target_sentence'], outputs=['cost'])
    def cost(self, representation, target_sentence, target_sentence_mask):
        target_sentence = target_sentence.dimshuffle(1, 0)
        target_sentence_mask = target_sentence_mask.T

        # The initial state and contexts, all functions of the representation
        contexts = {key: value.dimshuffle('x', 0, 1)
                    if key not in self.transition.apply.states else value
                    for key, value
                    in self.fork.apply(representation, as_dict=True).items()}
        contexts['states'] = self.tanh.apply(contexts['states'])
        cost = self.sequence_generator.cost(**merge(
            contexts, {'mask': target_sentence_mask,
                       'outputs': target_sentence,
                       'readout_context': representation.dimshuffle('x', 0, 1)}
        ))

        return (cost * target_sentence_mask).sum() / target_sentence_mask.shape[1]
def test_integer_sequence_generator():
    """Test a sequence generator with integer outputs.

    Such sequence generators can be used to e.g. model language.

    """
    rng = numpy.random.RandomState(1234)

    readout_dim = 5
    feedback_dim = 3
    dim = 20
    batch_size = 30
    n_steps = 10

    transition = GatedRecurrent(dim=dim, activation=Tanh(),
                                weights_init=Orthogonal())
    generator = SequenceGenerator(
        Readout(readout_dim=readout_dim, source_names=["states"],
                emitter=SoftmaxEmitter(theano_seed=1234),
                feedback_brick=LookupFeedback(readout_dim,
                                              feedback_dim)),
        transition,
        weights_init=IsotropicGaussian(0.1), biases_init=Constant(0),
        seed=1234)
    generator.initialize()

    # Test 'cost_matrix' method
    y = tensor.lmatrix('y')
    mask = tensor.matrix('mask')
    costs = generator.cost_matrix(y, mask)
    assert costs.ndim == 2
    costs_fun = theano.function([y, mask], [costs])
    y_test = rng.randint(readout_dim, size=(n_steps, batch_size))
    m_test = numpy.ones((n_steps, batch_size), dtype=floatX)
    costs_val = costs_fun(y_test, m_test)[0]
    assert costs_val.shape == (n_steps, batch_size)
    assert_allclose(costs_val.sum(), 482.827, rtol=1e-5)

    # Test 'cost' method
    cost = generator.cost(y, mask)
    assert cost.ndim == 0
    cost_val = theano.function([y, mask], [cost])(y_test, m_test)
    assert_allclose(cost_val, 16.0942, rtol=1e-5)

    # Test 'AUXILIARY' variable 'per_sequence_element' in 'cost' method
    cg = ComputationGraph([cost])
    var_filter = VariableFilter(roles=[AUXILIARY])
    aux_var_name = '_'.join([generator.name, generator.cost.name,
                             'per_sequence_element'])
    cost_per_el = [el for el in var_filter(cg.variables)
                   if el.name == aux_var_name][0]
    assert cost_per_el.ndim == 0
    cost_per_el_val = theano.function([y, mask], [cost_per_el])(y_test, m_test)
    assert_allclose(cost_per_el_val, 1.60942, rtol=1e-5)

    # Test generate
    states, outputs, costs = generator.generate(
        iterate=True, batch_size=batch_size, n_steps=n_steps)
    cg = ComputationGraph(states + outputs + costs)
    states_val, outputs_val, costs_val = theano.function(
        [], [states, outputs, costs],
        updates=cg.updates)()
    assert states_val.shape == (n_steps, batch_size, dim)
    assert outputs_val.shape == (n_steps, batch_size)
    assert outputs_val.dtype == 'int64'
    assert costs_val.shape == (n_steps, batch_size)
    assert_allclose(states_val.sum(), -17.91811, rtol=1e-5)
    assert_allclose(costs_val.sum(), 482.863, rtol=1e-5)
    assert outputs_val.sum() == 630

    # Test masks agnostic results of cost
    cost1 = costs_fun([[1], [2]], [[1], [1]])[0]
    cost2 = costs_fun([[3, 1], [4, 2], [2, 0]],
                      [[1, 1], [1, 1], [1, 0]])[0]
    assert_allclose(cost1.sum(), cost2[:, 1].sum(), rtol=1e-5)
lstm1 = LSTM(dim=lstm_dim, use_bias=False, weights_init=Orthogonal())
lstm2 = LSTM(dim=lstm_dim, use_bias=False, weights_init=Orthogonal())

rnn = RecurrentStack([lstm1, lstm2], name="transition")

readout = Readout(readout_dim=alphabet_size,
                  source_names=["states"],
                  emitter=SoftmaxEmitter(name="emitter"),
                  feedback_brick=LookupFeedback(alphabet_size,
                                                feedback_dim=alphabet_size,
                                                name="feedback"),
                  name="readout")

seq_gen = SequenceGenerator(readout=readout,
                            transition=rnn,
                            weights_init=IsotropicGaussian(0.01),
                            biases_init=Constant(0),
                            name="generator")

seq_gen.push_initialization_config()
rnn.weights_init = Orthogonal()
seq_gen.initialize()

# z markov_tutorial
x = tensor.lvector('features')
x = x.reshape((x.shape[0], 1))
cost = aggregation.mean(seq_gen.cost_matrix(x[:, :]).sum(), x.shape[1])
cost.name = "negative log-likelihood"
cost_cg = ComputationGraph(cost)

print VariableFilter(roles=[WEIGHT])(cost_cg.variables)
Beispiel #44
0
def test_attention_transition():
    inp_dim = 2
    inp_len = 10
    attended_dim = 3
    attended_len = 11
    batch_size = 4
    n_steps = 30

    transition = TestTransition(dim=inp_dim,
                                attended_dim=attended_dim,
                                name="transition")
    attention = SequenceContentAttention(transition.apply.states,
                                         match_dim=inp_dim,
                                         name="attention")
    mixer = Mixer(
        [name for name in transition.apply.sequences if name != 'mask'],
        attention.take_look.outputs[0],
        name="mixer")
    att_trans = AttentionTransition(transition,
                                    attention,
                                    mixer,
                                    name="att_trans")
    att_trans.weights_init = IsotropicGaussian(0.01)
    att_trans.biases_init = Constant(0)
    att_trans.initialize()

    attended = tensor.tensor3("attended")
    attended_mask = tensor.matrix("attended_mask")
    inputs = tensor.tensor3("inputs")
    inputs_mask = tensor.matrix("inputs_mask")
    states, glimpses, weights = att_trans.apply(input_=inputs,
                                                mask=inputs_mask,
                                                attended=attended,
                                                attended_mask=attended_mask)
    assert states.ndim == 3
    assert glimpses.ndim == 3
    assert weights.ndim == 3

    input_vals = numpy.zeros((inp_len, batch_size, inp_dim), dtype=floatX)
    input_mask_vals = numpy.ones((inp_len, batch_size), dtype=floatX)
    attended_vals = numpy.zeros((attended_len, batch_size, attended_dim),
                                dtype=floatX)
    attended_mask_vals = numpy.ones((attended_len, batch_size), dtype=floatX)

    func = theano.function([inputs, inputs_mask, attended, attended_mask],
                           [states, glimpses, weights])
    states_vals, glimpses_vals, weight_vals = func(input_vals, input_mask_vals,
                                                   attended_vals,
                                                   attended_mask_vals)

    assert states_vals.shape == input_vals.shape
    assert glimpses_vals.shape == (inp_len, batch_size, attended_dim)
    assert weight_vals.shape == (inp_len, batch_size, attended_len)

    # Test SequenceGenerator using AttentionTransition
    generator = SequenceGenerator(LinearReadout(
        readout_dim=inp_dim,
        source_names=["state"],
        emitter=TestEmitter(name="emitter"),
        name="readout"),
                                  transition=transition,
                                  attention=attention,
                                  weights_init=IsotropicGaussian(0.01),
                                  biases_init=Constant(0),
                                  name="generator")

    outputs = tensor.tensor3('outputs')
    costs = generator.cost(outputs,
                           attended=attended,
                           attended_mask=attended_mask)
    costs_vals = costs.eval({
        outputs: input_vals,
        attended: attended_vals,
        attended_mask: attended_mask_vals
    })
    assert costs_vals.shape == (inp_len, batch_size)

    results = (generator.generate(n_steps=n_steps,
                                  batch_size=attended.shape[1],
                                  attended=attended,
                                  attended_mask=attended_mask))
    assert len(results) == 5
    states_vals, outputs_vals, glimpses_vals, weights_vals, costs_vals = (
        theano.function([attended, attended_mask],
                        results)(attended_vals, attended_mask_vals))
    assert states_vals.shape == (n_steps, batch_size, inp_dim)
    assert states_vals.shape == outputs_vals.shape
    assert glimpses_vals.shape == (n_steps, batch_size, attended_dim)
    assert weights_vals.shape == (n_steps, batch_size, attended_len)
    assert costs_vals.shape == (n_steps, batch_size)
Beispiel #45
0
class PyramidLayer(Initializable):
    """Basic unit for the pyramid model.

    """
    def __init__(self,
				 batch_size,
				 frame_size,
				 k,
				 depth,
				 size,
				  **kwargs):
		super(PyramidLayer, self).__init__(**kwargs)

		target_size = frame_size * k

		depth_x = depth
		hidden_size_mlp_x = 32*size

		depth_transition = depth-1

		depth_theta = depth
		hidden_size_mlp_theta = 32*size
		hidden_size_recurrent = 32*size*3

		depth_context = depth
		hidden_size_mlp_context = 32*size
		context_size = 32*size

		activations_x = [Rectifier()]*depth_x

		dims_x = [frame_size] + [hidden_size_mlp_x]*(depth_x-1) + \
		         [4*hidden_size_recurrent]

		activations_theta = [Rectifier()]*depth_theta

		dims_theta = [hidden_size_recurrent] + \
		             [hidden_size_mlp_theta]*depth_theta

		activations_context = [Rectifier()]*depth_context

		dims_context = [frame_size] + [hidden_size_mlp_context]*(depth_context-1) + \
		         [context_size]

		mlp_x = MLP(activations = activations_x,
		            dims = dims_x,
		            name = "mlp_x")

		feedback = DeepTransitionFeedback(mlp = mlp_x)

		transition = [GatedRecurrent(dim=hidden_size_recurrent, 
		                   use_bias = True,
		                   name = "gru_{}".format(i) ) for i in range(depth_transition)]

		transition = RecurrentStack( transition,
		            name="transition", skip_connections = True)

		self.transition = transition

		mlp_theta = MLP( activations = activations_theta,
		             dims = dims_theta,
		             name = "mlp_theta")

		mlp_gmm = GMMMLP(mlp = mlp_theta,
		                  dim = target_size,
		                  k = k,
		                  const = 0.00001,
		                  name = "gmm_wrap")

		gmm_emitter = GMMEmitter(gmmmlp = mlp_gmm,
		  output_size = frame_size, k = k)

		source_names = [name for name in transition.apply.states if 'states' in name]

		attention = SimpleSequenceAttention(
		              state_names = source_names,
		              state_dims = [hidden_size_recurrent],
		              attended_dim = context_size,
		              name = "attention")

		#ipdb.set_trace()
		# Verify source names
		readout = Readout(
		    readout_dim = hidden_size_recurrent,
		    source_names =source_names + ['feedback'] + ['glimpses'],
		    emitter=gmm_emitter,
		    feedback_brick = feedback,
		    name="readout")

		self.generator = SequenceGenerator(readout=readout, 
		                              transition=transition,
		                              attention = attention,
		                              name = "generator")

		self.mlp_context = MLP(activations = activations_context,
		                  dims = dims_context)

		self.children = [self.generator, self.mlp_context]
		self.final_states = []
	

    def monitoring_vars(self, cg):

        readout = self.generator.readout
        readouts = VariableFilter( applications = [readout.readout],
            name_regex = "output")(cg.variables)[0]

        mu, sigma, coeff = readout.emitter.components(readouts)

        min_sigma = sigma.min().copy(name="sigma_min")
        mean_sigma = sigma.mean().copy(name="sigma_mean")
        max_sigma = sigma.max().copy(name="sigma_max")

        min_mu = mu.min().copy(name="mu_min")
        mean_mu = mu.mean().copy(name="mu_mean")
        max_mu = mu.max().copy(name="mu_max")

        monitoring_vars = [mean_sigma, min_sigma,
            min_mu, max_mu, mean_mu, max_sigma]

        return monitoring_vars

    @application
    def cost(self, x, context, **kwargs):
        cost_matrix = self.generator.cost_matrix(
                x, attended=self.mlp_context.apply(context),
                **kwargs)

        return cost_matrix.mean()

    @application
    def generate(context):
        return self.generator.generate(
          attended = self.mlp_context.apply(context),
          n_steps = context.shape[0],
          batch_size = context.shape[1],
          iterate = True)
Beispiel #46
0
mlp_gmm = GMMMLP(mlp=mlp_theta, dim=target_size, k=k, const=0.00001)

emitter = GMMEmitter(gmmmlp=mlp_gmm,
                     output_size=frame_size,
                     k=k,
                     name="emitter")

source_names = [name for name in transition.apply.states if 'states' in name]
readout = Readout(readout_dim=hidden_size_recurrent,
                  source_names=source_names,
                  emitter=emitter,
                  feedback_brick=feedback,
                  name="readout")

generator = SequenceGenerator(readout=readout,
                              transition=transition,
                              name="generator")

generator.weights_init = IsotropicGaussian(0.01)
generator.biases_init = Constant(0.)
generator.push_initialization_config()

generator.transition.biases_init = IsotropicGaussian(0.01, 1)
generator.transition.push_initialization_config()

generator.initialize()

states = {}
states = generator.transition.apply.outputs

states = {
Beispiel #47
0
    def __init__(self,
				 batch_size,
				 frame_size,
				 k,
				 depth,
				 size,
				  **kwargs):
		super(PyramidLayer, self).__init__(**kwargs)

		target_size = frame_size * k

		depth_x = depth
		hidden_size_mlp_x = 32*size

		depth_transition = depth-1

		depth_theta = depth
		hidden_size_mlp_theta = 32*size
		hidden_size_recurrent = 32*size*3

		depth_context = depth
		hidden_size_mlp_context = 32*size
		context_size = 32*size

		activations_x = [Rectifier()]*depth_x

		dims_x = [frame_size] + [hidden_size_mlp_x]*(depth_x-1) + \
		         [4*hidden_size_recurrent]

		activations_theta = [Rectifier()]*depth_theta

		dims_theta = [hidden_size_recurrent] + \
		             [hidden_size_mlp_theta]*depth_theta

		activations_context = [Rectifier()]*depth_context

		dims_context = [frame_size] + [hidden_size_mlp_context]*(depth_context-1) + \
		         [context_size]

		mlp_x = MLP(activations = activations_x,
		            dims = dims_x,
		            name = "mlp_x")

		feedback = DeepTransitionFeedback(mlp = mlp_x)

		transition = [GatedRecurrent(dim=hidden_size_recurrent, 
		                   use_bias = True,
		                   name = "gru_{}".format(i) ) for i in range(depth_transition)]

		transition = RecurrentStack( transition,
		            name="transition", skip_connections = True)

		self.transition = transition

		mlp_theta = MLP( activations = activations_theta,
		             dims = dims_theta,
		             name = "mlp_theta")

		mlp_gmm = GMMMLP(mlp = mlp_theta,
		                  dim = target_size,
		                  k = k,
		                  const = 0.00001,
		                  name = "gmm_wrap")

		gmm_emitter = GMMEmitter(gmmmlp = mlp_gmm,
		  output_size = frame_size, k = k)

		source_names = [name for name in transition.apply.states if 'states' in name]

		attention = SimpleSequenceAttention(
		              state_names = source_names,
		              state_dims = [hidden_size_recurrent],
		              attended_dim = context_size,
		              name = "attention")

		#ipdb.set_trace()
		# Verify source names
		readout = Readout(
		    readout_dim = hidden_size_recurrent,
		    source_names =source_names + ['feedback'] + ['glimpses'],
		    emitter=gmm_emitter,
		    feedback_brick = feedback,
		    name="readout")

		self.generator = SequenceGenerator(readout=readout, 
		                              transition=transition,
		                              attention = attention,
		                              name = "generator")

		self.mlp_context = MLP(activations = activations_context,
		                  dims = dims_context)

		self.children = [self.generator, self.mlp_context]
		self.final_states = []
Beispiel #48
0
    def __init__(self,
                 vocab_size,
                 embedding_dim,
                 state_dim,
                 att_dim,
                 maxout_dim,
                 representation_dim,
                 attention_strategy='content',
                 attention_sources='s',
                 readout_sources='sfa',
                 memory='none',
                 memory_size=500,
                 seq_len=50,
                 init_strategy='last',
                 theano_seed=None,
                 **kwargs):
        """Creates a new decoder brick without embedding.
        
        Args:
            vocab_size (int): Target language vocabulary size
            embedding_dim (int): Size of feedback embedding layer
            state_dim (int): Number of hidden units
            att_dim (int): Size of attention match vector
            maxout_dim (int): Size of maxout layer
            representation_dim (int): Dimension of source annotations
            attention_strategy (string): Which attention should be used
                                         cf.  ``_initialize_attention``
            attention_sources (string): Defines the sources used by the 
                                        attention model 's' for decoder
                                        states, 'f' for feedback
            readout_sources (string): Defines the sources used in the 
                                      readout network. 's' for decoder
                                      states, 'f' for feedback, 'a' for
                                      attention (context vector)
            memory (string): Which external memory should be used
                             (cf.  ``_initialize_attention``)
            memory_size (int): Size of the external memory structure
            seq_len (int): Maximum sentence length
            init_strategy (string): How to initialize the RNN state
                                    (cf.  ``GRUInitialState``)
            theano_seed: Random seed
        """
        super(NoLookupDecoder, self).__init__(**kwargs)
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.state_dim = state_dim
        self.representation_dim = representation_dim
        self.theano_seed = theano_seed

        # Initialize gru with special initial state
        self.transition = GRUInitialState(attended_dim=state_dim,
                                          init_strategy=init_strategy,
                                          dim=state_dim,
                                          activation=Tanh(),
                                          name='decoder')

        # Initialize the attention mechanism
        att_dim = att_dim if att_dim > 0 else state_dim
        self.attention, src_names = _initialize_attention(
            attention_strategy, seq_len, self.transition, representation_dim,
            att_dim, attention_sources, readout_sources, memory, memory_size)

        # Initialize the readout, note that SoftmaxEmitter emits -1 for
        # initial outputs which is used by LookupFeedBackWMT15
        maxout_dim = maxout_dim if maxout_dim > 0 else state_dim
        readout = Readout(
            source_names=src_names,
            readout_dim=embedding_dim,
            emitter=NoLookupEmitter(initial_output=-1,
                                    readout_dim=embedding_dim,
                                    cost_brick=SquaredError()),
            #                        cost_brick=CategoricalCrossEntropy()),
            feedback_brick=TrivialFeedback(output_dim=embedding_dim),
            post_merge=InitializableFeedforwardSequence([
                Bias(dim=maxout_dim, name='maxout_bias').apply,
                Maxout(num_pieces=2, name='maxout').apply,
                Linear(input_dim=maxout_dim / 2,
                       output_dim=embedding_dim,
                       use_bias=False,
                       name='softmax0').apply,
                Logistic(name='softmax1').apply
            ]),
            merged_dim=maxout_dim)

        # Build sequence generator accordingly
        self.sequence_generator = SequenceGenerator(
            readout=readout,
            transition=self.transition,
            attention=self.attention,
            fork=Fork([
                name
                for name in self.transition.apply.sequences if name != 'mask'
            ],
                      prototype=Linear()))

        self.children = [self.sequence_generator]
Beispiel #49
0
emitter = GMMEmitter(gmmmlp = mlp_gmm,
                     output_size = frame_size,
                     k = k,
                     name = "emitter")

source_names = [name for name in transition.apply.states if 'states' in name]
readout = Readout(
    readout_dim = hidden_size_recurrent,
    source_names =source_names,
    emitter=emitter,
    feedback_brick = feedback,
    name="readout")

generator = SequenceGenerator(readout=readout, 
                              transition=transition,
                              name = "generator")

generator.weights_init = IsotropicGaussian(0.01)
generator.biases_init = Constant(0.)
generator.push_initialization_config()

generator.transition.biases_init = IsotropicGaussian(0.01,1)
generator.transition.push_initialization_config()

generator.initialize()

cost_matrix = generator.cost_matrix(x)
cost = cost_matrix.mean()
cost.name = "sequence_log_likelihood"
Beispiel #50
0
def main(mode, save_path, num_batches, from_dump):
    if mode == "train":
        # Experiment configuration
        dimension = 100
        readout_dimension = len(char2code)

        # Data processing pipeline
        data_stream = DataStreamMapping(
            mapping=lambda data: tuple(array.T for array in data),
            data_stream=PaddingDataStream(
                BatchDataStream(
                    iteration_scheme=ConstantScheme(10),
                    data_stream=DataStreamMapping(
                        mapping=reverse_words,
                        add_sources=("targets", ),
                        data_stream=DataStreamFilter(
                            predicate=lambda data: len(data[0]) <= 100,
                            data_stream=OneBillionWord(
                                "training", [99],
                                char2code,
                                level="character",
                                preprocess=str.lower).get_default_stream())))))

        # Build the model
        chars = tensor.lmatrix("features")
        chars_mask = tensor.matrix("features_mask")
        targets = tensor.lmatrix("targets")
        targets_mask = tensor.matrix("targets_mask")

        encoder = Bidirectional(GatedRecurrent(dim=dimension,
                                               activation=Tanh()),
                                weights_init=Orthogonal())
        encoder.initialize()
        fork = Fork([
            name
            for name in encoder.prototype.apply.sequences if name != 'mask'
        ],
                    weights_init=IsotropicGaussian(0.1),
                    biases_init=Constant(0))
        fork.input_dim = dimension
        fork.fork_dims = {name: dimension for name in fork.fork_names}
        fork.initialize()
        lookup = LookupTable(readout_dimension,
                             dimension,
                             weights_init=IsotropicGaussian(0.1))
        lookup.initialize()
        transition = Transition(activation=Tanh(),
                                dim=dimension,
                                attended_dim=2 * dimension,
                                name="transition")
        attention = SequenceContentAttention(
            state_names=transition.apply.states,
            match_dim=dimension,
            name="attention")
        readout = LinearReadout(readout_dim=readout_dimension,
                                source_names=["states"],
                                emitter=SoftmaxEmitter(name="emitter"),
                                feedbacker=LookupFeedback(
                                    readout_dimension, dimension),
                                name="readout")
        generator = SequenceGenerator(readout=readout,
                                      transition=transition,
                                      attention=attention,
                                      weights_init=IsotropicGaussian(0.1),
                                      biases_init=Constant(0),
                                      name="generator")
        generator.push_initialization_config()
        transition.weights_init = Orthogonal()
        generator.initialize()
        bricks = [encoder, fork, lookup, generator]

        # Give an idea of what's going on
        params = Selector(bricks).get_params()
        logger.info("Parameters:\n" +
                    pprint.pformat([(key, value.get_value().shape)
                                    for key, value in params.items()],
                                   width=120))

        # Build the cost computation graph
        batch_cost = generator.cost(
            targets,
            targets_mask,
            attended=encoder.apply(**dict_union(fork.apply(
                lookup.lookup(chars), return_dict=True),
                                                mask=chars_mask)),
            attended_mask=chars_mask).sum()
        batch_size = named_copy(chars.shape[1], "batch_size")
        cost = aggregation.mean(batch_cost, batch_size)
        cost.name = "sequence_log_likelihood"
        logger.info("Cost graph is built")

        # Fetch variables useful for debugging
        max_length = named_copy(chars.shape[0], "max_length")
        cost_per_character = named_copy(
            aggregation.mean(batch_cost, batch_size * max_length),
            "character_log_likelihood")
        cg = ComputationGraph(cost)
        energies = unpack(VariableFilter(application=readout.readout,
                                         name="output")(cg.variables),
                          singleton=True)
        min_energy = named_copy(energies.min(), "min_energy")
        max_energy = named_copy(energies.max(), "max_energy")
        (activations, ) = VariableFilter(
            application=generator.transition.apply,
            name="states")(cg.variables)
        mean_activation = named_copy(activations.mean(), "mean_activation")

        # Define the training algorithm.
        algorithm = GradientDescent(cost=cost,
                                    step_rule=CompositeRule([
                                        GradientClipping(10.0),
                                        SteepestDescent(0.01)
                                    ]))

        observables = [
            cost, min_energy, max_energy, mean_activation, batch_size,
            max_length, cost_per_character, algorithm.total_step_norm,
            algorithm.total_gradient_norm
        ]
        for name, param in params.items():
            observables.append(named_copy(param.norm(2), name + "_norm"))
            observables.append(
                named_copy(algorithm.gradients[param].norm(2),
                           name + "_grad_norm"))

        main_loop = MainLoop(
            model=bricks,
            data_stream=data_stream,
            algorithm=algorithm,
            extensions=([LoadFromDump(from_dump)] if from_dump else []) + [
                Timing(),
                TrainingDataMonitoring(observables, after_every_batch=True),
                TrainingDataMonitoring(
                    observables, prefix="average", every_n_batches=10),
                FinishAfter(after_n_batches=num_batches).add_condition(
                    "after_batch", lambda log: math.isnan(
                        log.current_row.total_gradient_norm)),
                Plot(os.path.basename(save_path),
                     [["average_" + cost.name],
                      ["average_" + cost_per_character.name]],
                     every_n_batches=10),
                SerializeMainLoop(save_path,
                                  every_n_batches=500,
                                  save_separately=["model", "log"]),
                Printing(every_n_batches=1)
            ])
        main_loop.run()
    elif mode == "test":
        with open(save_path, "rb") as source:
            encoder, fork, lookup, generator = dill.load(source)
        logger.info("Model is loaded")
        chars = tensor.lmatrix("features")
        generated = generator.generate(
            n_steps=3 * chars.shape[0],
            batch_size=chars.shape[1],
            attended=encoder.apply(**dict_union(
                fork.apply(lookup.lookup(chars), return_dict=True))),
            attended_mask=tensor.ones(chars.shape))
        sample_function = ComputationGraph(generated).get_theano_function()
        logging.info("Sampling function is compiled")

        while True:
            # Python 2-3 compatibility
            line = input("Enter a sentence\n")
            batch_size = int(input("Enter a number of samples\n"))
            encoded_input = [
                char2code.get(char, char2code["<UNK>"])
                for char in line.lower().strip()
            ]
            encoded_input = ([char2code['<S>']] + encoded_input +
                             [char2code['</S>']])
            print("Encoder input:", encoded_input)
            target = reverse_words((encoded_input, ))[0]
            print("Target: ", target)
            states, samples, glimpses, weights, costs = sample_function(
                numpy.repeat(numpy.array(encoded_input)[:, None],
                             batch_size,
                             axis=1))

            messages = []
            for i in range(samples.shape[1]):
                sample = list(samples[:, i])
                try:
                    true_length = sample.index(char2code['</S>']) + 1
                except ValueError:
                    true_length = len(sample)
                sample = sample[:true_length]
                cost = costs[:true_length, i].sum()
                message = "({})".format(cost)
                message += "".join(code2char[code] for code in sample)
                if sample == target:
                    message += " CORRECT!"
                messages.append((cost, message))
            messages.sort(key=lambda tuple_: -tuple_[0])
            for _, message in messages:
                print(message)
Beispiel #51
0
mlp_theta = MLP( activations = activations_theta,
             dims = dims_theta)

emitter = SPF0Emitter(mlp = mlp_theta,
                      name = "emitter")

source_names = [name for name in transition.apply.states if 'states' in name]
readout = Readout(
    readout_dim = hidden_size_recurrent,
    source_names =source_names,
    emitter=emitter,
    #feedback_brick = feedback,
    name="readout")

generator = SequenceGenerator(readout=readout, 
                              transition=transition,
                              name = "generator")

#######################################

steps = 2048
n_samples = 10

from blocks.utils import shared_floatx_zeros, shared_floatx

states = generator.transition.apply.outputs
states = {name: shared_floatx_zeros((batch_size, hidden_size_recurrent)) for name in states}

from theano import tensor, function
f0 = tensor.matrix('f0')
voiced = tensor.matrix('voiced')
Beispiel #52
0
                     k=k,
                     name="emitter")

source_names = ['states']
readout = Readout(readout_dim=hidden_size_recurrent,
                  source_names=source_names,
                  emitter=emitter,
                  feedback_brick=feedback,
                  name="readout")

attention = SimpleSequenceAttention(state_names=source_names,
                                    state_dims=[hidden_size_recurrent],
                                    attended_dim=context_size)

generator = SequenceGenerator(readout=readout,
                              transition=transition,
                              attention=attention,
                              name="generator")

generator.weights_init = IsotropicGaussian(0.01)
generator.biases_init = Constant(0.)
generator.initialize()

mlp_context.weights_init = IsotropicGaussian(0.01)
mlp_context.biases_init = Constant(0.)
mlp_context.initialize()

#ipdb.set_trace()
cost_matrix = generator.cost_matrix(x,
                                    x_mask,
                                    attended=mlp_context.apply(context))
cost = cost_matrix.sum() / x_mask.sum()
Beispiel #53
0
def train():

    if os.path.isfile('trainingdata.tar'):
        with open('trainingdata.tar', 'rb') as f:
            main = load(f)
    else:
        hidden_size = 512
        filename = 'warpeace.hdf5'

        encoder = HDF5CharEncoder('warpeace_input.txt', 1000)
        encoder.write(filename)
        alphabet_len = encoder.length

        x = theano.tensor.lmatrix('x')

        readout = Readout(
            readout_dim=alphabet_len,
            feedback_brick=LookupFeedback(alphabet_len, hidden_size, name='feedback'),
            source_names=['states'],
            emitter=RandomSoftmaxEmitter(),
            name='readout'
        )

        transition = GatedRecurrent(
            activation=Tanh(),
            dim=hidden_size)
        transition.weights_init = IsotropicGaussian(0.01)

        gen = SequenceGenerator(readout=readout,
                                transition=transition,
                                weights_init=IsotropicGaussian(0.01),
                                biases_init=Constant(0),
                                name='sequencegenerator')

        gen.push_initialization_config()
        gen.initialize()

        cost = gen.cost(outputs=x)
        cost.name = 'cost'

        cg = ComputationGraph(cost)

        algorithm = GradientDescent(cost=cost,
                                    parameters=cg.parameters,
                                    step_rule=Scale(0.5))

        train_set = encoder.get_dataset()
        train_stream = DataStream.default_stream(
            train_set, iteration_scheme=SequentialScheme(
                train_set.num_examples, batch_size=128))

        main = MainLoop(
            model=Model(cost),
            data_stream=train_stream,
            algorithm=algorithm,
            extensions=[
                FinishAfter(),
                Printing(),
                Checkpoint('trainingdata.tar', every_n_epochs=10),
                ShowOutput(every_n_epochs=10)
            ])

    main.run()