Ejemplo n.º 1
0
    def __init__(self, dimension, alphabet_size, **kwargs):
        super(WordReverser, self).__init__(**kwargs)
        encoder = Bidirectional(
            SimpleRecurrent(dim=dimension, activation=Tanh()))
        fork = Fork([name for name in encoder.prototype.apply.sequences
                    if name != 'mask'])
        fork.input_dim = dimension
        fork.output_dims = [dimension for name in fork.input_names]
        lookup = LookupTable(alphabet_size, dimension)
        transition = SimpleRecurrent(
            activation=Tanh(),
            dim=dimension, name="transition")
        attention = SequenceContentAttention(
            state_names=transition.apply.states,
            attended_dim=2 * dimension, match_dim=dimension, name="attention")
        readout = Readout(
            readout_dim=alphabet_size,
            source_names=[transition.apply.states[0],
                          attention.take_glimpses.outputs[0]],
            emitter=SoftmaxEmitter(name="emitter"),
            feedback_brick=LookupFeedback(alphabet_size, dimension),
            name="readout")
        generator = SequenceGenerator(
            readout=readout, transition=transition, attention=attention,
            name="generator")

        self.lookup = lookup
        self.fork = fork
        self.encoder = encoder
        self.generator = generator
        self.children = [lookup, fork, encoder, generator]
Ejemplo n.º 2
0
	def __init__(self, dimen, vocab_size): #{
		# No idea what this is doing, but otherwise "allocated" is not set
		super(MorphGen, self).__init__(self)

		# The encoder 
		encoder = Bidirectional(SimpleRecurrent(dim=dimen, activation=Tanh()))

		# What is this doing ? 
		fork = Fork([name for name in encoder.prototype.apply.sequences if name != 'mask'])
		fork.input_dim = dimen
		fork.output_dims = [encoder.prototype.get_dim(name) for name in fork.input_names]

		lookup = LookupTable(vocab_size, dimen)

		transition = SimpleRecurrent(dim=dimen, activation=Tanh(), name="transition")

		atten = SequenceContentAttention(state_names=transition.apply.states,attended_dim=2*dimen, match_dim=dimen, name="attention")

		readout = Readout(
			readout_dim=vocab_size,
			source_names=[transition.apply.states[0],
			atten.take_glimpses.outputs[0]],
			emitter=SoftmaxEmitter(name="emitter"),
			feedback_brick=LookupFeedback(vocab_size, dimen),
			name="readout");

		generator = SequenceGenerator(readout=readout, transition=transition, attention=atten,name="generator")
	
		self.lookup = lookup
		self.fork = fork
		self.encoder = encoder
		self.generator = generator
		self.children = [lookup, fork, encoder, generator]
    def __init__(self, path, nn_char_map, no_transition_cost=1e12, **kwargs):
        # Since we currently support only type, it is ignored.
        # if type_ != 'fst':
        #    raise ValueError("Supports only FST's so far.")
        fst = FST(path)
        fst_char_map = dict(fst.fst.isyms.items())
        del fst_char_map['<eps>']
        if not len(fst_char_map) == len(nn_char_map):
            raise ValueError()
        remap_table = {
            nn_char_map[character]: fst_code
            for character, fst_code in fst_char_map.items()
        }
        transition = FSTTransition(fst, remap_table, no_transition_cost)

        # This SequenceGenerator will be used only in a very limited way.
        # That's why it is sufficient to equip it with a completely
        # fake readout.
        dummy_readout = Readout(source_names=['add'],
                                readout_dim=len(remap_table),
                                merge=Merge(input_names=['costs'],
                                            prototype=Identity()),
                                post_merge=Identity(),
                                emitter=SoftmaxEmitter())
        super(LanguageModel,
              self).__init__(transition=transition,
                             fork=Fork(output_names=[
                                 name for name in transition.apply.sequences
                                 if name != 'mask'
                             ],
                                       prototype=Identity()),
                             readout=dummy_readout,
                             **kwargs)
Ejemplo n.º 4
0
    def __init__(self, dimension, alphabet_size, **kwargs):
        super(SimpleGenerator, self).__init__(**kwargs)
        lookup = LookupTable(alphabet_size, dimension)
        transition = SimpleRecurrent(activation=Tanh(),
                                     dim=dimension,
                                     name="transition")
        attention = SequenceContentAttention(
            state_names=transition.apply.states,
            attended_dim=dimension,
            match_dim=dimension,
            name="attention")
        readout = Readout(readout_dim=alphabet_size,
                          source_names=[
                              transition.apply.states[0],
                              attention.take_glimpses.outputs[0]
                          ],
                          emitter=SoftmaxEmitter(name="emitter"),
                          feedback_brick=LookupFeedback(
                              alphabet_size, dimension),
                          name="readout")
        generator = SequenceGenerator(readout=readout,
                                      transition=transition,
                                      attention=attention,
                                      name="generator")

        self.lookup = lookup
        self.generator = generator
        self.children = [lookup, generator]
Ejemplo n.º 5
0
def getRnnGenerator(vocab_size,hidden_dim,input_dim=512):
    """
    "Apply" the RNN to the input x
    For initializing the network, the vocab size needs to be known
    Default of the hidden layer is set tot 512 like Karpathy
    """
    generator = SequenceGenerator(
        Readout(readout_dim = vocab_size,
                source_names = ["states"], # transition.apply.states ???
                emitter = SoftmaxEmitter(name="emitter"),
                feedback_brick = LookupFeedback(
                    vocab_size,
                    input_dim,
                    name = 'feedback'
                ),
                name = "readout"
        ),
        MySimpleRecurrent(
            name = "transition",
            activation = Tanh(),
            dim = hidden_dim
        ),
        weights_init = IsotropicGaussian(0.01),
        biases_init  = Constant(0),
        name = "generator"
    )
    generator.push_initialization_config()
    generator.transition.weights_init = IsotropicGaussian(0.01)
    generator.initialize()
    
    return generator
Ejemplo n.º 6
0
    def __init__(self, vocab_size, embedding_dim, state_dim,
                 representation_dim,topical_dim,theano_seed=None, **kwargs):
        super(Decoder, self).__init__(**kwargs)
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.state_dim = state_dim
        self.representation_dim = representation_dim
        self.theano_seed = theano_seed

        #self.topical_dim=topical_dim;

        # Initialize gru with special initial state
        self.transition = GRUInitialState(
            attended_dim=state_dim, dim=state_dim,
            activation=Tanh(), name='decoder')


        # Initialize the attention mechanism
        self.attention = SequenceContentAttention(
            state_names=self.transition.apply.states,
            attended_dim=representation_dim,
            match_dim=state_dim, name="attention")

        self.topical_attention=SequenceContentAttention(
            state_names=self.transition.apply.states,
            attended_dim=topical_dim,
            match_dim=state_dim, name="topical_attention")#not sure whether the match dim would be correct.


        # Initialize the readout, note that SoftmaxEmitter emits -1 for
        # initial outputs which is used by LookupFeedBackWMT15
        readout = Readout(
            source_names=['states', 'feedback',
                          self.attention.take_glimpses.outputs[0]],#check!
            readout_dim=self.vocab_size,
            emitter=SoftmaxEmitter(initial_output=-1, theano_seed=theano_seed),
            feedback_brick=LookupFeedbackWMT15(vocab_size, embedding_dim),
            post_merge=InitializableFeedforwardSequence(
                [Bias(dim=state_dim, name='maxout_bias').apply,
                 Maxout(num_pieces=2, name='maxout').apply,
                 Linear(input_dim=state_dim / 2, output_dim=embedding_dim,
                        use_bias=False, name='softmax0').apply,
                 Linear(input_dim=embedding_dim, name='softmax1').apply]),
            merged_dim=state_dim)

        # Build sequence generator accordingly
        self.sequence_generator = SequenceGenerator(
            readout=readout,
            transition=self.transition,
            attention=self.attention,
            topical_attention=self.topical_attention,
            topical_name='topical_embeddingq',
            content_name='content_embedding',
            fork=Fork([name for name in self.transition.apply.sequences
                       if name != 'mask'], prototype=Linear())
        )

        self.children = [self.sequence_generator]
Ejemplo n.º 7
0
    def __init__(self, vocab_size, embedding_dim, state_dim,
                 representation_dim, **kwargs):
        super(Decoder, self).__init__(**kwargs)
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.state_dim = state_dim
        self.representation_dim = representation_dim

        self.transition = GRUInitialState(attended_dim=state_dim,
                                          dim=state_dim,
                                          activation=Tanh(),
                                          name='decoder')
        self.attention = SequenceContentAttention(
            state_names=self.transition.apply.states,
            attended_dim=representation_dim,
            match_dim=state_dim,
            name="attention")

        readout = Readout(source_names=[
            'states', 'feedback', self.attention.take_glimpses.outputs[0]
        ],
                          readout_dim=self.vocab_size,
                          emitter=SoftmaxEmitter(initial_output=-1),
                          feedback_brick=LookupFeedbackWMT15(
                              vocab_size, embedding_dim),
                          post_merge=InitializableFeedforwardSequence([
                              Bias(dim=state_dim, name='maxout_bias').apply,
                              Maxout(num_pieces=2, name='maxout').apply,
                              Linear(input_dim=state_dim / 2,
                                     output_dim=embedding_dim,
                                     use_bias=False,
                                     name='softmax0').apply,
                              Linear(input_dim=embedding_dim,
                                     name='softmax1').apply
                          ]),
                          merged_dim=state_dim,
                          merge_prototype=Linear(use_bias=True))

        self.sequence_generator = SequenceGenerator(
            readout=readout,
            transition=self.transition,
            attention=self.attention,
            fork=Fork([
                name
                for name in self.transition.apply.sequences if name != 'mask'
            ],
                      prototype=Linear()))

        self.children = [self.sequence_generator]
Ejemplo n.º 8
0
def test_integer_sequence_generator():
    # Disclaimer: here we only check shapes, not values.

    readout_dim = 5
    feedback_dim = 3
    dim = 20
    batch_size = 30
    n_steps = 10

    transition = GatedRecurrent(name="transition",
                                activation=Tanh(),
                                dim=dim,
                                weights_init=Orthogonal())
    generator = SequenceGenerator(LinearReadout(
        readout_dim=readout_dim,
        source_names=["states"],
        emitter=SoftmaxEmitter(name="emitter"),
        feedbacker=LookupFeedback(readout_dim, feedback_dim),
        name="readout"),
                                  transition,
                                  weights_init=IsotropicGaussian(0.01),
                                  biases_init=Constant(0),
                                  name="generator")
    generator.initialize()

    y = tensor.lmatrix('y')
    mask = tensor.matrix('mask')
    costs = generator.cost(y, mask)
    assert costs.ndim == 2
    costs_val = theano.function([y, mask],
                                [costs])(numpy.zeros((n_steps, batch_size),
                                                     dtype='int64'),
                                         numpy.ones((n_steps, batch_size),
                                                    dtype=floatX))[0]
    assert costs_val.shape == (n_steps, batch_size)

    states, outputs, costs = generator.generate(iterate=True,
                                                batch_size=batch_size,
                                                n_steps=n_steps)
    states_val, outputs_val, costs_val = theano.function(
        [], [states, outputs, costs],
        updates=costs.owner.inputs[0].owner.tag.updates)()
    assert states_val.shape == (n_steps, batch_size, dim)
    assert outputs_val.shape == (n_steps, batch_size)
    assert outputs_val.dtype == 'int64'
    assert costs_val.shape == (n_steps, batch_size)
Ejemplo n.º 9
0
    def __init__(self, vocab_size, embedding_dim, state_dim,
                 representation_dim, **kwargs):
        super(Decoder, self).__init__(**kwargs)
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.state_dim = state_dim
        self.representation_dim = representation_dim

        readout = Readout(
            source_names=['states', 'feedback', 'readout_context'],
            readout_dim=self.vocab_size,
            emitter=SoftmaxEmitter(),
            feedback_brick=LookupFeedback(vocab_size, embedding_dim),
            post_merge=InitializableFeedforwardSequence([
                Bias(dim=1000).apply,
                Maxout(num_pieces=2).apply,
                Linear(input_dim=state_dim / 2, output_dim=100,
                       use_bias=False).apply,
                Linear(input_dim=100).apply
            ]),
            merged_dim=1000)

        self.transition = GatedRecurrentWithContext(Tanh(),
                                                    dim=state_dim,
                                                    name='decoder')
        # Readout will apply the linear transformation to 'readout_context'
        # with a Merge brick, so no need to fork it here
        self.fork = Fork([
            name for name in self.transition.apply.contexts +
            self.transition.apply.states if name != 'readout_context'
        ],
                         prototype=Linear())
        self.tanh = Tanh()

        self.sequence_generator = SequenceGenerator(
            readout=readout,
            transition=self.transition,
            fork_inputs=[
                name for name in self.transition.apply.sequences
                if name != 'mask'
            ],
        )

        self.children = [self.fork, self.sequence_generator, self.tanh]
Ejemplo n.º 10
0
    def __init__(self,
                 vocab_size,
                 embedding_dim,
                 dgru_state_dim,
                 igru_state_dim,
                 state_dim,
                 representation_dim,
                 transition_depth,
                 trg_igru_depth,
                 trg_dgru_depth,
                 trg_space_idx,
                 trg_bos,
                 theano_seed=None,
                 **kwargs):
        super(Decoder, self).__init__(**kwargs)
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.dgru_state_dim = dgru_state_dim
        self.igru_state_dim = igru_state_dim
        self.state_dim = state_dim
        self.trg_space_idx = trg_space_idx
        self.representation_dim = representation_dim
        self.theano_seed = theano_seed

        # Initialize gru with special initial state
        self.transition = RecurrentStack([
            GRUInitialState(attended_dim=state_dim,
                            dim=state_dim,
                            activation=Tanh(),
                            name='decoder_gru_withinit')
        ] + [
            GatedRecurrent(
                dim=state_dim, activation=Tanh(), name='decoder_gru' + str(i))
            for i in range(1, transition_depth)
        ],
                                         skip_connections=False)

        # Initialize the attention mechanism
        self.attention = SequenceContentAttention(
            state_names=self.transition.apply.states,
            attended_dim=representation_dim,
            match_dim=state_dim,
            name="attention")

        self.interpolator = Interpolator(
            vocab_size=vocab_size,
            embedding_dim=embedding_dim,
            igru_state_dim=igru_state_dim,
            igru_depth=trg_igru_depth,
            trg_dgru_depth=trg_dgru_depth,
            source_names=[
                'states', 'feedback', self.attention.take_glimpses.outputs[0]
            ],
            readout_dim=self.vocab_size,
            emitter=SoftmaxEmitter(initial_output=trg_bos,
                                   theano_seed=theano_seed),
            feedback_brick=TargetWordEncoder(vocab_size, embedding_dim,
                                             self.dgru_state_dim,
                                             trg_dgru_depth))

        # Build sequence generator accordingly
        self.sequence_generator = SequenceGeneratorDCNMT(
            trg_space_idx=self.trg_space_idx,
            readout=self.interpolator,
            transition=self.transition,
            attention=self.attention,
            transition_depth=transition_depth,
            igru_depth=trg_igru_depth,
            trg_dgru_depth=trg_dgru_depth,
            fork=Fork([
                name
                for name in self.transition.apply.sequences if name != 'mask'
            ],
                      prototype=Linear()))
        self.children = [self.sequence_generator]
Ejemplo n.º 11
0
    def __init__(self,
                 vocab_size,
                 topicWord_size,
                 embedding_dim,
                 state_dim,
                 topical_dim,
                 representation_dim,
                 match_function='SumMacthFunction',
                 use_doubly_stochastic=False,
                 lambda_ds=0.001,
                 use_local_attention=False,
                 window_size=10,
                 use_step_decay_cost=False,
                 use_concentration_cost=False,
                 lambda_ct=10,
                 use_stablilizer=False,
                 lambda_st=50,
                 theano_seed=None,
                 **kwargs):
        super(Decoder, self).__init__(**kwargs)
        self.vocab_size = vocab_size
        self.topicWord_size = topicWord_size
        self.embedding_dim = embedding_dim
        self.state_dim = state_dim
        self.representation_dim = representation_dim
        self.theano_seed = theano_seed

        # Initialize gru with special initial state
        self.transition = GRU(attended_dim=state_dim,
                              dim=state_dim,
                              activation=Tanh(),
                              name='decoder')

        self.energy_computer = globals()[match_function](name='energy_comp')

        # Initialize the attention mechanism
        self.attention = SequenceContentAttention(
            state_names=self.transition.apply.states,
            attended_dim=representation_dim,
            match_dim=state_dim,
            energy_computer=self.energy_computer,
            use_local_attention=use_local_attention,
            window_size=window_size,
            name="attention")

        self.topical_attention = SequenceContentAttention(
            state_names=self.transition.apply.states,
            attended_dim=topical_dim,
            match_dim=state_dim,
            energy_computer=self.energy_computer,
            use_local_attention=use_local_attention,
            window_size=window_size,
            name="topical_attention"
        )  #not sure whether the match dim would be correct.

        # Initialize the readout, note that SoftmaxEmitter emits -1 for
        # initial outputs which is used by LookupFeedBackWMT15
        readout = Readout(source_names=[
            'states', 'feedback', self.attention.take_glimpses.outputs[0]
        ],
                          readout_dim=self.vocab_size,
                          emitter=SoftmaxEmitter(initial_output=-1,
                                                 theano_seed=theano_seed),
                          feedback_brick=LookupFeedbackWMT15(
                              vocab_size, embedding_dim),
                          post_merge=InitializableFeedforwardSequence([
                              Bias(dim=state_dim, name='maxout_bias').apply,
                              Maxout(num_pieces=2, name='maxout').apply,
                              Linear(input_dim=state_dim / 2,
                                     output_dim=embedding_dim,
                                     use_bias=False,
                                     name='softmax0').apply,
                              Linear(input_dim=embedding_dim,
                                     name='softmax1').apply
                          ]),
                          merged_dim=state_dim,
                          name='readout')

        # calculate the readout of topic word,
        # no specific feedback brick, use the trival feedback break
        # no post_merge and merge, use Bias and Linear
        topicWordReadout = Readout(source_names=[
            'states', 'feedback', self.attention.take_glimpses.outputs[0]
        ],
                                   readout_dim=self.topicWord_size,
                                   emitter=SoftmaxEmitter(
                                       initial_output=-1,
                                       theano_seed=theano_seed),
                                   name='twReadout')

        # Build sequence generator accordingly
        self.sequence_generator = SequenceGenerator(
            readout=readout,
            topicWordReadout=topicWordReadout,
            topic_vector_names=['topicSumVector'],
            transition=self.transition,
            attention=self.attention,
            topical_attention=self.topical_attention,
            q_dim=self.state_dim,
            #q_name='topic_embedding',
            topical_name='topic_embedding',
            content_name='content_embedding',
            use_step_decay_cost=use_step_decay_cost,
            use_doubly_stochastic=use_doubly_stochastic,
            lambda_ds=lambda_ds,
            use_concentration_cost=use_concentration_cost,
            lambda_ct=lambda_ct,
            use_stablilizer=use_stablilizer,
            lambda_st=lambda_st,
            fork=Fork([
                name
                for name in self.transition.apply.sequences if name != 'mask'
            ],
                      prototype=Linear()))

        self.children = [self.sequence_generator]
Ejemplo n.º 12
0
def main():
    logging.basicConfig(
        level=logging.DEBUG,
        format="%(asctime)s: %(name)s: %(levelname)s: %(message)s")

    parser = argparse.ArgumentParser(
        "Case study of language modeling with RNN",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument(
        "mode",
        choices=["train", "sample"],
        help="The mode to run. Use `train` to train a new model"
        " and `sample` to sample a sequence generated by an"
        " existing one.")
    parser.add_argument("prefix",
                        default="sine",
                        help="The prefix for model, timing and state files")
    parser.add_argument("state",
                        nargs="?",
                        default="",
                        help="Changes to Groundhog state")
    parser.add_argument("--path", help="Path to a language dataset")
    parser.add_argument("--dict", help="Path to the dataset dictionary")
    parser.add_argument("--restart", help="Start anew")
    parser.add_argument("--reset",
                        action="store_true",
                        default=False,
                        help="Reset the hidden state between batches")
    parser.add_argument("--steps",
                        type=int,
                        default=100,
                        help="Number of steps to plot for the 'sample' mode"
                        " OR training sequence length for the 'train' mode.")
    args = parser.parse_args()
    logger.debug("Args:\n" + str(args))

    dim = 200
    num_chars = 50

    transition = GatedRecurrent(name="transition",
                                activation=Tanh(),
                                dim=dim,
                                weights_init=Orthogonal())
    generator = SequenceGenerator(LinearReadout(
        readout_dim=num_chars,
        source_names=["states"],
        emitter=SoftmaxEmitter(name="emitter"),
        feedbacker=LookupFeedback(num_chars, dim, name='feedback'),
        name="readout"),
                                  transition,
                                  weights_init=IsotropicGaussian(0.01),
                                  biases_init=Constant(0),
                                  name="generator")
    generator.allocate()
    logger.debug("Parameters:\n" + pprint.pformat(
        [(key, value.get_value().shape)
         for key, value in Selector(generator).get_params().items()],
        width=120))

    if args.mode == "train":
        batch_size = 1
        seq_len = args.steps

        generator.initialize()

        # Build cost computation graph that uses the saved hidden states.
        # An issue: for Groundhog this is completely transparent, that's
        # why it does not carry the hidden state over the period when
        # validation in done. We should find a way to fix in the future.
        x = tensor.lmatrix('x')
        init_states = shared_floatx_zeros((batch_size, dim),
                                          name='init_states')
        reset = tensor.scalar('reset')
        cost = ComputationGraph(
            generator.cost(x, states=init_states * reset).sum())
        # TODO: better search routine
        states = [
            v for v in cost.variables if hasattr(v.tag, 'application_call')
            and v.tag.application_call.brick == generator.transition and
            (v.tag.application_call.application == generator.transition.apply)
            and v.tag.role == VariableRole.OUTPUT and v.tag.name == 'states'
        ]
        assert len(states) == 1
        states = states[0]

        gh_model = GroundhogModel(generator, cost)
        gh_model.properties.append(
            ('bpc', cost.outputs[0] * numpy.log(2) / seq_len))
        gh_model.properties.append(('mean_init_state', init_states.mean()))
        gh_model.properties.append(('reset', reset))
        if not args.reset:
            gh_model.updates.append((init_states, states[-1]))

        state = GroundhogState(args.prefix, batch_size,
                               learning_rate=0.0001).as_dict()
        changes = eval("dict({})".format(args.state))
        state.update(changes)

        def output_format(x, y, reset):
            return dict(x=x[:, None], reset=reset)

        train, valid, test = [
            LMIterator(batch_size=batch_size,
                       use_infinite_loop=mode == 'train',
                       path=args.path,
                       seq_len=seq_len,
                       mode=mode,
                       chunks='chars',
                       output_format=output_format,
                       can_fit=True) for mode in ['train', 'valid', 'test']
        ]

        trainer = SGD(gh_model, state, train)
        state['on_nan'] = 'warn'
        state['cutoff'] = 1.

        main_loop = MainLoop(train, valid, None, gh_model, trainer, state,
                             None)
        if not args.restart:
            main_loop.load()
        main_loop.main()
    elif args.mode == "sample":
        load_params(generator, args.prefix + "model.npz")

        chars = numpy.load(args.dict)['unique_chars']

        sample = ComputationGraph(
            generator.generate(n_steps=args.steps, batch_size=10,
                               iterate=True)).function()

        states, outputs, costs = sample()

        for i in range(10):
            print("Generation cost: {}".format(costs[:, i].sum()))
            print("".join([chars[o] for o in outputs[:, i]]))
    else:
        assert False
Ejemplo n.º 13
0
def main(mode, save_path, num_batches, from_dump):
    if mode == "train":
        # Experiment configuration
        dimension = 100
        readout_dimension = len(char2code)

        # Data processing pipeline
        data_stream = DataStreamMapping(
            mapping=lambda data: tuple(array.T for array in data),
            data_stream=PaddingDataStream(
                BatchDataStream(
                    iteration_scheme=ConstantScheme(10),
                    data_stream=DataStreamMapping(
                        mapping=reverse_words,
                        add_sources=("targets", ),
                        data_stream=DataStreamFilter(
                            predicate=lambda data: len(data[0]) <= 100,
                            data_stream=OneBillionWord(
                                "training", [99],
                                char2code,
                                level="character",
                                preprocess=str.lower).get_default_stream())))))

        # Build the model
        chars = tensor.lmatrix("features")
        chars_mask = tensor.matrix("features_mask")
        targets = tensor.lmatrix("targets")
        targets_mask = tensor.matrix("targets_mask")

        encoder = Bidirectional(GatedRecurrent(dim=dimension,
                                               activation=Tanh()),
                                weights_init=Orthogonal())
        encoder.initialize()
        fork = Fork([
            name
            for name in encoder.prototype.apply.sequences if name != 'mask'
        ],
                    weights_init=IsotropicGaussian(0.1),
                    biases_init=Constant(0))
        fork.input_dim = dimension
        fork.fork_dims = {name: dimension for name in fork.fork_names}
        fork.initialize()
        lookup = LookupTable(readout_dimension,
                             dimension,
                             weights_init=IsotropicGaussian(0.1))
        lookup.initialize()
        transition = Transition(activation=Tanh(),
                                dim=dimension,
                                attended_dim=2 * dimension,
                                name="transition")
        attention = SequenceContentAttention(
            state_names=transition.apply.states,
            match_dim=dimension,
            name="attention")
        readout = LinearReadout(readout_dim=readout_dimension,
                                source_names=["states"],
                                emitter=SoftmaxEmitter(name="emitter"),
                                feedbacker=LookupFeedback(
                                    readout_dimension, dimension),
                                name="readout")
        generator = SequenceGenerator(readout=readout,
                                      transition=transition,
                                      attention=attention,
                                      weights_init=IsotropicGaussian(0.1),
                                      biases_init=Constant(0),
                                      name="generator")
        generator.push_initialization_config()
        transition.weights_init = Orthogonal()
        generator.initialize()
        bricks = [encoder, fork, lookup, generator]

        # Give an idea of what's going on
        params = Selector(bricks).get_params()
        logger.info("Parameters:\n" +
                    pprint.pformat([(key, value.get_value().shape)
                                    for key, value in params.items()],
                                   width=120))

        # Build the cost computation graph
        batch_cost = generator.cost(
            targets,
            targets_mask,
            attended=encoder.apply(**dict_union(fork.apply(
                lookup.lookup(chars), return_dict=True),
                                                mask=chars_mask)),
            attended_mask=chars_mask).sum()
        batch_size = named_copy(chars.shape[1], "batch_size")
        cost = aggregation.mean(batch_cost, batch_size)
        cost.name = "sequence_log_likelihood"
        logger.info("Cost graph is built")

        # Fetch variables useful for debugging
        max_length = named_copy(chars.shape[0], "max_length")
        cost_per_character = named_copy(
            aggregation.mean(batch_cost, batch_size * max_length),
            "character_log_likelihood")
        cg = ComputationGraph(cost)
        energies = unpack(VariableFilter(application=readout.readout,
                                         name="output")(cg.variables),
                          singleton=True)
        min_energy = named_copy(energies.min(), "min_energy")
        max_energy = named_copy(energies.max(), "max_energy")
        (activations, ) = VariableFilter(
            application=generator.transition.apply,
            name="states")(cg.variables)
        mean_activation = named_copy(activations.mean(), "mean_activation")

        # Define the training algorithm.
        algorithm = GradientDescent(cost=cost,
                                    step_rule=CompositeRule([
                                        GradientClipping(10.0),
                                        SteepestDescent(0.01)
                                    ]))

        observables = [
            cost, min_energy, max_energy, mean_activation, batch_size,
            max_length, cost_per_character, algorithm.total_step_norm,
            algorithm.total_gradient_norm
        ]
        for name, param in params.items():
            observables.append(named_copy(param.norm(2), name + "_norm"))
            observables.append(
                named_copy(algorithm.gradients[param].norm(2),
                           name + "_grad_norm"))

        main_loop = MainLoop(
            model=bricks,
            data_stream=data_stream,
            algorithm=algorithm,
            extensions=([LoadFromDump(from_dump)] if from_dump else []) + [
                Timing(),
                TrainingDataMonitoring(observables, after_every_batch=True),
                TrainingDataMonitoring(
                    observables, prefix="average", every_n_batches=10),
                FinishAfter(after_n_batches=num_batches).add_condition(
                    "after_batch", lambda log: math.isnan(
                        log.current_row.total_gradient_norm)),
                Plot(os.path.basename(save_path),
                     [["average_" + cost.name],
                      ["average_" + cost_per_character.name]],
                     every_n_batches=10),
                SerializeMainLoop(save_path,
                                  every_n_batches=500,
                                  save_separately=["model", "log"]),
                Printing(every_n_batches=1)
            ])
        main_loop.run()
    elif mode == "test":
        with open(save_path, "rb") as source:
            encoder, fork, lookup, generator = dill.load(source)
        logger.info("Model is loaded")
        chars = tensor.lmatrix("features")
        generated = generator.generate(
            n_steps=3 * chars.shape[0],
            batch_size=chars.shape[1],
            attended=encoder.apply(**dict_union(
                fork.apply(lookup.lookup(chars), return_dict=True))),
            attended_mask=tensor.ones(chars.shape))
        sample_function = ComputationGraph(generated).get_theano_function()
        logging.info("Sampling function is compiled")

        while True:
            # Python 2-3 compatibility
            line = input("Enter a sentence\n")
            batch_size = int(input("Enter a number of samples\n"))
            encoded_input = [
                char2code.get(char, char2code["<UNK>"])
                for char in line.lower().strip()
            ]
            encoded_input = ([char2code['<S>']] + encoded_input +
                             [char2code['</S>']])
            print("Encoder input:", encoded_input)
            target = reverse_words((encoded_input, ))[0]
            print("Target: ", target)
            states, samples, glimpses, weights, costs = sample_function(
                numpy.repeat(numpy.array(encoded_input)[:, None],
                             batch_size,
                             axis=1))

            messages = []
            for i in range(samples.shape[1]):
                sample = list(samples[:, i])
                try:
                    true_length = sample.index(char2code['</S>']) + 1
                except ValueError:
                    true_length = len(sample)
                sample = sample[:true_length]
                cost = costs[:true_length, i].sum()
                message = "({})".format(cost)
                message += "".join(code2char[code] for code in sample)
                if sample == target:
                    message += " CORRECT!"
                messages.append((cost, message))
            messages.sort(key=lambda tuple_: -tuple_[0])
            for _, message in messages:
                print(message)
Ejemplo n.º 14
0
                      eos_token=None,
                      unk_token='<UNK>',
                      level='character')

alphabet_size = 4

lstm_dim = 2

lstm1 = LSTM(dim=lstm_dim, use_bias=False, weights_init=Orthogonal())
lstm2 = LSTM(dim=lstm_dim, use_bias=False, weights_init=Orthogonal())

rnn = RecurrentStack([lstm1, lstm2], name="transition")

readout = Readout(readout_dim=alphabet_size,
                  source_names=["states"],
                  emitter=SoftmaxEmitter(name="emitter"),
                  feedback_brick=LookupFeedback(alphabet_size,
                                                feedback_dim=alphabet_size,
                                                name="feedback"),
                  name="readout")

seq_gen = SequenceGenerator(readout=readout,
                            transition=rnn,
                            weights_init=IsotropicGaussian(0.01),
                            biases_init=Constant(0),
                            name="generator")

seq_gen.push_initialization_config()
rnn.weights_init = Orthogonal()
seq_gen.initialize()
Ejemplo n.º 15
0
 def __init__(self, initial_output='\n'):
     self.theano_rng = MRG_RandomStreams(seed=random.randint(0,100000))
     SoftmaxEmitter.__init__(self, initial_output=initial_output)
Ejemplo n.º 16
0
    def __init__(
            self,
            input_dims,
            input_num_chars,
            eos_label,
            num_phonemes,
            dim_dec,
            dims_bidir,
            enc_transition,
            dec_transition,
            use_states_for_readout,
            attention_type,
            criterion,
            bottom,
            lm=None,
            character_map=None,
            bidir=True,
            subsample=None,
            dims_top=None,
            prior=None,
            conv_n=None,
            post_merge_activation=None,
            post_merge_dims=None,
            dim_matcher=None,
            embed_outputs=True,
            dim_output_embedding=None,
            dec_stack=1,
            conv_num_filters=1,
            data_prepend_eos=True,
            # softmax is the default set in SequenceContentAndConvAttention
            energy_normalizer=None,
            # for speech this is the approximate phoneme duration in frames
            max_decoded_length_scale=1,
            **kwargs):

        if post_merge_activation is None:
            post_merge_activation = Tanh()
        super(SpeechRecognizer, self).__init__(**kwargs)
        self.eos_label = eos_label
        self.data_prepend_eos = data_prepend_eos

        self.rec_weights_init = None
        self.initial_states_init = None

        self.enc_transition = enc_transition
        self.dec_transition = dec_transition
        self.dec_stack = dec_stack

        self.criterion = criterion

        self.max_decoded_length_scale = max_decoded_length_scale

        post_merge_activation = post_merge_activation

        if dim_matcher is None:
            dim_matcher = dim_dec

        # The bottom part, before BiRNN
        bottom_class = bottom.pop('bottom_class')
        bottom = bottom_class(input_dims=input_dims,
                              input_num_chars=input_num_chars,
                              name='bottom',
                              **bottom)

        # BiRNN
        if not subsample:
            subsample = [1] * len(dims_bidir)
        encoder = Encoder(self.enc_transition,
                          dims_bidir,
                          bottom.get_dim(bottom.apply.outputs[0]),
                          subsample,
                          bidir=bidir)
        dim_encoded = encoder.get_dim(encoder.apply.outputs[0])

        generators = [None, None]
        for i in range(2):
            # The top part, on top of BiRNN but before the attention
            if dims_top:
                top = MLP([Tanh()], [dim_encoded] + dims_top + [dim_encoded],
                          name="top{}".format(i))
            else:
                top = Identity(name='top{}'.format(i))

            if dec_stack == 1:
                transition = self.dec_transition(dim=dim_dec,
                                                 activation=Tanh(),
                                                 name="transition{}".format(i))
            else:
                transitions = [
                    self.dec_transition(dim=dim_dec,
                                        activation=Tanh(),
                                        name="transition_{}_{}".format(
                                            i, trans_level))
                    for trans_level in xrange(dec_stack)
                ]
                transition = RecurrentStack(transitions=transitions,
                                            skip_connections=True)
            # Choose attention mechanism according to the configuration
            if attention_type == "content":
                attention = SequenceContentAttention(
                    state_names=transition.apply.states,
                    attended_dim=dim_encoded,
                    match_dim=dim_matcher,
                    name="cont_att" + i)
            elif attention_type == "content_and_conv":
                attention = SequenceContentAndConvAttention(
                    state_names=transition.apply.states,
                    conv_n=conv_n,
                    conv_num_filters=conv_num_filters,
                    attended_dim=dim_encoded,
                    match_dim=dim_matcher,
                    prior=prior,
                    energy_normalizer=energy_normalizer,
                    name="conv_att{}".format(i))
            else:
                raise ValueError(
                    "Unknown attention type {}".format(attention_type))
            if embed_outputs:
                feedback = LookupFeedback(
                    num_phonemes + 1, dim_dec
                    if dim_output_embedding is None else dim_output_embedding)
            else:
                feedback = OneOfNFeedback(num_phonemes + 1)
            if criterion['name'] == 'log_likelihood':
                emitter = SoftmaxEmitter(initial_output=num_phonemes,
                                         name="emitter{}".format(i))
                if lm:
                    # In case we use LM it is Readout that is responsible
                    # for normalization.
                    emitter = LMEmitter()
            elif criterion['name'].startswith('mse'):
                emitter = RewardRegressionEmitter(criterion['name'],
                                                  eos_label,
                                                  num_phonemes,
                                                  criterion.get(
                                                      'min_reward', -1.0),
                                                  name="emitter")
            else:
                raise ValueError("Unknown criterion {}".format(
                    criterion['name']))
            readout_config = dict(
                readout_dim=num_phonemes,
                source_names=(transition.apply.states if use_states_for_readout
                              else []) + [attention.take_glimpses.outputs[0]],
                emitter=emitter,
                feedback_brick=feedback,
                name="readout{}".format(i))
            if post_merge_dims:
                readout_config['merged_dim'] = post_merge_dims[0]
                readout_config['post_merge'] = InitializableSequence(
                    [
                        Bias(post_merge_dims[0]).apply,
                        post_merge_activation.apply,
                        MLP(
                            [post_merge_activation] *
                            (len(post_merge_dims) - 1) + [Identity()],
                            # MLP was designed to support Maxout is activation
                            # (because Maxout in a way is not one). However
                            # a single layer Maxout network works with the trick below.
                            # For deeper Maxout network one has to use the
                            # Sequence brick.
                            [
                                d //
                                getattr(post_merge_activation, 'num_pieces', 1)
                                for d in post_merge_dims
                            ] + [num_phonemes]).apply,
                    ],
                    name='post_merge{}'.format(i))
            readout = Readout(**readout_config)

            language_model = None
            if lm and lm.get('path'):
                lm_weight = lm.pop('weight', 0.0)
                normalize_am_weights = lm.pop('normalize_am_weights', True)
                normalize_lm_weights = lm.pop('normalize_lm_weights', False)
                normalize_tot_weights = lm.pop('normalize_tot_weights', False)
                am_beta = lm.pop('am_beta', 1.0)
                if normalize_am_weights + normalize_lm_weights + normalize_tot_weights < 1:
                    logger.warn(
                        "Beam search is prone to fail with no log-prob normalization"
                    )
                language_model = LanguageModel(nn_char_map=character_map, **lm)
                readout = ShallowFusionReadout(
                    lm_costs_name='lm_add',
                    lm_weight=lm_weight,
                    normalize_am_weights=normalize_am_weights,
                    normalize_lm_weights=normalize_lm_weights,
                    normalize_tot_weights=normalize_tot_weights,
                    am_beta=am_beta,
                    **readout_config)

            generators[i] = SequenceGenerator(readout=readout,
                                              transition=transition,
                                              attention=attention,
                                              language_model=language_model,
                                              name="generator{}".format(i))

        self.generator = generators[0]

        self.forward_to_backward = Linear(dim_dec, dim_dec)

        # Remember child bricks
        self.encoder = encoder
        self.bottom = bottom
        self.top = top
        self.generators = generators
        self.children = [self.forward_to_backward, encoder, top, bottom
                         ] + generators

        # Create input variables
        self.inputs = self.bottom.batch_inputs
        self.inputs_mask = self.bottom.mask

        self.labels = tensor.lmatrix('labels')
        self.labels_mask = tensor.matrix("labels_mask")

        self.single_inputs = self.bottom.single_inputs
        self.single_labels = tensor.lvector('labels')
        self.n_steps = tensor.lscalar('n_steps')
Ejemplo n.º 17
0
def test_sequence_generator_with_lm():
    floatX = theano.config.floatX
    rng = numpy.random.RandomState(1234)

    readout_dim = 5
    feedback_dim = 3
    dim = 20
    batch_size = 30
    n_steps = 10

    transition = GatedRecurrent(dim=dim,
                                activation=Tanh(),
                                weights_init=Orthogonal())
    language_model = SequenceGenerator(Readout(
        readout_dim=readout_dim,
        source_names=["states"],
        emitter=SoftmaxEmitter(theano_seed=1234),
        feedback_brick=LookupFeedback(readout_dim, dim, name='feedback')),
                                       SimpleRecurrent(dim, Tanh()),
                                       name='language_model')
    generator = SequenceGenerator(Readout(
        readout_dim=readout_dim,
        source_names=["states", "lm_states"],
        emitter=SoftmaxEmitter(theano_seed=1234),
        feedback_brick=LookupFeedback(readout_dim, feedback_dim)),
                                  transition,
                                  language_model=language_model,
                                  weights_init=IsotropicGaussian(0.1),
                                  biases_init=Constant(0),
                                  seed=1234)
    generator.initialize()

    # Test 'cost_matrix' method
    y = tensor.lmatrix('y')
    y.tag.test_value = numpy.zeros((15, batch_size), dtype='int64')
    mask = tensor.matrix('mask')
    mask.tag.test_value = numpy.ones((15, batch_size))

    costs = generator.cost_matrix(y, mask)
    assert costs.ndim == 2
    costs_fun = theano.function([y, mask], [costs])
    y_test = rng.randint(readout_dim, size=(n_steps, batch_size))
    m_test = numpy.ones((n_steps, batch_size), dtype=floatX)
    costs_val = costs_fun(y_test, m_test)[0]
    assert costs_val.shape == (n_steps, batch_size)
    assert_allclose(costs_val.sum(), 483.153, rtol=1e-5)

    # Test 'cost' method
    cost = generator.cost(y, mask)
    assert cost.ndim == 0
    cost_val = theano.function([y, mask], cost)(y_test, m_test)
    assert_allclose(cost_val, 16.105, rtol=1e-5)

    # Test 'AUXILIARY' variable 'per_sequence_element' in 'cost' method
    cg = ComputationGraph([cost])
    var_filter = VariableFilter(roles=[AUXILIARY])
    aux_var_name = '_'.join(
        [generator.name, generator.cost.name, 'per_sequence_element'])
    cost_per_el = [
        el for el in var_filter(cg.variables) if el.name == aux_var_name
    ][0]
    assert cost_per_el.ndim == 0
    cost_per_el_val = theano.function([y, mask], [cost_per_el])(y_test, m_test)
    assert_allclose(cost_per_el_val, 1.61051, rtol=1e-5)

    # Test generate
    states, outputs, lm_states, costs = generator.generate(
        iterate=True, batch_size=batch_size, n_steps=n_steps)
    cg = ComputationGraph([states, outputs, costs])
    states_val, outputs_val, costs_val = theano.function(
        [], [states, outputs, costs], updates=cg.updates)()
    assert states_val.shape == (n_steps, batch_size, dim)
    assert outputs_val.shape == (n_steps, batch_size)
    assert outputs_val.dtype == 'int64'
    assert costs_val.shape == (n_steps, batch_size)
    assert_allclose(states_val.sum(), -4.88367, rtol=1e-5)
    assert_allclose(costs_val.sum(), 486.681, rtol=1e-5)
    assert outputs_val.sum() == 627

    # Test masks agnostic results of cost
    cost1 = costs_fun([[1], [2]], [[1], [1]])[0]
    cost2 = costs_fun([[3, 1], [4, 2], [2, 0]], [[1, 1], [1, 1], [1, 0]])[0]
    assert_allclose(cost1.sum(), cost2[:, 1].sum(), rtol=1e-5)
Ejemplo n.º 18
0
def main(config): 
	vocab_src, _ = text_to_dict([config['train_src'],
		config['dev_src'], config['test_src']])
	vocab_tgt, cabvo = text_to_dict([config['train_tgt'],
		config['dev_tgt']])

	# Create Theano variables
	logger.info('Creating theano variables')
	source_sentence = tensor.lmatrix('source')
	source_sentence_mask = tensor.matrix('source_mask')
	target_sentence = tensor.lmatrix('target')
	target_sentence_mask = tensor.matrix('target_mask')
	source_sentence.tag.test_value = [[13, 20, 0, 20, 0, 20, 0],
										[1, 4, 8, 4, 8, 4, 8],]
	source_sentence_mask.tag.test_value = [[0, 1, 0, 1, 0, 1, 0],
											[1, 0, 1, 0, 1, 0, 1],]
	target_sentence.tag.test_value = [[0,1,1,5],
										[2,0,1,0],]
	target_sentence_mask.tag.test_value = [[0,1,1,0],
											[1,1,1,0],]


	logger.info('Building RNN encoder-decoder')
	### Building Encoder 
	embedder = LookupTable(
		length=len(vocab_src), 
		dim=config['embed_src'], 
		weights_init=IsotropicGaussian(),
		biases_init=Constant(0.0), 
		name='embedder')
	transformer = Linear(
		config['embed_src'], 
		config['hidden_src']*4, 
		weights_init=IsotropicGaussian(),
		biases_init=Constant(0.0), 
		name='transformer')

	lstminit = np.asarray([0.0,]*config['hidden_src']+[0.0,]*config['hidden_src']+[1.0,]*config['hidden_src']+[0.0,]*config['hidden_src'])
	encoder = Bidirectional(
		LSTM(
			dim=config['hidden_src'], 
			weights_init=IsotropicGaussian(0.01),
			biases_init=Constant(lstminit)),
		name='encoderBiLSTM'
		)
	encoder.prototype.weights_init = Orthogonal()
	
	### Building Decoder 
	lstminit = np.asarray([0.0,]*config['hidden_tgt']+[0.0,]*config['hidden_tgt']+[1.0,]*config['hidden_tgt']+[0.0,]*config['hidden_tgt'])
	transition = LSTM2GO(
		attended_dim=config['hidden_tgt'], 
		dim=config['hidden_tgt'], 
		weights_init=IsotropicGaussian(0.01),
		biases_init=Constant(lstminit), 
		name='decoderLSTM')

	attention = SequenceContentAttention( 
		state_names=transition.apply.states, # default activation is Tanh
		state_dims=[config['hidden_tgt']],
		attended_dim=config['hidden_src']*2,
		match_dim=config['hidden_tgt'], 
		name="attention")

	readout = Readout(
		source_names=['states', 
			'feedback', 
			attention.take_glimpses.outputs[0]],
		readout_dim=len(vocab_tgt),
		emitter = SoftmaxEmitter(
			name='emitter'), 
		feedback_brick = LookupFeedback(
			num_outputs=len(vocab_tgt), 
			feedback_dim=config['embed_tgt'], 
			name='feedback'), 
		post_merge=InitializableFeedforwardSequence([
			Bias(dim=config['hidden_tgt'], 
				name='softmax_bias').apply,
			Linear(input_dim=config['hidden_tgt'], 
				output_dim=config['embed_tgt'],
				use_bias=False, 
				name='softmax0').apply,
			Linear(input_dim=config['embed_tgt'], 
				name='softmax1').apply]),
		merged_dim=config['hidden_tgt'])

	decoder = SequenceGenerator(
		readout=readout, 
		transition=transition, 
		attention=attention, 
		weights_init=IsotropicGaussian(0.01), 
		biases_init=Constant(0),
		name="generator",
		fork=Fork(
			[name for name in transition.apply.sequences if name != 'mask'], 
			prototype=Linear()),
		add_contexts=True)
	decoder.transition.weights_init = Orthogonal()

	#printchildren(encoder, 1)
	# Initialize model
	logger.info('Initializing model')
	embedder.initialize()
	transformer.initialize()
	encoder.initialize()
	decoder.initialize()
	
	# Apply model 
	embedded = embedder.apply(source_sentence)
	tansformed = transformer.apply(embedded)
	encoded = encoder.apply(tansformed)[0]
	generated = decoder.generate(
		n_steps=2*source_sentence.shape[1], 
		batch_size=source_sentence.shape[0], 
		attended = encoded.dimshuffle(1,0,2), 
		attended_mask=tensor.ones(source_sentence.shape).T
		)
	print 'Generated: ', generated
	# generator_generate_outputs
	#samples = generated[1] # For GRU 
	samples = generated[2] # For LSTM
	samples.name = 'samples'
	#samples_cost = generated[4] # For GRU 
	samples_cost = generated[5] # For LSTM
	samples_cost = 'sampling_cost'
	cost = decoder.cost(
		mask = target_sentence_mask.T, 
		outputs = target_sentence.T, 
		attended = encoded.dimshuffle(1,0,2), 
		attended_mask = source_sentence_mask.T)
	cost.name = 'target_cost'
	cost.tag.aggregation_scheme = TakeLast(cost)
	model = Model(cost)
	
	logger.info('Creating computational graph')
	cg = ComputationGraph(cost)
	
	# apply dropout for regularization
	if config['dropout'] < 1.0: # dropout is applied to the output of maxout in ghog
		logger.info('Applying dropout')
		dropout_inputs = [x for x in cg.intermediary_variables if x.name == 'maxout_apply_output']
		cg = apply_dropout(cg, dropout_inputs, config['dropout'])

	######## 
	# Print shapes
	shapes = [param.get_value().shape for param in cg.parameters]
	logger.info("Parameter shapes: ")
	for shape, count in Counter(shapes).most_common():
		logger.info('	{:15}: {}'.format(shape, count))
	logger.info("Total number of parameters: {}".format(len(shapes)))

	printchildren(embedder, 1)
	printchildren(transformer, 1)
	printchildren(encoder, 1)
	printchildren(decoder, 1)
	# Print parameter names
	# enc_dec_param_dict = merge(Selector(embedder).get_parameters(), Selector(encoder).get_parameters(), Selector(decoder).get_parameters())
	# enc_dec_param_dict = merge(Selector(decoder).get_parameters())
	# logger.info("Parameter names: ")
	# for name, value in enc_dec_param_dict.items():
	# 	logger.info('	{:15}: {}'.format(value.get_value().shape, name))
	# logger.info("Total number of parameters: {}".format(len(enc_dec_param_dict)))
	##########

	# Training data 
	train_stream = get_train_stream(config, 
		[config['train_src'],], [config['train_tgt'],], 
		vocab_src, vocab_tgt)
	dev_stream = get_dev_stream(
		[config['dev_src'],], [config['dev_tgt'],], 
		vocab_src, vocab_tgt)
	test_stream = get_test_stream([config['test_src'],], vocab_src)

	# Set extensions
	logger.info("Initializing extensions")
	extensions = [
		FinishAfter(after_n_batches=config['finish_after']),
		ProgressBar(),
		TrainingDataMonitoring([cost], 
			prefix="tra", 
			after_batch=True),
		DataStreamMonitoring(variables=[cost], 
			data_stream=dev_stream, 
			prefix="dev", 
			after_batch=True), 
		Sampler(
			model=Model(samples), 
			data_stream=dev_stream,
			vocab=cabvo,
			saveto=config['saveto']+'dev',
			every_n_batches=config['save_freq']), 
		Sampler(
			model=Model(samples), 
			data_stream=test_stream,
			vocab=cabvo,
			saveto=config['saveto']+'test',
			after_n_batches=1, 
			on_resumption=True,
			before_training=True), 
		Plotter(saveto=config['saveto'], after_batch=True),
		Printing(after_batch=True),
		Checkpoint(
			path=config['saveto'], 
			parameters = cg.parameters,
			save_main_loop=False,
			every_n_batches=config['save_freq'])]
	if BOKEH_AVAILABLE: 
		Plot('Training cost', channels=[['target_cost']], after_batch=True)
	if config['reload']: 
		extensions.append(Load(path=config['saveto'], 
			load_iteration_state=False, 
			load_log=False))
	else: 
		with open(config['saveto']+'.txt', 'w') as f: 
			pass 

	# Set up training algorithm
	logger.info("Initializing training algorithm")
	algorithm = GradientDescent(cost=cost, 
		parameters=cg.parameters,
		step_rule=CompositeRule([StepClipping(config['step_clipping']), 
			eval(config['step_rule'])()])
    )

	# Initialize main loop
	logger.info("Initializing main loop")
	main_loop = MainLoop(
		model=model,
		algorithm=algorithm,
		data_stream=train_stream,
		extensions=extensions)
	main_loop.run()
Ejemplo n.º 19
0
 def __init__(self):
     SoftmaxEmitter.__init__(self)
     self.theano_rng = MRG_RandomStreams(seed=random.randint(0, 100000))
 def __init__(self):
     SoftmaxEmitter.__init__(self)
     self.theano_rng = MRG_RandomStreams(seed=random.randint(0,100000))
Ejemplo n.º 21
0
def main(mode, save_path, steps, num_batches):
    num_states = MarkovChainDataset.num_states

    if mode == "train":
        # Experiment configuration
        rng = numpy.random.RandomState(1)
        batch_size = 50
        seq_len = 100
        dim = 10
        feedback_dim = 8

        # Build the bricks and initialize them
        transition = GatedRecurrent(name="transition",
                                    dim=dim,
                                    activation=Tanh())
        generator = SequenceGenerator(Readout(
            readout_dim=num_states,
            source_names=["states"],
            emitter=SoftmaxEmitter(name="emitter"),
            feedback_brick=LookupFeedback(num_states,
                                          feedback_dim,
                                          name='feedback'),
            name="readout"),
                                      transition,
                                      weights_init=IsotropicGaussian(0.01),
                                      biases_init=Constant(0),
                                      name="generator")
        generator.push_initialization_config()
        transition.weights_init = Orthogonal()
        generator.initialize()

        # Give an idea of what's going on.
        logger.info("Parameters:\n" + pprint.pformat(
            [(key, value.get_value().shape)
             for key, value in Selector(generator).get_params().items()],
            width=120))
        logger.info("Markov chain entropy: {}".format(
            MarkovChainDataset.entropy))
        logger.info("Expected min error: {}".format(
            -MarkovChainDataset.entropy * seq_len))

        # Build the cost computation graph.
        x = tensor.lmatrix('data')
        cost = aggregation.mean(
            generator.cost_matrix(x[:, :]).sum(), x.shape[1])
        cost.name = "sequence_log_likelihood"

        algorithm = GradientDescent(
            cost=cost,
            params=list(Selector(generator).get_params().values()),
            step_rule=Scale(0.001))
        main_loop = MainLoop(algorithm=algorithm,
                             data_stream=DataStream(
                                 MarkovChainDataset(rng, seq_len),
                                 iteration_scheme=ConstantScheme(batch_size)),
                             model=Model(cost),
                             extensions=[
                                 FinishAfter(after_n_batches=num_batches),
                                 TrainingDataMonitoring([cost],
                                                        prefix="this_step",
                                                        after_batch=True),
                                 TrainingDataMonitoring([cost],
                                                        prefix="average",
                                                        every_n_batches=100),
                                 Checkpoint(save_path, every_n_batches=500),
                                 Printing(every_n_batches=100)
                             ])
        main_loop.run()
    elif mode == "sample":
        main_loop = cPickle.load(open(save_path, "rb"))
        generator = main_loop.model

        sample = ComputationGraph(
            generator.generate(n_steps=steps, batch_size=1,
                               iterate=True)).get_theano_function()

        states, outputs, costs = [data[:, 0] for data in sample()]

        numpy.set_printoptions(precision=3, suppress=True)
        print("Generation cost:\n{}".format(costs.sum()))

        freqs = numpy.bincount(outputs).astype(floatX)
        freqs /= freqs.sum()
        print("Frequencies:\n {} vs {}".format(freqs,
                                               MarkovChainDataset.equilibrium))

        trans_freqs = numpy.zeros((num_states, num_states), dtype=floatX)
        for a, b in zip(outputs, outputs[1:]):
            trans_freqs[a, b] += 1
        trans_freqs /= trans_freqs.sum(axis=1)[:, None]
        print("Transition frequencies:\n{}\nvs\n{}".format(
            trans_freqs, MarkovChainDataset.trans_prob))
    else:
        assert False
    def __init__(self, config, vocab_size):
        question = tensor.imatrix('question')
        question_mask = tensor.imatrix('question_mask')
        context = tensor.imatrix('context')
        context_mask = tensor.imatrix('context_mask')
        answer = tensor.ivector('answer')
        candidates = tensor.imatrix('candidates')
        candidates_mask = tensor.imatrix('candidates_mask')

        bricks = []

        question = question.dimshuffle(1, 0)
        question_mask = question_mask.dimshuffle(1, 0)
        context = context.dimshuffle(1, 0)
        context_mask = context_mask.dimshuffle(1, 0)

        # Embed questions and cntext
        embed = LookupTable(vocab_size, config.embed_size, name='question_embed')
        bricks.append(embed)

        qembed = embed.apply(question)
        cembed = embed.apply(context)

        qlstms, qhidden_list = make_bidir_lstm_stack(qembed, config.embed_size, question_mask.astype(theano.config.floatX),
                                                     config.question_lstm_size, config.question_skip_connections, 'q')
        clstms, chidden_list = make_bidir_lstm_stack(cembed, config.embed_size, context_mask.astype(theano.config.floatX),
                                                     config.ctx_lstm_size, config.ctx_skip_connections, 'ctx')
        bricks = bricks + qlstms + clstms

        # Calculate question encoding (concatenate layer1)
        if config.question_skip_connections:
            qenc_dim = 2*sum(config.question_lstm_size)
            qenc = tensor.concatenate([h[-1,:,:] for h in qhidden_list], axis=1)
        else:
            qenc_dim = 2*config.question_lstm_size[-1]
            qenc = tensor.concatenate([h[-1,:,:] for h in qhidden_list[-2:]], axis=1)
        qenc.name = 'qenc'

        # Calculate context encoding (concatenate layer1)
        if config.ctx_skip_connections: #default yes
            cenc_dim = 2*sum(config.ctx_lstm_size) #2 : fw & bw
            cenc = tensor.concatenate(chidden_list, axis=2)
        else:
            cenc_dim = 2*config.ctx_lstm_size[-1]
            cenc = tensor.concatenate(chidden_list[-2:], axis=2)
        cenc.name = 'cenc'

        # Attention mechanism MLP           activation: Tanh, identity
        attention_mlp = MLP(dims=config.attention_mlp_hidden + [1],
                            activations=config.attention_mlp_activations[1:] + [Identity()],
                            name='attention_mlp')
        attention_qlinear = Linear(input_dim=qenc_dim, output_dim=config.attention_mlp_hidden[0], name='attq') #Wum
        attention_clinear = Linear(input_dim=cenc_dim, output_dim=config.attention_mlp_hidden[0], use_bias=False, name='attc') # Wym
        bricks += [attention_mlp, attention_qlinear, attention_clinear]
        layer1 = Tanh().apply(attention_clinear.apply(cenc.reshape((cenc.shape[0]*cenc.shape[1], cenc.shape[2])))
                                        .reshape((cenc.shape[0],cenc.shape[1],config.attention_mlp_hidden[0]))
                             + attention_qlinear.apply(qenc)[None, :, :])
        layer1.name = 'layer1'
        att_weights = attention_mlp.apply(layer1.reshape((layer1.shape[0]*layer1.shape[1], layer1.shape[2])))
        att_weights.name = 'att_weights_0'
        att_weights = att_weights.reshape((layer1.shape[0], layer1.shape[1]))
        att_weights.name = 'att_weights'

        attended = tensor.sum(cenc * tensor.nnet.softmax(att_weights.T).T[:, :, None], axis=0)
        attended.name = 'attended'

        print("attended shape: %d" %attended.shape)

        dimension = qenc_dim + cenc_dim
        transition = SimpleRecurrent(activation=Tanh(),dim=dimension, name="transition")

        readout = Readout(
            readout_dim=vocab_size,
            source_names=[transition.apply.states[0]],
            emitter=SoftmaxEmitter(name="emitter"),
            feedback_brick=LookupFeedback(vocab_size, dimension),
            name="readout")

        generator = SequenceGenerator(
            readout=readout, transition=transition,
            name="generator")

        self.generator = generator
        bricks += [generator]


        cost = self.generator.cost()




        # Now we can calculate our output
        out_mlp = MLP(dims=[cenc_dim + qenc_dim] + config.out_mlp_hidden + [config.n_entities],
                      activations=config.out_mlp_activations + [Identity()],
                      name='out_mlp')
        bricks += [out_mlp]
        probs = out_mlp.apply(tensor.concatenate([attended, qenc], axis=1))
        probs.name = 'probs'

        is_candidate = tensor.eq(tensor.arange(config.n_entities, dtype='int32')[None, None, :],
                                 tensor.switch(candidates_mask, candidates, -tensor.ones_like(candidates))[:, :, None]).sum(axis=1)
        probs = tensor.switch(is_candidate, probs, -1000 * tensor.ones_like(probs))

        # Calculate prediction, cost and error rate
        pred = probs.argmax(axis=1)
        cost = Softmax().categorical_cross_entropy(answer, probs).mean()
        error_rate = tensor.neq(answer, pred).mean()

        # Apply dropout
        cg = ComputationGraph([cost, error_rate])
        if config.w_noise > 0:
            noise_vars = VariableFilter(roles=[WEIGHT])(cg)
            cg = apply_noise(cg, noise_vars, config.w_noise)
        if config.dropout > 0:
            cg = apply_dropout(cg, qhidden_list + chidden_list, config.dropout)
        [cost_reg, error_rate_reg] = cg.outputs

        # Other stuff
        cost_reg.name = cost.name = 'cost'
        error_rate_reg.name = error_rate.name = 'error_rate'

        self.sgd_cost = cost_reg
        self.monitor_vars = [[cost_reg], [error_rate_reg]]
        self.monitor_vars_valid = [[cost], [error_rate]]

        # Initialize bricks
        for brick in bricks:
            brick.weights_init = config.weights_init
            brick.biases_init = config.biases_init
            brick.initialize()
def test_integer_sequence_generator():
    """Test a sequence generator with integer outputs.

    Such sequence generators can be used to e.g. model language.

    """
    rng = numpy.random.RandomState(1234)

    readout_dim = 5
    feedback_dim = 3
    dim = 20
    batch_size = 30
    n_steps = 10

    transition = GatedRecurrent(dim=dim, activation=Tanh(),
                                weights_init=Orthogonal())
    generator = SequenceGenerator(
        Readout(readout_dim=readout_dim, source_names=["states"],
                emitter=SoftmaxEmitter(theano_seed=1234),
                feedback_brick=LookupFeedback(readout_dim,
                                              feedback_dim)),
        transition,
        weights_init=IsotropicGaussian(0.1), biases_init=Constant(0),
        seed=1234)
    generator.initialize()

    # Test 'cost_matrix' method
    y = tensor.lmatrix('y')
    mask = tensor.matrix('mask')
    costs = generator.cost_matrix(y, mask)
    assert costs.ndim == 2
    costs_fun = theano.function([y, mask], [costs])
    y_test = rng.randint(readout_dim, size=(n_steps, batch_size))
    m_test = numpy.ones((n_steps, batch_size), dtype=floatX)
    costs_val = costs_fun(y_test, m_test)[0]
    assert costs_val.shape == (n_steps, batch_size)
    assert_allclose(costs_val.sum(), 482.827, rtol=1e-5)

    # Test 'cost' method
    cost = generator.cost(y, mask)
    assert cost.ndim == 0
    cost_val = theano.function([y, mask], [cost])(y_test, m_test)
    assert_allclose(cost_val, 16.0942, rtol=1e-5)

    # Test 'AUXILIARY' variable 'per_sequence_element' in 'cost' method
    cg = ComputationGraph([cost])
    var_filter = VariableFilter(roles=[AUXILIARY])
    aux_var_name = '_'.join([generator.name, generator.cost.name,
                             'per_sequence_element'])
    cost_per_el = [el for el in var_filter(cg.variables)
                   if el.name == aux_var_name][0]
    assert cost_per_el.ndim == 0
    cost_per_el_val = theano.function([y, mask], [cost_per_el])(y_test, m_test)
    assert_allclose(cost_per_el_val, 1.60942, rtol=1e-5)

    # Test generate
    states, outputs, costs = generator.generate(
        iterate=True, batch_size=batch_size, n_steps=n_steps)
    cg = ComputationGraph(states + outputs + costs)
    states_val, outputs_val, costs_val = theano.function(
        [], [states, outputs, costs],
        updates=cg.updates)()
    assert states_val.shape == (n_steps, batch_size, dim)
    assert outputs_val.shape == (n_steps, batch_size)
    assert outputs_val.dtype == 'int64'
    assert costs_val.shape == (n_steps, batch_size)
    assert_allclose(states_val.sum(), -17.91811, rtol=1e-5)
    assert_allclose(costs_val.sum(), 482.863, rtol=1e-5)
    assert outputs_val.sum() == 630

    # Test masks agnostic results of cost
    cost1 = costs_fun([[1], [2]], [[1], [1]])[0]
    cost2 = costs_fun([[3, 1], [4, 2], [2, 0]],
                      [[1, 1], [1, 1], [1, 0]])[0]
    assert_allclose(cost1.sum(), cost2[:, 1].sum(), rtol=1e-5)
def test_softmax_emitter_initial_outputs():
    emitter = SoftmaxEmitter(3)
    emitter.readout_dim = 0
    assert_equal(emitter.initial_outputs(2).eval(),
                 3 * numpy.ones((2,), dtype='int64'))
Ejemplo n.º 25
0
def main(mode, save_path, steps, time_budget, reset):

    num_states = ChainDataset.num_states

    if mode == "train":
        # Experiment configuration
        rng = numpy.random.RandomState(1)
        batch_size = 50
        seq_len = 100
        dim = 10
        feedback_dim = 8

        # Build the bricks and initialize them
        transition = GatedRecurrent(name="transition", activation=Tanh(),
                                    dim=dim)
        generator = SequenceGenerator(
            LinearReadout(readout_dim=num_states, source_names=["states"],
                          emitter=SoftmaxEmitter(name="emitter"),
                          feedbacker=LookupFeedback(
                              num_states, feedback_dim, name='feedback'),
                          name="readout"),
            transition,
            weights_init=IsotropicGaussian(0.01), biases_init=Constant(0),
            name="generator")
        generator.push_initialization_config()
        transition.weights_init = Orthogonal()
        generator.initialize()

        logger.info("Parameters:\n" +
                    pprint.pformat(
                        [(key, value.get_value().shape) for key, value
                         in Selector(generator).get_params().items()],
                        width=120))
        logger.info("Markov chain entropy: {}".format(
            ChainDataset.entropy))
        logger.info("Expected min error: {}".format(
            -ChainDataset.entropy * seq_len * batch_size))

        if os.path.isfile(save_path) and not reset:
            model = Pylearn2Model.load(save_path)
        else:
            model = Pylearn2Model(generator)

        # Build the cost computation graph.
        # Note: would be probably nicer to make cost part of the model.
        x = tensor.ltensor3('x')
        cost = Pylearn2Cost(model.brick.cost(x[:, :, 0]).sum())

        dataset = ChainDataset(rng, seq_len)
        sgd = SGD(learning_rate=0.0001, cost=cost,
                  batch_size=batch_size, batches_per_iter=10,
                  monitoring_dataset=dataset,
                  monitoring_batch_size=batch_size,
                  monitoring_batches=1,
                  learning_rule=Pylearn2LearningRule(
                      SGDLearningRule(),
                      dict(training_objective=cost.cost)))
        train = Pylearn2Train(dataset, model, algorithm=sgd,
                              save_path=save_path, save_freq=10)
        train.main_loop(time_budget=time_budget)
    elif mode == "sample":
        model = Pylearn2Model.load(save_path)
        generator = model.brick

        sample = ComputationGraph(generator.generate(
            n_steps=steps, batch_size=1, iterate=True)).function()

        states, outputs, costs = [data[:, 0] for data in sample()]

        numpy.set_printoptions(precision=3, suppress=True)
        print("Generation cost:\n{}".format(costs.sum()))

        freqs = numpy.bincount(outputs).astype(floatX)
        freqs /= freqs.sum()
        print("Frequencies:\n {} vs {}".format(freqs,
                                               ChainDataset.equilibrium))

        trans_freqs = numpy.zeros((num_states, num_states), dtype=floatX)
        for a, b in zip(outputs, outputs[1:]):
            trans_freqs[a, b] += 1
        trans_freqs /= trans_freqs.sum(axis=1)[:, None]
        print("Transition frequencies:\n{}\nvs\n{}".format(
            trans_freqs, ChainDataset.trans_prob))
    else:
        assert False
Ejemplo n.º 26
0
    def __init__(self,
                 vocab_size,
                 embedding_dim,
                 state_dim,
                 representation_dim,
                 context_dim,
                 target_transition,
                 theano_seed=None,
                 loss_function='cross_entropy',
                 **kwargs):
        super(InitialContextDecoder, self).__init__(**kwargs)

        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.state_dim = state_dim
        self.representation_dim = representation_dim
        self.theano_seed = theano_seed

        # Initialize gru with special initial state
        self.transition = target_transition(attended_dim=state_dim,
                                            context_dim=context_dim,
                                            dim=state_dim,
                                            activation=Tanh(),
                                            name='decoder')

        # self.transition = GRUInitialStateWithInitialStateConcatContext(
        #     attended_dim=state_dim, context_dim=context_dim, dim=state_dim,
        #     activation=Tanh(), name='decoder')

        # Initialize the attention mechanism
        self.attention = SequenceContentAttention(
            state_names=self.transition.apply.states,
            attended_dim=representation_dim,
            match_dim=state_dim,
            name="attention")

        # Initialize the readout, note that SoftmaxEmitter emits -1 for
        # initial outputs which is used by LookupFeedBackWMT15
        readout = Readout(
            source_names=[
                'states',
                'feedback',
                # Chris: it's key that we're taking the first output of self.attention.take_glimpses.outputs
                # Chris: the first output is the weighted avgs, the second is the weights in (batch, time)
                self.attention.take_glimpses.outputs[0]
            ],
            readout_dim=self.vocab_size,
            emitter=SoftmaxEmitter(initial_output=-1, theano_seed=theano_seed),
            feedback_brick=LookupFeedbackWMT15(vocab_size, embedding_dim),
            post_merge=InitializableFeedforwardSequence([
                Bias(dim=state_dim, name='maxout_bias').apply,
                Maxout(num_pieces=2, name='maxout').apply,
                Linear(input_dim=state_dim / 2,
                       output_dim=embedding_dim,
                       use_bias=False,
                       name='softmax0').apply,
                Linear(input_dim=embedding_dim, name='softmax1').apply
            ]),
            merged_dim=state_dim)

        # Build sequence generator accordingly
        if loss_function == 'cross_entropy':
            self.sequence_generator = InitialContextSequenceGenerator(
                readout=readout,
                transition=self.transition,
                attention=self.attention,
                fork=Fork([
                    name for name in self.transition.apply.sequences
                    if name != 'mask'
                ],
                          prototype=Linear()))
        elif loss_function == 'min_risk':
            self.sequence_generator = MinRiskInitialContextSequenceGenerator(
                readout=readout,
                transition=self.transition,
                attention=self.attention,
                fork=Fork([
                    name for name in self.transition.apply.sequences
                    if name != 'mask'
                ],
                          prototype=Linear()))
            # the name is important, because it lets us match the brick hierarchy names for the vanilla SequenceGenerator
            # to load pretrained models
            # TODO: quick hack to fix bug
            self.sequence_generator.name = 'initialcontextsequencegenerator'

        else:
            raise ValueError(
                'The decoder does not support the loss function: {}'.format(
                    loss_function))

        # TODO: uncomment this!!
        # self.sequence_generator.name = 'sequencegenerator'

        self.children = [self.sequence_generator]
Ejemplo n.º 27
0
    def __init__(
        self,
        recordings_source,
        labels_source,
        eos_label,
        num_features,
        num_phonemes,
        dim_dec,
        dims_bidir,
        dims_bottom,
        enc_transition,
        dec_transition,
        use_states_for_readout,
        attention_type,
        lm=None,
        character_map=None,
        subsample=None,
        dims_top=None,
        prior=None,
        conv_n=None,
        bottom_activation=None,
        post_merge_activation=None,
        post_merge_dims=None,
        dim_matcher=None,
        embed_outputs=True,
        dec_stack=1,
        conv_num_filters=1,
        data_prepend_eos=True,
        energy_normalizer=None,  # softmax is th edefault set in SequenceContentAndConvAttention
        **kwargs):
        if bottom_activation is None:
            bottom_activation = Tanh()
        if post_merge_activation is None:
            post_merge_activation = Tanh()
        super(SpeechRecognizer, self).__init__(**kwargs)
        self.recordings_source = recordings_source
        self.labels_source = labels_source
        self.eos_label = eos_label
        self.data_prepend_eos = data_prepend_eos

        self.rec_weights_init = None
        self.initial_states_init = None

        self.enc_transition = enc_transition
        self.dec_transition = dec_transition
        self.dec_stack = dec_stack

        bottom_activation = bottom_activation
        post_merge_activation = post_merge_activation

        if dim_matcher is None:
            dim_matcher = dim_dec

        # The bottom part, before BiRNN
        if dims_bottom:
            bottom = MLP([bottom_activation] * len(dims_bottom),
                         [num_features] + dims_bottom,
                         name="bottom")
        else:
            bottom = Identity(name='bottom')

        # BiRNN
        if not subsample:
            subsample = [1] * len(dims_bidir)
        encoder = Encoder(
            self.enc_transition, dims_bidir,
            dims_bottom[-1] if len(dims_bottom) else num_features, subsample)

        # The top part, on top of BiRNN but before the attention
        if dims_top:
            top = MLP([Tanh()],
                      [2 * dims_bidir[-1]] + dims_top + [2 * dims_bidir[-1]],
                      name="top")
        else:
            top = Identity(name='top')

        if dec_stack == 1:
            transition = self.dec_transition(dim=dim_dec,
                                             activation=Tanh(),
                                             name="transition")
        else:
            transitions = [
                self.dec_transition(dim=dim_dec,
                                    activation=Tanh(),
                                    name="transition_{}".format(trans_level))
                for trans_level in xrange(dec_stack)
            ]
            transition = RecurrentStack(transitions=transitions,
                                        skip_connections=True)
        # Choose attention mechanism according to the configuration
        if attention_type == "content":
            attention = SequenceContentAttention(
                state_names=transition.apply.states,
                attended_dim=2 * dims_bidir[-1],
                match_dim=dim_matcher,
                name="cont_att")
        elif attention_type == "content_and_conv":
            attention = SequenceContentAndConvAttention(
                state_names=transition.apply.states,
                conv_n=conv_n,
                conv_num_filters=conv_num_filters,
                attended_dim=2 * dims_bidir[-1],
                match_dim=dim_matcher,
                prior=prior,
                energy_normalizer=energy_normalizer,
                name="conv_att")
        else:
            raise ValueError(
                "Unknown attention type {}".format(attention_type))
        if embed_outputs:
            feedback = LookupFeedback(num_phonemes + 1, dim_dec)
        else:
            feedback = OneOfNFeedback(num_phonemes + 1)
        if lm:
            # In case we use LM it is Readout that is responsible
            # for normalization.
            emitter = LMEmitter()
        else:
            emitter = SoftmaxEmitter(initial_output=num_phonemes,
                                     name="emitter")
        readout_config = dict(readout_dim=num_phonemes,
                              source_names=(transition.apply.states if
                                            use_states_for_readout else []) +
                              [attention.take_glimpses.outputs[0]],
                              emitter=emitter,
                              feedback_brick=feedback,
                              name="readout")
        if post_merge_dims:
            readout_config['merged_dim'] = post_merge_dims[0]
            readout_config['post_merge'] = InitializableSequence(
                [
                    Bias(post_merge_dims[0]).apply,
                    post_merge_activation.apply,
                    MLP(
                        [post_merge_activation] *
                        (len(post_merge_dims) - 1) + [Identity()],
                        # MLP was designed to support Maxout is activation
                        # (because Maxout in a way is not one). However
                        # a single layer Maxout network works with the trick below.
                        # For deeper Maxout network one has to use the
                        # Sequence brick.
                        [
                            d //
                            getattr(post_merge_activation, 'num_pieces', 1)
                            for d in post_merge_dims
                        ] + [num_phonemes]).apply,
                ],
                name='post_merge')
        readout = Readout(**readout_config)

        language_model = None
        if lm:
            lm_weight = lm.pop('weight', 0.0)
            normalize_am_weights = lm.pop('normalize_am_weights', True)
            normalize_lm_weights = lm.pop('normalize_lm_weights', False)
            normalize_tot_weights = lm.pop('normalize_tot_weights', False)
            am_beta = lm.pop('am_beta', 1.0)
            if normalize_am_weights + normalize_lm_weights + normalize_tot_weights < 1:
                logger.warn(
                    "Beam search is prone to fail with no log-prob normalization"
                )
            language_model = LanguageModel(nn_char_map=character_map, **lm)
            readout = ShallowFusionReadout(
                lm_costs_name='lm_add',
                lm_weight=lm_weight,
                normalize_am_weights=normalize_am_weights,
                normalize_lm_weights=normalize_lm_weights,
                normalize_tot_weights=normalize_tot_weights,
                am_beta=am_beta,
                **readout_config)

        generator = SequenceGenerator(readout=readout,
                                      transition=transition,
                                      attention=attention,
                                      language_model=language_model,
                                      name="generator")

        # Remember child bricks
        self.encoder = encoder
        self.bottom = bottom
        self.top = top
        self.generator = generator
        self.children = [encoder, top, bottom, generator]

        # Create input variables
        self.recordings = tensor.tensor3(self.recordings_source)
        self.recordings_mask = tensor.matrix(self.recordings_source + "_mask")
        self.labels = tensor.lmatrix(self.labels_source)
        self.labels_mask = tensor.matrix(self.labels_source + "_mask")
        self.batch_inputs = [
            self.recordings, self.recordings_source, self.labels,
            self.labels_mask
        ]
        self.single_recording = tensor.matrix(self.recordings_source)
        self.single_transcription = tensor.lvector(self.labels_source)
Ejemplo n.º 28
0
    def __init__(self,
                 vocab_size,
                 embedding_dim,
                 state_dim,
                 att_dim,
                 maxout_dim,
                 representation_dim,
                 attention_strategy='content',
                 attention_sources='s',
                 readout_sources='sfa',
                 memory='none',
                 memory_size=500,
                 seq_len=50,
                 init_strategy='last',
                 make_prunable=False,
                 theano_seed=None,
                 **kwargs):
        """Creates a new decoder brick.
        
        Args:
            vocab_size (int): Target language vocabulary size
            embedding_dim (int): Size of feedback embedding layer
            state_dim (int): Number of hidden units
            att_dim (int): Size of attention match vector
            maxout_dim (int): Size of maxout layer
            representation_dim (int): Dimension of source annotations
            attention_strategy (string): Which attention should be used
                                         cf.  ``_initialize_attention``
            attention_sources (string): Defines the sources used by the 
                                        attention model 's' for decoder
                                        states, 'f' for feedback
            readout_sources (string): Defines the sources used in the 
                                      readout network. 's' for decoder
                                      states, 'f' for feedback, 'a' for
                                      attention (context vector)
            memory (string): Which external memory should be used
                             (cf.  ``_initialize_attention``)
            memory_size (int): Size of the external memory structure
            seq_len (int): Maximum sentence length
            init_strategy (string): How to initialize the RNN state
                                    (cf.  ``GRUInitialState``)
            theano_seed: Random seed
        """
        super(Decoder, self).__init__(**kwargs)
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.state_dim = state_dim
        self.representation_dim = representation_dim
        self.theano_seed = theano_seed

        # Initialize gru with special initial state
        self.transition = GRUInitialState(attended_dim=representation_dim / 2,
                                          init_strategy=init_strategy,
                                          dim=state_dim,
                                          activation=Tanh(),
                                          name='decoder')

        # Initialize the attention mechanism
        att_dim = att_dim if att_dim > 0 else state_dim
        self.attention, src_names = _initialize_attention(
            attention_strategy, seq_len, self.transition, representation_dim,
            att_dim, attention_sources, readout_sources, memory, memory_size)

        # Initialize the readout, note that SoftmaxEmitter emits -1 for
        # initial outputs which is used by LookupFeedBackWMT15
        maxout_dim = maxout_dim if maxout_dim > 0 else state_dim
        post_layers = [
            Bias(dim=maxout_dim, name='maxout_bias').apply,
            Maxout(num_pieces=2, name='maxout').apply,
            Linear(input_dim=maxout_dim / 2,
                   output_dim=embedding_dim,
                   use_bias=False,
                   name='softmax0').apply,
            Linear(input_dim=embedding_dim, name='softmax1').apply
        ]
        if make_prunable:
            post_merge = PrunableInitializableFeedforwardSequence(post_layers)
        else:
            post_merge = InitializableFeedforwardSequence(post_layers)
        readout = Readout(source_names=src_names,
                          readout_dim=self.vocab_size,
                          emitter=SoftmaxEmitter(initial_output=-1,
                                                 theano_seed=theano_seed),
                          feedback_brick=LookupFeedbackWMT15(
                              vocab_size, embedding_dim),
                          post_merge=post_merge,
                          merged_dim=maxout_dim)

        # Build sequence generator accordingly
        if make_prunable:
            self.sequence_generator = PrunableSequenceGenerator(
                readout=readout,
                transition=self.transition,
                attention=self.attention,
                fork=Fork([
                    name for name in self.transition.apply.sequences
                    if name != 'mask'
                ],
                          prototype=Linear()))
        else:
            self.sequence_generator = SequenceGenerator(
                readout=readout,
                transition=self.transition,
                attention=self.attention,
                fork=Fork([
                    name for name in self.transition.apply.sequences
                    if name != 'mask'
                ],
                          prototype=Linear()))

        self.children = [self.sequence_generator]
Ejemplo n.º 29
0
def main():
    logging.basicConfig(
        level=logging.DEBUG,
        format="%(asctime)s: %(name)s: %(levelname)s: %(message)s")

    parser = argparse.ArgumentParser(
        "Case study of generating a Markov chain with RNN.",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument(
        "mode",
        choices=["train", "sample"],
        help="The mode to run. Use `train` to train a new model"
        " and `sample` to sample a sequence generated by an"
        " existing one.")
    parser.add_argument("prefix",
                        default="sine",
                        help="The prefix for model, timing and state files")
    parser.add_argument("--steps",
                        type=int,
                        default=100,
                        help="Number of steps to plot")
    args = parser.parse_args()

    dim = 10
    num_states = ChainIterator.num_states
    feedback_dim = 8

    transition = GatedRecurrent(name="transition", activation=Tanh(), dim=dim)
    generator = SequenceGenerator(LinearReadout(
        readout_dim=num_states,
        source_names=["states"],
        emitter=SoftmaxEmitter(name="emitter"),
        feedbacker=LookupFeedback(num_states, feedback_dim, name='feedback'),
        name="readout"),
                                  transition,
                                  weights_init=IsotropicGaussian(0.01),
                                  biases_init=Constant(0),
                                  name="generator")
    generator.allocate()
    logger.debug("Parameters:\n" + pprint.pformat(
        [(key, value.get_value().shape)
         for key, value in Selector(generator).get_params().items()],
        width=120))

    if args.mode == "train":
        rng = numpy.random.RandomState(1)
        batch_size = 50

        generator.push_initialization_config()
        transition.weights_init = Orthogonal()
        generator.initialize()
        logger.debug("transition.weights_init={}".format(
            transition.weights_init))

        cost = generator.cost(tensor.lmatrix('x')).sum()
        gh_model = GroundhogModel(generator, cost)
        state = GroundhogState(args.prefix, batch_size,
                               learning_rate=0.0001).as_dict()
        data = ChainIterator(rng, 100, batch_size)
        trainer = SGD(gh_model, state, data)
        main_loop = MainLoop(data, None, None, gh_model, trainer, state, None)
        main_loop.main()
    elif args.mode == "sample":
        load_params(generator, args.prefix + "model.npz")

        sample = ComputationGraph(
            generator.generate(n_steps=args.steps, batch_size=1,
                               iterate=True)).function()

        states, outputs, costs = [data[:, 0] for data in sample()]

        numpy.set_printoptions(precision=3, suppress=True)
        print("Generation cost:\n{}".format(costs.sum()))

        freqs = numpy.bincount(outputs).astype(floatX)
        freqs /= freqs.sum()
        print("Frequencies:\n {} vs {}".format(freqs,
                                               ChainIterator.equilibrium))

        trans_freqs = numpy.zeros((num_states, num_states), dtype=floatX)
        for a, b in zip(outputs, outputs[1:]):
            trans_freqs[a, b] += 1
        trans_freqs /= trans_freqs.sum(axis=1)[:, None]
        print("Transition frequencies:\n{}\nvs\n{}".format(
            trans_freqs, ChainIterator.trans_prob))
    else:
        assert False