Exemple #1
0
 def create_lm_model(self):
     """
     create language model based on the encoder-decoder
     to generate target sequence given an input sequence.
     """
     if hasattr(self, 'lm_model'): return self.lm_model
     self.lm_model = LM_Model(
         cost_layer=self.predictions,
         sample_fn=self.create_sampler(),
         weight_noise_amount=self.state['weight_noise_amount'],
         word_dict=self.state['word_indx_trgt'],
         word_dict_src=self.state['word_indx'],
         rng=self.rng)
     self.lm_model.load_dict(self.state)
     logger.debug("Params of Language Model:\n{}".format(
         pprint.pformat(sorted([p.name for p in self.lm_model.params]))))
     return self.lm_model
Exemple #2
0
def jobman(state, channel):
    # load dataset
    rng = numpy.random.RandomState(state['seed'])

    # declare the dimensionalies of the input and output
    if state['chunks'] == 'words':
        state['n_in'] = 10000
        state['n_out'] = 10000
    else:
        state['n_in'] = 50
        state['n_out'] = 50
    train_data, valid_data, test_data = get_text_data(state)

    ## BEGIN Tutorial
    ### Define Theano Input Variables
    x = TT.lvector('x')
    y = TT.lvector('y')
    h0 = theano.shared(
        numpy.zeros((eval(state['nhids'])[-1], ), dtype='float32'))

    ### Neural Implementation of the Operators: \oplus
    #### Word Embedding
    emb_words = MultiLayer(rng,
                           n_in=state['n_in'],
                           n_hids=eval(state['inp_nhids']),
                           activation=eval(state['inp_activ']),
                           init_fn='sample_weights_classic',
                           weight_noise=state['weight_noise'],
                           rank_n_approx=state['rank_n_approx'],
                           scale=state['inp_scale'],
                           sparsity=state['inp_sparse'],
                           learn_bias=True,
                           bias_scale=eval(state['inp_bias']),
                           name='emb_words')

    #### Deep Transition Recurrent Layer
    rec = eval(state['rec_layer'])(
        rng,
        eval(state['nhids']),
        activation=eval(state['rec_activ']),
        #activation = 'TT.nnet.sigmoid',
        bias_scale=eval(state['rec_bias']),
        scale=eval(state['rec_scale']),
        sparsity=eval(state['rec_sparse']),
        init_fn=eval(state['rec_init']),
        weight_noise=state['weight_noise'],
        name='rec')

    #### Stiching them together
    ##### (1) Get the embedding of a word
    x_emb = emb_words(x, no_noise_bias=state['no_noise_bias'])
    ##### (2) Embedding + Hidden State via DT Recurrent Layer
    reset = TT.scalar('reset')
    rec_layer = rec(x_emb,
                    n_steps=x.shape[0],
                    init_state=h0 * reset,
                    no_noise_bias=state['no_noise_bias'],
                    truncate_gradient=state['truncate_gradient'],
                    batch_size=1)

    ## BEGIN Exercise: DOT-RNN
    ### Neural Implementation of the Operators: \lhd

    #### Exercise (1)
    #### TODO: Define a layer from the hidden state to the intermediate layer
    emb_layer = MultiLayer(rng, )

    #### Exercise (1)
    #### TODO: Define a layer from the input to the intermediate Layer

    #### Hidden State: Combine emb_state and emb_words_out
    #### Exercise (1)
    #### TODO: Define an activation layer

    #### Exercise (2)
    #### TODO: Define a dropout layer

    #### Softmax Layer
    output_layer = SoftmaxLayer(rng,
                                eval(state['dout_nhid']),
                                state['n_out'],
                                scale=state['out_scale'],
                                bias_scale=state['out_bias_scale'],
                                init_fn="sample_weights_classic",
                                weight_noise=state['weight_noise'],
                                sparsity=state['out_sparse'],
                                sum_over_time=True,
                                name='out')

    ### Few Optional Things
    #### Direct shortcut from x to y
    if state['shortcut_inpout']:
        shortcut = MultiLayer(rng,
                              n_in=state['n_in'],
                              n_hids=eval(state['inpout_nhids']),
                              activations=eval(state['inpout_activ']),
                              init_fn='sample_weights_classic',
                              weight_noise=state['weight_noise'],
                              scale=eval(state['inpout_scale']),
                              sparsity=eval(state['inpout_sparse']),
                              learn_bias=eval(state['inpout_learn_bias']),
                              bias_scale=eval(state['inpout_bias']),
                              name='shortcut')

    #### Learning rate scheduling (1/(1+n/beta))
    state['clr'] = state['lr']

    def update_lr(obj, cost):
        stp = obj.step
        if isinstance(obj.state['lr_start'],
                      int) and stp > obj.state['lr_start']:
            time = float(stp - obj.state['lr_start'])
            new_lr = obj.state['clr'] / (1 + time / obj.state['lr_beta'])
            obj.lr = new_lr

    if state['lr_adapt']:
        rec.add_schedule(update_lr)

    ### Neural Implementations of the Language Model
    #### Training
    if state['shortcut_inpout']:
        additional_inputs = [rec_layer, shortcut(x)]
    else:
        additional_inputs = [rec_layer]

    ##### Exercise (1): Compute the output intermediate layer
    ##### TODO: Compute the output intermediate layer

    ##### Exercise (2): Apply Dropout
    ##### TODO: Apply the dropout layer

    train_model = output_layer(outhid,
                               no_noise_bias=state['no_noise_bias'],
                               additional_inputs=additional_inputs).train(
                                   target=y,
                                   scale=numpy.float32(1. / state['seqlen']))

    nw_h0 = rec_layer.out[rec_layer.out.shape[0] - 1]
    if state['carry_h0']:
        train_model.updates += [(h0, nw_h0)]

    #### Validation
    h0val = theano.shared(
        numpy.zeros((eval(state['nhids'])[-1], ), dtype='float32'))
    rec_layer = rec(emb_words(x, use_noise=False),
                    n_steps=x.shape[0],
                    batch_size=1,
                    init_state=h0val * reset,
                    use_noise=False)
    nw_h0 = rec_layer.out[rec_layer.out.shape[0] - 1]

    ##### Exercise (1):
    ##### TODO: Compute the output intermediate layer

    ##### Exercise (2): Apply Dropout
    ##### TODO: Apply the dropout layer without noise

    if state['shortcut_inpout']:
        additional_inputs = [rec_layer, shortcut(x, use_noise=False)]
    else:
        additional_inputs = [rec_layer]
    valid_model = output_layer(outhid,
                               additional_inputs=additional_inputs,
                               use_noise=False).validate(target=y,
                                                         sum_over_time=True)

    valid_updates = []
    if state['carry_h0']:
        valid_updates = [(h0val, nw_h0)]

    valid_fn = theano.function([x, y, reset],
                               valid_model.cost,
                               name='valid_fn',
                               updates=valid_updates)

    #### Sampling
    ##### single-step sampling
    def sample_fn(word_tm1, h_tm1):
        x_emb = emb_words(word_tm1, use_noise=False, one_step=True)
        h0 = rec(x_emb, state_before=h_tm1, one_step=True, use_noise=False)[-1]
        outhid = outhid_dropout(outhid_activ(
            emb_state(h0, use_noise=False, one_step=True) +
            emb_words_out(word_tm1, use_noise=False, one_step=True),
            one_step=True),
                                use_noise=False,
                                one_step=True)
        word = output_layer.get_sample(state_below=outhid,
                                       additional_inputs=[h0],
                                       temp=1.)
        return word, h0

    ##### scan for iterating the single-step sampling multiple times
    [samples, summaries], updates = scan(sample_fn,
                                         states=[
                                             TT.alloc(numpy.int64(0),
                                                      state['sample_steps']),
                                             TT.alloc(numpy.float32(0), 1,
                                                      eval(state['nhids'])[-1])
                                         ],
                                         n_steps=state['sample_steps'],
                                         name='sampler_scan')

    ##### build a Theano function for sampling
    sample_fn = theano.function([], [samples],
                                updates=updates,
                                profile=False,
                                name='sample_fn')

    ##### Load a dictionary
    dictionary = numpy.load(state['dictionary'])
    if state['chunks'] == 'chars':
        dictionary = dictionary['unique_chars']
    else:
        dictionary = dictionary['unique_words']

    def hook_fn():
        sample = sample_fn()[0]
        print 'Sample:',
        if state['chunks'] == 'chars':
            print "".join(dictionary[sample])
        else:
            for si in sample:
                print dictionary[si],
            print

    ### Build and Train a Model
    #### Define a model
    model = LM_Model(cost_layer=train_model,
                     weight_noise_amount=state['weight_noise_amount'],
                     valid_fn=valid_fn,
                     clean_before_noise_fn=False,
                     noise_fn=None,
                     rng=rng)

    if state['reload']:
        model.load(state['prefix'] + 'model.npz')

    #### Define a trainer
    ##### Training algorithm (SGD)
    if state['moment'] < 0:
        algo = SGD(model, state, train_data)
    else:
        algo = SGD_m(model, state, train_data)
    ##### Main loop of the trainer
    main = MainLoop(train_data,
                    valid_data,
                    test_data,
                    model,
                    algo,
                    state,
                    channel,
                    train_cost=False,
                    hooks=hook_fn,
                    validate_postprocess=eval(state['validate_postprocess']))
    ## Run!
    main.main()
def jobman(state, channel):
    # load dataset
    rng = numpy.random.RandomState(state['seed'])

    # declare the dimensionalies of the input and output
    if state['chunks'] == 'words':
        state['n_in'] = 10000
        state['n_out'] = 10000
    else:
        state['n_in'] = 50
        state['n_out'] = 50
    train_data, valid_data, test_data = get_text_data(state)

    ## BEGIN Tutorial
    ### Define Theano Input Variables
    x = TT.lvector('x')
    y = TT.lvector('y')
    h0 = theano.shared(numpy.zeros((eval(state['nhids'])[-1],), dtype='float32'))

    ### Neural Implementation of the Operators: \oplus
    #### Word Embedding
    emb_words = MultiLayer(
        rng,
        n_in=state['n_in'],
        n_hids=eval(state['inp_nhids']),
        activation=eval(state['inp_activ']),
        init_fn='sample_weights_classic',
        weight_noise=state['weight_noise'],
        rank_n_approx = state['rank_n_approx'],
        scale=state['inp_scale'],
        sparsity=state['inp_sparse'],
        learn_bias = True,
        bias_scale=eval(state['inp_bias']),
        name='emb_words')

    #### Deep Transition Recurrent Layer
    rec = eval(state['rec_layer'])(
            rng,
            eval(state['nhids']),
            activation = eval(state['rec_activ']),
            #activation = 'TT.nnet.sigmoid',
            bias_scale = eval(state['rec_bias']),
            scale=eval(state['rec_scale']),
            sparsity=eval(state['rec_sparse']),
            init_fn=eval(state['rec_init']),
            weight_noise=state['weight_noise'],
            name='rec')

    #### Stiching them together
    ##### (1) Get the embedding of a word
    x_emb = emb_words(x, no_noise_bias=state['no_noise_bias'])
    ##### (2) Embedding + Hidden State via DT Recurrent Layer
    reset = TT.scalar('reset')
    rec_layer = rec(x_emb, n_steps=x.shape[0],
                    init_state=h0*reset,
                    no_noise_bias=state['no_noise_bias'],
                    truncate_gradient=state['truncate_gradient'],
                    batch_size=1)

    ## BEGIN Exercise: DOT-RNN
    ### Neural Implementation of the Operators: \lhd

    #### Exercise (1)
    #### TODO: Define a layer from the hidden state to the intermediate layer

    #### Exercise (1)
    #### TODO: Define a layer from the input to the intermediate Layer

    #### Hidden State: Combine emb_state and emb_words_out
    #### Exercise (1)
    #### TODO: Define an activation layer

    #### Exercise (2)
    #### TODO: Define a dropout layer

    #### Softmax Layer
    output_layer = SoftmaxLayer(
        rng,
        eval(state['dout_nhid']),
        state['n_out'],
        scale=state['out_scale'],
        bias_scale=state['out_bias_scale'],
        init_fn="sample_weights_classic",
        weight_noise=state['weight_noise'],
        sparsity=state['out_sparse'],
        sum_over_time=True,
        name='out')

    ### Few Optional Things
    #### Direct shortcut from x to y
    if state['shortcut_inpout']:
        shortcut = MultiLayer(
            rng,
            n_in=state['n_in'],
            n_hids=eval(state['inpout_nhids']),
            activations=eval(state['inpout_activ']),
            init_fn='sample_weights_classic',
            weight_noise = state['weight_noise'],
            scale=eval(state['inpout_scale']),
            sparsity=eval(state['inpout_sparse']),
            learn_bias=eval(state['inpout_learn_bias']),
            bias_scale=eval(state['inpout_bias']),
            name='shortcut')

    #### Learning rate scheduling (1/(1+n/beta))
    state['clr'] = state['lr']
    def update_lr(obj, cost):
        stp = obj.step
        if isinstance(obj.state['lr_start'], int) and stp > obj.state['lr_start']:
            time = float(stp - obj.state['lr_start'])
            new_lr = obj.state['clr']/(1+time/obj.state['lr_beta'])
            obj.lr = new_lr
    if state['lr_adapt']:
        rec.add_schedule(update_lr)

    ### Neural Implementations of the Language Model
    #### Training
    if state['shortcut_inpout']:
        additional_inputs = [rec_layer, shortcut(x)]
    else:
        additional_inputs = [rec_layer]

    ##### Exercise (1): Compute the output intermediate layer
    ##### TODO: Compute the output intermediate layer

    ##### Exercise (2): Apply Dropout
    ##### TODO: Apply the dropout layer

    train_model = output_layer(outhid,
                               no_noise_bias=state['no_noise_bias'],
                               additional_inputs=additional_inputs).train(target=y,
            scale=numpy.float32(1./state['seqlen']))

    nw_h0 = rec_layer.out[rec_layer.out.shape[0]-1]
    if state['carry_h0']:
        train_model.updates += [(h0, nw_h0)]

    #### Validation
    h0val = theano.shared(numpy.zeros((eval(state['nhids'])[-1],), dtype='float32'))
    rec_layer = rec(emb_words(x, use_noise=False),
                    n_steps = x.shape[0],
                    batch_size=1,
                    init_state=h0val*reset,
                    use_noise=False)
    nw_h0 = rec_layer.out[rec_layer.out.shape[0]-1]

    ##### Exercise (1):
    ##### TODO: Compute the output intermediate layer

    ##### Exercise (2): Apply Dropout
    ##### TODO: Apply the dropout layer without noise

    if state['shortcut_inpout']:
        additional_inputs=[rec_layer, shortcut(x, use_noise=False)]
    else:
        additional_inputs=[rec_layer]
    valid_model = output_layer(outhid,
            additional_inputs=additional_inputs,
            use_noise=False).validate(target=y, sum_over_time=True)

    valid_updates = []
    if state['carry_h0']:
        valid_updates = [(h0val, nw_h0)]

    valid_fn = theano.function([x,y, reset], valid_model.out,
          name='valid_fn', updates=valid_updates)

    #### Sampling
    ##### single-step sampling
    def sample_fn(word_tm1, h_tm1):
        x_emb = emb_words(word_tm1, use_noise = False, one_step=True)
        h0 = rec(x_emb, state_before=h_tm1, one_step=True, use_noise=False)[-1]
        outhid = outhid_dropout(outhid_activ(emb_state(h0, use_noise=False, one_step=True) +
            emb_words_out(word_tm1, use_noise=False, one_step=True), one_step=True), 
            use_noise=False, one_step=True)
        word = output_layer.get_sample(state_below=outhid, additional_inputs=[h0], temp=1.)
        return word, h0

    ##### scan for iterating the single-step sampling multiple times
    [samples, summaries], updates = scan(sample_fn,
                      states = [
                          TT.alloc(numpy.int64(0), state['sample_steps']),
                          TT.alloc(numpy.float32(0), 1, eval(state['nhids'])[-1])],
                      n_steps= state['sample_steps'],
                      name='sampler_scan')

    ##### build a Theano function for sampling
    sample_fn = theano.function([], [samples],
        updates=updates, profile=False, name='sample_fn')

    ##### Load a dictionary
    dictionary = numpy.load(state['dictionary'])
    if state['chunks'] == 'chars':
        dictionary = dictionary['unique_chars']
    else:
        dictionary = dictionary['unique_words']
    def hook_fn():
        sample = sample_fn()[0]
        print 'Sample:',
        if state['chunks'] == 'chars':
            print "".join(dictionary[sample])
        else:
            for si in sample:
                print dictionary[si],
            print

    ### Build and Train a Model
    #### Define a model
    model = LM_Model(
        cost_layer = train_model,
        weight_noise_amount=state['weight_noise_amount'],
        valid_fn = valid_fn,
        clean_before_noise_fn = False,
        noise_fn = None,
        rng = rng)

    if state['reload']:
        model.load(state['prefix']+'model.npz')

    #### Define a trainer
    ##### Training algorithm (SGD)
    if state['moment'] < 0:
        algo = SGD(model, state, train_data)
    else:
        algo = SGD_m(model, state, train_data)
    ##### Main loop of the trainer
    main = MainLoop(train_data,
                    valid_data,
                    test_data,
                    model,
                    algo,
                    state,
                    channel,
                    train_cost = False,
                    hooks = hook_fn,
                    validate_postprocess =  eval(state['validate_postprocess']))
    ## Run!
    main.main()
Exemple #4
0
    [x, y, reset],
    valid_model.cost,
    name='valid_fn',
    updates=[],  #No updates
    on_unused_input='warn')

model_path = state['prefix'] + 'model.npz'
timings_path = state['prefix'] + 'timing.npz'

try:
    print "Loading model"
    model = LM_Model(cost_layer=train_model,
                     weight_noise_amount=state['weight_noise_amount'],
                     valid_fn=valid_fn,
                     clean_before_noise_fn=False,
                     noise_fn=None,
                     test_verbosity=0,
                     cost_per_sample=True,
                     indx_word=state['dictionary'],
                     rng=rng)
    model.load(model_path)
    print "Model loaded"
except Exception:
    print 'mainLoop: Corrupted model file'
    traceback.print_exc()
try:
    timings = dict(numpy.load(timings_path).iteritems())
except Exception:
    print 'mainLoop: Corrupted timings file'
    traceback.print_exc()
Exemple #5
0
class RNNEncoderDecoder(object):
    """This class encapsulates the main translation model.

    The expected usage pattern is:
    >>> encdec = RNNEncoderDecoder(...)
    >>> encdec.build(...)
    >>> useful_smth = encdec.create_useful_smth(...)

    Functions from the create_smth family (except create_lm_model)
    when called complile and return functions that do useful stuff.
    """

    def __init__(self, state, rng,
            skip_init=False,
            compute_alignment=False):
        """Constructor.

        :param state:
            A state in the usual groundhog sense.
        :param rng:
            Random number generator. Something like numpy.random.RandomState(seed).
        :param skip_init:
            If True, all the layers are initialized with zeros. Saves time spent on
            parameter initialization if they are loaded later anyway.
        :param compute_alignment:
            If True, the alignment is returned by the decoder.
        """

        self.state = state
        self.rng = rng
        self.skip_init = skip_init
        self.compute_alignment = compute_alignment

    def build(self):
        """
        Build the computation graph of an RNN Encoder-Decoder. (From input sequence x to predictions/costs)
        It builds two graphs: a log-likelihood computation graph for evaluating costs during training, 
        and a sampling compuation graph for sampling target sequences during sampling.
        """
        logger.debug("Create input variables")
        self.x = TT.lmatrix('x')
        self.x_mask = TT.matrix('x_mask')
        self.y = TT.lmatrix('y')
        self.y_mask = TT.matrix('y_mask')
        self.inputs = [self.x, self.y, self.x_mask, self.y_mask]

        # Graph1: for the log-likelihood computation (evaluating cost of <x,y> pairs during training)
        training_c_components = []

        logger.debug("Create encoder")
        self.encoder = Encoder(self.state, self.rng,
                prefix="enc",
                skip_init=self.skip_init)
        self.encoder.create_layers()

        logger.debug("Build encoding computation graph")
        forward_training_c = self.encoder.build_encoder(
                self.x, self.x_mask,
                use_noise=True,
                return_hidden_layers=True)
        

        logger.debug("Create backward encoder")
        self.backward_encoder = Encoder(self.state, self.rng,
                prefix="back_enc",
                skip_init=self.skip_init)
        self.backward_encoder.create_layers()

        logger.debug("Build backward encoding computation graph")
        backward_training_c = self.backward_encoder.build_encoder(
                self.x[::-1],#here -1 means reverse?
                self.x_mask[::-1],
                use_noise=True,
                approx_embeddings=self.encoder.approx_embedder(self.x[::-1]),
                return_hidden_layers=True)
        # Reverse time for backward representations.
        backward_training_c.out = backward_training_c.out[::-1]

        if self.state['forward']:
            training_c_components.append(forward_training_c)
        if self.state['last_forward']:
            training_c_components.append(
                    ReplicateLayer(self.x.shape[0])(forward_training_c[-1]))
        if self.state['backward']:
            training_c_components.append(backward_training_c)
        if self.state['last_backward']:
            training_c_components.append(ReplicateLayer(self.x.shape[0])
                    (backward_training_c[0]))
        self.state['c_dim'] = len(training_c_components) * self.state['dim']
            #the real dimension of context vector is the sum of dimensions of all c components

        logger.debug("Create decoder")
        self.decoder = Decoder(self.state, self.rng,
                skip_init=self.skip_init, compute_alignment=self.compute_alignment)
        self.decoder.create_layers()
        logger.debug("Build log-likelihood computation graph")
        self.predictions, self.alignment = self.decoder.build_decoder(
                c=Concatenate(axis=2)(*training_c_components), c_mask=self.x_mask,
                y=self.y, y_mask=self.y_mask)# returns the cost of src and target pairs

 #       theano.printing.pydotprint(self.x, outfile="./pics/lm_model.png", var_with_name_simple=True)
    

        # Graph2: for sampling from a source sentence x to a target sample y
        sampling_c_components = []

        logger.debug("Build sampling computation graph")
        self.sampling_x = TT.lvector("sampling_x")
        self.n_samples = TT.lscalar("n_samples")
        self.n_steps = TT.lscalar("n_steps")
        self.T = TT.scalar("T")
        self.forward_sampling_c = self.encoder.build_encoder(
                self.sampling_x,
                return_hidden_layers=True).out
        self.backward_sampling_c = self.backward_encoder.build_encoder(
                self.sampling_x[::-1],
                approx_embeddings=self.encoder.approx_embedder(self.sampling_x[::-1]),
                return_hidden_layers=True).out[::-1]
        if self.state['forward']:
            sampling_c_components.append(self.forward_sampling_c)
        if self.state['last_forward']:
            sampling_c_components.append(ReplicateLayer(self.sampling_x.shape[0])
                    (self.forward_sampling_c[-1]))
        if self.state['backward']:
            sampling_c_components.append(self.backward_sampling_c)
        if self.state['last_backward']:
            sampling_c_components.append(ReplicateLayer(self.sampling_x.shape[0])
                    (self.backward_sampling_c[0]))

        self.sampling_c = Concatenate(axis=1)(*sampling_c_components).out
        (self.sample, self.sample_log_prob), self.sampling_updates =\
            self.decoder.build_sampler(self.n_samples, self.n_steps, self.T,
                    c=self.sampling_c)

        logger.debug("Create auxiliary variables")
        self.c = TT.matrix("c")
        self.step_num = TT.lscalar("step_num")
        self.current_states = [TT.matrix("cur_{}".format(i))
                for i in range(self.decoder.num_levels)]
        self.gen_y = TT.lvector("gen_y")

    def create_lm_model(self):
        """
        create language model based on the encoder-decoder
        to generate target sequence given an input sequence.
        """
        if hasattr(self, 'lm_model'):
            return self.lm_model
        self.lm_model = LM_Model(
            cost_layer=self.predictions,
            sample_fn=self.create_sampler(),
            weight_noise_amount=self.state['weight_noise_amount'],
            word_dict=self.state['word_indx_trgt'],
            word_dict_src=self.state['word_indx'],
            rng=self.rng)
        self.lm_model.load_dict(self.state)
        logger.debug("Params of Language Model:\n{}".format(
            pprint.pformat(sorted([p.name for p in self.lm_model.params]))))
        return self.lm_model

    def create_representation_computer(self):
        """
        function to encode a sequence to a context vector (c)
        """
        if not hasattr(self, "repr_fn"):
            '''repr_fn is a function to encode a sequence to a context vector (c)'''
            self.repr_fn = theano.function(
                    inputs=[self.sampling_x],
                    outputs=[self.sampling_c],
                    name="repr_fn")
        return self.repr_fn

    def create_initializers(self):
        if not hasattr(self, "init_fn"):
            init_c = self.sampling_c[0, -self.state['dim']:]
            self.init_fn = theano.function(
                    inputs=[self.sampling_c],
                    outputs=self.decoder.build_initializers(init_c),
                    name="init_fn")
        return self.init_fn

    def create_sampler(self, many_samples=False):
        if hasattr(self, 'sample_fn'):
            return self.sample_fn
        logger.debug("Compile sampler")
        self.sample_fn = theano.function(
                inputs=[self.n_samples, self.n_steps, self.T, self.sampling_x],
                outputs=[self.sampling_c, self.sample, self.sample_log_prob],
                updates=self.sampling_updates,
                name="sample_fn")
        if not many_samples:
            def sampler(*args):
                return map(lambda x : x.squeeze(), self.sample_fn(1, *args))
            return sampler
        return self.sample_fn

    def create_scorer(self, batch=False):#a scorer returns the average prediction confidence(-cost_per_sample) for an input sequence
        if not hasattr(self, 'score_fn'):
            logger.debug("Compile scorer")
            self.score_fn = theano.function(
                    inputs=self.inputs,
                    outputs=[-self.predictions.cost_per_sample],
                    name="score_fn")
        if batch:
            return self.score_fn
        def scorer(x, y):
            x_mask = numpy.ones(x.shape[0], dtype="float32")
            y_mask = numpy.ones(y.shape[0], dtype="float32")
            return self.score_fn(x[:, None], y[:, None],
                    x_mask[:, None], y_mask[:, None])
        return scorer

    def create_next_probs_computer(self):
        if not hasattr(self, 'next_probs_fn'):
            self.next_probs_fn = theano.function(
                    inputs=[self.c, self.step_num, self.gen_y] + self.current_states,
                    outputs=[self.decoder.build_next_probs_predictor(
                        self.c, self.step_num, self.gen_y, self.current_states)],
                    name="next_probs_fn",on_unused_input='warn')
        return self.next_probs_fn

    def create_next_states_computer(self):
        if not hasattr(self, 'next_states_fn'):
            self.next_states_fn = theano.function(
                    inputs=[self.c, self.step_num, self.gen_y] + self.current_states,
                    outputs=self.decoder.build_next_states_computer(
                        self.c, self.step_num, self.gen_y, self.current_states),
                    name="next_states_fn",on_unused_input='warn')
        return self.next_states_fn


    def create_probs_computer(self, return_alignment=False):
        if not hasattr(self, 'probs_fn'):
            logger.debug("Compile probs computer")
            self.probs_fn = theano.function(
                    inputs=self.inputs,
                    outputs=[self.predictions.word_probs, self.alignment],
                    name="probs_fn",on_unused_input='warn')
        def probs_computer(x, y):
            x_mask = numpy.ones(x.shape[0], dtype="float32")
            y_mask = numpy.ones(y.shape[0], dtype="float32")
            probs, alignment = self.probs_fn(x[:, None], y[:, None],
                    x_mask[:, None], y_mask[:, None])
            if return_alignment:
                return probs, alignment
            else:
                return probs
        return probs_computer
Exemple #6
0
                           )



model_path = state['prefix'] + 'model.npz'
timings_path = state['prefix'] + 'timing.npz'


 
try:
    print "Loading model" 
    model = LM_Model(
        cost_layer = train_model,
        weight_noise_amount=state['weight_noise_amount'],
        valid_fn =  valid_fn,
        clean_before_noise_fn = False,
        noise_fn = None,
        test_verbosity = 0,
        cost_per_sample=True,
        indx_word=state['dictionary'],
        rng = rng)
    model.load(model_path)
    print "Model loaded"
except Exception:
    print 'mainLoop: Corrupted model file'
    traceback.print_exc()
try:
    timings = dict(numpy.load(timings_path).iteritems())
except Exception:
    print 'mainLoop: Corrupted timings file'
    traceback.print_exc()
Exemple #7
0
def jobman(state, channel):
    # load dataset
    state['null_sym_source'] = 15000
    state['null_sym_target'] = 15000
    state['n_sym_source'] = state['null_sym_source'] + 1
    state['n_sym_target'] = state['null_sym_target'] + 1

    state['nouts'] = state['n_sym_target']
    state['nins'] = state['n_sym_source']
    rng = numpy.random.RandomState(state['seed'])
    if state['loopIters'] > 0:
        train_data, valid_data, test_data = get_data(state)
    else:
        train_data = None
        valid_data = None
        test_data = None

    ########### Training graph #####################
    ## 1. Inputs
    if state['bs'] == 1:
        x = TT.lvector('x')
        x_mask = TT.vector('x_mask')
        y = TT.lvector('y')
        y0 = y
        y_mask = TT.vector('y_mask')
    else:
        x = TT.lmatrix('x')
        x_mask = TT.matrix('x_mask')
        y = TT.lmatrix('y')
        y0 = y
        y_mask = TT.matrix('y_mask')

    # 2. Layers and Operators
    bs = state['bs']

    embdim = state['dim_mlp']

    # Source Sentence
    emb = MultiLayer(rng,
                     n_in=state['nins'],
                     n_hids=[state['rank_n_approx']],
                     activation=[state['rank_n_activ']],
                     init_fn=state['weight_init_fn'],
                     weight_noise=state['weight_noise'],
                     scale=state['weight_scale'],
                     name='emb')

    emb_words = []
    if state['rec_gating']:
        gater_words = []
    if state['rec_reseting']:
        reseter_words = []
    for si in xrange(state['encoder_stack']):
        emb_words.append(
            MultiLayer(rng,
                       n_in=state['rank_n_approx'],
                       n_hids=[embdim],
                       activation=['lambda x:x'],
                       init_fn=state['weight_init_fn'],
                       weight_noise=state['weight_noise'],
                       scale=state['weight_scale'],
                       name='emb_words_%d' % si))
        if state['rec_gating']:
            gater_words.append(
                MultiLayer(rng,
                           n_in=state['rank_n_approx'],
                           n_hids=[state['dim']],
                           activation=['lambda x:x'],
                           init_fn=state['weight_init_fn'],
                           weight_noise=state['weight_noise'],
                           scale=state['weight_scale'],
                           learn_bias=False,
                           name='gater_words_%d' % si))
        if state['rec_reseting']:
            reseter_words.append(
                MultiLayer(rng,
                           n_in=state['rank_n_approx'],
                           n_hids=[state['dim']],
                           activation=['lambda x:x'],
                           init_fn=state['weight_init_fn'],
                           weight_noise=state['weight_noise'],
                           scale=state['weight_scale'],
                           learn_bias=False,
                           name='reseter_words_%d' % si))

    add_rec_step = []
    rec_proj = []
    if state['rec_gating']:
        rec_proj_gater = []
    if state['rec_reseting']:
        rec_proj_reseter = []
    for si in xrange(state['encoder_stack']):
        if si > 0:
            rec_proj.append(
                MultiLayer(rng,
                           n_in=state['dim'],
                           n_hids=[embdim],
                           activation=['lambda x:x'],
                           init_fn=state['rec_weight_init_fn'],
                           weight_noise=state['weight_noise'],
                           scale=state['rec_weight_scale'],
                           name='rec_proj_%d' % si))
            if state['rec_gating']:
                rec_proj_gater.append(
                    MultiLayer(rng,
                               n_in=state['dim'],
                               n_hids=[state['dim']],
                               activation=['lambda x:x'],
                               init_fn=state['weight_init_fn'],
                               weight_noise=state['weight_noise'],
                               scale=state['weight_scale'],
                               learn_bias=False,
                               name='rec_proj_gater_%d' % si))
            if state['rec_reseting']:
                rec_proj_reseter.append(
                    MultiLayer(rng,
                               n_in=state['dim'],
                               n_hids=[state['dim']],
                               activation=['lambda x:x'],
                               init_fn=state['weight_init_fn'],
                               weight_noise=state['weight_noise'],
                               scale=state['weight_scale'],
                               learn_bias=False,
                               name='rec_proj_reseter_%d' % si))

        add_rec_step.append(
            eval(state['rec_layer'])(rng,
                                     n_hids=state['dim'],
                                     activation=state['activ'],
                                     bias_scale=state['bias'],
                                     scale=state['rec_weight_scale'],
                                     init_fn=state['rec_weight_init_fn'],
                                     weight_noise=state['weight_noise_rec'],
                                     dropout=state['dropout_rec'],
                                     gating=state['rec_gating'],
                                     gater_activation=state['rec_gater'],
                                     reseting=state['rec_reseting'],
                                     reseter_activation=state['rec_reseter'],
                                     name='add_h_%d' % si))

    def _add_op(words_embeddings,
                words_mask=None,
                prev_val=None,
                si=0,
                state_below=None,
                gater_below=None,
                reseter_below=None,
                one_step=False,
                bs=1,
                init_state=None,
                use_noise=True):
        seqlen = words_embeddings.out.shape[0] // bs
        rval = words_embeddings
        gater = None
        reseter = None
        if state['rec_gating']:
            gater = gater_below
        if state['rec_reseting']:
            reseter = reseter_below
        if si > 0:
            rval += rec_proj[si - 1](state_below,
                                     one_step=one_step,
                                     use_noise=use_noise)
            if state['rec_gating']:
                projg = rec_proj_gater[si - 1](state_below,
                                               one_step=one_step,
                                               use_noise=use_noise)
                if gater: gater += projg
                else: gater = projg
            if state['rec_reseting']:
                projg = rec_proj_reseter[si - 1](state_below,
                                                 one_step=one_step,
                                                 use_noise=use_noise)
                if reseter: reseter += projg
                else: reseter = projg

        if not one_step:
            rval = add_rec_step[si](rval,
                                    nsteps=seqlen,
                                    batch_size=bs,
                                    mask=words_mask,
                                    gater_below=gater,
                                    reseter_below=reseter,
                                    one_step=one_step,
                                    init_state=init_state,
                                    use_noise=use_noise)
        else:
            rval = add_rec_step[si](rval,
                                    mask=words_mask,
                                    state_before=prev_val,
                                    gater_below=gater,
                                    reseter_below=reseter,
                                    one_step=one_step,
                                    init_state=init_state,
                                    use_noise=use_noise)
        return rval

    add_op = Operator(_add_op)

    # Target Sentence
    emb_t = MultiLayer(rng,
                       n_in=state['nouts'],
                       n_hids=[state['rank_n_approx']],
                       activation=[state['rank_n_activ']],
                       init_fn=state['weight_init_fn'],
                       weight_noise=state['weight_noise'],
                       scale=state['weight_scale'],
                       name='emb_t')

    emb_words_t = []
    if state['rec_gating']:
        gater_words_t = []
    if state['rec_reseting']:
        reseter_words_t = []
    for si in xrange(state['decoder_stack']):
        emb_words_t.append(
            MultiLayer(rng,
                       n_in=state['rank_n_approx'],
                       n_hids=[embdim],
                       activation=['lambda x:x'],
                       init_fn=state['weight_init_fn'],
                       weight_noise=state['weight_noise'],
                       scale=state['weight_scale'],
                       name='emb_words_t_%d' % si))
        if state['rec_gating']:
            gater_words_t.append(
                MultiLayer(rng,
                           n_in=state['rank_n_approx'],
                           n_hids=[state['dim']],
                           activation=['lambda x:x'],
                           init_fn=state['weight_init_fn'],
                           weight_noise=state['weight_noise'],
                           scale=state['weight_scale'],
                           learn_bias=False,
                           name='gater_words_t_%d' % si))
        if state['rec_reseting']:
            reseter_words_t.append(
                MultiLayer(rng,
                           n_in=state['rank_n_approx'],
                           n_hids=[state['dim']],
                           activation=['lambda x:x'],
                           init_fn=state['weight_init_fn'],
                           weight_noise=state['weight_noise'],
                           scale=state['weight_scale'],
                           learn_bias=False,
                           name='reseter_words_t_%d' % si))

    proj_everything_t = []
    if state['rec_gating']:
        gater_everything_t = []
    if state['rec_reseting']:
        reseter_everything_t = []
    for si in xrange(state['decoder_stack']):
        proj_everything_t.append(
            MultiLayer(rng,
                       n_in=state['dim'],
                       n_hids=[embdim],
                       activation=['lambda x:x'],
                       init_fn=state['weight_init_fn'],
                       weight_noise=state['weight_noise'],
                       scale=state['weight_scale'],
                       name='proj_everything_t_%d' % si,
                       learn_bias=False))
        if state['rec_gating']:
            gater_everything_t.append(
                MultiLayer(rng,
                           n_in=state['dim'],
                           n_hids=[state['dim']],
                           activation=['lambda x:x'],
                           init_fn=state['weight_init_fn'],
                           weight_noise=state['weight_noise'],
                           scale=state['weight_scale'],
                           name='gater_everything_t_%d' % si,
                           learn_bias=False))
        if state['rec_reseting']:
            reseter_everything_t.append(
                MultiLayer(rng,
                           n_in=state['dim'],
                           n_hids=[state['dim']],
                           activation=['lambda x:x'],
                           init_fn=state['weight_init_fn'],
                           weight_noise=state['weight_noise'],
                           scale=state['weight_scale'],
                           name='reseter_everything_t_%d' % si,
                           learn_bias=False))

    add_rec_step_t = []
    rec_proj_t = []
    if state['rec_gating']:
        rec_proj_t_gater = []
    if state['rec_reseting']:
        rec_proj_t_reseter = []
    for si in xrange(state['decoder_stack']):
        if si > 0:
            rec_proj_t.append(
                MultiLayer(rng,
                           n_in=state['dim'],
                           n_hids=[embdim],
                           activation=['lambda x:x'],
                           init_fn=state['rec_weight_init_fn'],
                           weight_noise=state['weight_noise'],
                           scale=state['rec_weight_scale'],
                           name='rec_proj_%d' % si))
            if state['rec_gating']:
                rec_proj_t_gater.append(
                    MultiLayer(rng,
                               n_in=state['dim'],
                               n_hids=[state['dim']],
                               activation=['lambda x:x'],
                               init_fn=state['weight_init_fn'],
                               weight_noise=state['weight_noise'],
                               scale=state['weight_scale'],
                               learn_bias=False,
                               name='rec_proj_t_gater_%d' % si))
            if state['rec_reseting']:
                rec_proj_t_reseter.append(
                    MultiLayer(rng,
                               n_in=state['dim'],
                               n_hids=[state['dim']],
                               activation=['lambda x:x'],
                               init_fn=state['weight_init_fn'],
                               weight_noise=state['weight_noise'],
                               scale=state['weight_scale'],
                               learn_bias=False,
                               name='rec_proj_t_reseter_%d' % si))

        add_rec_step_t.append(
            eval(state['rec_layer'])(rng,
                                     n_hids=state['dim'],
                                     activation=state['activ'],
                                     bias_scale=state['bias'],
                                     scale=state['rec_weight_scale'],
                                     init_fn=state['rec_weight_init_fn'],
                                     weight_noise=state['weight_noise_rec'],
                                     dropout=state['dropout_rec'],
                                     gating=state['rec_gating'],
                                     gater_activation=state['rec_gater'],
                                     reseting=state['rec_reseting'],
                                     reseter_activation=state['rec_reseter'],
                                     name='add_h_t_%d' % si))

    if state['encoder_stack'] > 1:
        encoder_proj = []
        for si in xrange(state['encoder_stack']):
            encoder_proj.append(
                MultiLayer(rng,
                           n_in=state['dim'],
                           n_hids=[state['dim'] * state['maxout_part']],
                           activation=['lambda x: x'],
                           init_fn=state['weight_init_fn'],
                           weight_noise=state['weight_noise'],
                           scale=state['weight_scale'],
                           name='encoder_proj_%d' % si,
                           learn_bias=(si == 0)))

        encoder_act_layer = UnaryOp(activation=eval(state['unary_activ']),
                                    indim=indim,
                                    pieces=pieces,
                                    rng=rng)

    def _add_t_op(words_embeddings,
                  everything=None,
                  words_mask=None,
                  prev_val=None,
                  one_step=False,
                  bs=1,
                  init_state=None,
                  use_noise=True,
                  gater_below=None,
                  reseter_below=None,
                  si=0,
                  state_below=None):
        seqlen = words_embeddings.out.shape[0] // bs

        rval = words_embeddings
        gater = None
        if state['rec_gating']:
            gater = gater_below
        reseter = None
        if state['rec_reseting']:
            reseter = reseter_below
        if si > 0:
            if isinstance(state_below, list):
                state_below = state_below[-1]
            rval += rec_proj_t[si - 1](state_below,
                                       one_step=one_step,
                                       use_noise=use_noise)
            if state['rec_gating']:
                projg = rec_proj_t_gater[si - 1](state_below,
                                                 one_step=one_step,
                                                 use_noise=use_noise)
                if gater: gater += projg
                else: gater = projg
            if state['rec_reseting']:
                projg = rec_proj_t_reseter[si - 1](state_below,
                                                   one_step=one_step,
                                                   use_noise=use_noise)
                if reseter: reseter += projg
                else: reseter = projg
        if everything:
            rval = rval + proj_everything_t[si](everything)
            if state['rec_gating']:
                everyg = gater_everything_t[si](everything,
                                                one_step=one_step,
                                                use_noise=use_noise)
                if gater: gater += everyg
                else: gater = everyg
            if state['rec_reseting']:
                everyg = reseter_everything_t[si](everything,
                                                  one_step=one_step,
                                                  use_noise=use_noise)
                if reseter: reseter += everyg
                else: reseter = everyg

        if not one_step:
            rval = add_rec_step_t[si](rval,
                                      nsteps=seqlen,
                                      batch_size=bs,
                                      mask=words_mask,
                                      one_step=one_step,
                                      init_state=init_state,
                                      gater_below=gater,
                                      reseter_below=reseter,
                                      use_noise=use_noise)
        else:
            rval = add_rec_step_t[si](rval,
                                      mask=words_mask,
                                      state_before=prev_val,
                                      one_step=one_step,
                                      gater_below=gater,
                                      reseter_below=reseter,
                                      use_noise=use_noise)
        return rval

    add_t_op = Operator(_add_t_op)

    outdim = state['dim_mlp']
    if not state['deep_out']:
        outdim = state['rank_n_approx']

    if state['bias_code']:
        bias_code = []
        for si in xrange(state['decoder_stack']):
            bias_code.append(
                MultiLayer(rng,
                           n_in=state['dim'],
                           n_hids=[state['dim']],
                           activation=[state['activ']],
                           bias_scale=[state['bias']],
                           scale=state['weight_scale'],
                           init_fn=state['weight_init_fn'],
                           weight_noise=state['weight_noise'],
                           name='bias_code_%d' % si))

    if state['avg_word']:
        word_code_nin = state['rank_n_approx']
        word_code = MultiLayer(rng,
                               n_in=word_code_nin,
                               n_hids=[outdim],
                               activation='lambda x:x',
                               bias_scale=[state['bias_mlp'] / 3],
                               scale=state['weight_scale'],
                               init_fn=state['weight_init_fn'],
                               weight_noise=state['weight_noise'],
                               learn_bias=False,
                               name='word_code')

    proj_code = MultiLayer(rng,
                           n_in=state['dim'],
                           n_hids=[outdim],
                           activation='lambda x: x',
                           bias_scale=[state['bias_mlp'] / 3],
                           scale=state['weight_scale'],
                           init_fn=state['weight_init_fn'],
                           weight_noise=state['weight_noise'],
                           learn_bias=False,
                           name='proj_code')

    proj_h = []
    for si in xrange(state['decoder_stack']):
        proj_h.append(
            MultiLayer(rng,
                       n_in=state['dim'],
                       n_hids=[outdim],
                       activation='lambda x: x',
                       bias_scale=[state['bias_mlp'] / 3],
                       scale=state['weight_scale'],
                       init_fn=state['weight_init_fn'],
                       weight_noise=state['weight_noise'],
                       name='proj_h_%d' % si))

    if state['bigram']:
        proj_word = MultiLayer(rng,
                               n_in=state['rank_n_approx'],
                               n_hids=[outdim],
                               activation=['lambda x:x'],
                               bias_scale=[state['bias_mlp'] / 3],
                               init_fn=state['weight_init_fn'],
                               weight_noise=state['weight_noise'],
                               scale=state['weight_scale'],
                               learn_bias=False,
                               name='emb_words_lm')

    if state['deep_out']:
        indim = 0
        pieces = 0
        act_layer = UnaryOp(activation=eval(state['unary_activ']))
        drop_layer = DropOp(rng=rng, dropout=state['dropout'])

    if state['deep_out']:
        indim = state['dim_mlp'] / state['maxout_part']
        rank_n_approx = state['rank_n_approx']
        rank_n_activ = state['rank_n_activ']
    else:
        indim = state['rank_n_approx']
        rank_n_approx = 0
        rank_n_activ = None
    output_layer = SoftmaxLayer(rng,
                                indim,
                                state['nouts'],
                                state['weight_scale'],
                                -1,
                                rank_n_approx=rank_n_approx,
                                rank_n_activ=rank_n_activ,
                                weight_noise=state['weight_noise'],
                                init_fn=state['weight_init_fn'],
                                name='out')

    def _pop_op(everything,
                accum,
                everything_max=None,
                everything_min=None,
                word=None,
                aword=None,
                one_step=False,
                use_noise=True):

        rval = proj_h[0](accum[0], one_step=one_step, use_noise=use_noise)
        for si in xrange(1, state['decoder_stack']):
            rval += proj_h[si](accum[si],
                               one_step=one_step,
                               use_noise=use_noise)
        if state['mult_out']:
            rval = rval * everything
        else:
            rval = rval + everything

        if aword and state['avg_word']:
            wcode = aword
            if one_step:
                if state['mult_out']:
                    rval = rval * wcode
                else:
                    rval = rval + wcode
            else:
                if not isinstance(wcode, TT.TensorVariable):
                    wcode = wcode.out
                shape = wcode.shape
                rshape = rval.shape
                rval = rval.reshape(
                    [rshape[0] / shape[0], shape[0], rshape[1]])
                wcode = wcode.dimshuffle('x', 0, 1)
                if state['mult_out']:
                    rval = rval * wcode
                else:
                    rval = rval + wcode
                rval = rval.reshape(rshape)
        if word and state['bigram']:
            if one_step:
                if state['mult_out']:
                    rval *= proj_word(emb_t(word, use_noise=use_noise),
                                      one_step=one_step,
                                      use_noise=use_noise)
                else:
                    rval += proj_word(emb_t(word, use_noise=use_noise),
                                      one_step=one_step,
                                      use_noise=use_noise)
            else:
                if isinstance(word, TT.TensorVariable):
                    shape = word.shape
                    ndim = word.ndim
                else:
                    shape = word.shape
                    ndim = word.out.ndim
                pword = proj_word(emb_t(word, use_noise=use_noise),
                                  one_step=one_step,
                                  use_noise=use_noise)
                shape_pword = pword.shape
                if ndim == 1:
                    pword = Shift()(pword.reshape([shape[0], 1, outdim]))
                else:
                    pword = Shift()(pword.reshape([shape[0], shape[1],
                                                   outdim]))
                if state['mult_out']:
                    rval *= pword.reshape(shape_pword)
                else:
                    rval += pword.reshape(shape_pword)
        if state['deep_out']:
            rval = drop_layer(act_layer(rval), use_noise=use_noise)
        return rval

    pop_op = Operator(_pop_op)

    # 3. Constructing the model
    gater_below = None
    if state['rec_gating']:
        gater_below = gater_words[0](emb(x))
    reseter_below = None
    if state['rec_reseting']:
        reseter_below = reseter_words[0](emb(x))
    encoder_acts = [
        add_op(emb_words[0](emb(x)),
               x_mask,
               bs=x_mask.shape[1],
               si=0,
               gater_below=gater_below,
               reseter_below=reseter_below)
    ]
    if state['encoder_stack'] > 1:
        everything = encoder_proj[0](last(encoder_acts[-1]))
    for si in xrange(1, state['encoder_stack']):
        gater_below = None
        if state['rec_gating']:
            gater_below = gater_words[si](emb(x))
        reseter_below = None
        if state['rec_reseting']:
            reseter_below = reseter_words[si](emb(x))
        encoder_acts.append(
            add_op(emb_words[si](emb(x)),
                   x_mask,
                   bs=x_mask.shape[1],
                   si=si,
                   state_below=encoder_acts[-1],
                   gater_below=gater_below,
                   reseter_below=reseter_below))
        if state['encoder_stack'] > 1:
            everything += encoder_proj[si](last(encoder_acts[-1]))

    if state['encoder_stack'] <= 1:
        encoder = encoder_acts[-1]
        everything = LastState(ntimes=True, n=y.shape[0])(encoder)
    else:
        everything = encoder_act_layer(everything)
        everything = everything.reshape(
            [1, everything.shape[0], everything.shape[1]])
        everything = LastState(ntimes=True, n=y.shape[0])(everything)

    if state['bias_code']:
        init_state = [bc(everything[-1]) for bc in bias_code]
    else:
        init_state = [None for bc in bias_code]

    if state['avg_word']:
        shape = x.shape
        pword = emb(x).out.reshape(
            [shape[0], shape[1], state['rank_n_approx']])
        pword = pword * x_mask.dimshuffle(0, 1, 'x')
        aword = pword.sum(0) / TT.maximum(1., x_mask.sum(0).dimshuffle(0, 'x'))
        aword = word_code(aword, use_noise=False)
    else:
        aword = None

    gater_below = None
    if state['rec_gating']:
        gater_below = gater_words_t[0](emb_t(y0))
    reseter_below = None
    if state['rec_reseting']:
        reseter_below = reseter_words_t[0](emb_t(y0))
    has_said = [
        add_t_op(emb_words_t[0](emb_t(y0)),
                 everything,
                 y_mask,
                 bs=y_mask.shape[1],
                 gater_below=gater_below,
                 reseter_below=reseter_below,
                 init_state=init_state[0],
                 si=0)
    ]
    for si in xrange(1, state['decoder_stack']):
        gater_below = None
        if state['rec_gating']:
            gater_below = gater_words_t[si](emb_t(y0))
        reseter_below = None
        if state['rec_reseting']:
            reseter_below = reseter_words_t[si](emb_t(y0))
        has_said.append(
            add_t_op(emb_words_t[si](emb_t(y0)),
                     everything,
                     y_mask,
                     bs=y_mask.shape[1],
                     state_below=has_said[-1],
                     gater_below=gater_below,
                     reseter_below=reseter_below,
                     init_state=init_state[si],
                     si=si))

    if has_said[0].out.ndim < 3:
        for si in xrange(state['decoder_stack']):
            shape_hs = has_said[si].shape
            if y0.ndim == 1:
                shape = y0.shape
                has_said[si] = Shift()(has_said[si].reshape(
                    [shape[0], 1, state['dim_mlp']]))
            else:
                shape = y0.shape
                has_said[si] = Shift()(has_said[si].reshape(
                    [shape[0], shape[1], state['dim_mlp']]))
            has_said[si].out = TT.set_subtensor(has_said[si].out[0, :, :],
                                                init_state[si])
            has_said[si] = has_said[si].reshape(shape_hs)
    else:
        for si in xrange(state['decoder_stack']):
            has_said[si] = Shift()(has_said[si])
            has_said[si].out = TT.set_subtensor(has_said[si].out[0, :, :],
                                                init_state[si])

    model = pop_op(proj_code(everything), has_said, word=y0, aword=aword)

    nll = output_layer.train(
        state_below=model, target=y0, mask=y_mask, reg=None) / TT.cast(
            y.shape[0] * y.shape[1], 'float32')

    valid_fn = None
    noise_fn = None

    x = TT.lvector(name='x')
    n_steps = TT.iscalar('nsteps')
    temp = TT.scalar('temp')
    gater_below = None
    if state['rec_gating']:
        gater_below = gater_words[0](emb(x))
    reseter_below = None
    if state['rec_reseting']:
        reseter_below = reseter_words[0](emb(x))
    encoder_acts = [
        add_op(emb_words[0](emb(x), use_noise=False),
               si=0,
               use_noise=False,
               gater_below=gater_below,
               reseter_below=reseter_below)
    ]
    if state['encoder_stack'] > 1:
        everything = encoder_proj[0](last(encoder_acts[-1]), use_noise=False)
    for si in xrange(1, state['encoder_stack']):
        gater_below = None
        if state['rec_gating']:
            gater_below = gater_words[si](emb(x))
        reseter_below = None
        if state['rec_reseting']:
            reseter_below = reseter_words[si](emb(x))
        encoder_acts.append(
            add_op(emb_words[si](emb(x), use_noise=False),
                   si=si,
                   state_below=encoder_acts[-1],
                   use_noise=False,
                   gater_below=gater_below,
                   reseter_below=reseter_below))
        if state['encoder_stack'] > 1:
            everything += encoder_proj[si](last(encoder_acts[-1]),
                                           use_noise=False)
    if state['encoder_stack'] <= 1:
        encoder = encoder_acts[-1]
        everything = last(encoder)
    else:
        everything = encoder_act_layer(everything)

    init_state = []
    for si in xrange(state['decoder_stack']):
        if state['bias_code']:
            init_state.append(
                TT.reshape(bias_code[si](everything, use_noise=False),
                           [1, state['dim']]))
        else:
            init_state.append(TT.alloc(numpy.float32(0), 1, state['dim']))

    if state['avg_word']:
        aword = emb(x, use_noise=False).out.mean(0)
        aword = word_code(aword, use_noise=False)
    else:
        aword = None

    def sample_fn(*args):
        aidx = 0
        word_tm1 = args[aidx]
        aidx += 1
        prob_tm1 = args[aidx]
        has_said_tm1 = []
        for si in xrange(state['decoder_stack']):
            aidx += 1
            has_said_tm1.append(args[aidx])
        aidx += 1
        ctx = args[aidx]
        if state['avg_word']:
            aidx += 1
            awrd = args[aidx]

        val = pop_op(proj_code(ctx),
                     has_said_tm1,
                     word=word_tm1,
                     aword=awrd,
                     one_step=True,
                     use_noise=False)
        sample = output_layer.get_sample(state_below=val, temp=temp)
        logp = output_layer.get_cost(state_below=val.out.reshape(
            [1, TT.cast(output_layer.n_in, 'int64')]),
                                     temp=temp,
                                     target=sample.reshape([1, 1]),
                                     use_noise=False)
        gater_below = None
        if state['rec_gating']:
            gater_below = gater_words_t[0](emb_t(sample))
        reseter_below = None
        if state['rec_reseting']:
            reseter_below = reseter_words_t[0](emb_t(sample))
        has_said_t = [
            add_t_op(emb_words_t[0](emb_t(sample)),
                     ctx,
                     prev_val=has_said_tm1[0],
                     gater_below=gater_below,
                     reseter_below=reseter_below,
                     one_step=True,
                     use_noise=True,
                     si=0)
        ]
        for si in xrange(1, state['decoder_stack']):
            gater_below = None
            if state['rec_gating']:
                gater_below = gater_words_t[si](emb_t(sample))
            reseter_below = None
            if state['rec_reseting']:
                reseter_below = reseter_words_t[si](emb_t(sample))
            has_said_t.append(
                add_t_op(emb_words_t[si](emb_t(sample)),
                         ctx,
                         prev_val=has_said_tm1[si],
                         gater_below=gater_below,
                         reseter_below=reseter_below,
                         one_step=True,
                         use_noise=True,
                         si=si,
                         state_below=has_said_t[-1]))
        for si in xrange(state['decoder_stack']):
            if isinstance(has_said_t[si], list):
                has_said_t[si] = has_said_t[si][-1]
        rval = [sample, TT.cast(logp, 'float32')] + has_said_t
        return rval

    sampler_params = [everything]
    if state['avg_word']:
        sampler_params.append(aword)

    states = [TT.alloc(numpy.int64(0), n_steps)]
    states.append(TT.alloc(numpy.float32(0), n_steps))
    states += init_state

    outputs, updates = scan(sample_fn,
                            states=states,
                            params=sampler_params,
                            n_steps=n_steps,
                            name='sampler_scan')
    samples = outputs[0]
    probs = outputs[1]

    sample_fn = theano.function([n_steps, temp, x],
                                [samples, probs.sum()],
                                updates=updates,
                                profile=False,
                                name='sample_fn')

    model = LM_Model(cost_layer=nll,
                     weight_noise_amount=state['weight_noise_amount'],
                     valid_fn=valid_fn,
                     sample_fn=sample_fn,
                     clean_before_noise_fn=False,
                     noise_fn=noise_fn,
                     indx_word=state['indx_word_target'],
                     indx_word_src=state['indx_word'],
                     character_level=False,
                     rng=rng)

    if state['loopIters'] > 0: algo = SGD(model, state, train_data)
    else: algo = None

    def hook_fn():
        if not hasattr(model, 'word_indxs'): model.load_dict()
        if not hasattr(model, 'word_indxs_src'):
            model.word_indxs_src = model.word_indxs
        old_offset = train_data.offset
        if state['sample_reset']: train_data.reset()
        ns = 0
        for sidx in xrange(state['sample_n']):
            while True:
                batch = train_data.next()
                if batch:
                    break
            x = batch['x']
            y = batch['y']
            #xbow = batch['x_bow']
            masks = batch['x_mask']
            if x.ndim > 1:
                for idx in xrange(x.shape[1]):
                    ns += 1
                    if ns > state['sample_max']:
                        break
                    print 'Input: ',
                    for k in xrange(x[:, idx].shape[0]):
                        print model.word_indxs_src[x[:, idx][k]],
                        if model.word_indxs_src[x[:, idx][k]] == '<eol>':
                            break
                    print ''
                    print 'Target: ',
                    for k in xrange(y[:, idx].shape[0]):
                        print model.word_indxs[y[:, idx][k]],
                        if model.word_indxs[y[:, idx][k]] == '<eol>':
                            break
                    print ''
                    senlen = len(x[:, idx])
                    if len(numpy.where(masks[:, idx] == 0)[0]) > 0:
                        senlen = numpy.where(masks[:, idx] == 0)[0][0]
                    if senlen < 1:
                        continue
                    xx = x[:senlen, idx]
                    #xx = xx.reshape([xx.shape[0], 1])
                    model.get_samples(state['seqlen'] + 1, 1, xx)
            else:
                ns += 1
                model.get_samples(state['seqlen'] + 1, 1, x)
            if ns > state['sample_max']:
                break
        train_data.offset = old_offset
        return

    main = MainLoop(train_data,
                    valid_data,
                    None,
                    model,
                    algo,
                    state,
                    channel,
                    reset=state['reset'],
                    hooks=hook_fn)
    if state['reload']: main.load()
    if state['loopIters'] > 0: main.main()

    if state['sampler_test']:
        # This is a test script: we only sample
        if not hasattr(model, 'word_indxs'): model.load_dict()
        if not hasattr(model, 'word_indxs_src'):
            model.word_indxs_src = model.word_indxs

        indx_word = pkl.load(open(state['word_indx'], 'rb'))

        try:
            while True:
                try:
                    seqin = raw_input('Input Sequence: ')
                    n_samples = int(raw_input('How many samples? '))
                    alpha = float(raw_input('Inverse Temperature? '))

                    seqin = seqin.lower()
                    seqin = seqin.split()

                    seqlen = len(seqin)
                    seq = numpy.zeros(seqlen + 1, dtype='int64')
                    for idx, sx in enumerate(seqin):
                        try:
                            seq[idx] = indx_word[sx]
                        except:
                            seq[idx] = indx_word[state['oov']]
                    seq[-1] = state['null_sym_source']

                except Exception:
                    print 'Something wrong with your input! Try again!'
                    continue

                sentences = []
                all_probs = []
                for sidx in xrange(n_samples):
                    #import ipdb; ipdb.set_trace()
                    [values, probs] = model.sample_fn(seqlen * 3, alpha, seq)
                    sen = []
                    for k in xrange(values.shape[0]):
                        if model.word_indxs[values[k]] == '<eol>':
                            break
                        sen.append(model.word_indxs[values[k]])
                    sentences.append(" ".join(sen))
                    all_probs.append(-probs)
                sprobs = numpy.argsort(all_probs)
                for pidx in sprobs:
                    print pidx, "(%f):" % (-all_probs[pidx]), sentences[pidx]
                print

        except KeyboardInterrupt:
            print 'Interrupted'
            pass
def jobman(state, channel):
    # load dataset
    state['null_sym_source'] = 15000 
    state['null_sym_target'] = 15000
    state['n_sym_source'] = state['null_sym_source'] + 1
    state['n_sym_target'] = state['null_sym_target'] + 1

    state['nouts'] = state['n_sym_target']
    state['nins'] = state['n_sym_source']
    rng = numpy.random.RandomState(state['seed'])
    if state['loopIters'] > 0:
        train_data, valid_data, test_data = get_data(state)
    else:
        train_data = None
        valid_data = None
        test_data = None

    ########### Training graph #####################
    ## 1. Inputs
    if state['bs'] == 1:
        x = TT.lvector('x')
        x_mask = TT.vector('x_mask')
        y = TT.lvector('y')
        y0 = y
        y_mask = TT.vector('y_mask')
    else:
        x = TT.lmatrix('x')
        x_mask = TT.matrix('x_mask')
        y = TT.lmatrix('y')
        y0 = y
        y_mask = TT.matrix('y_mask')

    # 2. Layers and Operators
    bs = state['bs']

    embdim = state['dim_mlp']

    # Source Sentence
    emb = MultiLayer(
        rng,
        n_in=state['nins'],
        n_hids=[state['rank_n_approx']],
        activation=[state['rank_n_activ']],
        init_fn=state['weight_init_fn'],
        weight_noise=state['weight_noise'],
        scale=state['weight_scale'],
        name='emb')

    emb_words = []
    if state['rec_gating']:
        gater_words = []
    if state['rec_reseting']:
        reseter_words = []
    for si in xrange(state['encoder_stack']):
        emb_words.append(MultiLayer(
            rng,
            n_in=state['rank_n_approx'],
            n_hids=[embdim],
            activation=['lambda x:x'],
            init_fn=state['weight_init_fn'],
            weight_noise=state['weight_noise'],
            scale=state['weight_scale'],
            name='emb_words_%d'%si))
        if state['rec_gating']:
            gater_words.append(MultiLayer(
                rng,
                n_in=state['rank_n_approx'],
                n_hids=[state['dim']],
                activation=['lambda x:x'],
                init_fn=state['weight_init_fn'],
                weight_noise=state['weight_noise'],
                scale=state['weight_scale'],
                learn_bias = False,
                name='gater_words_%d'%si))
        if state['rec_reseting']:
            reseter_words.append(MultiLayer(
                rng,
                n_in=state['rank_n_approx'],
                n_hids=[state['dim']],
                activation=['lambda x:x'],
                init_fn=state['weight_init_fn'],
                weight_noise=state['weight_noise'],
                scale=state['weight_scale'],
                learn_bias = False,
                name='reseter_words_%d'%si))

    add_rec_step = []
    rec_proj = []
    if state['rec_gating']:
        rec_proj_gater = []
    if state['rec_reseting']:
        rec_proj_reseter = []
    for si in xrange(state['encoder_stack']):
        if si > 0:
            rec_proj.append(MultiLayer(
                rng,
                n_in=state['dim'],
                n_hids=[embdim],
                activation=['lambda x:x'],
                init_fn=state['rec_weight_init_fn'],
                weight_noise=state['weight_noise'],
                scale=state['rec_weight_scale'],
                name='rec_proj_%d'%si))
            if state['rec_gating']:
                rec_proj_gater.append(MultiLayer(
                    rng,
                    n_in=state['dim'],
                    n_hids=[state['dim']],
                    activation=['lambda x:x'],
                    init_fn=state['weight_init_fn'],
                    weight_noise=state['weight_noise'],
                    scale=state['weight_scale'],
                    learn_bias = False, 
                    name='rec_proj_gater_%d'%si))
            if state['rec_reseting']:
                rec_proj_reseter.append(MultiLayer(
                    rng,
                    n_in=state['dim'],
                    n_hids=[state['dim']],
                    activation=['lambda x:x'],
                    init_fn=state['weight_init_fn'],
                    weight_noise=state['weight_noise'],
                    scale=state['weight_scale'],
                    learn_bias = False, 
                    name='rec_proj_reseter_%d'%si))

        add_rec_step.append(eval(state['rec_layer'])(
                rng,
                n_hids=state['dim'],
                activation = state['activ'],
                bias_scale = state['bias'],
                scale=state['rec_weight_scale'],
                init_fn=state['rec_weight_init_fn'],
                weight_noise=state['weight_noise_rec'],
                dropout=state['dropout_rec'],
                gating=state['rec_gating'],
                gater_activation=state['rec_gater'],
                reseting=state['rec_reseting'],
                reseter_activation=state['rec_reseter'],
                name='add_h_%d'%si))

    def _add_op(words_embeddings, 
                words_mask=None,
                prev_val=None,
                si = 0, 
                state_below = None,
                gater_below = None,
                reseter_below = None,
                one_step=False, 
                bs=1, 
                init_state=None, 
                use_noise=True):
        seqlen = words_embeddings.out.shape[0]//bs
        rval = words_embeddings
        gater = None
        reseter = None
        if state['rec_gating']:
            gater = gater_below
        if state['rec_reseting']:
            reseter = reseter_below
        if si > 0:
            rval += rec_proj[si-1](state_below, one_step=one_step, 
                    use_noise=use_noise)
            if state['rec_gating']:
                projg = rec_proj_gater[si-1](state_below, one_step=one_step, 
                        use_noise = use_noise)
                if gater: gater += projg
                else: gater = projg
            if state['rec_reseting']:
                projg = rec_proj_reseter[si-1](state_below, one_step=one_step, 
                        use_noise = use_noise)
                if reseter: reseter += projg
                else: reseter = projg
            
        if not one_step:
            rval= add_rec_step[si](
                rval,
                nsteps=seqlen,
                batch_size=bs,
                mask=words_mask,
                gater_below = gater,
                reseter_below = reseter,
                one_step=one_step,
                init_state=init_state,
                use_noise = use_noise)
        else:
            rval= add_rec_step[si](
                rval,
                mask=words_mask,
                state_before=prev_val,
                gater_below = gater,
                reseter_below = reseter,
                one_step=one_step,
                init_state=init_state,
                use_noise = use_noise)
        return rval
    add_op = Operator(_add_op)

    # Target Sentence
    emb_t = MultiLayer(
        rng,
        n_in=state['nouts'],
        n_hids=[state['rank_n_approx']],
        activation=[state['rank_n_activ']],
        init_fn=state['weight_init_fn'],
        weight_noise=state['weight_noise'],
        scale=state['weight_scale'],
        name='emb_t')

    emb_words_t = []
    if state['rec_gating']:
        gater_words_t = []
    if state['rec_reseting']:
        reseter_words_t = []
    for si in xrange(state['decoder_stack']):
        emb_words_t.append(MultiLayer(
            rng,
            n_in=state['rank_n_approx'],
            n_hids=[embdim],
            activation=['lambda x:x'],
            init_fn=state['weight_init_fn'],
            weight_noise=state['weight_noise'],
            scale=state['weight_scale'],
            name='emb_words_t_%d'%si))
        if state['rec_gating']:
            gater_words_t.append(MultiLayer(
                rng,
                n_in=state['rank_n_approx'],
                n_hids=[state['dim']],
                activation=['lambda x:x'],
                init_fn=state['weight_init_fn'],
                weight_noise=state['weight_noise'],
                scale=state['weight_scale'],
                learn_bias=False,
                name='gater_words_t_%d'%si))
        if state['rec_reseting']:
            reseter_words_t.append(MultiLayer(
                rng,
                n_in=state['rank_n_approx'],
                n_hids=[state['dim']],
                activation=['lambda x:x'],
                init_fn=state['weight_init_fn'],
                weight_noise=state['weight_noise'],
                scale=state['weight_scale'],
                learn_bias=False,
                name='reseter_words_t_%d'%si))

    proj_everything_t = []
    if state['rec_gating']:
        gater_everything_t = []
    if state['rec_reseting']:
        reseter_everything_t = []
    for si in xrange(state['decoder_stack']):
        proj_everything_t.append(MultiLayer(
            rng,
            n_in=state['dim'],
            n_hids=[embdim],
            activation=['lambda x:x'],
            init_fn=state['weight_init_fn'],
            weight_noise=state['weight_noise'],
            scale=state['weight_scale'],
            name='proj_everything_t_%d'%si,
            learn_bias = False))
        if state['rec_gating']:
            gater_everything_t.append(MultiLayer(
                rng,
                n_in=state['dim'],
                n_hids=[state['dim']],
                activation=['lambda x:x'],
                init_fn=state['weight_init_fn'],
                weight_noise=state['weight_noise'],
                scale=state['weight_scale'],
                name='gater_everything_t_%d'%si,
                learn_bias = False))
        if state['rec_reseting']:
            reseter_everything_t.append(MultiLayer(
                rng,
                n_in=state['dim'],
                n_hids=[state['dim']],
                activation=['lambda x:x'],
                init_fn=state['weight_init_fn'],
                weight_noise=state['weight_noise'],
                scale=state['weight_scale'],
                name='reseter_everything_t_%d'%si,
                learn_bias = False))

    add_rec_step_t = []
    rec_proj_t = []
    if state['rec_gating']:
        rec_proj_t_gater = []
    if state['rec_reseting']:
        rec_proj_t_reseter = []
    for si in xrange(state['decoder_stack']):
        if si > 0:
            rec_proj_t.append(MultiLayer(
                rng,
                n_in=state['dim'],
                n_hids=[embdim],
                activation=['lambda x:x'],
                init_fn=state['rec_weight_init_fn'],
                weight_noise=state['weight_noise'],
                scale=state['rec_weight_scale'],
                name='rec_proj_%d'%si))
            if state['rec_gating']:
                rec_proj_t_gater.append(MultiLayer(
                    rng,
                    n_in=state['dim'],
                    n_hids=[state['dim']],
                    activation=['lambda x:x'],
                    init_fn=state['weight_init_fn'],
                    weight_noise=state['weight_noise'],
                    scale=state['weight_scale'],
                    learn_bias=False,
                    name='rec_proj_t_gater_%d'%si))
            if state['rec_reseting']:
                rec_proj_t_reseter.append(MultiLayer(
                    rng,
                    n_in=state['dim'],
                    n_hids=[state['dim']],
                    activation=['lambda x:x'],
                    init_fn=state['weight_init_fn'],
                    weight_noise=state['weight_noise'],
                    scale=state['weight_scale'],
                    learn_bias=False,
                    name='rec_proj_t_reseter_%d'%si))

        add_rec_step_t.append(eval(state['rec_layer'])(
                rng,
                n_hids=state['dim'],
                activation = state['activ'],
                bias_scale = state['bias'],
                scale=state['rec_weight_scale'],
                init_fn=state['rec_weight_init_fn'],
                weight_noise=state['weight_noise_rec'],
                dropout=state['dropout_rec'],
                gating=state['rec_gating'],
                gater_activation=state['rec_gater'],
                reseting=state['rec_reseting'],
                reseter_activation=state['rec_reseter'],
                name='add_h_t_%d'%si))

    if state['encoder_stack'] > 1:
        encoder_proj = []
        for si in xrange(state['encoder_stack']):
            encoder_proj.append(MultiLayer(
                rng,
                n_in=state['dim'],
                n_hids=[state['dim'] * state['maxout_part']],
                activation=['lambda x: x'],
                init_fn=state['weight_init_fn'],
                weight_noise=state['weight_noise'],
                scale=state['weight_scale'],
                name='encoder_proj_%d'%si,
                learn_bias = (si == 0)))

        encoder_act_layer = UnaryOp(activation=eval(state['unary_activ']), 
                indim = indim, pieces = pieces, rng=rng)

    def _add_t_op(words_embeddings, everything = None, words_mask=None,
                prev_val=None,one_step=False, bs=1, 
                init_state=None, use_noise=True,
                gater_below = None,
                reseter_below = None,
                si = 0, state_below = None):
        seqlen = words_embeddings.out.shape[0]//bs

        rval = words_embeddings
        gater = None
        if state['rec_gating']:
            gater = gater_below
        reseter = None
        if state['rec_reseting']:
            reseter = reseter_below
        if si > 0:
            if isinstance(state_below, list):
                state_below = state_below[-1]
            rval += rec_proj_t[si-1](state_below, 
                    one_step=one_step, use_noise=use_noise)
            if state['rec_gating']:
                projg = rec_proj_t_gater[si-1](state_below, one_step=one_step, 
                        use_noise = use_noise)
                if gater: gater += projg
                else: gater = projg
            if state['rec_reseting']:
                projg = rec_proj_t_reseter[si-1](state_below, one_step=one_step, 
                        use_noise = use_noise)
                if reseter: reseter += projg
                else: reseter = projg
        if everything:
            rval = rval + proj_everything_t[si](everything)
            if state['rec_gating']:
                everyg = gater_everything_t[si](everything, one_step=one_step, use_noise=use_noise)
                if gater: gater += everyg
                else: gater = everyg
            if state['rec_reseting']:
                everyg = reseter_everything_t[si](everything, one_step=one_step, use_noise=use_noise)
                if reseter: reseter += everyg
                else: reseter = everyg

        if not one_step:
            rval = add_rec_step_t[si](
                rval,
                nsteps=seqlen,
                batch_size=bs,
                mask=words_mask,
                one_step=one_step,
                init_state=init_state,
                gater_below = gater,
                reseter_below = reseter,
                use_noise = use_noise)
        else:
            rval = add_rec_step_t[si](
                rval,
                mask=words_mask,
                state_before=prev_val,
                one_step=one_step,
                gater_below = gater,
                reseter_below = reseter,
                use_noise = use_noise)
        return rval
    add_t_op = Operator(_add_t_op)

    outdim = state['dim_mlp']
    if not state['deep_out']:
        outdim = state['rank_n_approx']

    if state['bias_code']:
        bias_code = []
        for si in xrange(state['decoder_stack']):
            bias_code.append(MultiLayer(
                rng,
                n_in=state['dim'],
                n_hids=[state['dim']],
                activation = [state['activ']],
                bias_scale = [state['bias']],
                scale=state['weight_scale'],
                init_fn=state['weight_init_fn'],
                weight_noise=state['weight_noise'],
                name='bias_code_%d'%si))

    if state['avg_word']:
        word_code_nin = state['rank_n_approx']
        word_code = MultiLayer(
            rng,
            n_in=word_code_nin,
            n_hids=[outdim],
            activation = 'lambda x:x',
            bias_scale = [state['bias_mlp']/3],
            scale=state['weight_scale'],
            init_fn=state['weight_init_fn'],
            weight_noise=state['weight_noise'],
            learn_bias = False,
            name='word_code')

    proj_code = MultiLayer(
        rng,
        n_in=state['dim'],
        n_hids=[outdim],
        activation = 'lambda x: x',
        bias_scale = [state['bias_mlp']/3],
        scale=state['weight_scale'],
        init_fn=state['weight_init_fn'],
        weight_noise=state['weight_noise'],
        learn_bias = False,
        name='proj_code')

    proj_h = []
    for si in xrange(state['decoder_stack']):
        proj_h.append(MultiLayer(
            rng,
            n_in=state['dim'],
            n_hids=[outdim],
            activation = 'lambda x: x',
            bias_scale = [state['bias_mlp']/3],
            scale=state['weight_scale'],
            init_fn=state['weight_init_fn'],
            weight_noise=state['weight_noise'],
            name='proj_h_%d'%si))

    if state['bigram']:
        proj_word = MultiLayer(
            rng,
            n_in=state['rank_n_approx'],
            n_hids=[outdim],
            activation=['lambda x:x'],
            bias_scale = [state['bias_mlp']/3],
            init_fn=state['weight_init_fn'],
            weight_noise=state['weight_noise'],
            scale=state['weight_scale'],
            learn_bias = False,
            name='emb_words_lm')

    if state['deep_out']:
        indim = 0
        pieces = 0
        act_layer = UnaryOp(activation=eval(state['unary_activ']))
        drop_layer = DropOp(rng=rng, dropout=state['dropout'])

    if state['deep_out']:
        indim = state['dim_mlp'] / state['maxout_part']
        rank_n_approx = state['rank_n_approx']
        rank_n_activ = state['rank_n_activ']
    else:
        indim = state['rank_n_approx']
        rank_n_approx = 0
        rank_n_activ = None
    output_layer = SoftmaxLayer(
            rng,
            indim, 
            state['nouts'],
            state['weight_scale'],
            -1, 
            rank_n_approx = rank_n_approx,
            rank_n_activ = rank_n_activ,
            weight_noise=state['weight_noise'],
            init_fn=state['weight_init_fn'],
            name='out')

    def _pop_op(everything, accum, everything_max = None,
            everything_min = None, word = None, aword = None,
            one_step=False, use_noise=True):

        rval = proj_h[0](accum[0], one_step=one_step, use_noise=use_noise)
        for si in xrange(1,state['decoder_stack']):
            rval += proj_h[si](accum[si], one_step=one_step, use_noise=use_noise)
        if state['mult_out']:
            rval = rval * everything
        else:
            rval = rval + everything

        if aword and state['avg_word']:
            wcode = aword
            if one_step:
                if state['mult_out']:
                    rval = rval * wcode
                else:
                    rval = rval + wcode
            else:
                if not isinstance(wcode, TT.TensorVariable):
                    wcode = wcode.out
                shape = wcode.shape
                rshape = rval.shape
                rval = rval.reshape([rshape[0]/shape[0], shape[0], rshape[1]])
                wcode = wcode.dimshuffle('x', 0, 1)
                if state['mult_out']:
                    rval = rval * wcode
                else:
                    rval = rval + wcode
                rval = rval.reshape(rshape)
        if word and state['bigram']:
            if one_step:
                if state['mult_out']:
                    rval *= proj_word(emb_t(word, use_noise=use_noise), 
                            one_step=one_step, use_noise=use_noise)
                else:
                    rval += proj_word(emb_t(word, use_noise=use_noise), 
                            one_step=one_step, use_noise=use_noise)
            else:
                if isinstance(word, TT.TensorVariable):
                    shape = word.shape
                    ndim = word.ndim
                else:
                    shape = word.shape
                    ndim = word.out.ndim
                pword = proj_word(emb_t(word, use_noise=use_noise), 
                        one_step=one_step, use_noise=use_noise)
                shape_pword = pword.shape
                if ndim == 1:
                    pword = Shift()(pword.reshape([shape[0], 1, outdim]))
                else:
                    pword = Shift()(pword.reshape([shape[0], shape[1], outdim]))
                if state['mult_out']:
                    rval *= pword.reshape(shape_pword)
                else:
                    rval += pword.reshape(shape_pword)
        if state['deep_out']:
            rval = drop_layer(act_layer(rval), use_noise=use_noise)
        return rval

    pop_op = Operator(_pop_op)

    # 3. Constructing the model
    gater_below = None
    if state['rec_gating']:
        gater_below = gater_words[0](emb(x))
    reseter_below = None
    if state['rec_reseting']:
        reseter_below = reseter_words[0](emb(x))
    encoder_acts = [add_op(emb_words[0](emb(x)), x_mask, 
        bs=x_mask.shape[1], 
        si=0, gater_below=gater_below, reseter_below=reseter_below)]
    if state['encoder_stack'] > 1:
        everything = encoder_proj[0](last(encoder_acts[-1]))
    for si in xrange(1,state['encoder_stack']):
        gater_below = None
        if state['rec_gating']:
            gater_below = gater_words[si](emb(x))
        reseter_below = None
        if state['rec_reseting']:
            reseter_below = reseter_words[si](emb(x))
        encoder_acts.append(add_op(emb_words[si](emb(x)), 
                x_mask, bs=x_mask.shape[1], 
                si=si, state_below=encoder_acts[-1], 
                gater_below=gater_below,
                reseter_below=reseter_below))
        if state['encoder_stack'] > 1:
            everything += encoder_proj[si](last(encoder_acts[-1]))

    if state['encoder_stack'] <= 1:
        encoder = encoder_acts[-1]
        everything = LastState(ntimes=True,n=y.shape[0])(encoder)
    else:
        everything = encoder_act_layer(everything)
        everything = everything.reshape([1, everything.shape[0], everything.shape[1]])
        everything = LastState(ntimes=True,n=y.shape[0])(everything)

    if state['bias_code']:
        init_state = [bc(everything[-1]) for bc in bias_code]
    else:
        init_state = [None for bc in bias_code]

    if state['avg_word']:
        shape = x.shape
        pword = emb(x).out.reshape([shape[0], shape[1], state['rank_n_approx']])
        pword = pword * x_mask.dimshuffle(0, 1, 'x')
        aword = pword.sum(0) / TT.maximum(1., x_mask.sum(0).dimshuffle(0, 'x'))
        aword = word_code(aword, use_noise=False)
    else:
        aword = None

    gater_below = None
    if state['rec_gating']:
        gater_below = gater_words_t[0](emb_t(y0))
    reseter_below = None
    if state['rec_reseting']:
        reseter_below = reseter_words_t[0](emb_t(y0))
    has_said = [add_t_op(emb_words_t[0](emb_t(y0)), 
            everything,
            y_mask, bs=y_mask.shape[1], 
            gater_below = gater_below,
            reseter_below = reseter_below,
            init_state=init_state[0], 
            si=0)]
    for si in xrange(1,state['decoder_stack']):
        gater_below = None
        if state['rec_gating']:
            gater_below = gater_words_t[si](emb_t(y0))
        reseter_below = None
        if state['rec_reseting']:
            reseter_below = reseter_words_t[si](emb_t(y0))
        has_said.append(add_t_op(emb_words_t[si](emb_t(y0)), 
                everything,
                y_mask, bs=y_mask.shape[1], 
                state_below = has_said[-1],
                gater_below = gater_below,
                reseter_below = reseter_below,
                init_state=init_state[si], 
                si=si))

    if has_said[0].out.ndim < 3:
        for si in xrange(state['decoder_stack']):
            shape_hs = has_said[si].shape
            if y0.ndim == 1:
                shape = y0.shape
                has_said[si] = Shift()(has_said[si].reshape([shape[0], 1, state['dim_mlp']]))
            else:
                shape = y0.shape
                has_said[si] = Shift()(has_said[si].reshape([shape[0], shape[1], state['dim_mlp']]))
            has_said[si].out = TT.set_subtensor(has_said[si].out[0, :, :], init_state[si])
            has_said[si] = has_said[si].reshape(shape_hs)
    else:
        for si in xrange(state['decoder_stack']):
            has_said[si] = Shift()(has_said[si])
            has_said[si].out = TT.set_subtensor(has_said[si].out[0, :, :], init_state[si])

    model = pop_op(proj_code(everything), has_said, word=y0, aword = aword)

    nll = output_layer.train(state_below=model, target=y0,
                 mask=y_mask, reg=None) / TT.cast(y.shape[0]*y.shape[1], 'float32')

    valid_fn = None
    noise_fn = None

    x = TT.lvector(name='x')
    n_steps = TT.iscalar('nsteps')
    temp = TT.scalar('temp')
    gater_below = None
    if state['rec_gating']:
        gater_below = gater_words[0](emb(x))
    reseter_below = None
    if state['rec_reseting']:
        reseter_below = reseter_words[0](emb(x))
    encoder_acts = [add_op(emb_words[0](emb(x),use_noise=False), 
            si=0, 
            use_noise=False, 
            gater_below=gater_below,
            reseter_below=reseter_below)]
    if state['encoder_stack'] > 1:
        everything = encoder_proj[0](last(encoder_acts[-1]), use_noise=False)
    for si in xrange(1,state['encoder_stack']):
        gater_below = None
        if state['rec_gating']:
            gater_below = gater_words[si](emb(x))
        reseter_below = None
        if state['rec_reseting']:
            reseter_below = reseter_words[si](emb(x))
        encoder_acts.append(add_op(emb_words[si](emb(x),use_noise=False), 
                si=si, 
                state_below=encoder_acts[-1], use_noise=False,
                gater_below = gater_below, 
                reseter_below = reseter_below))
        if state['encoder_stack'] > 1:
            everything += encoder_proj[si](last(encoder_acts[-1]), use_noise=False)
    if state['encoder_stack'] <= 1:
        encoder = encoder_acts[-1]
        everything = last(encoder)
    else:
        everything = encoder_act_layer(everything)

    init_state = []
    for si in xrange(state['decoder_stack']):
        if state['bias_code']:
            init_state.append(TT.reshape(bias_code[si](everything, 
                use_noise=False), [1, state['dim']]))
        else:
            init_state.append(TT.alloc(numpy.float32(0), 1, state['dim']))

    if state['avg_word']:
        aword = emb(x,use_noise=False).out.mean(0)
        aword = word_code(aword, use_noise=False)
    else:
        aword = None

    def sample_fn(*args):
        aidx = 0; word_tm1 = args[aidx]
        aidx += 1; prob_tm1 = args[aidx]
        has_said_tm1 = []
        for si in xrange(state['decoder_stack']):
            aidx += 1; has_said_tm1.append(args[aidx])
        aidx += 1; ctx = args[aidx]
        if state['avg_word']:
            aidx += 1; awrd = args[aidx]
        
        val = pop_op(proj_code(ctx), has_said_tm1, word=word_tm1,
                aword=awrd, one_step=True, use_noise=False)
        sample = output_layer.get_sample(state_below=val, temp=temp)
        logp = output_layer.get_cost(
                state_below=val.out.reshape([1, TT.cast(output_layer.n_in, 'int64')]), 
                temp=temp, target=sample.reshape([1,1]), use_noise=False)
        gater_below = None
        if state['rec_gating']:
            gater_below = gater_words_t[0](emb_t(sample))
        reseter_below = None
        if state['rec_reseting']:
            reseter_below = reseter_words_t[0](emb_t(sample))
        has_said_t = [add_t_op(emb_words_t[0](emb_t(sample)), 
                ctx,
                prev_val=has_said_tm1[0], 
                gater_below=gater_below,
                reseter_below=reseter_below,
                one_step=True, use_noise=True,
                si=0)]
        for si in xrange(1, state['decoder_stack']):
            gater_below = None
            if state['rec_gating']:
                gater_below = gater_words_t[si](emb_t(sample))
            reseter_below = None
            if state['rec_reseting']:
                reseter_below = reseter_words_t[si](emb_t(sample))
            has_said_t.append(add_t_op(emb_words_t[si](emb_t(sample)), 
                    ctx,
                    prev_val=has_said_tm1[si], 
                    gater_below=gater_below,
                    reseter_below=reseter_below,
                    one_step=True, use_noise=True,
                    si=si, state_below=has_said_t[-1]))
        for si in xrange(state['decoder_stack']):
            if isinstance(has_said_t[si], list):
                has_said_t[si] = has_said_t[si][-1]
        rval = [sample, TT.cast(logp, 'float32')] + has_said_t
        return rval

    sampler_params = [everything]
    if state['avg_word']:
        sampler_params.append(aword)

    states = [TT.alloc(numpy.int64(0), n_steps)]
    states.append(TT.alloc(numpy.float32(0), n_steps))
    states += init_state

    outputs, updates = scan(sample_fn,
            states = states,
            params = sampler_params,
            n_steps= n_steps,
            name='sampler_scan'
            )
    samples = outputs[0]
    probs = outputs[1]

    sample_fn = theano.function(
        [n_steps, temp, x], [samples, probs.sum()],
        updates=updates,
        profile=False, name='sample_fn')

    model = LM_Model(
        cost_layer = nll,
        weight_noise_amount=state['weight_noise_amount'],
        valid_fn = valid_fn,
        sample_fn  = sample_fn,
        clean_before_noise_fn = False,
        noise_fn = noise_fn,
        indx_word=state['indx_word_target'],
        indx_word_src=state['indx_word'],
        character_level = False,
        rng = rng)

    if state['loopIters'] > 0: algo = SGD(model, state, train_data)
    else: algo = None

    def hook_fn():
        if not hasattr(model, 'word_indxs'): model.load_dict()
        if not hasattr(model, 'word_indxs_src'):
            model.word_indxs_src = model.word_indxs
        old_offset = train_data.offset
        if state['sample_reset']: train_data.reset()
        ns = 0
        for sidx in xrange(state['sample_n']):
            while True:
                batch = train_data.next()
                if batch:
                    break
            x = batch['x']
            y = batch['y']
            #xbow = batch['x_bow']
            masks = batch['x_mask']
            if x.ndim > 1:
                for idx in xrange(x.shape[1]):
                    ns += 1
                    if ns > state['sample_max']:
                        break
                    print 'Input: ',
                    for k in xrange(x[:,idx].shape[0]):
                        print model.word_indxs_src[x[:,idx][k]],
                        if model.word_indxs_src[x[:,idx][k]] == '<eol>':
                            break
                    print ''
                    print 'Target: ',
                    for k in xrange(y[:,idx].shape[0]):
                        print model.word_indxs[y[:,idx][k]],
                        if model.word_indxs[y[:,idx][k]] == '<eol>':
                            break
                    print ''
                    senlen = len(x[:,idx])
                    if len(numpy.where(masks[:,idx]==0)[0]) > 0:
                        senlen = numpy.where(masks[:,idx]==0)[0][0]
                    if senlen < 1:
                        continue
                    xx = x[:senlen, idx]
                    #xx = xx.reshape([xx.shape[0], 1])
                    model.get_samples(state['seqlen']+1,  1, xx)
            else:
                ns += 1
                model.get_samples(state['seqlen']+1,  1, x)
            if ns > state['sample_max']:
                break
        train_data.offset = old_offset
        return

    main = MainLoop(train_data, valid_data, None, model, algo, state, channel,
            reset = state['reset'], hooks = hook_fn)
    if state['reload']: main.load()
    if state['loopIters'] > 0: main.main()

    if state['sampler_test']:
        # This is a test script: we only sample
        if not hasattr(model, 'word_indxs'): model.load_dict()
        if not hasattr(model, 'word_indxs_src'):
            model.word_indxs_src = model.word_indxs

        indx_word=pkl.load(open(state['word_indx'],'rb'))

        try:
            while True:
                try:
                    seqin = raw_input('Input Sequence: ')
                    n_samples = int(raw_input('How many samples? '))
                    alpha = float(raw_input('Inverse Temperature? '))

                    seqin = seqin.lower()
                    seqin = seqin.split()

                    seqlen = len(seqin)
                    seq = numpy.zeros(seqlen+1, dtype='int64')
                    for idx,sx in enumerate(seqin):
                        try:
                            seq[idx] = indx_word[sx]
                        except:
                            seq[idx] = indx_word[state['oov']]
                    seq[-1] = state['null_sym_source']

                except Exception:
                    print 'Something wrong with your input! Try again!'
                    continue

                sentences = []
                all_probs = []
                for sidx in xrange(n_samples):
                    #import ipdb; ipdb.set_trace()
                    [values, probs] = model.sample_fn(seqlen * 3, alpha, seq)
                    sen = []
                    for k in xrange(values.shape[0]):
                        if model.word_indxs[values[k]] == '<eol>':
                            break
                        sen.append(model.word_indxs[values[k]])
                    sentences.append(" ".join(sen))
                    all_probs.append(-probs)
                sprobs = numpy.argsort(all_probs)
                for pidx in sprobs:
                    print pidx,"(%f):"%(-all_probs[pidx]),sentences[pidx]
                print

        except KeyboardInterrupt:
            print 'Interrupted'
            pass
def do_experiment(state, channel):
    logging.basicConfig(level=logging.DEBUG,
            format="%(asctime)s: %(name)s: %(levelname)s: %(message)s")
    logger.debug("Starting state: {}".format(state))

    def maxout(x):
        shape = x.shape
        if x.ndim == 1:
            shape1 = TT.cast(shape[0] / state['maxout_part'], 'int64')
            shape2 = TT.cast(state['maxout_part'], 'int64')
            x = x.reshape([shape1, shape2])
            x = x.max(1)
        else:
            shape1 = TT.cast(shape[1] / state['maxout_part'], 'int64')
            shape2 = TT.cast(state['maxout_part'], 'int64')
            x = x.reshape([shape[0], shape1, shape2])
            x = x.max(2)
        return x

    logger.info("Start loading")
    rng = numpy.random.RandomState(state['seed'])
    train_data, valid_data, test_data = get_data(state)

    logger.info("Build layers")
    if state['bs'] == 1:
        x = TT.lvector('x')
        x_mask = TT.vector('x_mask')
        y = TT.lvector('y')
        y0 = y
        y_mask = TT.vector('y_mask')
    else:
        x = TT.lmatrix('x')
        x_mask = TT.matrix('x_mask')
        y = TT.lmatrix('y')
        y0 = y
        y_mask = TT.matrix('y_mask')
    scoring_inputs = [x, x_mask, y, y_mask]

    bs = state['bs']

    # Dimensionality of word embedings.
    # The same as state['dim'] and in fact equals the number of hidden units.
    embdim = state['dim_mlp']

    logger.info("Source sentence")
    # Low-rank embeddings
    emb = MultiLayer(
        rng,
        n_in=state['nins'],
        n_hids=[state['rank_n_approx']],
        activation=[state['rank_n_activ']],
        init_fn=state['weight_init_fn'],
        weight_noise=state['weight_noise'],
        scale=state['weight_scale'],
        name='emb')

    emb_words = []
    if state['rec_gating']:
        gater_words = []
    if state['rec_reseting']:
        reseter_words = []
    # si always stands for the number in stack of RNNs (which is actually 1)
    for si in xrange(state['encoder_stack']):
        # In paper it is multiplication by W
        emb_words.append(MultiLayer(
            rng,
            n_in=state['rank_n_approx'],
            n_hids=[embdim],
            activation=['lambda x:x'],
            init_fn=state['weight_init_fn'],
            weight_noise=state['weight_noise'],
            scale=state['weight_scale'],
            name='emb_words_%d'%si))
        # In paper it is multiplication by W_z
        if state['rec_gating']:
            gater_words.append(MultiLayer(
                rng,
                n_in=state['rank_n_approx'],
                n_hids=[state['dim']],
                activation=['lambda x:x'],
                init_fn=state['weight_init_fn'],
                weight_noise=state['weight_noise'],
                scale=state['weight_scale'],
                learn_bias = False,
                name='gater_words_%d'%si))
        # In paper it is multiplication by W_r
        if state['rec_reseting']:
            reseter_words.append(MultiLayer(
                rng,
                n_in=state['rank_n_approx'],
                n_hids=[state['dim']],
                activation=['lambda x:x'],
                init_fn=state['weight_init_fn'],
                weight_noise=state['weight_noise'],
                scale=state['weight_scale'],
                learn_bias = False,
                name='reseter_words_%d'%si))

    add_rec_step = []
    rec_proj = []
    if state['rec_gating']:
        rec_proj_gater = []
    if state['rec_reseting']:
        rec_proj_reseter = []
    for si in xrange(state['encoder_stack']):
        if si > 0:
            rec_proj.append(MultiLayer(
                rng,
                n_in=state['dim'],
                n_hids=[embdim],
                activation=['lambda x:x'],
                init_fn=state['rec_weight_init_fn'],
                weight_noise=state['weight_noise'],
                scale=state['rec_weight_scale'],
                name='rec_proj_%d'%si))
            if state['rec_gating']:
                rec_proj_gater.append(MultiLayer(
                    rng,
                    n_in=state['dim'],
                    n_hids=[state['dim']],
                    activation=['lambda x:x'],
                    init_fn=state['weight_init_fn'],
                    weight_noise=state['weight_noise'],
                    scale=state['weight_scale'],
                    learn_bias=False,
                    name='rec_proj_gater_%d'%si))
            if state['rec_reseting']:
                rec_proj_reseter.append(MultiLayer(
                    rng,
                    n_in=state['dim'],
                    n_hids=[state['dim']],
                    activation=['lambda x:x'],
                    init_fn=state['weight_init_fn'],
                    weight_noise=state['weight_noise'],
                    scale=state['weight_scale'],
                    learn_bias=False,
                    name='rec_proj_reseter_%d'%si))

        # This should be U from paper
        add_rec_step.append(eval(state['rec_layer'])(
                rng,
                n_hids=state['dim'],
                activation = state['activ'],
                bias_scale = state['bias'],
                scale=state['rec_weight_scale'],
                init_fn=state['rec_weight_init_fn'],
                weight_noise=state['weight_noise_rec'],
                dropout=state['dropout_rec'],
                gating=state['rec_gating'],
                gater_activation=state['rec_gater'],
                reseting=state['rec_reseting'],
                reseter_activation=state['rec_reseter'],
                name='add_h_%d'%si))

    def _add_op(words_embeddings,
                words_mask=None,
                prev_val=None,
                si = 0,
                state_below = None,
                gater_below = None,
                reseter_below = None,
                one_step=False,
                bs=1,
                init_state=None,
                use_noise=True):
        seqlen = words_embeddings.out.shape[0]//bs
        rval = words_embeddings
        gater = None
        reseter = None
        if state['rec_gating']:
            gater = gater_below
        if state['rec_reseting']:
            reseter = reseter_below
        if si > 0:
            rval += rec_proj[si-1](state_below, one_step=one_step,
                    use_noise=use_noise)
            if state['rec_gating']:
                projg = rec_proj_gater[si-1](state_below, one_step=one_step,
                        use_noise = use_noise)
                if gater: gater += projg
                else: gater = projg
            if state['rec_reseting']:
                projg = rec_proj_reseter[si-1](state_below, one_step=one_step,
                        use_noise = use_noise)
                if reseter: reseter += projg
                else: reseter = projg

        if not one_step:
            rval= add_rec_step[si](
                rval,
                nsteps=seqlen,
                batch_size=bs,
                mask=words_mask,
                gater_below = gater,
                reseter_below = reseter,
                one_step=one_step,
                init_state=init_state,
                use_noise = use_noise)
        else:
            #Here we link the Encoder part
            rval= add_rec_step[si](
                rval,
                mask=words_mask,
                state_before=prev_val,
                gater_below = gater,
                reseter_below = reseter,
                one_step=one_step,
                init_state=init_state,
                use_noise = use_noise)
        return rval
    add_op = Operator(_add_op)

    logger.info("Target sequence")
    emb_t = MultiLayer(
        rng,
        n_in=state['nouts'],
        n_hids=[state['rank_n_approx']],
        activation=[state['rank_n_activ']],
        init_fn=state['weight_init_fn'],
        weight_noise=state['weight_noise'],
        scale=state['weight_scale'],
        name='emb_t')

    emb_words_t = []
    if state['rec_gating']:
        gater_words_t = []
    if state['rec_reseting']:
        reseter_words_t = []
    for si in xrange(state['decoder_stack']):
        emb_words_t.append(MultiLayer(
            rng,
            n_in=state['rank_n_approx'],
            n_hids=[embdim],
            activation=['lambda x:x'],
            init_fn=state['weight_init_fn'],
            weight_noise=state['weight_noise'],
            scale=state['weight_scale'],
            name='emb_words_t_%d'%si))
        if state['rec_gating']:
            gater_words_t.append(MultiLayer(
                rng,
                n_in=state['rank_n_approx'],
                n_hids=[state['dim']],
                activation=['lambda x:x'],
                init_fn=state['weight_init_fn'],
                weight_noise=state['weight_noise'],
                scale=state['weight_scale'],
                learn_bias=False,
                name='gater_words_t_%d'%si))
        if state['rec_reseting']:
            reseter_words_t.append(MultiLayer(
                rng,
                n_in=state['rank_n_approx'],
                n_hids=[state['dim']],
                activation=['lambda x:x'],
                init_fn=state['weight_init_fn'],
                weight_noise=state['weight_noise'],
                scale=state['weight_scale'],
                learn_bias=False,
                name='reseter_words_t_%d'%si))

    proj_everything_t = []
    if state['rec_gating']:
        gater_everything_t = []
    if state['rec_reseting']:
        reseter_everything_t = []
    for si in xrange(state['decoder_stack']):
        # This stands for the matrix C from the text
        proj_everything_t.append(MultiLayer(
            rng,
            n_in=state['dim'],
            n_hids=[embdim],
            activation=['lambda x:x'],
            init_fn=state['weight_init_fn'],
            weight_noise=state['weight_noise'],
            scale=state['weight_scale'],
            name='proj_everything_t_%d'%si,
            learn_bias = False))
        if state['rec_gating']:
            gater_everything_t.append(MultiLayer(
                rng,
                n_in=state['dim'],
                n_hids=[state['dim']],
                activation=['lambda x:x'],
                init_fn=state['weight_init_fn'],
                weight_noise=state['weight_noise'],
                scale=state['weight_scale'],
                name='gater_everything_t_%d'%si,
                learn_bias = False))
        if state['rec_reseting']:
            reseter_everything_t.append(MultiLayer(
                rng,
                n_in=state['dim'],
                n_hids=[state['dim']],
                activation=['lambda x:x'],
                init_fn=state['weight_init_fn'],
                weight_noise=state['weight_noise'],
                scale=state['weight_scale'],
                name='reseter_everything_t_%d'%si,
                learn_bias = False))

    add_rec_step_t = []
    rec_proj_t = []
    if state['rec_gating']:
        rec_proj_t_gater = []
    if state['rec_reseting']:
        rec_proj_t_reseter = []
    for si in xrange(state['decoder_stack']):
        if si > 0:
            rec_proj_t.append(MultiLayer(
                rng,
                n_in=state['dim'],
                n_hids=[embdim],
                activation=['lambda x:x'],
                init_fn=state['rec_weight_init_fn'],
                weight_noise=state['weight_noise'],
                scale=state['rec_weight_scale'],
                name='rec_proj_%d'%si))
            if state['rec_gating']:
                rec_proj_t_gater.append(MultiLayer(
                    rng,
                    n_in=state['dim'],
                    n_hids=[state['dim']],
                    activation=['lambda x:x'],
                    init_fn=state['weight_init_fn'],
                    weight_noise=state['weight_noise'],
                    scale=state['weight_scale'],
                    learn_bias=False,
                    name='rec_proj_t_gater_%d'%si))
            if state['rec_reseting']:
                rec_proj_t_reseter.append(MultiLayer(
                    rng,
                    n_in=state['dim'],
                    n_hids=[state['dim']],
                    activation=['lambda x:x'],
                    init_fn=state['weight_init_fn'],
                    weight_noise=state['weight_noise'],
                    scale=state['weight_scale'],
                    learn_bias=False,
                    name='rec_proj_t_reseter_%d'%si))

        # This one stands for gating, resetting and applying non-linearity in Decoder
        add_rec_step_t.append(eval(state['rec_layer'])(
                rng,
                n_hids=state['dim'],
                activation = state['activ'],
                bias_scale = state['bias'],
                scale=state['rec_weight_scale'],
                init_fn=state['rec_weight_init_fn'],
                weight_noise=state['weight_noise_rec'],
                dropout=state['dropout_rec'],
                gating=state['rec_gating'],
                gater_activation=state['rec_gater'],
                reseting=state['rec_reseting'],
                reseter_activation=state['rec_reseter'],
                name='add_h_t_%d'%si))

    if state['encoder_stack'] > 1:
        encoder_proj = []
        for si in xrange(state['encoder_stack']):
            encoder_proj.append(MultiLayer(
                rng,
                n_in=state['dim'],
                n_hids=[state['dim'] * state['maxout_part']],
                activation=['lambda x: x'],
                init_fn=state['weight_init_fn'],
                weight_noise=state['weight_noise'],
                scale=state['weight_scale'],
                name='encoder_proj_%d'%si,
                learn_bias = (si == 0)))

        encoder_act_layer = UnaryOp(activation=eval(state['unary_activ']),
                indim=indim, pieces=pieces, rng=rng)

    # Actually add target opp
    def _add_t_op(words_embeddings, everything = None, words_mask=None,
                prev_val=None,one_step=False, bs=1,
                init_state=None, use_noise=True,
                gater_below = None,
                reseter_below = None,
                si = 0, state_below = None):
        seqlen = words_embeddings.out.shape[0]//bs

        rval = words_embeddings
        gater = None
        if state['rec_gating']:
            gater = gater_below
        reseter = None
        if state['rec_reseting']:
            reseter = reseter_below
        if si > 0:
            if isinstance(state_below, list):
                state_below = state_below[-1]
            rval += rec_proj_t[si-1](state_below,
                    one_step=one_step, use_noise=use_noise)
            if state['rec_gating']:
                projg = rec_proj_t_gater[si-1](state_below, one_step=one_step,
                        use_noise = use_noise)
                if gater: gater += projg
                else: gater = projg
            if state['rec_reseting']:
                projg = rec_proj_t_reseter[si-1](state_below, one_step=one_step,
                        use_noise = use_noise)
                if reseter: reseter += projg
                else: reseter = projg
        if everything:
            rval = rval + proj_everything_t[si](everything)
            if state['rec_gating']:
                everyg = gater_everything_t[si](everything, one_step=one_step, use_noise=use_noise)
                if gater: gater += everyg
                else: gater = everyg
            if state['rec_reseting']:
                everyg = reseter_everything_t[si](everything, one_step=one_step, use_noise=use_noise)
                if reseter: reseter += everyg
                else: reseter = everyg

        if not one_step:
            rval = add_rec_step_t[si](
                rval,
                nsteps=seqlen,
                batch_size=bs,
                mask=words_mask,
                one_step=one_step,
                init_state=init_state,
                gater_below=gater,
                reseter_below=reseter,
                use_noise=use_noise)
        else:
            # Here we link the Decoder part
            rval = add_rec_step_t[si](
                rval,
                mask=words_mask,
                state_before=prev_val,
                one_step=one_step,
                gater_below=gater,
                reseter_below=reseter,
                use_noise=use_noise)
        return rval
    add_t_op = Operator(_add_t_op)

    outdim = state['dim_mlp']
    if not state['deep_out']:
        outdim = state['rank_n_approx']

    if state['bias_code']:
        bias_code = []
        for si in xrange(state['decoder_stack']):
            bias_code.append(MultiLayer(
                rng,
                n_in=state['dim'],
                n_hids=[state['dim']],
                activation = [state['activ']],
                bias_scale = [state['bias']],
                scale=state['weight_scale'],
                init_fn=state['weight_init_fn'],
                weight_noise=state['weight_noise'],
                name='bias_code_%d'%si))

    if state['avg_word']:
        word_code_nin = state['rank_n_approx']
        word_code = MultiLayer(
            rng,
            n_in=word_code_nin,
            n_hids=[outdim],
            activation = 'lambda x:x',
            bias_scale = [state['bias_mlp']/3],
            scale=state['weight_scale'],
            init_fn=state['weight_init_fn'],
            weight_noise=state['weight_noise'],
            learn_bias = False,
            name='word_code')

    proj_code = MultiLayer(
        rng,
        n_in=state['dim'],
        n_hids=[outdim],
        activation = 'lambda x: x',
        bias_scale = [state['bias_mlp']/3],
        scale=state['weight_scale'],
        init_fn=state['weight_init_fn'],
        weight_noise=state['weight_noise'],
        learn_bias = False,
        name='proj_code')

    proj_h = []
    for si in xrange(state['decoder_stack']):
        proj_h.append(MultiLayer(
            rng,
            n_in=state['dim'],
            n_hids=[outdim],
            activation = 'lambda x: x',
            bias_scale = [state['bias_mlp']/3],
            scale=state['weight_scale'],
            init_fn=state['weight_init_fn'],
            weight_noise=state['weight_noise'],
            name='proj_h_%d'%si))

    if state['bigram']:
        proj_word = MultiLayer(
            rng,
            n_in=state['rank_n_approx'],
            n_hids=[outdim],
            activation=['lambda x:x'],
            bias_scale = [state['bias_mlp']/3],
            init_fn=state['weight_init_fn'],
            weight_noise=state['weight_noise'],
            scale=state['weight_scale'],
            learn_bias = False,
            name='emb_words_lm')

    if state['deep_out']:
        indim = 0
        pieces = 0
        act_layer = UnaryOp(activation=eval(state['unary_activ']))
        drop_layer = DropOp(rng=rng, dropout=state['dropout'])

    if state['deep_out']:
        indim = state['dim_mlp'] / state['maxout_part']
        rank_n_approx = state['rank_n_approx']
        rank_n_activ = state['rank_n_activ']
    else:
        indim = state['rank_n_approx']
        rank_n_approx = 0
        rank_n_activ = None
    output_layer = SoftmaxLayer(
            rng,
            indim,
            state['nouts'],
            state['weight_scale'],
            -1,
            rank_n_approx = rank_n_approx,
            rank_n_activ = rank_n_activ,
            weight_noise=state['weight_noise'],
            init_fn=state['weight_init_fn'],
            name='out')

    def _pop_op(everything, accum, everything_max = None,
            everything_min = None, word = None, aword = None,
            one_step=False, use_noise=True):

        rval = proj_h[0](accum[0], one_step=one_step, use_noise=use_noise)
        for si in xrange(1,state['decoder_stack']):
            rval += proj_h[si](accum[si], one_step=one_step, use_noise=use_noise)
        if state['mult_out']:
            rval = rval * everything
        else:
            rval = rval + everything

        if aword and state['avg_word']:
            wcode = aword
            if one_step:
                if state['mult_out']:
                    rval = rval * wcode
                else:
                    rval = rval + wcode
            else:
                if not isinstance(wcode, TT.TensorVariable):
                    wcode = wcode.out
                shape = wcode.shape
                rshape = rval.shape
                rval = rval.reshape([rshape[0]/shape[0], shape[0], rshape[1]])
                wcode = wcode.dimshuffle('x', 0, 1)
                if state['mult_out']:
                    rval = rval * wcode
                else:
                    rval = rval + wcode
                rval = rval.reshape(rshape)
        if word and state['bigram']:
            if one_step:
                if state['mult_out']:
                    rval *= proj_word(emb_t(word, use_noise=use_noise),
                            one_step=one_step, use_noise=use_noise)
                else:
                    rval += proj_word(emb_t(word, use_noise=use_noise),
                            one_step=one_step, use_noise=use_noise)
            else:
                if isinstance(word, TT.TensorVariable):
                    shape = word.shape
                    ndim = word.ndim
                else:
                    shape = word.shape
                    ndim = word.out.ndim
                pword = proj_word(emb_t(word, use_noise=use_noise),
                        one_step=one_step, use_noise=use_noise)
                shape_pword = pword.shape
                if ndim == 1:
                    pword = Shift()(pword.reshape([shape[0], 1, outdim]))
                else:
                    pword = Shift()(pword.reshape([shape[0], shape[1], outdim]))
                if state['mult_out']:
                    rval *= pword.reshape(shape_pword)
                else:
                    rval += pword.reshape(shape_pword)
        if state['deep_out']:
            rval = drop_layer(act_layer(rval), use_noise=use_noise)
        return rval

    pop_op = Operator(_pop_op)

    logger.info("Construct the model")
    gater_below = None
    if state['rec_gating']:
        gater_below = gater_words[0](emb(x))
    reseter_below = None
    if state['rec_reseting']:
        reseter_below = reseter_words[0](emb(x))
    encoder_acts = [add_op(emb_words[0](emb(x)), x_mask,
        bs=x_mask.shape[1],
        si=0, gater_below=gater_below, reseter_below=reseter_below)]
    if state['encoder_stack'] > 1:
        everything = encoder_proj[0](last(encoder_acts[-1]))
    for si in xrange(1,state['encoder_stack']):
        gater_below = None
        if state['rec_gating']:
            gater_below = gater_words[si](emb(x))
        reseter_below = None
        if state['rec_reseting']:
            reseter_below = reseter_words[si](emb(x))
        encoder_acts.append(add_op(emb_words[si](emb(x)),
                x_mask, bs=x_mask.shape[1],
                si=si, state_below=encoder_acts[-1],
                gater_below=gater_below,
                reseter_below=reseter_below))
        if state['encoder_stack'] > 1:
            everything += encoder_proj[si](last(encoder_acts[-1]))

    if state['encoder_stack'] <= 1:
        encoder = encoder_acts[-1]
        everything = LastState(ntimes=True,n=y.shape[0])(encoder)
    else:
        everything = encoder_act_layer(everything)
        everything = everything.reshape([1, everything.shape[0], everything.shape[1]])
        everything = LastState(ntimes=True,n=y.shape[0])(everything)

    if state['bias_code']:
        init_state = [bc(everything[-1]) for bc in bias_code]
    else:
        init_state = [None for bc in bias_code]

    if state['avg_word']:
        shape = x.shape
        pword = emb(x).out.reshape([shape[0], shape[1], state['rank_n_approx']])
        pword = pword * x_mask.dimshuffle(0, 1, 'x')
        aword = pword.sum(0) / TT.maximum(1., x_mask.sum(0).dimshuffle(0, 'x'))
        aword = word_code(aword, use_noise=False)
    else:
        aword = None

    gater_below = None
    if state['rec_gating']:
        gater_below = gater_words_t[0](emb_t(y0))
    reseter_below = None
    if state['rec_reseting']:
        reseter_below = reseter_words_t[0](emb_t(y0))
    has_said = [add_t_op(emb_words_t[0](emb_t(y0)),
            everything,
            y_mask, bs=y_mask.shape[1],
            gater_below = gater_below,
            reseter_below = reseter_below,
            init_state=init_state[0],
            si=0)]
    for si in xrange(1,state['decoder_stack']):
        gater_below = None
        if state['rec_gating']:
            gater_below = gater_words_t[si](emb_t(y0))
        reseter_below = None
        if state['rec_reseting']:
            reseter_below = reseter_words_t[si](emb_t(y0))
        has_said.append(add_t_op(emb_words_t[si](emb_t(y0)),
                everything,
                y_mask, bs=y_mask.shape[1],
                state_below = has_said[-1],
                gater_below = gater_below,
                reseter_below = reseter_below,
                init_state=init_state[si],
                si=si))

    # has_said are hidden layer states

    if has_said[0].out.ndim < 3:
        for si in xrange(state['decoder_stack']):
            shape_hs = has_said[si].shape
            if y0.ndim == 1:
                shape = y0.shape
                has_said[si] = Shift()(has_said[si].reshape([shape[0], 1, state['dim_mlp']]))
            else:
                shape = y0.shape
                has_said[si] = Shift()(has_said[si].reshape([shape[0], shape[1], state['dim_mlp']]))
            has_said[si] = TT.set_subtensor(has_said[si][0, :, :], init_state[si])
            has_said[si] = has_said[si].reshape(shape_hs)
    else:
        for si in xrange(state['decoder_stack']):
            has_said[si] = Shift()(has_said[si])
            has_said[si].out = TT.set_subtensor(has_said[si][0, :, :], init_state[si])

    model = pop_op(proj_code(everything), has_said, word=y0, aword = aword)

    nll = output_layer.train(state_below=model, target=y0,
                 mask=y_mask, reg=None) / TT.cast(y.shape[0]*y.shape[1], 'float32')

    valid_fn = None
    noise_fn = None

    x = TT.lvector(name='x')
    n_steps = TT.iscalar('nsteps')
    temp = TT.scalar('temp')
    gater_below = None
    if state['rec_gating']:
        gater_below = gater_words[0](emb(x))
    reseter_below = None
    if state['rec_reseting']:
        reseter_below = reseter_words[0](emb(x))
    encoder_acts = [add_op(emb_words[0](emb(x),use_noise=False),
            si=0,
            use_noise=False,
            gater_below=gater_below,
            reseter_below=reseter_below)]
    if state['encoder_stack'] > 1:
        everything = encoder_proj[0](last(encoder_acts[-1]), use_noise=False)
    for si in xrange(1,state['encoder_stack']):
        gater_below = None
        if state['rec_gating']:
            gater_below = gater_words[si](emb(x))
        reseter_below = None
        if state['rec_reseting']:
            reseter_below = reseter_words[si](emb(x))
        encoder_acts.append(add_op(emb_words[si](emb(x),use_noise=False),
                si=si,
                state_below=encoder_acts[-1], use_noise=False,
                gater_below = gater_below,
                reseter_below = reseter_below))
        if state['encoder_stack'] > 1:
            everything += encoder_proj[si](last(encoder_acts[-1]), use_noise=False)
    if state['encoder_stack'] <= 1:
        encoder = encoder_acts[-1]
        everything = last(encoder)
    else:
        everything = encoder_act_layer(everything)

    init_state = []
    for si in xrange(state['decoder_stack']):
        if state['bias_code']:
            init_state.append(TT.reshape(bias_code[si](everything,
                use_noise=False), [1, state['dim']]))
        else:
            init_state.append(TT.alloc(numpy.float32(0), 1, state['dim']))

    if state['avg_word']:
        aword = emb(x,use_noise=False).out.mean(0)
        aword = word_code(aword, use_noise=False)
    else:
        aword = None

    def sample_fn(*args):
        aidx = 0; word_tm1 = args[aidx]
        aidx += 1; prob_tm1 = args[aidx]
        has_said_tm1 = []
        for si in xrange(state['decoder_stack']):
            aidx += 1; has_said_tm1.append(args[aidx])
        aidx += 1; ctx = args[aidx]
        if state['avg_word']:
            aidx += 1; awrd = args[aidx]
        else:
            awrd = None

        val = pop_op(proj_code(ctx), has_said_tm1, word=word_tm1,
                aword=awrd, one_step=True, use_noise=False)
        sample = output_layer.get_sample(state_below=val, temp=temp)
        logp = output_layer.get_cost(
                state_below=val.out.reshape([1, TT.cast(output_layer.n_in, 'int64')]),
                temp=temp, target=sample.reshape([1,1]), use_noise=False)
        gater_below = None
        if state['rec_gating']:
            gater_below = gater_words_t[0](emb_t(sample))
        reseter_below = None
        if state['rec_reseting']:
            reseter_below = reseter_words_t[0](emb_t(sample))
        has_said_t = [add_t_op(emb_words_t[0](emb_t(sample)),
                ctx,
                prev_val=has_said_tm1[0],
                gater_below=gater_below,
                reseter_below=reseter_below,
                one_step=True, use_noise=True,
                si=0)]
        for si in xrange(1, state['decoder_stack']):
            gater_below = None
            if state['rec_gating']:
                gater_below = gater_words_t[si](emb_t(sample))
            reseter_below = None
            if state['rec_reseting']:
                reseter_below = reseter_words_t[si](emb_t(sample))
            has_said_t.append(add_t_op(emb_words_t[si](emb_t(sample)),
                    ctx,
                    prev_val=has_said_tm1[si],
                    gater_below=gater_below,
                    reseter_below=reseter_below,
                    one_step=True, use_noise=True,
                    si=si, state_below=has_said_t[-1]))
        for si in xrange(state['decoder_stack']):
            if isinstance(has_said_t[si], list):
                has_said_t[si] = has_said_t[si][-1]
        rval = [sample, TT.cast(logp, 'float32')] + has_said_t
        return rval

    sampler_params = [everything]
    if state['avg_word']:
        sampler_params.append(aword)

    states = [TT.alloc(numpy.int64(0), n_steps)]
    states.append(TT.alloc(numpy.float32(0), n_steps))
    states += init_state

    outputs, updates = scan(sample_fn,
            states = states,
            params = sampler_params,
            n_steps= n_steps,
            name='sampler_scan'
            )
    samples = outputs[0]
    probs = outputs[1]

    sample_fn = theano.function(
        [n_steps, temp, x], [samples, probs.sum()],
        updates=updates,
        profile=False, name='sample_fn')

    model = LM_Model(
        cost_layer=nll,
        weight_noise_amount=state['weight_noise_amount'],
        valid_fn=valid_fn,
        sample_fn=sample_fn,
        clean_before_noise_fn = False,
        noise_fn=noise_fn,
        indx_word=state['indx_word_target'],
        indx_word_src=state['indx_word'],
        character_level=False,
        rng=rng)

    algo = SGD(model, state, train_data)

    def hook_fn():
        if not hasattr(model, 'word_indxs'): model.load_dict()
        if not hasattr(model, 'word_indxs_src'):
            model.word_indxs_src = model.word_indxs
        old_offset = train_data.offset
        if state['sample_reset']: train_data.reset()
        ns = 0
        for sidx in xrange(state['sample_n']):
            while True:
                batch = train_data.next()
                if batch:
                    break
            x = batch['x']
            y = batch['y']
            #xbow = batch['x_bow']
            masks = batch['x_mask']
            if x.ndim > 1:
                for idx in xrange(x.shape[1]):
                    ns += 1
                    if ns > state['sample_max']:
                        break
                    print 'Input: ',
                    for k in xrange(x[:,idx].shape[0]):
                        print model.word_indxs_src[x[:,idx][k]],
                        if model.word_indxs_src[x[:,idx][k]] == '<eol>':
                            break
                    print ''
                    print 'Target: ',
                    for k in xrange(y[:,idx].shape[0]):
                        print model.word_indxs[y[:,idx][k]],
                        if model.word_indxs[y[:,idx][k]] == '<eol>':
                            break
                    print ''
                    senlen = len(x[:,idx])
                    if len(numpy.where(masks[:,idx]==0)[0]) > 0:
                        senlen = numpy.where(masks[:,idx]==0)[0][0]
                    if senlen < 1:
                        continue
                    xx = x[:senlen, idx]
                    #xx = xx.reshape([xx.shape[0], 1])
                    model.get_samples(state['seqlen']+1,  1, xx)
            else:
                ns += 1
                model.get_samples(state['seqlen']+1,  1, x)
            if ns > state['sample_max']:
                break
        train_data.offset = old_offset
        return

    main = MainLoop(train_data, valid_data, None, model, algo, state, channel,
            reset = state['reset'], hooks = hook_fn)
    if state['reload']: main.load()
    main.main()

    if state["scoring"]:
        score_file = open(state["score_file"], "w")

        logger.info("Compiling score function")
        score_fn = theano.function(scoring_inputs, [-nll.cost_per_sample])

        count = 0
        n_samples = 0
        logger.info('Scoring phrases')
        for batch in train_data:
            if batch == None:
                continue
            if batch['x'].shape[0] <= 0 or \
                    batch['x_mask'].shape[0] <= 0 or \
                    batch['y'].shape[0] <= 0 or \
                    batch['y_mask'].shape[0] <= 0:
                logger.error('Wrong batch!!!')
                continue
            st = time.time()
            [scores] = score_fn(batch['x'], batch['x_mask'],
                    batch['y'], batch['y_mask'])
            up_time = time.time() - st
            for s in scores:
                print >>score_file, "{:.5f}".format(float(s))

            n_samples += batch['x'].shape[1]
            count += 1

            if state['flush_scores'] >= 1 and count % state['flush_scores'] == 0:
                score_file.flush()
                logger.debug("Scores flushed")
            logger.debug("{} batches, {} samples, {} per sample; example scores: {}".format(
                count, n_samples, up_time/scores.shape[0], scores[:5]))
        logger.info("Done")
        score_file.flush()
        score_file.close()

    if state['sampler_test']:
        # This is a test script: we only sample
        if not hasattr(model, 'word_indxs'): model.load_dict()
        if not hasattr(model, 'word_indxs_src'):
            model.word_indxs_src = model.word_indxs

        indx_word=pkl.load(open(state['word_indx'],'rb'))

        try:
            while True:
                try:
                    seqin = raw_input('Input Sequence: ')
                    n_samples = int(raw_input('How many samples? '))
                    alpha = float(raw_input('Inverse Temperature? '))

                    seqin = seqin.lower()
                    seqin = seqin.split()

                    seqlen = len(seqin)
                    seq = numpy.zeros(seqlen+1, dtype='int64')
                    for idx,sx in enumerate(seqin):
                        try:
                            seq[idx] = indx_word[sx]
                        except:
                            seq[idx] = indx_word[state['oov']]
                    seq[-1] = state['null_sym_source']

                except Exception:
                    print 'Something wrong with your input! Try again!'
                    continue

                sentences = []
                all_probs = []
                for sidx in xrange(n_samples):
                    #import ipdb; ipdb.set_trace()
                    [values, probs] = model.sample_fn(seqlen * 3, alpha, seq)
                    sen = []
                    for k in xrange(values.shape[0]):
                        if model.word_indxs[values[k]] == '<eol>':
                            break
                        sen.append(model.word_indxs[values[k]])
                    sentences.append(" ".join(sen))
                    all_probs.append(-probs)
                sprobs = numpy.argsort(all_probs)
                for pidx in sprobs:
                    print pidx,"(%f):"%(-all_probs[pidx]),sentences[pidx]
                print

        except KeyboardInterrupt:
            print 'Interrupted'
            pass