Beispiel #1
0
 def create_lm_model(self):
     """
     create language model based on the encoder-decoder
     to generate target sequence given an input sequence.
     """
     if hasattr(self, 'lm_model'): return self.lm_model
     self.lm_model = LM_Model(
         cost_layer=self.predictions,
         sample_fn=self.create_sampler(),
         weight_noise_amount=self.state['weight_noise_amount'],
         word_dict=self.state['word_indx_trgt'],
         word_dict_src=self.state['word_indx'],
         rng=self.rng)
     self.lm_model.load_dict(self.state)
     logger.debug("Params of Language Model:\n{}".format(
         pprint.pformat(sorted([p.name for p in self.lm_model.params]))))
     return self.lm_model
Beispiel #2
0
def jobman(state, channel):
    # load dataset
    rng = numpy.random.RandomState(state['seed'])

    # declare the dimensionalies of the input and output
    if state['chunks'] == 'words':
        state['n_in'] = 10000
        state['n_out'] = 10000
    else:
        state['n_in'] = 50
        state['n_out'] = 50
    train_data, valid_data, test_data = get_text_data(state)

    ## BEGIN Tutorial
    ### Define Theano Input Variables
    x = TT.lvector('x')
    y = TT.lvector('y')
    h0 = theano.shared(
        numpy.zeros((eval(state['nhids'])[-1], ), dtype='float32'))

    ### Neural Implementation of the Operators: \oplus
    #### Word Embedding
    emb_words = MultiLayer(rng,
                           n_in=state['n_in'],
                           n_hids=eval(state['inp_nhids']),
                           activation=eval(state['inp_activ']),
                           init_fn='sample_weights_classic',
                           weight_noise=state['weight_noise'],
                           rank_n_approx=state['rank_n_approx'],
                           scale=state['inp_scale'],
                           sparsity=state['inp_sparse'],
                           learn_bias=True,
                           bias_scale=eval(state['inp_bias']),
                           name='emb_words')

    #### Deep Transition Recurrent Layer
    rec = eval(state['rec_layer'])(
        rng,
        eval(state['nhids']),
        activation=eval(state['rec_activ']),
        #activation = 'TT.nnet.sigmoid',
        bias_scale=eval(state['rec_bias']),
        scale=eval(state['rec_scale']),
        sparsity=eval(state['rec_sparse']),
        init_fn=eval(state['rec_init']),
        weight_noise=state['weight_noise'],
        name='rec')

    #### Stiching them together
    ##### (1) Get the embedding of a word
    x_emb = emb_words(x, no_noise_bias=state['no_noise_bias'])
    ##### (2) Embedding + Hidden State via DT Recurrent Layer
    reset = TT.scalar('reset')
    rec_layer = rec(x_emb,
                    n_steps=x.shape[0],
                    init_state=h0 * reset,
                    no_noise_bias=state['no_noise_bias'],
                    truncate_gradient=state['truncate_gradient'],
                    batch_size=1)

    ## BEGIN Exercise: DOT-RNN
    ### Neural Implementation of the Operators: \lhd

    #### Exercise (1)
    #### TODO: Define a layer from the hidden state to the intermediate layer
    emb_layer = MultiLayer(rng, )

    #### Exercise (1)
    #### TODO: Define a layer from the input to the intermediate Layer

    #### Hidden State: Combine emb_state and emb_words_out
    #### Exercise (1)
    #### TODO: Define an activation layer

    #### Exercise (2)
    #### TODO: Define a dropout layer

    #### Softmax Layer
    output_layer = SoftmaxLayer(rng,
                                eval(state['dout_nhid']),
                                state['n_out'],
                                scale=state['out_scale'],
                                bias_scale=state['out_bias_scale'],
                                init_fn="sample_weights_classic",
                                weight_noise=state['weight_noise'],
                                sparsity=state['out_sparse'],
                                sum_over_time=True,
                                name='out')

    ### Few Optional Things
    #### Direct shortcut from x to y
    if state['shortcut_inpout']:
        shortcut = MultiLayer(rng,
                              n_in=state['n_in'],
                              n_hids=eval(state['inpout_nhids']),
                              activations=eval(state['inpout_activ']),
                              init_fn='sample_weights_classic',
                              weight_noise=state['weight_noise'],
                              scale=eval(state['inpout_scale']),
                              sparsity=eval(state['inpout_sparse']),
                              learn_bias=eval(state['inpout_learn_bias']),
                              bias_scale=eval(state['inpout_bias']),
                              name='shortcut')

    #### Learning rate scheduling (1/(1+n/beta))
    state['clr'] = state['lr']

    def update_lr(obj, cost):
        stp = obj.step
        if isinstance(obj.state['lr_start'],
                      int) and stp > obj.state['lr_start']:
            time = float(stp - obj.state['lr_start'])
            new_lr = obj.state['clr'] / (1 + time / obj.state['lr_beta'])
            obj.lr = new_lr

    if state['lr_adapt']:
        rec.add_schedule(update_lr)

    ### Neural Implementations of the Language Model
    #### Training
    if state['shortcut_inpout']:
        additional_inputs = [rec_layer, shortcut(x)]
    else:
        additional_inputs = [rec_layer]

    ##### Exercise (1): Compute the output intermediate layer
    ##### TODO: Compute the output intermediate layer

    ##### Exercise (2): Apply Dropout
    ##### TODO: Apply the dropout layer

    train_model = output_layer(outhid,
                               no_noise_bias=state['no_noise_bias'],
                               additional_inputs=additional_inputs).train(
                                   target=y,
                                   scale=numpy.float32(1. / state['seqlen']))

    nw_h0 = rec_layer.out[rec_layer.out.shape[0] - 1]
    if state['carry_h0']:
        train_model.updates += [(h0, nw_h0)]

    #### Validation
    h0val = theano.shared(
        numpy.zeros((eval(state['nhids'])[-1], ), dtype='float32'))
    rec_layer = rec(emb_words(x, use_noise=False),
                    n_steps=x.shape[0],
                    batch_size=1,
                    init_state=h0val * reset,
                    use_noise=False)
    nw_h0 = rec_layer.out[rec_layer.out.shape[0] - 1]

    ##### Exercise (1):
    ##### TODO: Compute the output intermediate layer

    ##### Exercise (2): Apply Dropout
    ##### TODO: Apply the dropout layer without noise

    if state['shortcut_inpout']:
        additional_inputs = [rec_layer, shortcut(x, use_noise=False)]
    else:
        additional_inputs = [rec_layer]
    valid_model = output_layer(outhid,
                               additional_inputs=additional_inputs,
                               use_noise=False).validate(target=y,
                                                         sum_over_time=True)

    valid_updates = []
    if state['carry_h0']:
        valid_updates = [(h0val, nw_h0)]

    valid_fn = theano.function([x, y, reset],
                               valid_model.cost,
                               name='valid_fn',
                               updates=valid_updates)

    #### Sampling
    ##### single-step sampling
    def sample_fn(word_tm1, h_tm1):
        x_emb = emb_words(word_tm1, use_noise=False, one_step=True)
        h0 = rec(x_emb, state_before=h_tm1, one_step=True, use_noise=False)[-1]
        outhid = outhid_dropout(outhid_activ(
            emb_state(h0, use_noise=False, one_step=True) +
            emb_words_out(word_tm1, use_noise=False, one_step=True),
            one_step=True),
                                use_noise=False,
                                one_step=True)
        word = output_layer.get_sample(state_below=outhid,
                                       additional_inputs=[h0],
                                       temp=1.)
        return word, h0

    ##### scan for iterating the single-step sampling multiple times
    [samples, summaries], updates = scan(sample_fn,
                                         states=[
                                             TT.alloc(numpy.int64(0),
                                                      state['sample_steps']),
                                             TT.alloc(numpy.float32(0), 1,
                                                      eval(state['nhids'])[-1])
                                         ],
                                         n_steps=state['sample_steps'],
                                         name='sampler_scan')

    ##### build a Theano function for sampling
    sample_fn = theano.function([], [samples],
                                updates=updates,
                                profile=False,
                                name='sample_fn')

    ##### Load a dictionary
    dictionary = numpy.load(state['dictionary'])
    if state['chunks'] == 'chars':
        dictionary = dictionary['unique_chars']
    else:
        dictionary = dictionary['unique_words']

    def hook_fn():
        sample = sample_fn()[0]
        print 'Sample:',
        if state['chunks'] == 'chars':
            print "".join(dictionary[sample])
        else:
            for si in sample:
                print dictionary[si],
            print

    ### Build and Train a Model
    #### Define a model
    model = LM_Model(cost_layer=train_model,
                     weight_noise_amount=state['weight_noise_amount'],
                     valid_fn=valid_fn,
                     clean_before_noise_fn=False,
                     noise_fn=None,
                     rng=rng)

    if state['reload']:
        model.load(state['prefix'] + 'model.npz')

    #### Define a trainer
    ##### Training algorithm (SGD)
    if state['moment'] < 0:
        algo = SGD(model, state, train_data)
    else:
        algo = SGD_m(model, state, train_data)
    ##### Main loop of the trainer
    main = MainLoop(train_data,
                    valid_data,
                    test_data,
                    model,
                    algo,
                    state,
                    channel,
                    train_cost=False,
                    hooks=hook_fn,
                    validate_postprocess=eval(state['validate_postprocess']))
    ## Run!
    main.main()
Beispiel #3
0
    [x, y, reset],
    valid_model.cost,
    name='valid_fn',
    updates=[],  #No updates
    on_unused_input='warn')

model_path = state['prefix'] + 'model.npz'
timings_path = state['prefix'] + 'timing.npz'

try:
    print "Loading model"
    model = LM_Model(cost_layer=train_model,
                     weight_noise_amount=state['weight_noise_amount'],
                     valid_fn=valid_fn,
                     clean_before_noise_fn=False,
                     noise_fn=None,
                     test_verbosity=0,
                     cost_per_sample=True,
                     indx_word=state['dictionary'],
                     rng=rng)
    model.load(model_path)
    print "Model loaded"
except Exception:
    print 'mainLoop: Corrupted model file'
    traceback.print_exc()
try:
    timings = dict(numpy.load(timings_path).iteritems())
except Exception:
    print 'mainLoop: Corrupted timings file'
    traceback.print_exc()
Beispiel #4
0
def jobman(state, channel):
    # load dataset
    state['null_sym_source'] = 15000
    state['null_sym_target'] = 15000
    state['n_sym_source'] = state['null_sym_source'] + 1
    state['n_sym_target'] = state['null_sym_target'] + 1

    state['nouts'] = state['n_sym_target']
    state['nins'] = state['n_sym_source']
    rng = numpy.random.RandomState(state['seed'])
    if state['loopIters'] > 0:
        train_data, valid_data, test_data = get_data(state)
    else:
        train_data = None
        valid_data = None
        test_data = None

    ########### Training graph #####################
    ## 1. Inputs
    if state['bs'] == 1:
        x = TT.lvector('x')
        x_mask = TT.vector('x_mask')
        y = TT.lvector('y')
        y0 = y
        y_mask = TT.vector('y_mask')
    else:
        x = TT.lmatrix('x')
        x_mask = TT.matrix('x_mask')
        y = TT.lmatrix('y')
        y0 = y
        y_mask = TT.matrix('y_mask')

    # 2. Layers and Operators
    bs = state['bs']

    embdim = state['dim_mlp']

    # Source Sentence
    emb = MultiLayer(rng,
                     n_in=state['nins'],
                     n_hids=[state['rank_n_approx']],
                     activation=[state['rank_n_activ']],
                     init_fn=state['weight_init_fn'],
                     weight_noise=state['weight_noise'],
                     scale=state['weight_scale'],
                     name='emb')

    emb_words = []
    if state['rec_gating']:
        gater_words = []
    if state['rec_reseting']:
        reseter_words = []
    for si in xrange(state['encoder_stack']):
        emb_words.append(
            MultiLayer(rng,
                       n_in=state['rank_n_approx'],
                       n_hids=[embdim],
                       activation=['lambda x:x'],
                       init_fn=state['weight_init_fn'],
                       weight_noise=state['weight_noise'],
                       scale=state['weight_scale'],
                       name='emb_words_%d' % si))
        if state['rec_gating']:
            gater_words.append(
                MultiLayer(rng,
                           n_in=state['rank_n_approx'],
                           n_hids=[state['dim']],
                           activation=['lambda x:x'],
                           init_fn=state['weight_init_fn'],
                           weight_noise=state['weight_noise'],
                           scale=state['weight_scale'],
                           learn_bias=False,
                           name='gater_words_%d' % si))
        if state['rec_reseting']:
            reseter_words.append(
                MultiLayer(rng,
                           n_in=state['rank_n_approx'],
                           n_hids=[state['dim']],
                           activation=['lambda x:x'],
                           init_fn=state['weight_init_fn'],
                           weight_noise=state['weight_noise'],
                           scale=state['weight_scale'],
                           learn_bias=False,
                           name='reseter_words_%d' % si))

    add_rec_step = []
    rec_proj = []
    if state['rec_gating']:
        rec_proj_gater = []
    if state['rec_reseting']:
        rec_proj_reseter = []
    for si in xrange(state['encoder_stack']):
        if si > 0:
            rec_proj.append(
                MultiLayer(rng,
                           n_in=state['dim'],
                           n_hids=[embdim],
                           activation=['lambda x:x'],
                           init_fn=state['rec_weight_init_fn'],
                           weight_noise=state['weight_noise'],
                           scale=state['rec_weight_scale'],
                           name='rec_proj_%d' % si))
            if state['rec_gating']:
                rec_proj_gater.append(
                    MultiLayer(rng,
                               n_in=state['dim'],
                               n_hids=[state['dim']],
                               activation=['lambda x:x'],
                               init_fn=state['weight_init_fn'],
                               weight_noise=state['weight_noise'],
                               scale=state['weight_scale'],
                               learn_bias=False,
                               name='rec_proj_gater_%d' % si))
            if state['rec_reseting']:
                rec_proj_reseter.append(
                    MultiLayer(rng,
                               n_in=state['dim'],
                               n_hids=[state['dim']],
                               activation=['lambda x:x'],
                               init_fn=state['weight_init_fn'],
                               weight_noise=state['weight_noise'],
                               scale=state['weight_scale'],
                               learn_bias=False,
                               name='rec_proj_reseter_%d' % si))

        add_rec_step.append(
            eval(state['rec_layer'])(rng,
                                     n_hids=state['dim'],
                                     activation=state['activ'],
                                     bias_scale=state['bias'],
                                     scale=state['rec_weight_scale'],
                                     init_fn=state['rec_weight_init_fn'],
                                     weight_noise=state['weight_noise_rec'],
                                     dropout=state['dropout_rec'],
                                     gating=state['rec_gating'],
                                     gater_activation=state['rec_gater'],
                                     reseting=state['rec_reseting'],
                                     reseter_activation=state['rec_reseter'],
                                     name='add_h_%d' % si))

    def _add_op(words_embeddings,
                words_mask=None,
                prev_val=None,
                si=0,
                state_below=None,
                gater_below=None,
                reseter_below=None,
                one_step=False,
                bs=1,
                init_state=None,
                use_noise=True):
        seqlen = words_embeddings.out.shape[0] // bs
        rval = words_embeddings
        gater = None
        reseter = None
        if state['rec_gating']:
            gater = gater_below
        if state['rec_reseting']:
            reseter = reseter_below
        if si > 0:
            rval += rec_proj[si - 1](state_below,
                                     one_step=one_step,
                                     use_noise=use_noise)
            if state['rec_gating']:
                projg = rec_proj_gater[si - 1](state_below,
                                               one_step=one_step,
                                               use_noise=use_noise)
                if gater: gater += projg
                else: gater = projg
            if state['rec_reseting']:
                projg = rec_proj_reseter[si - 1](state_below,
                                                 one_step=one_step,
                                                 use_noise=use_noise)
                if reseter: reseter += projg
                else: reseter = projg

        if not one_step:
            rval = add_rec_step[si](rval,
                                    nsteps=seqlen,
                                    batch_size=bs,
                                    mask=words_mask,
                                    gater_below=gater,
                                    reseter_below=reseter,
                                    one_step=one_step,
                                    init_state=init_state,
                                    use_noise=use_noise)
        else:
            rval = add_rec_step[si](rval,
                                    mask=words_mask,
                                    state_before=prev_val,
                                    gater_below=gater,
                                    reseter_below=reseter,
                                    one_step=one_step,
                                    init_state=init_state,
                                    use_noise=use_noise)
        return rval

    add_op = Operator(_add_op)

    # Target Sentence
    emb_t = MultiLayer(rng,
                       n_in=state['nouts'],
                       n_hids=[state['rank_n_approx']],
                       activation=[state['rank_n_activ']],
                       init_fn=state['weight_init_fn'],
                       weight_noise=state['weight_noise'],
                       scale=state['weight_scale'],
                       name='emb_t')

    emb_words_t = []
    if state['rec_gating']:
        gater_words_t = []
    if state['rec_reseting']:
        reseter_words_t = []
    for si in xrange(state['decoder_stack']):
        emb_words_t.append(
            MultiLayer(rng,
                       n_in=state['rank_n_approx'],
                       n_hids=[embdim],
                       activation=['lambda x:x'],
                       init_fn=state['weight_init_fn'],
                       weight_noise=state['weight_noise'],
                       scale=state['weight_scale'],
                       name='emb_words_t_%d' % si))
        if state['rec_gating']:
            gater_words_t.append(
                MultiLayer(rng,
                           n_in=state['rank_n_approx'],
                           n_hids=[state['dim']],
                           activation=['lambda x:x'],
                           init_fn=state['weight_init_fn'],
                           weight_noise=state['weight_noise'],
                           scale=state['weight_scale'],
                           learn_bias=False,
                           name='gater_words_t_%d' % si))
        if state['rec_reseting']:
            reseter_words_t.append(
                MultiLayer(rng,
                           n_in=state['rank_n_approx'],
                           n_hids=[state['dim']],
                           activation=['lambda x:x'],
                           init_fn=state['weight_init_fn'],
                           weight_noise=state['weight_noise'],
                           scale=state['weight_scale'],
                           learn_bias=False,
                           name='reseter_words_t_%d' % si))

    proj_everything_t = []
    if state['rec_gating']:
        gater_everything_t = []
    if state['rec_reseting']:
        reseter_everything_t = []
    for si in xrange(state['decoder_stack']):
        proj_everything_t.append(
            MultiLayer(rng,
                       n_in=state['dim'],
                       n_hids=[embdim],
                       activation=['lambda x:x'],
                       init_fn=state['weight_init_fn'],
                       weight_noise=state['weight_noise'],
                       scale=state['weight_scale'],
                       name='proj_everything_t_%d' % si,
                       learn_bias=False))
        if state['rec_gating']:
            gater_everything_t.append(
                MultiLayer(rng,
                           n_in=state['dim'],
                           n_hids=[state['dim']],
                           activation=['lambda x:x'],
                           init_fn=state['weight_init_fn'],
                           weight_noise=state['weight_noise'],
                           scale=state['weight_scale'],
                           name='gater_everything_t_%d' % si,
                           learn_bias=False))
        if state['rec_reseting']:
            reseter_everything_t.append(
                MultiLayer(rng,
                           n_in=state['dim'],
                           n_hids=[state['dim']],
                           activation=['lambda x:x'],
                           init_fn=state['weight_init_fn'],
                           weight_noise=state['weight_noise'],
                           scale=state['weight_scale'],
                           name='reseter_everything_t_%d' % si,
                           learn_bias=False))

    add_rec_step_t = []
    rec_proj_t = []
    if state['rec_gating']:
        rec_proj_t_gater = []
    if state['rec_reseting']:
        rec_proj_t_reseter = []
    for si in xrange(state['decoder_stack']):
        if si > 0:
            rec_proj_t.append(
                MultiLayer(rng,
                           n_in=state['dim'],
                           n_hids=[embdim],
                           activation=['lambda x:x'],
                           init_fn=state['rec_weight_init_fn'],
                           weight_noise=state['weight_noise'],
                           scale=state['rec_weight_scale'],
                           name='rec_proj_%d' % si))
            if state['rec_gating']:
                rec_proj_t_gater.append(
                    MultiLayer(rng,
                               n_in=state['dim'],
                               n_hids=[state['dim']],
                               activation=['lambda x:x'],
                               init_fn=state['weight_init_fn'],
                               weight_noise=state['weight_noise'],
                               scale=state['weight_scale'],
                               learn_bias=False,
                               name='rec_proj_t_gater_%d' % si))
            if state['rec_reseting']:
                rec_proj_t_reseter.append(
                    MultiLayer(rng,
                               n_in=state['dim'],
                               n_hids=[state['dim']],
                               activation=['lambda x:x'],
                               init_fn=state['weight_init_fn'],
                               weight_noise=state['weight_noise'],
                               scale=state['weight_scale'],
                               learn_bias=False,
                               name='rec_proj_t_reseter_%d' % si))

        add_rec_step_t.append(
            eval(state['rec_layer'])(rng,
                                     n_hids=state['dim'],
                                     activation=state['activ'],
                                     bias_scale=state['bias'],
                                     scale=state['rec_weight_scale'],
                                     init_fn=state['rec_weight_init_fn'],
                                     weight_noise=state['weight_noise_rec'],
                                     dropout=state['dropout_rec'],
                                     gating=state['rec_gating'],
                                     gater_activation=state['rec_gater'],
                                     reseting=state['rec_reseting'],
                                     reseter_activation=state['rec_reseter'],
                                     name='add_h_t_%d' % si))

    if state['encoder_stack'] > 1:
        encoder_proj = []
        for si in xrange(state['encoder_stack']):
            encoder_proj.append(
                MultiLayer(rng,
                           n_in=state['dim'],
                           n_hids=[state['dim'] * state['maxout_part']],
                           activation=['lambda x: x'],
                           init_fn=state['weight_init_fn'],
                           weight_noise=state['weight_noise'],
                           scale=state['weight_scale'],
                           name='encoder_proj_%d' % si,
                           learn_bias=(si == 0)))

        encoder_act_layer = UnaryOp(activation=eval(state['unary_activ']),
                                    indim=indim,
                                    pieces=pieces,
                                    rng=rng)

    def _add_t_op(words_embeddings,
                  everything=None,
                  words_mask=None,
                  prev_val=None,
                  one_step=False,
                  bs=1,
                  init_state=None,
                  use_noise=True,
                  gater_below=None,
                  reseter_below=None,
                  si=0,
                  state_below=None):
        seqlen = words_embeddings.out.shape[0] // bs

        rval = words_embeddings
        gater = None
        if state['rec_gating']:
            gater = gater_below
        reseter = None
        if state['rec_reseting']:
            reseter = reseter_below
        if si > 0:
            if isinstance(state_below, list):
                state_below = state_below[-1]
            rval += rec_proj_t[si - 1](state_below,
                                       one_step=one_step,
                                       use_noise=use_noise)
            if state['rec_gating']:
                projg = rec_proj_t_gater[si - 1](state_below,
                                                 one_step=one_step,
                                                 use_noise=use_noise)
                if gater: gater += projg
                else: gater = projg
            if state['rec_reseting']:
                projg = rec_proj_t_reseter[si - 1](state_below,
                                                   one_step=one_step,
                                                   use_noise=use_noise)
                if reseter: reseter += projg
                else: reseter = projg
        if everything:
            rval = rval + proj_everything_t[si](everything)
            if state['rec_gating']:
                everyg = gater_everything_t[si](everything,
                                                one_step=one_step,
                                                use_noise=use_noise)
                if gater: gater += everyg
                else: gater = everyg
            if state['rec_reseting']:
                everyg = reseter_everything_t[si](everything,
                                                  one_step=one_step,
                                                  use_noise=use_noise)
                if reseter: reseter += everyg
                else: reseter = everyg

        if not one_step:
            rval = add_rec_step_t[si](rval,
                                      nsteps=seqlen,
                                      batch_size=bs,
                                      mask=words_mask,
                                      one_step=one_step,
                                      init_state=init_state,
                                      gater_below=gater,
                                      reseter_below=reseter,
                                      use_noise=use_noise)
        else:
            rval = add_rec_step_t[si](rval,
                                      mask=words_mask,
                                      state_before=prev_val,
                                      one_step=one_step,
                                      gater_below=gater,
                                      reseter_below=reseter,
                                      use_noise=use_noise)
        return rval

    add_t_op = Operator(_add_t_op)

    outdim = state['dim_mlp']
    if not state['deep_out']:
        outdim = state['rank_n_approx']

    if state['bias_code']:
        bias_code = []
        for si in xrange(state['decoder_stack']):
            bias_code.append(
                MultiLayer(rng,
                           n_in=state['dim'],
                           n_hids=[state['dim']],
                           activation=[state['activ']],
                           bias_scale=[state['bias']],
                           scale=state['weight_scale'],
                           init_fn=state['weight_init_fn'],
                           weight_noise=state['weight_noise'],
                           name='bias_code_%d' % si))

    if state['avg_word']:
        word_code_nin = state['rank_n_approx']
        word_code = MultiLayer(rng,
                               n_in=word_code_nin,
                               n_hids=[outdim],
                               activation='lambda x:x',
                               bias_scale=[state['bias_mlp'] / 3],
                               scale=state['weight_scale'],
                               init_fn=state['weight_init_fn'],
                               weight_noise=state['weight_noise'],
                               learn_bias=False,
                               name='word_code')

    proj_code = MultiLayer(rng,
                           n_in=state['dim'],
                           n_hids=[outdim],
                           activation='lambda x: x',
                           bias_scale=[state['bias_mlp'] / 3],
                           scale=state['weight_scale'],
                           init_fn=state['weight_init_fn'],
                           weight_noise=state['weight_noise'],
                           learn_bias=False,
                           name='proj_code')

    proj_h = []
    for si in xrange(state['decoder_stack']):
        proj_h.append(
            MultiLayer(rng,
                       n_in=state['dim'],
                       n_hids=[outdim],
                       activation='lambda x: x',
                       bias_scale=[state['bias_mlp'] / 3],
                       scale=state['weight_scale'],
                       init_fn=state['weight_init_fn'],
                       weight_noise=state['weight_noise'],
                       name='proj_h_%d' % si))

    if state['bigram']:
        proj_word = MultiLayer(rng,
                               n_in=state['rank_n_approx'],
                               n_hids=[outdim],
                               activation=['lambda x:x'],
                               bias_scale=[state['bias_mlp'] / 3],
                               init_fn=state['weight_init_fn'],
                               weight_noise=state['weight_noise'],
                               scale=state['weight_scale'],
                               learn_bias=False,
                               name='emb_words_lm')

    if state['deep_out']:
        indim = 0
        pieces = 0
        act_layer = UnaryOp(activation=eval(state['unary_activ']))
        drop_layer = DropOp(rng=rng, dropout=state['dropout'])

    if state['deep_out']:
        indim = state['dim_mlp'] / state['maxout_part']
        rank_n_approx = state['rank_n_approx']
        rank_n_activ = state['rank_n_activ']
    else:
        indim = state['rank_n_approx']
        rank_n_approx = 0
        rank_n_activ = None
    output_layer = SoftmaxLayer(rng,
                                indim,
                                state['nouts'],
                                state['weight_scale'],
                                -1,
                                rank_n_approx=rank_n_approx,
                                rank_n_activ=rank_n_activ,
                                weight_noise=state['weight_noise'],
                                init_fn=state['weight_init_fn'],
                                name='out')

    def _pop_op(everything,
                accum,
                everything_max=None,
                everything_min=None,
                word=None,
                aword=None,
                one_step=False,
                use_noise=True):

        rval = proj_h[0](accum[0], one_step=one_step, use_noise=use_noise)
        for si in xrange(1, state['decoder_stack']):
            rval += proj_h[si](accum[si],
                               one_step=one_step,
                               use_noise=use_noise)
        if state['mult_out']:
            rval = rval * everything
        else:
            rval = rval + everything

        if aword and state['avg_word']:
            wcode = aword
            if one_step:
                if state['mult_out']:
                    rval = rval * wcode
                else:
                    rval = rval + wcode
            else:
                if not isinstance(wcode, TT.TensorVariable):
                    wcode = wcode.out
                shape = wcode.shape
                rshape = rval.shape
                rval = rval.reshape(
                    [rshape[0] / shape[0], shape[0], rshape[1]])
                wcode = wcode.dimshuffle('x', 0, 1)
                if state['mult_out']:
                    rval = rval * wcode
                else:
                    rval = rval + wcode
                rval = rval.reshape(rshape)
        if word and state['bigram']:
            if one_step:
                if state['mult_out']:
                    rval *= proj_word(emb_t(word, use_noise=use_noise),
                                      one_step=one_step,
                                      use_noise=use_noise)
                else:
                    rval += proj_word(emb_t(word, use_noise=use_noise),
                                      one_step=one_step,
                                      use_noise=use_noise)
            else:
                if isinstance(word, TT.TensorVariable):
                    shape = word.shape
                    ndim = word.ndim
                else:
                    shape = word.shape
                    ndim = word.out.ndim
                pword = proj_word(emb_t(word, use_noise=use_noise),
                                  one_step=one_step,
                                  use_noise=use_noise)
                shape_pword = pword.shape
                if ndim == 1:
                    pword = Shift()(pword.reshape([shape[0], 1, outdim]))
                else:
                    pword = Shift()(pword.reshape([shape[0], shape[1],
                                                   outdim]))
                if state['mult_out']:
                    rval *= pword.reshape(shape_pword)
                else:
                    rval += pword.reshape(shape_pword)
        if state['deep_out']:
            rval = drop_layer(act_layer(rval), use_noise=use_noise)
        return rval

    pop_op = Operator(_pop_op)

    # 3. Constructing the model
    gater_below = None
    if state['rec_gating']:
        gater_below = gater_words[0](emb(x))
    reseter_below = None
    if state['rec_reseting']:
        reseter_below = reseter_words[0](emb(x))
    encoder_acts = [
        add_op(emb_words[0](emb(x)),
               x_mask,
               bs=x_mask.shape[1],
               si=0,
               gater_below=gater_below,
               reseter_below=reseter_below)
    ]
    if state['encoder_stack'] > 1:
        everything = encoder_proj[0](last(encoder_acts[-1]))
    for si in xrange(1, state['encoder_stack']):
        gater_below = None
        if state['rec_gating']:
            gater_below = gater_words[si](emb(x))
        reseter_below = None
        if state['rec_reseting']:
            reseter_below = reseter_words[si](emb(x))
        encoder_acts.append(
            add_op(emb_words[si](emb(x)),
                   x_mask,
                   bs=x_mask.shape[1],
                   si=si,
                   state_below=encoder_acts[-1],
                   gater_below=gater_below,
                   reseter_below=reseter_below))
        if state['encoder_stack'] > 1:
            everything += encoder_proj[si](last(encoder_acts[-1]))

    if state['encoder_stack'] <= 1:
        encoder = encoder_acts[-1]
        everything = LastState(ntimes=True, n=y.shape[0])(encoder)
    else:
        everything = encoder_act_layer(everything)
        everything = everything.reshape(
            [1, everything.shape[0], everything.shape[1]])
        everything = LastState(ntimes=True, n=y.shape[0])(everything)

    if state['bias_code']:
        init_state = [bc(everything[-1]) for bc in bias_code]
    else:
        init_state = [None for bc in bias_code]

    if state['avg_word']:
        shape = x.shape
        pword = emb(x).out.reshape(
            [shape[0], shape[1], state['rank_n_approx']])
        pword = pword * x_mask.dimshuffle(0, 1, 'x')
        aword = pword.sum(0) / TT.maximum(1., x_mask.sum(0).dimshuffle(0, 'x'))
        aword = word_code(aword, use_noise=False)
    else:
        aword = None

    gater_below = None
    if state['rec_gating']:
        gater_below = gater_words_t[0](emb_t(y0))
    reseter_below = None
    if state['rec_reseting']:
        reseter_below = reseter_words_t[0](emb_t(y0))
    has_said = [
        add_t_op(emb_words_t[0](emb_t(y0)),
                 everything,
                 y_mask,
                 bs=y_mask.shape[1],
                 gater_below=gater_below,
                 reseter_below=reseter_below,
                 init_state=init_state[0],
                 si=0)
    ]
    for si in xrange(1, state['decoder_stack']):
        gater_below = None
        if state['rec_gating']:
            gater_below = gater_words_t[si](emb_t(y0))
        reseter_below = None
        if state['rec_reseting']:
            reseter_below = reseter_words_t[si](emb_t(y0))
        has_said.append(
            add_t_op(emb_words_t[si](emb_t(y0)),
                     everything,
                     y_mask,
                     bs=y_mask.shape[1],
                     state_below=has_said[-1],
                     gater_below=gater_below,
                     reseter_below=reseter_below,
                     init_state=init_state[si],
                     si=si))

    if has_said[0].out.ndim < 3:
        for si in xrange(state['decoder_stack']):
            shape_hs = has_said[si].shape
            if y0.ndim == 1:
                shape = y0.shape
                has_said[si] = Shift()(has_said[si].reshape(
                    [shape[0], 1, state['dim_mlp']]))
            else:
                shape = y0.shape
                has_said[si] = Shift()(has_said[si].reshape(
                    [shape[0], shape[1], state['dim_mlp']]))
            has_said[si].out = TT.set_subtensor(has_said[si].out[0, :, :],
                                                init_state[si])
            has_said[si] = has_said[si].reshape(shape_hs)
    else:
        for si in xrange(state['decoder_stack']):
            has_said[si] = Shift()(has_said[si])
            has_said[si].out = TT.set_subtensor(has_said[si].out[0, :, :],
                                                init_state[si])

    model = pop_op(proj_code(everything), has_said, word=y0, aword=aword)

    nll = output_layer.train(
        state_below=model, target=y0, mask=y_mask, reg=None) / TT.cast(
            y.shape[0] * y.shape[1], 'float32')

    valid_fn = None
    noise_fn = None

    x = TT.lvector(name='x')
    n_steps = TT.iscalar('nsteps')
    temp = TT.scalar('temp')
    gater_below = None
    if state['rec_gating']:
        gater_below = gater_words[0](emb(x))
    reseter_below = None
    if state['rec_reseting']:
        reseter_below = reseter_words[0](emb(x))
    encoder_acts = [
        add_op(emb_words[0](emb(x), use_noise=False),
               si=0,
               use_noise=False,
               gater_below=gater_below,
               reseter_below=reseter_below)
    ]
    if state['encoder_stack'] > 1:
        everything = encoder_proj[0](last(encoder_acts[-1]), use_noise=False)
    for si in xrange(1, state['encoder_stack']):
        gater_below = None
        if state['rec_gating']:
            gater_below = gater_words[si](emb(x))
        reseter_below = None
        if state['rec_reseting']:
            reseter_below = reseter_words[si](emb(x))
        encoder_acts.append(
            add_op(emb_words[si](emb(x), use_noise=False),
                   si=si,
                   state_below=encoder_acts[-1],
                   use_noise=False,
                   gater_below=gater_below,
                   reseter_below=reseter_below))
        if state['encoder_stack'] > 1:
            everything += encoder_proj[si](last(encoder_acts[-1]),
                                           use_noise=False)
    if state['encoder_stack'] <= 1:
        encoder = encoder_acts[-1]
        everything = last(encoder)
    else:
        everything = encoder_act_layer(everything)

    init_state = []
    for si in xrange(state['decoder_stack']):
        if state['bias_code']:
            init_state.append(
                TT.reshape(bias_code[si](everything, use_noise=False),
                           [1, state['dim']]))
        else:
            init_state.append(TT.alloc(numpy.float32(0), 1, state['dim']))

    if state['avg_word']:
        aword = emb(x, use_noise=False).out.mean(0)
        aword = word_code(aword, use_noise=False)
    else:
        aword = None

    def sample_fn(*args):
        aidx = 0
        word_tm1 = args[aidx]
        aidx += 1
        prob_tm1 = args[aidx]
        has_said_tm1 = []
        for si in xrange(state['decoder_stack']):
            aidx += 1
            has_said_tm1.append(args[aidx])
        aidx += 1
        ctx = args[aidx]
        if state['avg_word']:
            aidx += 1
            awrd = args[aidx]

        val = pop_op(proj_code(ctx),
                     has_said_tm1,
                     word=word_tm1,
                     aword=awrd,
                     one_step=True,
                     use_noise=False)
        sample = output_layer.get_sample(state_below=val, temp=temp)
        logp = output_layer.get_cost(state_below=val.out.reshape(
            [1, TT.cast(output_layer.n_in, 'int64')]),
                                     temp=temp,
                                     target=sample.reshape([1, 1]),
                                     use_noise=False)
        gater_below = None
        if state['rec_gating']:
            gater_below = gater_words_t[0](emb_t(sample))
        reseter_below = None
        if state['rec_reseting']:
            reseter_below = reseter_words_t[0](emb_t(sample))
        has_said_t = [
            add_t_op(emb_words_t[0](emb_t(sample)),
                     ctx,
                     prev_val=has_said_tm1[0],
                     gater_below=gater_below,
                     reseter_below=reseter_below,
                     one_step=True,
                     use_noise=True,
                     si=0)
        ]
        for si in xrange(1, state['decoder_stack']):
            gater_below = None
            if state['rec_gating']:
                gater_below = gater_words_t[si](emb_t(sample))
            reseter_below = None
            if state['rec_reseting']:
                reseter_below = reseter_words_t[si](emb_t(sample))
            has_said_t.append(
                add_t_op(emb_words_t[si](emb_t(sample)),
                         ctx,
                         prev_val=has_said_tm1[si],
                         gater_below=gater_below,
                         reseter_below=reseter_below,
                         one_step=True,
                         use_noise=True,
                         si=si,
                         state_below=has_said_t[-1]))
        for si in xrange(state['decoder_stack']):
            if isinstance(has_said_t[si], list):
                has_said_t[si] = has_said_t[si][-1]
        rval = [sample, TT.cast(logp, 'float32')] + has_said_t
        return rval

    sampler_params = [everything]
    if state['avg_word']:
        sampler_params.append(aword)

    states = [TT.alloc(numpy.int64(0), n_steps)]
    states.append(TT.alloc(numpy.float32(0), n_steps))
    states += init_state

    outputs, updates = scan(sample_fn,
                            states=states,
                            params=sampler_params,
                            n_steps=n_steps,
                            name='sampler_scan')
    samples = outputs[0]
    probs = outputs[1]

    sample_fn = theano.function([n_steps, temp, x],
                                [samples, probs.sum()],
                                updates=updates,
                                profile=False,
                                name='sample_fn')

    model = LM_Model(cost_layer=nll,
                     weight_noise_amount=state['weight_noise_amount'],
                     valid_fn=valid_fn,
                     sample_fn=sample_fn,
                     clean_before_noise_fn=False,
                     noise_fn=noise_fn,
                     indx_word=state['indx_word_target'],
                     indx_word_src=state['indx_word'],
                     character_level=False,
                     rng=rng)

    if state['loopIters'] > 0: algo = SGD(model, state, train_data)
    else: algo = None

    def hook_fn():
        if not hasattr(model, 'word_indxs'): model.load_dict()
        if not hasattr(model, 'word_indxs_src'):
            model.word_indxs_src = model.word_indxs
        old_offset = train_data.offset
        if state['sample_reset']: train_data.reset()
        ns = 0
        for sidx in xrange(state['sample_n']):
            while True:
                batch = train_data.next()
                if batch:
                    break
            x = batch['x']
            y = batch['y']
            #xbow = batch['x_bow']
            masks = batch['x_mask']
            if x.ndim > 1:
                for idx in xrange(x.shape[1]):
                    ns += 1
                    if ns > state['sample_max']:
                        break
                    print 'Input: ',
                    for k in xrange(x[:, idx].shape[0]):
                        print model.word_indxs_src[x[:, idx][k]],
                        if model.word_indxs_src[x[:, idx][k]] == '<eol>':
                            break
                    print ''
                    print 'Target: ',
                    for k in xrange(y[:, idx].shape[0]):
                        print model.word_indxs[y[:, idx][k]],
                        if model.word_indxs[y[:, idx][k]] == '<eol>':
                            break
                    print ''
                    senlen = len(x[:, idx])
                    if len(numpy.where(masks[:, idx] == 0)[0]) > 0:
                        senlen = numpy.where(masks[:, idx] == 0)[0][0]
                    if senlen < 1:
                        continue
                    xx = x[:senlen, idx]
                    #xx = xx.reshape([xx.shape[0], 1])
                    model.get_samples(state['seqlen'] + 1, 1, xx)
            else:
                ns += 1
                model.get_samples(state['seqlen'] + 1, 1, x)
            if ns > state['sample_max']:
                break
        train_data.offset = old_offset
        return

    main = MainLoop(train_data,
                    valid_data,
                    None,
                    model,
                    algo,
                    state,
                    channel,
                    reset=state['reset'],
                    hooks=hook_fn)
    if state['reload']: main.load()
    if state['loopIters'] > 0: main.main()

    if state['sampler_test']:
        # This is a test script: we only sample
        if not hasattr(model, 'word_indxs'): model.load_dict()
        if not hasattr(model, 'word_indxs_src'):
            model.word_indxs_src = model.word_indxs

        indx_word = pkl.load(open(state['word_indx'], 'rb'))

        try:
            while True:
                try:
                    seqin = raw_input('Input Sequence: ')
                    n_samples = int(raw_input('How many samples? '))
                    alpha = float(raw_input('Inverse Temperature? '))

                    seqin = seqin.lower()
                    seqin = seqin.split()

                    seqlen = len(seqin)
                    seq = numpy.zeros(seqlen + 1, dtype='int64')
                    for idx, sx in enumerate(seqin):
                        try:
                            seq[idx] = indx_word[sx]
                        except:
                            seq[idx] = indx_word[state['oov']]
                    seq[-1] = state['null_sym_source']

                except Exception:
                    print 'Something wrong with your input! Try again!'
                    continue

                sentences = []
                all_probs = []
                for sidx in xrange(n_samples):
                    #import ipdb; ipdb.set_trace()
                    [values, probs] = model.sample_fn(seqlen * 3, alpha, seq)
                    sen = []
                    for k in xrange(values.shape[0]):
                        if model.word_indxs[values[k]] == '<eol>':
                            break
                        sen.append(model.word_indxs[values[k]])
                    sentences.append(" ".join(sen))
                    all_probs.append(-probs)
                sprobs = numpy.argsort(all_probs)
                for pidx in sprobs:
                    print pidx, "(%f):" % (-all_probs[pidx]), sentences[pidx]
                print

        except KeyboardInterrupt:
            print 'Interrupted'
            pass