def perplexity(f_cost, lines, char_dict, word_dict, opts):
    """ 
    compute perplexity over the validation/test data

    Parameters
    ----------
    f_cost     : compiled function, computation for the forward pass
    lines      : list of string, validation/test data
    char_dict  : OrderedDict, {character: index}
    word_dict  : OrderedDict, {word: index}
    opts       : dictionary, {hyperparameter: value}

    Returns
    -------
    cost       : numpy float32, perplexity 

    """
    n_lines = len(lines)
    cost = 0.
    n_words = 0.
    total_n_words = 0.
    batch_size = 64  
    kf_train = KFold(n_lines, n_folds=n_lines/(batch_size-1), shuffle=False)
    for _, index in kf_train:
        x = [lines[i] for i in index]
        x_f_, x_r_, x_spaces_, x_last_chars_, x_word_input_, label_words_ \
                                                      = txt_to_inps(x, char_dict, word_dict, opts)
        n_words = (1 - x_last_chars_).sum()
        cost_one = f_cost(x_f_, x_r_, x_spaces_, x_last_chars_, x_word_input_, label_words_) * x_f_.shape[1]
        cost += cost_one
        total_n_words += n_words  
    cost = numpy.exp(cost / total_n_words)
    return cost
def train(opts):
    """ training process starts here """

    print '==> Training a language model'
    if opts['model_type'] == 'gate':
        if opts['pretrain'] > 0:
            print '    [Gated Word & Char with Pretraining]'
        else:
            print '    [Gated Word & Char]'
    else:
        if opts['pretrain'] > 0:
            print '    [Concat Word & Char with Pretraining]'
        else:
            print '    [Concat Word & Char]'

    #---------------------------------------------------------
    # prepare ingredients
    #---------------------------------------------------------

    print '==> Loading dictionaries: ',

    # load word dictionary
    print 'word dict,',
    if opts['word_dictionary']:
        with open(opts['word_dictionary'], 'rb') as f:
            word_dict = pkl.load(f)  # word -> index
        word_idict = dict()
        for kk, vv in word_dict.iteritems():
            word_idict[vv] = kk  # index -> word

    # load character dictionary
    print 'char dict,',
    if opts['char_dictionary']:
        with open(opts['char_dictionary'], 'rb') as f:
            char_dict = pkl.load(f)  # char -> index
            char_dict[opts['bos']] = len(char_dict)  # add the BOS symbol
        char_idict = dict()
        for kk, vv in char_dict.iteritems():
            char_idict[vv] = kk  # index -> char
    print 'Done'

    # reload options
    if opts['reload_'] and os.path.exists(opts['saveto']):
        with open('%s.pkl' % opts['saveto'], 'rb') as f:
            reloaded_options = pkl.load(f)
            opts.update(reloaded_options)

    # load training data
    train = load_data(path=opts['train_text'])

    # initialize params
    print '==> Building model:'
    params = init_params(opts)

    # reload parameters
    if opts['reload_'] and os.path.exists(opts['saveto']):
        params = load_params(opts['saveto'], params)

    # convert params to Theano shared variabel
    tparams = init_tparams(params)

    # build computational graph
    trng, is_train, pretrain_mode, x_f, x_r, x_spaces, x_last_chars, x_word_input, label_words, cost \
                                                                              = build_model(tparams, opts)
    inps = [x_f, x_r, x_spaces, x_last_chars, x_word_input, label_words]

    print '==> Building f_cost...',
    f_cost = theano.function(inps, cost)
    print 'Done'

    # get gradients
    print '==> Computing gradient...',
    grads = tensor.grad(cost, wrt=itemlist(tparams))

    # gradient clipping
    print 'gradient clipping...',
    grad_norm = tensor.sqrt(tensor.sum([tensor.sum(g**2.) for g in grads]))
    tau = opts['gradclip']
    grad_clipped = []
    for g in grads:
        grad_clipped.append(
            tensor.switch(tensor.ge(grad_norm, tau), g * tau / grad_norm, g))
    print 'Done'

    # build optimizer
    lr = tensor.scalar(name='lr')
    print '==> Building optimizers...',
    f_grad_shared, f_update = eval(opts['optimizer'])(lr, tparams,
                                                      grad_clipped, inps, cost)
    print 'Done'

    #---------------------------------------------------------
    # start optimization
    #---------------------------------------------------------

    print '==> Optimization:'

    # reload history
    history_errs = []
    if opts['reload_'] and os.path.exists(opts['saveto']):
        history_errs = list(numpy.load(opts['saveto'])['history_errs'])
    best_p = None
    bad_counter = 0

    # load validation and test data
    if opts['valid_text']:
        valid_lines = []
        with open(opts['valid_text'], 'r') as f:
            for l in f:
                valid_lines.append(l)
        n_valid_lines = len(valid_lines)
    if opts['test_text']:
        test_lines = []
        with open(opts['test_text'], 'r') as f:
            for l in f:
                test_lines.append(l)
        n_test_lines = len(test_lines)

    # initialize some values
    uidx = 0  # update counter
    estop = False  # early stopping flag
    m = opts['pretrain']  # pretrain for m epochs using word/char only
    lrate = opts['lrate']
    lr_decayed = opts['lrate']
    batch_size = opts['batch_size']

    # outer loop: epochs
    for eidx in xrange(opts['max_epochs']):

        n_samples = 0  # sample counter

        # shuffle training data every epoch
        print '==> Shuffling sentences...',
        shuffle(train)
        print 'Done'

        # learning rate decay
        if eidx >= opts['lr_decay_start']:
            lr_decayed /= opts['lr_decay']

        # set pretraining mode
        if eidx in [e for e in range(m)]:
            pretrain_mode.set_value(0.)
        elif eidx in [e + m for e in range(m)]:
            lrate = .1
            pretrain_mode.set_value(1.)
        else:
            lrate = lr_decayed
            pretrain_mode.set_value(2.)
        print 'pretrain_mode = ', pretrain_mode.get_value(
        ), 'epoch = ', eidx, 'lr = ', lrate

        # training iterator
        kf_train = KFold(len(train),
                         n_folds=len(train) / (batch_size - 1),
                         shuffle=False)

        # inner loop: batches
        for _, index in kf_train:
            n_samples += len(index)
            uidx += 1

            # is_train=1 at training time
            is_train.set_value(1.)

            # get a batch
            x = [train[i] for i in index]

            # format input data
            x_f_, x_r_, x_spaces_, x_last_chars_, x_word_input_, label_words_ \
                                                      = txt_to_inps(x, char_dict, word_dict, opts)

            # compute cost
            cost = f_grad_shared(x_f_, x_r_, x_spaces_, x_last_chars_,
                                 x_word_input_, label_words_)

            # update parameters
            f_update(lrate)

            # check cost
            if numpy.isnan(cost) or numpy.isinf(cost):
                print 'NaN detected'
                return 1., 1., 1.

            # display cost
            if numpy.mod(uidx, opts['dispFreq']) == 0:
                print 'Epoch ', eidx, 'Update ', uidx, 'Cost ', cost

            # save params
            if numpy.mod(uidx, opts['saveFreq']) == 0:
                print 'Saving...',
                if best_p is not None:
                    params = best_p
                else:
                    params = unzip(tparams)
                numpy.savez(opts['saveto'],
                            history_errs=history_errs,
                            **params)
                pkl.dump(opts, open('%s.pkl' % opts['saveto'], 'wb'))
                print 'Done'

            # compute validation/test perplexity
            if numpy.mod(uidx, opts['validFreq']) == 0:
                print "Computing Dev/Test Perplexity"

                # is_train=0 at valid/test time
                is_train.set_value(0.)
                valid_err = perplexity(f_cost, valid_lines, char_dict,
                                       word_dict, opts)
                test_err = perplexity(f_cost, test_lines, char_dict, word_dict,
                                      opts)
                history_errs.append([valid_err, test_err])

                # save the best params
                if len(history_errs) > 1:
                    if uidx == 0 or valid_err <= numpy.array(
                            history_errs)[:, 0].min():
                        best_p = unzip(tparams)
                        print 'Saving best params...',
                        numpy.savez(opts['savebestto'],
                                    history_errs=history_errs,
                                    **params)
                        pkl.dump(opts, open('%s.pkl' % opts['savebestto'],
                                            'wb'))
                        print 'Done'
                        bad_counter = 0
                    if len(history_errs
                           ) > opts['patience'] and valid_err >= numpy.array(
                               history_errs)[:-opts['patience'], 0].min():
                        bad_counter += 1
                        if bad_counter > opts['patience']:
                            print 'Early Stop!'
                            estop = True
                            break

                print 'Valid ', valid_err, 'Test ', test_err

        # inner loop: end

        print 'Seen %d samples' % n_samples

        # early stopping
        if estop:
            break

    # outer loop: end

    if best_p is not None:
        zipp(best_p, tparams)

    # compute validation/test perplexity at the end of training
    is_train.set_value(0.)
    valid_err = perplexity(f_cost, valid_lines, char_dict, word_dict, opts)
    test_err = perplexity(f_cost, test_lines, char_dict, word_dict, opts)
    print 'Valid ', valid_err, 'Test ', test_err

    # save everithing
    params = copy.copy(best_p)
    numpy.savez(opts['saveto'],
                zipped_params=best_p,
                valid_err=valid_err,
                test_err=test_err,
                history_errs=history_errs,
                **params)

    return valid_err, test_err
def train(opts):
    """ training process starts here """
    
    print '==> Training a language model'  
    if opts['model_type'] == 'gate':
        if opts['pretrain'] > 0:
            print '    [Gated Word & Char with Pretraining]'
        else:
            print '    [Gated Word & Char]'
    else:
        if opts['pretrain'] > 0:
            print '    [Concat Word & Char with Pretraining]'
        else:
            print '    [Concat Word & Char]'

    #---------------------------------------------------------
    # prepare ingredients
    #---------------------------------------------------------   

    print '==> Loading dictionaries: ',
    
    # load word dictionary
    print 'word dict,',
    if opts['word_dictionary']:
        with open(opts['word_dictionary'], 'rb') as f:
            word_dict = pkl.load(f) # word -> index 
        word_idict = dict()
        for kk, vv in word_dict.iteritems():
            word_idict[vv] = kk     # index -> word
    
    # load character dictionary
    print 'char dict,',
    if opts['char_dictionary']:
        with open(opts['char_dictionary'], 'rb') as f:
            char_dict = pkl.load(f) # char -> index
            char_dict[opts['bos']] = len(char_dict) # add the BOS symbol 
        char_idict = dict()
        for kk, vv in char_dict.iteritems():
            char_idict[vv] = kk     # index -> char
    print 'Done'        
    
    # reload options 
    if opts['reload_'] and os.path.exists(opts['saveto']):
        with open('%s.pkl' % opts['saveto'], 'rb') as f:
            reloaded_options = pkl.load(f)
            opts.update(reloaded_options)
   
    # load training data
    train = load_data(path=opts['train_text'])
 
    # initialize params
    print '==> Building model:'
    params = init_params(opts)

    # reload parameters
    if opts['reload_'] and os.path.exists(opts['saveto']):
        params = load_params(opts['saveto'], params)

    # convert params to Theano shared variabel 
    tparams = init_tparams(params)
    
    # build computational graph 
    trng, is_train, pretrain_mode, x_f, x_r, x_spaces, x_last_chars, x_word_input, label_words, cost \
                                                                              = build_model(tparams, opts)
    inps = [x_f, x_r, x_spaces, x_last_chars, x_word_input, label_words]

    print '==> Building f_cost...',
    f_cost = theano.function(inps, cost)
    print 'Done'

    # get gradients
    print '==> Computing gradient...',
    grads = tensor.grad(cost, wrt=itemlist(tparams))

    # gradient clipping
    print 'gradient clipping...',
    grad_norm = tensor.sqrt(tensor.sum([tensor.sum(g**2.) for g in grads]))
    tau = opts['gradclip']
    grad_clipped = []
    for g in grads:
        grad_clipped.append(tensor.switch(tensor.ge(grad_norm, tau), g * tau / grad_norm, g))
    print 'Done'

    # build optimizer
    lr = tensor.scalar(name='lr')
    print '==> Building optimizers...',
    f_grad_shared, f_update = eval(opts['optimizer'])(lr, tparams, grad_clipped, inps, cost)
    print 'Done'
 
    #---------------------------------------------------------
    # start optimization
    #---------------------------------------------------------   

    print '==> Optimization:'

    # reload history
    history_errs = []
    if opts['reload_'] and os.path.exists(opts['saveto']):
        history_errs = list(numpy.load(opts['saveto'])['history_errs'])
    best_p = None
    bad_counter = 0

    # load validation and test data
    if opts['valid_text']:
        valid_lines = []
        with open(opts['valid_text'], 'r') as f:
            for l in f:
                valid_lines.append(l)
        n_valid_lines = len(valid_lines)
    if opts['test_text']:
        test_lines = []
        with open(opts['test_text'], 'r') as f:
            for l in f:
                test_lines.append(l)
        n_test_lines = len(test_lines)
    
    # initialize some values
    uidx = 0                 # update counter
    estop = False            # early stopping flag
    m = opts['pretrain']     # pretrain for m epochs using word/char only
    lrate = opts['lrate']
    lr_decayed = opts['lrate']
    batch_size = opts['batch_size']

    # outer loop: epochs
    for eidx in xrange(opts['max_epochs']):
        
        n_samples = 0  # sample counter
              
        # shuffle training data every epoch
        print '==> Shuffling sentences...',
        shuffle(train)
        print 'Done'
      
        # learning rate decay
        if eidx >= opts['lr_decay_start']:
            lr_decayed /= opts['lr_decay'] 

        # set pretraining mode
        if eidx in [e for e in range(m)]:
            pretrain_mode.set_value(0.)
        elif eidx in [e + m for e in range(m)]:
            lrate = .1
            pretrain_mode.set_value(1.)
        else:
            lrate = lr_decayed
            pretrain_mode.set_value(2.)      
        print 'pretrain_mode = ', pretrain_mode.get_value(), 'epoch = ', eidx, 'lr = ', lrate
 
        # training iterator 
        kf_train = KFold(len(train), n_folds=len(train)/(batch_size-1), shuffle=False)
  
        # inner loop: batches
        for _, index in kf_train:
            n_samples += len(index)
            uidx += 1

            # is_train=1 at training time
            is_train.set_value(1.)

            # get a batch
            x = [train[i] for i in index]
                
            # format input data
            x_f_, x_r_, x_spaces_, x_last_chars_, x_word_input_, label_words_ \
                                                      = txt_to_inps(x, char_dict, word_dict, opts) 
            
            # compute cost 
            cost = f_grad_shared(x_f_, x_r_, x_spaces_, x_last_chars_, x_word_input_, label_words_)     

            # update parameters 
            f_update(lrate)

            # check cost  
            if numpy.isnan(cost) or numpy.isinf(cost):
                print 'NaN detected'
                return 1., 1., 1.
  
            # display cost
            if numpy.mod(uidx, opts['dispFreq']) == 0:
                print 'Epoch ', eidx, 'Update ', uidx, 'Cost ', cost

            # save params
            if numpy.mod(uidx, opts['saveFreq']) == 0:
                print 'Saving...',
                if best_p is not None:
                    params = best_p
                else:
                    params = unzip(tparams)
                numpy.savez(opts['saveto'], history_errs=history_errs, **params)
                pkl.dump(opts, open('%s.pkl' % opts['saveto'], 'wb'))
                print 'Done'

            # compute validation/test perplexity
            if numpy.mod(uidx, opts['validFreq']) == 0:
                print "Computing Dev/Test Perplexity"
                
                # is_train=0 at valid/test time
                is_train.set_value(0.)                  
                valid_err = perplexity(f_cost, valid_lines, char_dict, word_dict, opts)               
                test_err = perplexity(f_cost, test_lines, char_dict, word_dict, opts)
                history_errs.append([valid_err, test_err])
                
                # save the best params
                if len(history_errs) > 1:
                    if uidx == 0 or valid_err <= numpy.array(
                            history_errs)[:, 0].min():
                        best_p = unzip(tparams)
                        print 'Saving best params...',
                        numpy.savez(opts['savebestto'], history_errs=history_errs, **params)
                        pkl.dump(opts, open('%s.pkl' % opts['savebestto'], 'wb'))
                        print 'Done'
                        bad_counter = 0
                    if len(history_errs) > opts['patience'] and valid_err >= numpy.array(
                                history_errs)[:-opts['patience'], 0].min():
                        bad_counter += 1
                        if bad_counter > opts['patience']:
                            print 'Early Stop!'
                            estop = True
                            break

                print 'Valid ', valid_err, 'Test ', test_err 
   
        # inner loop: end
  
        print 'Seen %d samples' % n_samples

        # early stopping
        if estop:
            break
    
    # outer loop: end 
   
    if best_p is not None:
        zipp(best_p, tparams)
    
    # compute validation/test perplexity at the end of training
    is_train.set_value(0.)
    valid_err = perplexity(f_cost, valid_lines, char_dict, word_dict, opts)
    test_err = perplexity(f_cost, test_lines, char_dict, word_dict, opts)
    print 'Valid ', valid_err, 'Test ', test_err

    # save everithing
    params = copy.copy(best_p)
    numpy.savez(opts['saveto'], zipped_params=best_p, valid_err=valid_err, 
                test_err=test_err, history_errs=history_errs, **params)

    return valid_err, test_err