Beispiel #1
0
    def evaluate(self, ptb_path='PTB'):
        raw_data = load_dataset(ptb_path)
        train_data, dev_data, test_data, word_to_id, id_to_word = raw_data

        print(datetime_str(), 'Training model ...')
        model = train(train_data, word_to_id, id_to_word)
        print(datetime_str(), 'Training model finished.')
        unigram_probs = self.train_unigram_model(train_data, word_to_id)

        allpreds = [['prev', 'pred1', 'pred2', 'pred3', 
                     'true_prob', 'true_rank', 'kl_uniform', 'kl_unigram']]

        print(datetime_str(), 'Testing model ...')
        allpreds.extend(self.predict_probs(model, id_to_word, train_data, unigram_probs, 'train'))
        allpreds.extend(self.predict_probs(model, id_to_word, dev_data, unigram_probs, 'valid'))
        allpreds.extend(self.predict_probs(model, id_to_word, test_data, unigram_probs, 'test'))
        print(datetime_str(), 'Testing model finished.')

        save_preds(allpreds, preds_fname=PREDS_FNAME)

        scores = score_preds(PREDS_FNAME, ptb_path)

        for method_name, method_scores in scores.items():
            for metric, value in method_scores.items():
                print('{} {}: {}'.format(method_name.capitalize(), metric, value))
            print()
Beispiel #2
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--ptb_path', default='PTB', help='Path to PTB data')
    args = parser.parse_args()

    raw_data = load_dataset(args.ptb_path)
    train_data, dev_data, test_data, word_to_id, id_to_word = raw_data

    model = train(train_data, word_to_id, id_to_word)
    unigram_probs = train_unigram_model(train_data, word_to_id)

    allpreds = [[
        'prev', 'pred1', 'pred2', 'pred3', 'true_prob', 'true_rank',
        'kl_uniform', 'kl_unigram'
    ]]
    allpreds.extend(
        predict_probs(model, id_to_word, train_data, unigram_probs, 'train'))
    allpreds.extend(
        predict_probs(model, id_to_word, dev_data, unigram_probs, 'valid'))
    allpreds.extend(
        predict_probs(model, id_to_word, test_data, unigram_probs, 'test'))

    save_preds(allpreds, preds_fname=PREDS_FNAME)

    scores = score_preds(PREDS_FNAME, args.ptb_path)

    for method_name, method_scores in scores.items():
        for metric, value in method_scores.items():
            print('{} {}: {}'.format(method_name.capitalize(), metric, value))
        print()
Beispiel #3
0
    def sampling(self, size, 
                 ptb_path='PTB', 
                 start_text=None, 
                 batch_size=20,
                 temperature=1.0, 
                 pretrained_model=None):
        """
            Parameters:
                size (int): - number of generated tokens
                start_text (str): - list of tokens separated by a space
                pretrained_model (str): - path to pretrained model

            Returns:
                generated text (str)
        """

        raw_data = load_dataset(ptb_path)
        train_data, dev_data, test_data, word_to_id, id_to_word = raw_data

        if pretrained_model is None:
            print(datetime_str(), 'Training model ...')
            model = train(train_data, word_to_id, id_to_word)
            print(datetime_str(), 'Training model finished.')
        else:
            model = train(train_data, word_to_id, id_to_word, pretrained_model=pretrained_model)

        unigram_probs = self.train_unigram_model(train_data, word_to_id)

        generated_texts = ancestral_sampling_batch(model, 
                                                   unigram_probs, 
                                                   word_to_id, 
                                                   id_to_word, 
                                                   size=size, 
                                                   batch_size=batch_size,
                                                   start_text=start_text, 
                                                   temperature=temperature)

        print('\n'.join(generated_texts))
Beispiel #4
0
def main(job_id, params):
    print params
    trainerr, validerr, testerr = train(saveto=params['model'][0],
                                        reload_=params['reload'][0],
                                        dim_word=params['dim_word'][0],
                                        dim=params['dim'][0],
                                        n_words=params['n-words'][0],
                                        decay_c=params['decay-c'][0],
                                        lrate=params['learning-rate'][0],
                                        optimizer=params['optimizer'][0], 
                                        maxlen=30,
                                        batch_size=32,
                                        valid_batch_size=16,
                                        validFreq=5000,
                                        dispFreq=10,
                                        saveFreq=1000,
                                        sampleFreq=10,
                                        use_dropout=params['use-dropout'][0])
    return validerr
Beispiel #5
0
def main(job_id, params):
    print params
    validerr = train(
        saveto=params['model'][0],
        reload_=params['reload'][0],
        dim_word=params['dim_word'][0],
        dim=params['dim'][0],
        n_words=params['n-words'][0],
        decay_c=params['decay-c'][0],
        lrate=params['learning-rate'][0],
        optimizer=params['optimizer'][0],
        maxlen=30,
        batch_size=32,
        valid_batch_size=16,
        validFreq=5000,
        dispFreq=10,
        saveFreq=1000,
        sampleFreq=1000,
        dataset='staff.data.train.tok',
        valid_dataset='staff.data.valid.tok',
        dictionary='staff.data.train.pkl',
        use_dropout=params['use-dropout'][0])
    return validerr
Beispiel #6
0
def main(job_id, params):
    print params
    validerr = train(
        saveto=params['model'][0],
        reload_=params['reload'][0],
        dim_word=params['dim_word'][0],
        dim=params['dim'][0],
        n_words=params['n-words'][0],
        decay_c=params['decay-c'][0],
        lrate=params['learning-rate'][0],
        optimizer=params['optimizer'][0],
        maxlen=30,
        batch_size=32,
        valid_batch_size=16,
        validFreq=5000,
        dispFreq=10,
        saveFreq=1000,
        sampleFreq=5,
        dataset='/data/lisatmp4/anirudhg/ptb/train.txt',
        valid_dataset='/data/lisatmp4/anirudhg/ptb/valid.txt',
        dictionary='/data/lisatmp4/anirudhg/ptb/dictionary_pbt.pkl',
        use_dropout=params['use-dropout'][0])
    return validerr
Beispiel #7
0
def main(job_id, params):
    print params
    validerr = train(
        saveto=params['model'][0],
        reload_=params['reload'][0],
        dim_word=params['dim_word'][0],
        dim=params['dim'][0],
        n_words=params['n-words'][0],
        decay_c=params['decay-c'][0],
        lrate=params['learning-rate'][0],
        optimizer=params['optimizer'][0],
        maxlen=30,
        batch_size=32,
        valid_batch_size=16,
        validFreq=5000,
        dispFreq=10,
        saveFreq=1000,
        sampleFreq=1000,
        dataset='/ichec/work/dl4mt_data/nec_files/wiki.tok.txt.gz',
        valid_dataset='/ichec/work/dl4mt_data/nec_files/newstest2011.en.tok',
        dictionary='/ichec/work/dl4mt_data/nec_files/wiki.tok.txt.gz.pkl',
        use_dropout=params['use-dropout'][0])
    return validerr
Beispiel #8
0
import argparse
import logging
import pprint

import config_lm

from lm import train

logger = logging.getLogger(__name__)


if __name__ == '__main__':
    # Get the arguments
    parser = argparse.ArgumentParser()
    parser.add_argument("--proto", help="Prototype config to use for model configuration")
    parser.add_argument("--platoon", action='store_true', help="Prototype config to use for model configuration")
    args = parser.parse_args()

    config = getattr(config_lm, args.proto)()
    config['platoon'] = args.platoon
    config['name'] = args.proto
    logger.info("Model options:\n{}".format(pprint.pformat(config)))
    train(**config)
Beispiel #9
0
import numpy

from lm import train

trainerr, validerr, testerr = train(
    train_path='../data/morph/finnish.clean.train10k',
    validation_path='../data/morph/finnish.clean.test',
    dictionary_path='../data/morph/morph.vocab',
    model_path='/l/senarvi/theano-rnnlm/model-train10k-lstm.npz',
    reload_state=True,
    dim_word=256,
    dim=1024,
    n_words=30000,
    decay_c=0.,
    lrate=0.0001,
    optimizer='adam',
    maxlen=30,
    batch_size=32,
    valid_batch_size=16,
    validFreq=5000,
    dispFreq=20,
    saveFreq=40,
    sampleFreq=20)
print(validerr)
Beispiel #10
0
import numpy

from lm import train

trainerr, validerr, testerr = train(train_path='../data/morph/finnish.clean.train10k',
                                    validation_path='../data/morph/finnish.clean.test',
                                    dictionary_path='../data/morph/morph.vocab',
                                    model_path='/l/senarvi/theano-rnnlm/model-train10k-lstm.npz',
                                    reload_state=True,
                                    dim_word=256,
                                    dim=1024,
                                    n_words=30000,
                                    decay_c=0.,
                                    lrate=0.0001,
                                    optimizer='adam', 
                                    maxlen=30,
                                    batch_size=32,
                                    valid_batch_size=16,
                                    validFreq=5000,
                                    dispFreq=20,
                                    saveFreq=40,
                                    sampleFreq=20)
print(validerr)