from models import NCELangModel
import os, re
import logging
import numpy as np

logging.basicConfig(level=logging.DEBUG)

trn_regex = re.compile(r'\d{3}.bz2')
dir_ = 'data/fake/test'
train_files = [
    os.path.join(dir_, f) for f in os.listdir(dir_) if trn_regex.match(f)
]
X = np.loadtxt(train_files[0], dtype='int32')

model = NCELangModel(vocab_size=15, nb_negative=2, embed_dims=128)
ins, _ = model.prepare_input(X, 0, None)
data = {model.input['idxes']: ins[0]}
model.compile()
Esempio n. 2
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-
__author__ = 'Yunchuan Chen'

from utils import get_unigram_probtable
from models import NCELangModel
from keras.optimizers import adam

NB_RUN_WORDS = 100000000
NB_VOCAB = 10000
NB_RUN_VAL = 100000
NB_EVALUATE = 5000000
SAVE_PATH = '../data/models/lang/nce0-neg50-e128-c128-lr0.01.pkl'

DATA_PATH = '../data/corpus/wiki-sg-norm-lc-drop-bin.bz2'
BATCH_SIZE = 256
VAL_INTER = 1200

unigram_table = get_unigram_probtable(nb_words=NB_VOCAB)

opt = adam(lr=0.01)
model = NCELangModel(vocab_size=NB_VOCAB, nb_negative=50, embed_dims=128, context_dims=128,
                     negprob_table=unigram_table, optimizer=opt)
model.compile()
model.train(data_file=DATA_PATH,
            save_path=SAVE_PATH,
            batch_size=BATCH_SIZE, train_nb_words=NB_RUN_WORDS,
            val_nb_words=NB_EVALUATE, train_val_nb=NB_RUN_VAL, validation_interval=VAL_INTER)
    logging.info('Train simple language model')
    model = SimpleLangModel(vocab_size=15,
                            embed_dims=128,
                            context_dims=128,
                            optimizer=options.optimizer)
    model.compile()
    model.train_from_dir(data_path,
                         validation_split=0.05,
                         batch_size=options.batch_size,
                         verbose=options.verbose)

if options.train_nce:
    logging.info('Train NCE based language model')
    model = NCELangModel(vocab_size=15,
                         nb_negative=2,
                         embed_dims=128,
                         negprob_table=negprob_table,
                         optimizer=options.optimizer)
    model.compile()
    logging.debug('compile success')
    model.train_from_dir(data_path,
                         validation_split=0.05,
                         batch_size=options.batch_size,
                         verbose=options.verbose)

if options.train_nce1:
    logging.info('Train NCE based language model (1)')
    model = NCELangModelV1(vocab_size=15,
                           nb_negative=6,
                           embed_dims=128,
                           negprob_table=negprob_table,
Esempio n. 4
0
if options.decay:
    opt = AdamAnneal(lr=options.lr, lr_min=options.lr_min, gamma=options.gamma)
else:
    opt = adam(lr=options.lr)

if options.log_file == '':
    log_file = None
else:
    log_file = options.log_file

if options.save == '':
    save_path = None
else:
    save_path = options.save

model = NCELangModel(vocab_size=nb_vocab,
                     nb_negative=options.negative,
                     embed_dims=options.embed_size,
                     context_dims=options.context_size,
                     negprob_table=unigram_table,
                     optimizer=opt)
model.compile()
model.train(data_file=DATA_PATH,
            save_path=save_path,
            batch_size=BATCH_SIZE,
            train_nb_words=nb_run_words,
            val_nb_words=nb_evaluate,
            train_val_nb=nb_run_val,
            validation_interval=options.interval,
            log_file=log_file)
Esempio n. 5
0
nb_evaluate = options.nb_evaluation

# unigram_table = get_unigram_probtable(nb_words=nb_vocab)
unigram_table = get_unigram_probtable(nb_words=nb_vocab,
                                      save_path='../data/wiki-unigram-prob-size%d.pkl' %
                                                nb_vocab)

if options.decay:
    opt = AdamAnneal(lr=options.lr, lr_min=options.lr_min, gamma=options.gamma)
else:
    opt = adam(lr=options.lr)

if options.log_file == '':
    log_file = None
else:
    log_file = options.log_file

if options.save == '':
    save_path = None
else:
    save_path = options.save

model = NCELangModel(vocab_size=nb_vocab, nb_negative=options.negative, 
                     embed_dims=options.embed_size, context_dims=options.context_size,
                     negprob_table=unigram_table, optimizer=opt)
model.compile()
model.train(data_file=DATA_PATH,
            save_path=save_path,
            batch_size=BATCH_SIZE, train_nb_words=nb_run_words,
            val_nb_words=nb_evaluate, train_val_nb=nb_run_val,
            validation_interval=options.interval, log_file=log_file)