Ejemplo n.º 1
0
valid_data_dir = '/mnt/raid1/billion-word-corpus/1-billion-word-language-modeling-benchmark/heldout-monolingual.tokenized.shuffled/'
save_dir = '/home/ab455/language-model/checkpoints/'
num_words = None

seq_len = 25
batch_size = 256
valid_batch_size = 16 ## Needs to be smaller due to memory issues
embed_size = 128
num_epochs = 20
hidden_size = 256
num_layers = 2

dataset = Dataset(data_dir,num_words)
dataset.set_batch_size(batch_size)
dataset.set_seq_len(seq_len)
dataset.save('./checkpoints_large/')

params = {}
params['vocab_size'] = dataset.vocab_size
params['num_classes'] = dataset.vocab_size
params['batch_size'] = batch_size
params['seq_len'] = seq_len
params['hidden_dim'] = hidden_size
params['num_layers'] = num_layers
params['embed_size'] = embed_size

model = LargeLanguageModel(params)
model.compile()
eval_softmax = 5
for epoch in range(num_epochs):
    dataset.set_data_dir(data_dir)
Ejemplo n.º 2
0
os.makedirs(directoryOutLogs)

num_words = None

seq_len = 25
batch_size = 16
valid_batch_size = 16  ## Needs to be smaller due to memory issues
embed_size = 64
num_epochs = 20
hidden_size = 64
num_layers = 1

dataset = Dataset(data_dir, num_words)
dataset.set_batch_size(batch_size)
dataset.set_seq_len(seq_len)
dataset.save(dataset_specific_info)

params = {}

#take account of the 0 token for padding
params['vocab_size'] = dataset.vocab_size + 1
params['num_classes'] = dataset.vocab_size
params['batch_size'] = batch_size
params['valid_batch_size'] = valid_batch_size
params['seq_len'] = seq_len
params['hidden_dim'] = hidden_size
params['num_layers'] = num_layers
params['embed_size'] = embed_size
params['directoryOutLogs'] = directoryOutLogs

model = LanguageModel(params)
Ejemplo n.º 3
0
valid_data_dir = '/mnt/raid1/billion-word-corpus/1-billion-word-language-modeling-benchmark/heldout-monolingual.tokenized.shuffled/'
save_dir = '/home/ab455/language-model/checkpoints/'
num_words = None

seq_len = 25
batch_size = 192
valid_batch_size = 16  ## Needs to be smaller due to memory issues
embed_size = 128
num_epochs = 20
hidden_size = 256
num_layers = 1

dataset = Dataset(data_dir, num_words)
dataset.set_batch_size(batch_size)
dataset.set_seq_len(seq_len)
dataset.save('./checkpoints/')

params = {}
params['vocab_size'] = dataset.vocab_size
params['num_classes'] = dataset.vocab_size
params['batch_size'] = batch_size
params['seq_len'] = seq_len
params['hidden_dim'] = hidden_size
params['num_layers'] = num_layers
params['embed_size'] = embed_size

model = LanguageModel(params)
model.compile()
eval_softmax = 5
for epoch in range(num_epochs):
    dataset.set_data_dir(data_dir)