Beispiel #1
0
    os.makedirs(work_dir)

# get text corpus
nltk.download('brown')
sents = nltk.corpus.brown.sents()[:100]

# preprocessing
normalizer = Normalization(sents, min_count=15)
training_data = NormalizationIter(normalizer, sents)
lm = LanguageModel(tokenized_sentences=training_data,
                   input_layer_size=64,
                   hidden_layer_size=128)
print()

# train model
lm.train(training_data, epochs=5, backup_directory=work_dir, log_interval=20)
print()

# test trained model
normalized_sentence = normalizer.normalize(sents[0])
print('normalized sentence:')
print(' '.join(normalized_sentence))
print('probability: ', lm.sentence_log_probability(normalized_sentence))
print()
start_tag = normalized_sentence[0]
end_tag = normalized_sentence[-1]
print('sample:')
print(' '.join(lm.sample([start_tag], end_tag=end_tag)))
print()

# save, load and test loaded model
lm_file = os.path.join(work_dir, 'test_model.bin')
if not os.path.exists(work_dir):
    os.makedirs(work_dir)

# get text corpus
nltk.download('brown')
sents = nltk.corpus.brown.sents()[:100]

# preprocessing
normalizer = Normalization(sents, min_count=15)
training_data = NormalizationIter(normalizer, sents)
lm = LanguageModel(tokenized_sentences=training_data, input_layer_size=64, hidden_layer_size=128)
print()

# train model
lm.train(training_data, epochs=5, backup_directory=work_dir, log_interval=20)
print()

# test trained model
normalized_sentence = normalizer.normalize(sents[0])
print('normalized sentence:')
print(' '.join(normalized_sentence))
print('probability: ', lm.sentence_log_probability(normalized_sentence))
print()
start_tag = normalized_sentence[0]
end_tag = normalized_sentence[-1]
print('sample:')
print(' '.join(lm.sample([start_tag], end_tag=end_tag)))
print()

# save, load and test loaded model
Beispiel #3
0
cs3 = ['well', 'good']

acs = AdvancedCorpusSplitter(all_sents_normalized, cs1 + cs2 + cs3)

if os.path.isfile(lm_file):
    lm = LanguageModel(lm_file=lm_file)

else:
    lm = LanguageModel(verbose=True,
                       tokenized_sentences=acs,
                       input_layer_size=128,
                       hidden_layer_size=512)

    cost_log = lm.train(acs,
                        epochs=10,
                        backup_directory=work_dir,
                        return_cost=True,
                        log_interval=1000)
    lm.save(lm_file)

# sampling

print()
print('sampling...')

def print_samples(language_model, start_tokens, end_tag, num_samples, temperatures):
    for t in temperatures:
        print('temperature: ', t)
        for i in range(num_samples):
            print(' '.join(language_model.sample(start_tokens, end_tag, temperature=t)[1:-1]))
        print()