Example #1
0
    print('temperature: ', t)
    print()
    for i in range(20):
        print(
            resample(embeddings,
                     h_size,
                     min_tokens=min_tokens,
                     trials=100,
                     t=t))
    print()

# apply lstm language model to confusion_set

print('apply lstm language model to confusion set...')
print()

cv_0 = list(helper.acs(sents, preserve_tokens, cv_token=confusion_set[0]))
cv_1 = list(helper.acs(sents, preserve_tokens, cv_token=confusion_set[1]))
test_0 = list(helper.acs(sents, preserve_tokens, test_token=confusion_set[0]))
test_1 = list(helper.acs(sents, preserve_tokens, test_token=confusion_set[1]))

cv_predictions = lm_predictions(cv_0, cv_1, confusion_set, embeddings)
test_predictions = lm_predictions(test_0, test_1, confusion_set, embeddings)

# find best temperature and threshold combination and print results

lm_results(cv_predictions, test_predictions, cv_0, cv_1, test_0, test_1,
           embeddings, confusion_set)

print()
print('done')
Example #2
0
    return 'NO SAMPLE IN %d STEPS' % trials

# training

print('start training...')
print()
log_steps = 100
save_steps = 100

weights_changed = False

for e in range(max_epochs):
    if e < start_epoch:
        continue
    error = 0
    for i, (inp, tar) in enumerate(helper.char_sequence_generator(helper.acs(sents, preserve_tokens), embeddings)):
        
        if e == start_epoch and i < start_iteration:
            continue
        
        cost = train(inp, tar)
        error += cost
        weights_changed = True
        
        if (i+1) % log_steps == 0:
            error /= log_steps
            errors.append(error)
            print('epoch: %d\titerations: %d\terror: %f' %(e, (i+1), error))
            print(resample(embeddings, h_size))
            print()
            error = 0
Example #3
0
# training

print('start training...')
print()
log_steps = 100
save_steps = 100

weights_changed = False

for e in range(max_epochs):
    if e < start_epoch:
        continue
    error = 0
    for i, (inp, tar) in enumerate(
            helper.char_sequence_generator(helper.acs(sents, preserve_tokens),
                                           embeddings)):

        if e == start_epoch and i < start_iteration:
            continue

        cost = train(inp, tar)
        error += cost
        weights_changed = True

        if (i + 1) % log_steps == 0:
            error /= log_steps
            errors.append(error)
            print('epoch: %d\titerations: %d\terror: %f' % (e, (i + 1), error))
            print(resample(embeddings, h_size))
            print()
Example #4
0
min_tokens = 5

print('genrate samples')
print('minimum number of tokens per sample: ', min_tokens)
print()
for t in [0.8, 1.0, 1.2]:
    print('temperature: ', t)
    print()
    for i in range(20):
        print(resample(embeddings, h_size, min_tokens=min_tokens, trials=100, t=t))
    print()
    
# apply lstm language model to confusion_set

print('apply lstm language model to confusion set...')
print()

cv_0 = list(helper.acs(sents, preserve_tokens, cv_token=confusion_set[0]))
cv_1 = list(helper.acs(sents, preserve_tokens, cv_token=confusion_set[1]))
test_0 = list(helper.acs(sents, preserve_tokens, test_token=confusion_set[0]))
test_1 = list(helper.acs(sents, preserve_tokens, test_token=confusion_set[1]))

cv_predictions = lm_predictions(cv_0, cv_1, confusion_set, embeddings)
test_predictions = lm_predictions(test_0, test_1, confusion_set, embeddings)

# find best temperature and threshold combination and print results

lm_results(cv_predictions, test_predictions, cv_0, cv_1, test_0, test_1, embeddings, confusion_set)

print()
print('done')
Example #5
0
# minimum occurence of tokens in training data 
# tokens with less occurences will be substituted to 'U' for unknown
# 'U' can also serve as substitute for unseen tokens at test time
min_occurrence = 20

### END SETTINGS ###
       
# init

if not os.path.exists(work_dir):
    os.makedirs(work_dir)
    
with open(corpus_file) as f:
    sents = [[helper.normalization(twp.split('|')[0].lower()) for twp in line.split()] for line in f]

train_sents = list(helper.acs(sents, preserve_tokens)) 
token_embeddings = helper.TokenEmbeddings(train_sents, min_occurrence)

if timestamp and start_epoch and start_iteration:
    errors = helper.load_errors('%s-%d-%d.errors' % (timestamp, start_epoch, start_iteration), work_dir)
    load_weights = '%s-%d-%d.weights' % (timestamp, start_epoch, start_iteration)
    print('init previous states...')
    print('timestamp: ', timestamp)
    print('start_epoch: ', start_epoch)
    print('start_iteration: ', start_iteration)
else:
    errors = []
    start_epoch = 0
    start_iteration = 0
    timestamp = datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
    load_weights = None