/
train_bimodal_auto_encoder.py
executable file
·119 lines (93 loc) · 6.51 KB
/
train_bimodal_auto_encoder.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
from Models.PhonAutoEncoder import PhonAutoEncoder
from Models.BiModalAutoEncoder import BiModalAutoEncoder
from Eval.functions import plot_2D_embeddings, plot_metric, write_preds_to_file, results_to_disk
from Eval.functions import get_f1_and_classification_report
from TrainingData.data_sets.save_load_data import save_bimodal_data_set, load_text_data_set
from TrainingData.load_corpus.load_corpus_data import get_target_words
from Phonology.get_phonology_vectors import get_primary_stress_vectors
def run():
data_set = load_text_data_set('CDS')
# exclude the following words, all longer than three syllables, which is not supported by the system ued to
# construct phonological feature vectors
more_than_three_syllables = ['actually', 'anybody', 'investigator', 'helicopter', 'interesting', 'cookie_monster',
'definitely', 'refrigerator', 'oh_my_goodness', 'humpty_dumpty', 'interested',
'everybody', 'father_christmas', 'helicopter', 'alligator', 'caterpillar', "everybody's",
'hippopotamus']
# also exclude the empty string, which occasionally occurs in the corpus
empty_string = ['']
vocabulary = get_target_words(2000, tags={'n', 'v', 'adj', 'fn'},
exclude_words=more_than_three_syllables + empty_string)
# auto encoder for reducing the dimensionality of the phonological feature vectors
phon_ae = PhonAutoEncoder(vocabulary=vocabulary,
epochs=200,
learning_rate=0.1,
n_hidden=30,
corruption_level=0.1,
batch_size=1)
phon_ae.train()
for training_condition in ['context', 'context_stress', 'context_phonology']:
# if 'phonology' is part of the model name, get dimensionality-reduced phonological feature vectors
if 'phonology' in training_condition:
phon_vs = phon_ae.get_hidden_vectors()
phon_vs_size = 30
else:
phon_vs = None
phon_vs_size = 0
# if 'stress' is part of the model name, get dimensionality-reduced lexical stress feature vectors
if 'stress' in training_condition:
stress_vs = get_primary_stress_vectors(vocabulary)
stress_vs_size = 3
else:
stress_vs = None
stress_vs_size = 0
# auto encoder for projecting left-context and phonological / lexical stress feature vector into a shared
# dimensionality-reduced space
bm_ae = BiModalAutoEncoder(data_set=data_set,
context_feature_size=2000,
phon_vectors=phon_vs,
phon_feature_size=phon_vs_size,
stress_vectors=stress_vs,
stress_feature_size=stress_vs_size,
learning_rate=0.01,
n_hidden=500,
corruption_level=0.1,
batch_size=1)
# vectors of left-context frequencies, plus phonological and / or lexical stress features
input_embeddings = dict(zip(bm_ae.vocabulary, bm_ae.embeddings_matrix.get_value()))
# write predicted categories to file -- files can be used for additional processing, e.g. significance testing
# on predictions made at two different stages in the training process
write_preds_to_file(input_embeddings, 'majority_vote', '%s_majority_vote' % training_condition)
write_preds_to_file(input_embeddings, 'stratified', '%s_stratified_sampling' % training_condition)
write_preds_to_file(input_embeddings, '10-NN', '%s_10-NN_before_stage1' % training_condition)
# create a new file with results by training stage -- write majority vote baseline results to this file
micro_f1, macro_f1, classification_report = get_f1_and_classification_report(input_embeddings, 'majority_vote')
results_to_disk(micro_f1, macro_f1, classification_report, epoch='model was not trained at this stage',
condition=training_condition, training_stage='BASELINE 1: MAJORITY VOTE', newfile=True)
# append stratified sampling baseline results to file
micro_f1, macro_f1, classification_report = get_f1_and_classification_report(input_embeddings, 'stratified')
results_to_disk(micro_f1, macro_f1, classification_report, epoch='model was not trained at this stage',
condition=training_condition, training_stage='BASELINE 2: STRATIFIED SAMPLING', newfile=False)
# append 10-NN results obtained from input vectors to file
micro_f1, macro_f1, classification_report = get_f1_and_classification_report(input_embeddings, '10-NN')
results_to_disk(micro_f1, macro_f1, classification_report, epoch='model was not trained at this stage',
condition=training_condition, training_stage='BEFORE STAGE 1', newfile=False)
# train the bimodal auto encoder
bm_ae.train()
# get the dimensionality-reduced embeddings after training (vectors of hidden unit activation values)
embeddings = bm_ae.embeddings_over_epochs[-1]
# # append 10-NN results obtained from hidden embeddings to file
micro_f1, macro_f1, classification_report = get_f1_and_classification_report(embeddings, '10-NN')
results_to_disk(micro_f1, macro_f1, classification_report, epoch=bm_ae.epochs,
condition=training_condition, training_stage='AFTER STAGE 1', newfile=False)
# plot word embeddings, reduced to two dimensions via the T-SNE algorithm
plot_2D_embeddings(embeddings, training_condition, training_stage='after_stage_1')
# plot accuracy over all training epochs
plot_metric(plot_name='%s_auto_encoder' % training_condition, plot_type='micro_f1_over_epochs',
ys=bm_ae.f1_over_epochs, label='micro_f1')
# plot training error over all training epochs
plot_metric(plot_name='%s_auto_encoder' % training_condition, plot_type='training_error_over_epochs',
ys=bm_ae.training_error_over_epochs, label='training_error')
# create and save a new data set with the new embeddings -- for further training with the softmax model
save_bimodal_data_set(bm_ae, data_set_name=training_condition, embeddings_dict=embeddings)
if __name__ == '__main__':
run()