def run():

    data_set = load_text_data_set("CDS")

    # exclude the following words, all longer than three syllables, which is not supported by the system ued to
    # construct phonological feature vectors
    more_than_three_syllables = [
        "actually",
        "anybody",
        "investigator",
        "helicopter",
        "interesting",
        "cookie_monster",
        "definitely",
        "refrigerator",
        "oh_my_goodness",
        "humpty_dumpty",
        "interested",
        "everybody",
        "father_christmas",
        "helicopter",
        "alligator",
        "caterpillar",
        "everybody's",
        "hippopotamus",
    ]

    # also exclude the empty string, which occasionally occurs in the corpus
    empty_string = [""]

    vocabulary = get_target_words(
        2000, tags={"n", "v", "adj", "fn"}, exclude_words=more_than_three_syllables + empty_string
    )

    # auto encoder for reducing the dimensionality of the phonological feature vectors
    phon_ae = PhonAutoEncoder(
        vocabulary=vocabulary, epochs=200, learning_rate=0.1, n_hidden=30, corruption_level=0.1, batch_size=1
    )

    phon_ae.train()

    for training_condition in ["context", "context_stress", "context_phonology"]:

        # if 'phonology' is part of the model name, get dimensionality-reduced phonological feature vectors
        if "phonology" in training_condition:
            phon_vs = phon_ae.get_hidden_vectors()
            phon_vs_size = 30
        else:
            phon_vs = None
            phon_vs_size = 0

        # if 'stress' is part of the model name, get dimensionality-reduced lexical stress feature vectors
        if "stress" in training_condition:
            stress_vs = get_primary_stress_vectors(vocabulary)
            stress_vs_size = 3
        else:
            stress_vs = None
            stress_vs_size = 0

        # auto encoder for projecting left-context and phonological / lexical stress feature vector into a shared
        # dimensionality-reduced space
        bm_ae = BiModalAutoEncoder(
            data_set=data_set,
            context_feature_size=2000,
            phon_vectors=phon_vs,
            phon_feature_size=phon_vs_size,
            stress_vectors=stress_vs,
            stress_feature_size=stress_vs_size,
            learning_rate=0.01,
            n_hidden=500,
            corruption_level=0.1,
            batch_size=1,
        )

        # vectors of left-context frequencies, plus phonological and / or lexical stress features
        input_embeddings = dict(zip(bm_ae.vocabulary, bm_ae.embeddings_matrix.get_value()))

        # write predicted categories to file -- files can be used for additional processing, e.g. significance testing
        # on predictions made at two different stages in the training process
        write_preds_to_file(input_embeddings, "majority_vote", "%s_majority_vote" % training_condition)
        write_preds_to_file(input_embeddings, "stratified", "%s_stratified_sampling" % training_condition)
        write_preds_to_file(input_embeddings, "10-NN", "%s_10-NN_before_stage1" % training_condition)

        # create a new file with results by training stage -- write majority vote baseline results to this file
        micro_f1, macro_f1, classification_report = get_f1_and_classification_report(input_embeddings, "majority_vote")
        results_to_disk(
            micro_f1,
            macro_f1,
            classification_report,
            epoch="model was not trained at this stage",
            condition=training_condition,
            training_stage="BASELINE 1: MAJORITY VOTE",
            newfile=True,
        )

        # append stratified sampling baseline results to file
        micro_f1, macro_f1, classification_report = get_f1_and_classification_report(input_embeddings, "stratified")
        results_to_disk(
            micro_f1,
            macro_f1,
            classification_report,
            epoch="model was not trained at this stage",
            condition=training_condition,
            training_stage="BASELINE 2: STRATIFIED SAMPLING",
            newfile=False,
        )

        # append 10-NN results obtained from input vectors to file
        micro_f1, macro_f1, classification_report = get_f1_and_classification_report(input_embeddings, "10-NN")
        results_to_disk(
            micro_f1,
            macro_f1,
            classification_report,
            epoch="model was not trained at this stage",
            condition=training_condition,
            training_stage="BEFORE STAGE 1",
            newfile=False,
        )

        # train the bimodal auto encoder
        bm_ae.train()

        # get the dimensionality-reduced embeddings after training (vectors of hidden unit activation values)
        embeddings = bm_ae.embeddings_over_epochs[-1]

        # # append 10-NN results obtained from hidden embeddings to file
        micro_f1, macro_f1, classification_report = get_f1_and_classification_report(embeddings, "10-NN")
        results_to_disk(
            micro_f1,
            macro_f1,
            classification_report,
            epoch=bm_ae.epochs,
            condition=training_condition,
            training_stage="AFTER STAGE 1",
            newfile=False,
        )

        # plot word embeddings, reduced to two dimensions via the T-SNE algorithm
        plot_2D_embeddings(embeddings, training_condition, training_stage="after_stage_1")

        # plot accuracy over all training epochs
        plot_metric(
            plot_name="%s_auto_encoder" % training_condition,
            plot_type="micro_f1_over_epochs",
            ys=bm_ae.f1_over_epochs,
            label="micro_f1",
        )

        # plot training error over all training epochs
        plot_metric(
            plot_name="%s_auto_encoder" % training_condition,
            plot_type="training_error_over_epochs",
            ys=bm_ae.training_error_over_epochs,
            label="training_error",
        )

        # create and save a new data set with the new embeddings -- for further training with the softmax model
        save_bimodal_data_set(bm_ae, data_set_name=training_condition, embeddings_dict=embeddings)
from TrainingData.data_sets.extract_from_corpus import create_and_save_data_set
from TrainingData.load_corpus.load_corpus_data import get_target_words


# exclude the following words, all longer than three syllables, which is not supported by the system ued to
# construct phonological feature vectors
more_than_three_syllables = ['actually', 'anybody', 'investigator', 'helicopter', 'interesting', 'cookie_monster',
                             'definitely', 'refrigerator',  'oh_my_goodness',  'humpty_dumpty', 'interested',
                             'everybody', 'father_christmas', 'helicopter', 'alligator', 'caterpillar', "everybody's",
                             'hippopotamus']

# also exclude the empty string, which occasionally occurs in the corpus
empty_string = ['']


# get vocabulary -- this is a list of the 2000 most frequent word strings,
#  with each word is given in the format 'word-pos_tag'.
# there are four POS tags: 'v' (verb), 'n' (noun), 'adj' (adjective) 'fn' (function word / closed class word)
vocabulary = get_target_words(2000, tags={'n', 'v', 'adj', 'fn'},
                               exclude_words=more_than_three_syllables + empty_string)

# the vocabulary composition by POS tag is as follows:
# nouns:        1010
# adjectiveS:   166
# closed class: 522
# verbs:        302


# go through the corpus and extract the text-based part of the data set, consisting of (1) vectors with frequency counts
# for each target word's left context and sampled (2) words from the right context; save extracted data set to disk
create_and_save_data_set('CDS', vocabulary=vocabulary, window_size=3)