Ejemplo n.º 1
0
    def train(self):

        print "There are %s training examples." % self.embeddings_matrix.get_value(borrow=True).shape[0]
        print

        # list of indices, each pointing to a row in the embeddings matrix
        train_set_idxs, train_fn = self._get_train_function()

        start_time = time.time()
        epoch = -1
        previous_accuracy = -numpy.inf
        while True:
            epoch += 1

            costs_over_batches = []
            for idx, y_idx in zip(train_set_idxs, range(len(train_set_idxs))):
                index = idx
                cost = train_fn(index, y_idx)
                costs_over_batches.append(cost)

            # store embeddings and evaluation metrics at each epoch
            training_loss = numpy.mean(costs_over_batches)
            embeddings = dict(zip(self.vocabulary, self.embeddings_matrix.get_value()))
            micro_f1, macro_f1, classification_report = get_f1_and_classification_report(embeddings, '10-NN')

            print 'epoch: %s -- loss: %s -- 10-NN accuracy: %s' % (epoch, training_loss, micro_f1)
            print classification_report
            print

            if micro_f1 <= previous_accuracy:
                print 'Accuracy did not increase relative to last epoch.'
                print 'I am aborting training and discarding results from the current epoch'
                break

            self.training_error_over_epochs.append(training_loss)
            self.f1_over_epochs.append(micro_f1)
            embeddings = dict(zip(self.vocabulary, self.embeddings_matrix.get_value()))
            self.embeddings_over_epochs.append(embeddings)
            self.epochs = epoch
            previous_accuracy = micro_f1

            print 'Time since beginning of training: %s' % ((time.time() - start_time) / 60)
            print

        print 'Training took: %s' % ((time.time() - start_time) / 60)
def run():

    for training_condition in ['context', 'context_stress', 'context_phonology']:

        # load data set with dimensionality-reduced word embeddings obtained from bimodal auto encoder
        # trained in 'train_bimodal_auto_encoder.py'
        data_set = load_bimodal_data_set(training_condition)

        # softmax model for predicting words from the right context
        softmax = SoftmaxPredictor(data_set=data_set,
                                   n_next_words=2000,
                                   learning_rate=0.008,
                                   input_size=500)
        # train the model
        softmax.train()

        # get word embeddings
        embeddings = softmax.embeddings_over_epochs[-1]

        # write predicted categories to file -- files can be used for additional processing, e.g. significance testing
        # on predictions made at two different stages in the training process
        write_preds_to_file(embeddings, '10-NN', '%s_10-NN_after_stage2' % training_condition)

        # append 10-NN results obtained from word embeddings to file
        micro_f1, macro_f1, classification_report = get_f1_and_classification_report(embeddings, '10-NN')
        results_to_disk(micro_f1, macro_f1, classification_report, epoch=softmax.epochs,
                        condition=training_condition, training_stage='AFTER STAGE 2', newfile=False)

        # plot word embeddings, reduced to two dimensions via the T-SNE algorithm
        plot_2D_embeddings(embeddings, training_condition, training_stage='after_stage_2')

        # plot accuracy over all training epochs
        plot_metric(plot_name='%s_softmax' % training_condition, plot_type='micro_f1_over_epochs',
                    ys=softmax.f1_over_epochs, label='micro_f1')

        # plot training error over all training epochs
        plot_metric(plot_name='%s_softmax' % training_condition, plot_type='training_error_over_epochs',
                    ys=softmax.training_error_over_epochs, label='training_error')
def run():

    data_set = load_text_data_set("CDS")

    # exclude the following words, all longer than three syllables, which is not supported by the system ued to
    # construct phonological feature vectors
    more_than_three_syllables = [
        "actually",
        "anybody",
        "investigator",
        "helicopter",
        "interesting",
        "cookie_monster",
        "definitely",
        "refrigerator",
        "oh_my_goodness",
        "humpty_dumpty",
        "interested",
        "everybody",
        "father_christmas",
        "helicopter",
        "alligator",
        "caterpillar",
        "everybody's",
        "hippopotamus",
    ]

    # also exclude the empty string, which occasionally occurs in the corpus
    empty_string = [""]

    vocabulary = get_target_words(
        2000, tags={"n", "v", "adj", "fn"}, exclude_words=more_than_three_syllables + empty_string
    )

    # auto encoder for reducing the dimensionality of the phonological feature vectors
    phon_ae = PhonAutoEncoder(
        vocabulary=vocabulary, epochs=200, learning_rate=0.1, n_hidden=30, corruption_level=0.1, batch_size=1
    )

    phon_ae.train()

    for training_condition in ["context", "context_stress", "context_phonology"]:

        # if 'phonology' is part of the model name, get dimensionality-reduced phonological feature vectors
        if "phonology" in training_condition:
            phon_vs = phon_ae.get_hidden_vectors()
            phon_vs_size = 30
        else:
            phon_vs = None
            phon_vs_size = 0

        # if 'stress' is part of the model name, get dimensionality-reduced lexical stress feature vectors
        if "stress" in training_condition:
            stress_vs = get_primary_stress_vectors(vocabulary)
            stress_vs_size = 3
        else:
            stress_vs = None
            stress_vs_size = 0

        # auto encoder for projecting left-context and phonological / lexical stress feature vector into a shared
        # dimensionality-reduced space
        bm_ae = BiModalAutoEncoder(
            data_set=data_set,
            context_feature_size=2000,
            phon_vectors=phon_vs,
            phon_feature_size=phon_vs_size,
            stress_vectors=stress_vs,
            stress_feature_size=stress_vs_size,
            learning_rate=0.01,
            n_hidden=500,
            corruption_level=0.1,
            batch_size=1,
        )

        # vectors of left-context frequencies, plus phonological and / or lexical stress features
        input_embeddings = dict(zip(bm_ae.vocabulary, bm_ae.embeddings_matrix.get_value()))

        # write predicted categories to file -- files can be used for additional processing, e.g. significance testing
        # on predictions made at two different stages in the training process
        write_preds_to_file(input_embeddings, "majority_vote", "%s_majority_vote" % training_condition)
        write_preds_to_file(input_embeddings, "stratified", "%s_stratified_sampling" % training_condition)
        write_preds_to_file(input_embeddings, "10-NN", "%s_10-NN_before_stage1" % training_condition)

        # create a new file with results by training stage -- write majority vote baseline results to this file
        micro_f1, macro_f1, classification_report = get_f1_and_classification_report(input_embeddings, "majority_vote")
        results_to_disk(
            micro_f1,
            macro_f1,
            classification_report,
            epoch="model was not trained at this stage",
            condition=training_condition,
            training_stage="BASELINE 1: MAJORITY VOTE",
            newfile=True,
        )

        # append stratified sampling baseline results to file
        micro_f1, macro_f1, classification_report = get_f1_and_classification_report(input_embeddings, "stratified")
        results_to_disk(
            micro_f1,
            macro_f1,
            classification_report,
            epoch="model was not trained at this stage",
            condition=training_condition,
            training_stage="BASELINE 2: STRATIFIED SAMPLING",
            newfile=False,
        )

        # append 10-NN results obtained from input vectors to file
        micro_f1, macro_f1, classification_report = get_f1_and_classification_report(input_embeddings, "10-NN")
        results_to_disk(
            micro_f1,
            macro_f1,
            classification_report,
            epoch="model was not trained at this stage",
            condition=training_condition,
            training_stage="BEFORE STAGE 1",
            newfile=False,
        )

        # train the bimodal auto encoder
        bm_ae.train()

        # get the dimensionality-reduced embeddings after training (vectors of hidden unit activation values)
        embeddings = bm_ae.embeddings_over_epochs[-1]

        # # append 10-NN results obtained from hidden embeddings to file
        micro_f1, macro_f1, classification_report = get_f1_and_classification_report(embeddings, "10-NN")
        results_to_disk(
            micro_f1,
            macro_f1,
            classification_report,
            epoch=bm_ae.epochs,
            condition=training_condition,
            training_stage="AFTER STAGE 1",
            newfile=False,
        )

        # plot word embeddings, reduced to two dimensions via the T-SNE algorithm
        plot_2D_embeddings(embeddings, training_condition, training_stage="after_stage_1")

        # plot accuracy over all training epochs
        plot_metric(
            plot_name="%s_auto_encoder" % training_condition,
            plot_type="micro_f1_over_epochs",
            ys=bm_ae.f1_over_epochs,
            label="micro_f1",
        )

        # plot training error over all training epochs
        plot_metric(
            plot_name="%s_auto_encoder" % training_condition,
            plot_type="training_error_over_epochs",
            ys=bm_ae.training_error_over_epochs,
            label="training_error",
        )

        # create and save a new data set with the new embeddings -- for further training with the softmax model
        save_bimodal_data_set(bm_ae, data_set_name=training_condition, embeddings_dict=embeddings)